diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/adapter_config.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..91562a2718627f56cb3f88093dd26c3a98c35384 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 4, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 2, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "A" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/eval_results.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..9f7a3362b3a201900d77c58b1bf1e7bd976e84bc --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_c", + "results": 0.4513651877133106 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/training_configuration.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..8b8ed9ad013a967e88d7d6b67f4928b2b34d13a9 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_C", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "abl_A", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 1577576 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_A-arc_c-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2", + "seed": 42, + "timestamp": "2025-08-30T16:35:26.588040" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/training_logs.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..d84de5c5c92523d3ea0b8bf9343b91b839542159 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/training_logs.json @@ -0,0 +1,625 @@ +[ + { + "step": 1, + "epoch": 0.05714285714285714, + "cpu_mem": 1.48738048, + "gpu_mem": 4.423850496, + "loss": 4.4614, + "grad_norm": 329.5343017578125, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 2, + "epoch": 0.11428571428571428, + "cpu_mem": 1.492885504, + "gpu_mem": 4.436614144, + "loss": 4.6994, + "grad_norm": 335.7124328613281, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 3, + "epoch": 0.17142857142857143, + "cpu_mem": 1.492885504, + "gpu_mem": 4.436644864, + "loss": 2.1292, + "grad_norm": 166.02584838867188, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 4, + "epoch": 0.22857142857142856, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436611072, + "loss": 1.5628, + "grad_norm": 19.919021606445312, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 5, + "epoch": 0.2857142857142857, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436598784, + "loss": 1.4114, + "grad_norm": 11.660603523254395, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 6, + "epoch": 0.34285714285714286, + "cpu_mem": 1.493082112, + "gpu_mem": 4.43666176, + "loss": 1.4434, + "grad_norm": 21.82590675354004, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 7, + "epoch": 0.4, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436667904, + "loss": 1.5455, + "grad_norm": 18.506698608398438, + "learning_rate": 0.0003 + }, + { + "step": 8, + "epoch": 0.45714285714285713, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436626432, + "loss": 1.3713, + "grad_norm": 6.629955768585205, + "learning_rate": 0.00029980111348272456 + }, + { + "step": 9, + "epoch": 0.5142857142857142, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436621824, + "loss": 1.3708, + "grad_norm": 20.72789192199707, + "learning_rate": 0.00029920498134218835 + }, + { + "step": 10, + "epoch": 0.5714285714285714, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436611072, + "loss": 1.4826, + "grad_norm": 16.597583770751953, + "learning_rate": 0.0002982131844136615 + }, + { + "step": 11, + "epoch": 0.6285714285714286, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436621824, + "loss": 1.3793, + "grad_norm": 8.454121589660645, + "learning_rate": 0.0002968283527643036 + }, + { + "step": 12, + "epoch": 0.6857142857142857, + "cpu_mem": 1.493082112, + "gpu_mem": 4.4366464, + "loss": 1.4662, + "grad_norm": 15.033178329467773, + "learning_rate": 0.000295054158718698 + }, + { + "step": 13, + "epoch": 0.7428571428571429, + "cpu_mem": 1.493082112, + "gpu_mem": 4.4366464, + "loss": 1.3175, + "grad_norm": 9.481575965881348, + "learning_rate": 0.00029289530712050735 + }, + { + "step": 14, + "epoch": 0.8, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436594176, + "loss": 1.4611, + "grad_norm": 11.892037391662598, + "learning_rate": 0.000290357522856074 + }, + { + "step": 15, + "epoch": 0.8571428571428571, + "cpu_mem": 1.493082112, + "gpu_mem": 4.43666944, + "loss": 1.3686, + "grad_norm": 5.933671474456787, + "learning_rate": 0.0002874475356730507 + }, + { + "step": 16, + "epoch": 0.9142857142857143, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436663296, + "loss": 1.6435, + "grad_norm": 23.32131576538086, + "learning_rate": 0.0002841730623343193 + }, + { + "step": 17, + "epoch": 0.9714285714285714, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436667904, + "loss": 1.6087, + "grad_norm": 17.27324676513672, + "learning_rate": 0.00028054278615452326 + }, + { + "step": 18, + "epoch": 1.0285714285714285, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443011584, + "loss": 2.1751, + "grad_norm": 17.91927719116211, + "learning_rate": 0.0002765663339734778 + }, + { + "step": 19, + "epoch": 1.0857142857142856, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443010048, + "loss": 1.4158, + "grad_norm": 5.83022928237915, + "learning_rate": 0.00027225425062752165 + }, + { + "step": 20, + "epoch": 1.1428571428571428, + "cpu_mem": 1.493082112, + "gpu_mem": 4.442985472, + "loss": 1.3469, + "grad_norm": 6.347321510314941, + "learning_rate": 0.0002676179709865066 + }, + { + "step": 21, + "epoch": 1.2, + "cpu_mem": 1.493082112, + "gpu_mem": 4.442993152, + "loss": 1.3926, + "grad_norm": 6.69634485244751, + "learning_rate": 0.0002626697896305779 + }, + { + "step": 22, + "epoch": 1.2571428571428571, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443022336, + "loss": 1.4088, + "grad_norm": 9.436979293823242, + "learning_rate": 0.000257422828247159 + }, + { + "step": 23, + "epoch": 1.3142857142857143, + "cpu_mem": 1.493082112, + "gpu_mem": 4.44305152, + "loss": 1.3046, + "grad_norm": 3.0805585384368896, + "learning_rate": 0.00025189100083459397 + }, + { + "step": 24, + "epoch": 1.3714285714285714, + "cpu_mem": 1.493082112, + "gpu_mem": 4.442994688, + "loss": 1.3668, + "grad_norm": 5.27305269241333, + "learning_rate": 0.0002460889768047263 + }, + { + "step": 25, + "epoch": 1.4285714285714286, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443063808, + "loss": 1.3333, + "grad_norm": 5.636720180511475, + "learning_rate": 0.00024003214208225522 + }, + { + "step": 26, + "epoch": 1.4857142857142858, + "cpu_mem": 1.493082112, + "gpu_mem": 4.4430208, + "loss": 1.3928, + "grad_norm": 6.520215034484863, + "learning_rate": 0.00023373655830402968 + }, + { + "step": 27, + "epoch": 1.5428571428571427, + "cpu_mem": 1.493082112, + "gpu_mem": 4.442979328, + "loss": 1.4101, + "grad_norm": 8.386139869689941, + "learning_rate": 0.00022721892022647462 + }, + { + "step": 28, + "epoch": 1.6, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443025408, + "loss": 1.5088, + "grad_norm": 14.999929428100586, + "learning_rate": 0.000220496511454098 + }, + { + "step": 29, + "epoch": 1.657142857142857, + "cpu_mem": 1.493082112, + "gpu_mem": 4.4430208, + "loss": 1.3617, + "grad_norm": 5.525674343109131, + "learning_rate": 0.0002135871586064791 + }, + { + "step": 30, + "epoch": 1.7142857142857144, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443010048, + "loss": 1.3591, + "grad_norm": 5.05485200881958, + "learning_rate": 0.00020650918404527775 + }, + { + "step": 31, + "epoch": 1.7714285714285714, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443040768, + "loss": 1.3461, + "grad_norm": 4.573202133178711, + "learning_rate": 0.00019928135728662522 + }, + { + "step": 32, + "epoch": 1.8285714285714287, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443049984, + "loss": 1.3656, + "grad_norm": 5.207383632659912, + "learning_rate": 0.00019192284522774142 + }, + { + "step": 33, + "epoch": 1.8857142857142857, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443030016, + "loss": 1.3748, + "grad_norm": 5.258096694946289, + "learning_rate": 0.00018445316131976934 + }, + { + "step": 34, + "epoch": 1.9428571428571428, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443008512, + "loss": 1.3884, + "grad_norm": 5.586267471313477, + "learning_rate": 0.00017689211382161034 + }, + { + "step": 35, + "epoch": 2.0, + "cpu_mem": 1.493082112, + "gpu_mem": 4.442896384, + "loss": 2.0668, + "grad_norm": 7.811915874481201, + "learning_rate": 0.00016925975327198266 + }, + { + "step": 36, + "epoch": 2.057142857142857, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436640256, + "loss": 1.3123, + "grad_norm": 3.8615543842315674, + "learning_rate": 0.00016157631931899697 + }, + { + "step": 37, + "epoch": 2.1142857142857143, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436649472, + "loss": 1.3892, + "grad_norm": 8.06524658203125, + "learning_rate": 0.0001538621870482483 + }, + { + "step": 38, + "epoch": 2.1714285714285713, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436620288, + "loss": 1.336, + "grad_norm": 4.97226095199585, + "learning_rate": 0.00014613781295175172 + }, + { + "step": 39, + "epoch": 2.2285714285714286, + "cpu_mem": 1.493082112, + "gpu_mem": 4.43663872, + "loss": 1.3288, + "grad_norm": 4.268495559692383, + "learning_rate": 0.00013842368068100303 + }, + { + "step": 40, + "epoch": 2.2857142857142856, + "cpu_mem": 1.493082112, + "gpu_mem": 4.43661568, + "loss": 1.3649, + "grad_norm": 5.184421062469482, + "learning_rate": 0.00013074024672801731 + }, + { + "step": 41, + "epoch": 2.342857142857143, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436617216, + "loss": 1.3795, + "grad_norm": 5.0870513916015625, + "learning_rate": 0.00012310788617838966 + }, + { + "step": 42, + "epoch": 2.4, + "cpu_mem": 1.493082112, + "gpu_mem": 4.4366464, + "loss": 1.3246, + "grad_norm": 6.057374000549316, + "learning_rate": 0.00011554683868023067 + }, + { + "step": 43, + "epoch": 2.4571428571428573, + "cpu_mem": 1.493082112, + "gpu_mem": 4.43666176, + "loss": 1.3699, + "grad_norm": 8.358153343200684, + "learning_rate": 0.00010807715477225858 + }, + { + "step": 44, + "epoch": 2.5142857142857142, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436680192, + "loss": 1.3074, + "grad_norm": 4.818901062011719, + "learning_rate": 0.00010071864271337478 + }, + { + "step": 45, + "epoch": 2.571428571428571, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436634112, + "loss": 1.2994, + "grad_norm": 5.064252853393555, + "learning_rate": 9.34908159547222e-05 + }, + { + "step": 46, + "epoch": 2.6285714285714286, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436627968, + "loss": 1.2786, + "grad_norm": 4.9510273933410645, + "learning_rate": 8.641284139352091e-05 + }, + { + "step": 47, + "epoch": 2.685714285714286, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436621824, + "loss": 1.2801, + "grad_norm": 4.4370856285095215, + "learning_rate": 7.950348854590204e-05 + }, + { + "step": 48, + "epoch": 2.742857142857143, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436626432, + "loss": 1.2029, + "grad_norm": 4.3672943115234375, + "learning_rate": 7.278107977352543e-05 + }, + { + "step": 49, + "epoch": 2.8, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436617216, + "loss": 1.2484, + "grad_norm": 3.859243154525757, + "learning_rate": 6.626344169597031e-05 + }, + { + "step": 50, + "epoch": 2.857142857142857, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436598784, + "loss": 1.3265, + "grad_norm": 7.188536643981934, + "learning_rate": 5.996785791774478e-05 + }, + { + "step": 51, + "epoch": 2.914285714285714, + "cpu_mem": 1.493082112, + "gpu_mem": 4.43662336, + "loss": 1.2704, + "grad_norm": 5.372682571411133, + "learning_rate": 5.391102319527373e-05 + }, + { + "step": 52, + "epoch": 2.9714285714285715, + "cpu_mem": 1.493082112, + "gpu_mem": 4.436651008, + "loss": 1.3735, + "grad_norm": 8.921072959899902, + "learning_rate": 4.8108999165406026e-05 + }, + { + "step": 53, + "epoch": 3.0285714285714285, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443006976, + "loss": 1.832, + "grad_norm": 8.732340812683105, + "learning_rate": 4.257717175284103e-05 + }, + { + "step": 54, + "epoch": 3.085714285714286, + "cpu_mem": 1.493082112, + "gpu_mem": 4.442976256, + "loss": 1.3082, + "grad_norm": 7.223543167114258, + "learning_rate": 3.733021036942205e-05 + }, + { + "step": 55, + "epoch": 3.142857142857143, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443010048, + "loss": 1.3045, + "grad_norm": 8.533186912536621, + "learning_rate": 3.238202901349345e-05 + }, + { + "step": 56, + "epoch": 3.2, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443083776, + "loss": 1.2201, + "grad_norm": 5.6041579246521, + "learning_rate": 2.774574937247831e-05 + }, + { + "step": 57, + "epoch": 3.257142857142857, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443026944, + "loss": 1.2386, + "grad_norm": 5.491722583770752, + "learning_rate": 2.3433666026522153e-05 + }, + { + "step": 58, + "epoch": 3.314285714285714, + "cpu_mem": 1.493082112, + "gpu_mem": 4.4430208, + "loss": 1.142, + "grad_norm": 5.923037052154541, + "learning_rate": 1.945721384547671e-05 + }, + { + "step": 59, + "epoch": 3.3714285714285714, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443071488, + "loss": 1.2387, + "grad_norm": 4.976379871368408, + "learning_rate": 1.5826937665680693e-05 + }, + { + "step": 60, + "epoch": 3.4285714285714284, + "cpu_mem": 1.493082112, + "gpu_mem": 4.44299776, + "loss": 1.2988, + "grad_norm": 7.554241180419922, + "learning_rate": 1.2552464326949302e-05 + }, + { + "step": 61, + "epoch": 3.4857142857142858, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443011584, + "loss": 1.2697, + "grad_norm": 6.907799243927002, + "learning_rate": 9.64247714392597e-06 + }, + { + "step": 62, + "epoch": 3.5428571428571427, + "cpu_mem": 1.493082112, + "gpu_mem": 4.44301312, + "loss": 1.2166, + "grad_norm": 5.639773845672607, + "learning_rate": 7.104692879492624e-06 + }, + { + "step": 63, + "epoch": 3.6, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443002368, + "loss": 1.2443, + "grad_norm": 6.010934829711914, + "learning_rate": 4.945841281301943e-06 + }, + { + "step": 64, + "epoch": 3.657142857142857, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443019264, + "loss": 1.2085, + "grad_norm": 6.853599548339844, + "learning_rate": 3.1716472356963286e-06 + }, + { + "step": 65, + "epoch": 3.7142857142857144, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443040768, + "loss": 1.2106, + "grad_norm": 6.083324909210205, + "learning_rate": 1.7868155863384415e-06 + }, + { + "step": 66, + "epoch": 3.7714285714285714, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443031552, + "loss": 1.1881, + "grad_norm": 6.191532611846924, + "learning_rate": 7.950186578116413e-07 + }, + { + "step": 67, + "epoch": 3.8285714285714287, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443057664, + "loss": 1.2379, + "grad_norm": 6.098973751068115, + "learning_rate": 1.988865172754206e-07 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443008512, + "loss": 1.2206, + "grad_norm": 6.373558044433594, + "learning_rate": 0.0 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.493082112, + "gpu_mem": 4.443008512, + "train_runtime": 374.6823, + "train_samples_per_second": 11.946, + "train_steps_per_second": 0.181, + "total_flos": 0.0, + "train_loss": 1.4825304623912363 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/adapter_config.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0052eed638e4aeb48f103586efb96096bb8d3ed --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 64, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 32, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "A" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/eval_results.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..92b7387f21dc50ef3634874adcc9300e074dc249 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_c", + "results": 0.628839590443686 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/training_configuration.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..8d052d8c6631c6b25104cf0be243a4cb237b377b --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_C", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "abl_A", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 25389056 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_A-arc_c-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2", + "seed": 42, + "timestamp": "2025-08-31T06:31:01.002762" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/training_logs.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..a5474d3809ee2f96601d5340cc4a6700d92d4730 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/training_logs.json @@ -0,0 +1,625 @@ +[ + { + "step": 1, + "epoch": 0.05714285714285714, + "cpu_mem": 1.496281088, + "gpu_mem": 4.519020032, + "loss": 4.4614, + "grad_norm": 280.62310791015625, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 2, + "epoch": 0.11428571428571428, + "cpu_mem": 1.501589504, + "gpu_mem": 4.722122752, + "loss": 4.6994, + "grad_norm": 286.9012451171875, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 3, + "epoch": 0.17142857142857143, + "cpu_mem": 1.501786112, + "gpu_mem": 4.722153472, + "loss": 2.1324, + "grad_norm": 415.12750244140625, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 4, + "epoch": 0.22857142857142856, + "cpu_mem": 1.501786112, + "gpu_mem": 4.72211968, + "loss": 1.7543, + "grad_norm": 44.261512756347656, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 5, + "epoch": 0.2857142857142857, + "cpu_mem": 1.501786112, + "gpu_mem": 4.722107392, + "loss": 1.508, + "grad_norm": 22.360448837280273, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 6, + "epoch": 0.34285714285714286, + "cpu_mem": 1.501786112, + "gpu_mem": 4.722170368, + "loss": 1.4382, + "grad_norm": 9.388525009155273, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 7, + "epoch": 0.4, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722176512, + "loss": 1.4429, + "grad_norm": 13.978992462158203, + "learning_rate": 0.0003 + }, + { + "step": 8, + "epoch": 0.45714285714285713, + "cpu_mem": 1.50198272, + "gpu_mem": 4.72213504, + "loss": 1.564, + "grad_norm": 15.369060516357422, + "learning_rate": 0.00029980111348272456 + }, + { + "step": 9, + "epoch": 0.5142857142857142, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722130432, + "loss": 1.5394, + "grad_norm": 17.35812759399414, + "learning_rate": 0.00029920498134218835 + }, + { + "step": 10, + "epoch": 0.5714285714285714, + "cpu_mem": 1.50198272, + "gpu_mem": 4.72211968, + "loss": 1.4216, + "grad_norm": 7.401285648345947, + "learning_rate": 0.0002982131844136615 + }, + { + "step": 11, + "epoch": 0.6285714285714286, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722130432, + "loss": 1.7083, + "grad_norm": 20.328474044799805, + "learning_rate": 0.0002968283527643036 + }, + { + "step": 12, + "epoch": 0.6857142857142857, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722155008, + "loss": 1.3558, + "grad_norm": 2.454993724822998, + "learning_rate": 0.000295054158718698 + }, + { + "step": 13, + "epoch": 0.7428571428571429, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722155008, + "loss": 1.5307, + "grad_norm": 12.548927307128906, + "learning_rate": 0.00029289530712050735 + }, + { + "step": 14, + "epoch": 0.8, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722102784, + "loss": 1.4728, + "grad_norm": 6.753164768218994, + "learning_rate": 0.000290357522856074 + }, + { + "step": 15, + "epoch": 0.8571428571428571, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722178048, + "loss": 1.4047, + "grad_norm": 5.13401460647583, + "learning_rate": 0.0002874475356730507 + }, + { + "step": 16, + "epoch": 0.9142857142857143, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722171904, + "loss": 1.7088, + "grad_norm": 15.40377140045166, + "learning_rate": 0.0002841730623343193 + }, + { + "step": 17, + "epoch": 0.9714285714285714, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722176512, + "loss": 1.6083, + "grad_norm": 22.233034133911133, + "learning_rate": 0.00028054278615452326 + }, + { + "step": 18, + "epoch": 1.0285714285714285, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823689728, + "loss": 2.0617, + "grad_norm": 4.666072845458984, + "learning_rate": 0.0002765663339734778 + }, + { + "step": 19, + "epoch": 1.0857142857142856, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823688192, + "loss": 1.3905, + "grad_norm": 4.700724124908447, + "learning_rate": 0.00027225425062752165 + }, + { + "step": 20, + "epoch": 1.1428571428571428, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823663616, + "loss": 1.3482, + "grad_norm": 5.293838024139404, + "learning_rate": 0.0002676179709865066 + }, + { + "step": 21, + "epoch": 1.2, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823671296, + "loss": 1.4783, + "grad_norm": 6.684973239898682, + "learning_rate": 0.0002626697896305779 + }, + { + "step": 22, + "epoch": 1.2571428571428571, + "cpu_mem": 1.50198272, + "gpu_mem": 4.82370048, + "loss": 1.3886, + "grad_norm": 4.591466426849365, + "learning_rate": 0.000257422828247159 + }, + { + "step": 23, + "epoch": 1.3142857142857143, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823729664, + "loss": 1.3456, + "grad_norm": 3.4432966709136963, + "learning_rate": 0.00025189100083459397 + }, + { + "step": 24, + "epoch": 1.3714285714285714, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823672832, + "loss": 1.3971, + "grad_norm": 4.504514694213867, + "learning_rate": 0.0002460889768047263 + }, + { + "step": 25, + "epoch": 1.4285714285714286, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823741952, + "loss": 1.3702, + "grad_norm": 4.352265357971191, + "learning_rate": 0.00024003214208225522 + }, + { + "step": 26, + "epoch": 1.4857142857142858, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823698944, + "loss": 1.3863, + "grad_norm": 3.940533399581909, + "learning_rate": 0.00023373655830402968 + }, + { + "step": 27, + "epoch": 1.5428571428571427, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823657472, + "loss": 1.3904, + "grad_norm": 3.843360662460327, + "learning_rate": 0.00022721892022647462 + }, + { + "step": 28, + "epoch": 1.6, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823703552, + "loss": 1.6316, + "grad_norm": 9.981597900390625, + "learning_rate": 0.000220496511454098 + }, + { + "step": 29, + "epoch": 1.657142857142857, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823698944, + "loss": 1.5115, + "grad_norm": 6.392779350280762, + "learning_rate": 0.0002135871586064791 + }, + { + "step": 30, + "epoch": 1.7142857142857144, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823688192, + "loss": 1.4569, + "grad_norm": 6.152426242828369, + "learning_rate": 0.00020650918404527775 + }, + { + "step": 31, + "epoch": 1.7714285714285714, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823718912, + "loss": 1.3658, + "grad_norm": 3.233621835708618, + "learning_rate": 0.00019928135728662522 + }, + { + "step": 32, + "epoch": 1.8285714285714287, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823728128, + "loss": 1.3712, + "grad_norm": 2.499112129211426, + "learning_rate": 0.00019192284522774142 + }, + { + "step": 33, + "epoch": 1.8857142857142857, + "cpu_mem": 1.50198272, + "gpu_mem": 4.82370816, + "loss": 1.4401, + "grad_norm": 4.652753829956055, + "learning_rate": 0.00018445316131976934 + }, + { + "step": 34, + "epoch": 1.9428571428571428, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823686656, + "loss": 1.4205, + "grad_norm": 3.1102354526519775, + "learning_rate": 0.00017689211382161034 + }, + { + "step": 35, + "epoch": 2.0, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823574528, + "loss": 2.0597, + "grad_norm": 1.4106764793395996, + "learning_rate": 0.00016925975327198266 + }, + { + "step": 36, + "epoch": 2.057142857142857, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722148864, + "loss": 1.3446, + "grad_norm": 2.1405179500579834, + "learning_rate": 0.00016157631931899697 + }, + { + "step": 37, + "epoch": 2.1142857142857143, + "cpu_mem": 1.50198272, + "gpu_mem": 4.72215808, + "loss": 1.4145, + "grad_norm": 3.9450926780700684, + "learning_rate": 0.0001538621870482483 + }, + { + "step": 38, + "epoch": 2.1714285714285713, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722128896, + "loss": 1.3416, + "grad_norm": 2.131178617477417, + "learning_rate": 0.00014613781295175172 + }, + { + "step": 39, + "epoch": 2.2285714285714286, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722147328, + "loss": 1.3367, + "grad_norm": 2.133847713470459, + "learning_rate": 0.00013842368068100303 + }, + { + "step": 40, + "epoch": 2.2857142857142856, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722124288, + "loss": 1.3975, + "grad_norm": 3.288180112838745, + "learning_rate": 0.00013074024672801731 + }, + { + "step": 41, + "epoch": 2.342857142857143, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722125824, + "loss": 1.3958, + "grad_norm": 3.1248035430908203, + "learning_rate": 0.00012310788617838966 + }, + { + "step": 42, + "epoch": 2.4, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722155008, + "loss": 1.335, + "grad_norm": 3.862166404724121, + "learning_rate": 0.00011554683868023067 + }, + { + "step": 43, + "epoch": 2.4571428571428573, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722170368, + "loss": 1.3372, + "grad_norm": 3.2153024673461914, + "learning_rate": 0.00010807715477225858 + }, + { + "step": 44, + "epoch": 2.5142857142857142, + "cpu_mem": 1.50198272, + "gpu_mem": 4.7221888, + "loss": 1.2856, + "grad_norm": 1.854359745979309, + "learning_rate": 0.00010071864271337478 + }, + { + "step": 45, + "epoch": 2.571428571428571, + "cpu_mem": 1.50198272, + "gpu_mem": 4.72214272, + "loss": 1.322, + "grad_norm": 2.8490543365478516, + "learning_rate": 9.34908159547222e-05 + }, + { + "step": 46, + "epoch": 2.6285714285714286, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722136576, + "loss": 1.2721, + "grad_norm": 2.5255608558654785, + "learning_rate": 8.641284139352091e-05 + }, + { + "step": 47, + "epoch": 2.685714285714286, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722130432, + "loss": 1.2753, + "grad_norm": 2.5269887447357178, + "learning_rate": 7.950348854590204e-05 + }, + { + "step": 48, + "epoch": 2.742857142857143, + "cpu_mem": 1.50198272, + "gpu_mem": 4.72213504, + "loss": 1.2814, + "grad_norm": 3.4005813598632812, + "learning_rate": 7.278107977352543e-05 + }, + { + "step": 49, + "epoch": 2.8, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722125824, + "loss": 1.2355, + "grad_norm": 2.77209734916687, + "learning_rate": 6.626344169597031e-05 + }, + { + "step": 50, + "epoch": 2.857142857142857, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722107392, + "loss": 1.2617, + "grad_norm": 2.584846258163452, + "learning_rate": 5.996785791774478e-05 + }, + { + "step": 51, + "epoch": 2.914285714285714, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722131968, + "loss": 1.2872, + "grad_norm": 3.403454542160034, + "learning_rate": 5.391102319527373e-05 + }, + { + "step": 52, + "epoch": 2.9714285714285715, + "cpu_mem": 1.50198272, + "gpu_mem": 4.722159616, + "loss": 1.323, + "grad_norm": 4.359912872314453, + "learning_rate": 4.8108999165406026e-05 + }, + { + "step": 53, + "epoch": 3.0285714285714285, + "cpu_mem": 1.50198272, + "gpu_mem": 4.82368512, + "loss": 1.8447, + "grad_norm": 5.682162761688232, + "learning_rate": 4.257717175284103e-05 + }, + { + "step": 54, + "epoch": 3.085714285714286, + "cpu_mem": 1.50198272, + "gpu_mem": 4.8236544, + "loss": 1.1505, + "grad_norm": 3.0071282386779785, + "learning_rate": 3.733021036942205e-05 + }, + { + "step": 55, + "epoch": 3.142857142857143, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823688192, + "loss": 1.151, + "grad_norm": 3.7012956142425537, + "learning_rate": 3.238202901349345e-05 + }, + { + "step": 56, + "epoch": 3.2, + "cpu_mem": 1.50198272, + "gpu_mem": 4.82376192, + "loss": 1.1417, + "grad_norm": 4.017345905303955, + "learning_rate": 2.774574937247831e-05 + }, + { + "step": 57, + "epoch": 3.257142857142857, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823705088, + "loss": 1.0821, + "grad_norm": 3.950089454650879, + "learning_rate": 2.3433666026522153e-05 + }, + { + "step": 58, + "epoch": 3.314285714285714, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823698944, + "loss": 1.0089, + "grad_norm": 4.133927345275879, + "learning_rate": 1.945721384547671e-05 + }, + { + "step": 59, + "epoch": 3.3714285714285714, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823749632, + "loss": 1.0377, + "grad_norm": 4.262353420257568, + "learning_rate": 1.5826937665680693e-05 + }, + { + "step": 60, + "epoch": 3.4285714285714284, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823675904, + "loss": 1.0539, + "grad_norm": 5.903791904449463, + "learning_rate": 1.2552464326949302e-05 + }, + { + "step": 61, + "epoch": 3.4857142857142858, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823689728, + "loss": 1.1056, + "grad_norm": 5.543725490570068, + "learning_rate": 9.64247714392597e-06 + }, + { + "step": 62, + "epoch": 3.5428571428571427, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823691264, + "loss": 1.0513, + "grad_norm": 5.472988128662109, + "learning_rate": 7.104692879492624e-06 + }, + { + "step": 63, + "epoch": 3.6, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823680512, + "loss": 1.0188, + "grad_norm": 5.62291955947876, + "learning_rate": 4.945841281301943e-06 + }, + { + "step": 64, + "epoch": 3.657142857142857, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823697408, + "loss": 1.0059, + "grad_norm": 5.603131294250488, + "learning_rate": 3.1716472356963286e-06 + }, + { + "step": 65, + "epoch": 3.7142857142857144, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823718912, + "loss": 1.0754, + "grad_norm": 6.040858268737793, + "learning_rate": 1.7868155863384415e-06 + }, + { + "step": 66, + "epoch": 3.7714285714285714, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823709696, + "loss": 0.9901, + "grad_norm": 5.5554585456848145, + "learning_rate": 7.950186578116413e-07 + }, + { + "step": 67, + "epoch": 3.8285714285714287, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823735808, + "loss": 1.0235, + "grad_norm": 6.136469841003418, + "learning_rate": 1.988865172754206e-07 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823686656, + "loss": 1.0859, + "grad_norm": 5.908904075622559, + "learning_rate": 0.0 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.50198272, + "gpu_mem": 4.823686656, + "train_runtime": 378.6352, + "train_samples_per_second": 11.821, + "train_steps_per_second": 0.18, + "total_flos": 0.0, + "train_loss": 1.4682180960388744 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/adapter_config.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6934cfad94edb068f0d54db83e6a8b58f0fc939 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 16, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 8, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "A" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/eval_results.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7f27c71ebc028cb849a9df58fe9a02b99a41ab02 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_c", + "results": 0.378839590443686 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/training_configuration.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..943a62ba5932bc1de403d5d7ffb39cb86549991c --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_C", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "abl_A", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 6317696 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_A-arc_c-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2", + "seed": 42, + "timestamp": "2025-08-30T23:32:37.041918" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/training_logs.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..9611dbaac813cdaeff3d77154e0deb1aaedb042a --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/training_logs.json @@ -0,0 +1,625 @@ +[ + { + "step": 1, + "epoch": 0.05714285714285714, + "cpu_mem": 1.489108992, + "gpu_mem": 4.442774016, + "loss": 4.4614, + "grad_norm": 272.1399230957031, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 2, + "epoch": 0.11428571428571428, + "cpu_mem": 1.494614016, + "gpu_mem": 4.493384704, + "loss": 4.6994, + "grad_norm": 279.0349426269531, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 3, + "epoch": 0.17142857142857143, + "cpu_mem": 1.494614016, + "gpu_mem": 4.493415424, + "loss": 2.3086, + "grad_norm": 260.66900634765625, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 4, + "epoch": 0.22857142857142856, + "cpu_mem": 1.494810624, + "gpu_mem": 4.493381632, + "loss": 1.572, + "grad_norm": 18.902830123901367, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 5, + "epoch": 0.2857142857142857, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493369344, + "loss": 1.5805, + "grad_norm": 30.244815826416016, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 6, + "epoch": 0.34285714285714286, + "cpu_mem": 1.495007232, + "gpu_mem": 4.49343232, + "loss": 1.3975, + "grad_norm": 10.924633026123047, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 7, + "epoch": 0.4, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493438464, + "loss": 1.4746, + "grad_norm": 35.98440933227539, + "learning_rate": 0.0003 + }, + { + "step": 8, + "epoch": 0.45714285714285713, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493396992, + "loss": 1.5768, + "grad_norm": 21.909250259399414, + "learning_rate": 0.00029980111348272456 + }, + { + "step": 9, + "epoch": 0.5142857142857142, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493392384, + "loss": 1.3486, + "grad_norm": 8.877981185913086, + "learning_rate": 0.00029920498134218835 + }, + { + "step": 10, + "epoch": 0.5714285714285714, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493381632, + "loss": 1.6285, + "grad_norm": 21.38736915588379, + "learning_rate": 0.0002982131844136615 + }, + { + "step": 11, + "epoch": 0.6285714285714286, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493392384, + "loss": 1.4523, + "grad_norm": 7.3010358810424805, + "learning_rate": 0.0002968283527643036 + }, + { + "step": 12, + "epoch": 0.6857142857142857, + "cpu_mem": 1.495007232, + "gpu_mem": 4.49341696, + "loss": 1.4579, + "grad_norm": 8.112820625305176, + "learning_rate": 0.000295054158718698 + }, + { + "step": 13, + "epoch": 0.7428571428571429, + "cpu_mem": 1.495007232, + "gpu_mem": 4.49341696, + "loss": 1.3493, + "grad_norm": 11.578926086425781, + "learning_rate": 0.00029289530712050735 + }, + { + "step": 14, + "epoch": 0.8, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493364736, + "loss": 1.6974, + "grad_norm": 17.703752517700195, + "learning_rate": 0.000290357522856074 + }, + { + "step": 15, + "epoch": 0.8571428571428571, + "cpu_mem": 1.495007232, + "gpu_mem": 4.49344, + "loss": 1.5182, + "grad_norm": 8.22641658782959, + "learning_rate": 0.0002874475356730507 + }, + { + "step": 16, + "epoch": 0.9142857142857143, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493433856, + "loss": 1.4541, + "grad_norm": 7.929551124572754, + "learning_rate": 0.0002841730623343193 + }, + { + "step": 17, + "epoch": 0.9714285714285714, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493438464, + "loss": 1.3941, + "grad_norm": 5.744842052459717, + "learning_rate": 0.00028054278615452326 + }, + { + "step": 18, + "epoch": 1.0285714285714285, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518705664, + "loss": 2.1397, + "grad_norm": 13.66163158416748, + "learning_rate": 0.0002765663339734778 + }, + { + "step": 19, + "epoch": 1.0857142857142856, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518704128, + "loss": 1.4262, + "grad_norm": 6.4307355880737305, + "learning_rate": 0.00027225425062752165 + }, + { + "step": 20, + "epoch": 1.1428571428571428, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518679552, + "loss": 1.3419, + "grad_norm": 18.498199462890625, + "learning_rate": 0.0002676179709865066 + }, + { + "step": 21, + "epoch": 1.2, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518687232, + "loss": 1.3816, + "grad_norm": 3.349029541015625, + "learning_rate": 0.0002626697896305779 + }, + { + "step": 22, + "epoch": 1.2571428571428571, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518716416, + "loss": 1.3216, + "grad_norm": 3.663336753845215, + "learning_rate": 0.000257422828247159 + }, + { + "step": 23, + "epoch": 1.3142857142857143, + "cpu_mem": 1.495007232, + "gpu_mem": 4.5187456, + "loss": 1.3365, + "grad_norm": 6.960829734802246, + "learning_rate": 0.00025189100083459397 + }, + { + "step": 24, + "epoch": 1.3714285714285714, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518688768, + "loss": 1.3619, + "grad_norm": 5.14816427230835, + "learning_rate": 0.0002460889768047263 + }, + { + "step": 25, + "epoch": 1.4285714285714286, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518757888, + "loss": 1.3392, + "grad_norm": 4.903714656829834, + "learning_rate": 0.00024003214208225522 + }, + { + "step": 26, + "epoch": 1.4857142857142858, + "cpu_mem": 1.495007232, + "gpu_mem": 4.51871488, + "loss": 1.3861, + "grad_norm": 6.120626926422119, + "learning_rate": 0.00023373655830402968 + }, + { + "step": 27, + "epoch": 1.5428571428571427, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518673408, + "loss": 1.3876, + "grad_norm": 6.803613662719727, + "learning_rate": 0.00022721892022647462 + }, + { + "step": 28, + "epoch": 1.6, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518719488, + "loss": 1.6675, + "grad_norm": 14.671072006225586, + "learning_rate": 0.000220496511454098 + }, + { + "step": 29, + "epoch": 1.657142857142857, + "cpu_mem": 1.495007232, + "gpu_mem": 4.51871488, + "loss": 1.4052, + "grad_norm": 6.3027143478393555, + "learning_rate": 0.0002135871586064791 + }, + { + "step": 30, + "epoch": 1.7142857142857144, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518704128, + "loss": 1.3962, + "grad_norm": 5.853539943695068, + "learning_rate": 0.00020650918404527775 + }, + { + "step": 31, + "epoch": 1.7714285714285714, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518734848, + "loss": 1.3627, + "grad_norm": 4.129302024841309, + "learning_rate": 0.00019928135728662522 + }, + { + "step": 32, + "epoch": 1.8285714285714287, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518744064, + "loss": 1.347, + "grad_norm": 6.030110836029053, + "learning_rate": 0.00019192284522774142 + }, + { + "step": 33, + "epoch": 1.8857142857142857, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518724096, + "loss": 1.4082, + "grad_norm": 8.369693756103516, + "learning_rate": 0.00018445316131976934 + }, + { + "step": 34, + "epoch": 1.9428571428571428, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518702592, + "loss": 1.4094, + "grad_norm": 14.688669204711914, + "learning_rate": 0.00017689211382161034 + }, + { + "step": 35, + "epoch": 2.0, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518590464, + "loss": 2.1062, + "grad_norm": 14.797432899475098, + "learning_rate": 0.00016925975327198266 + }, + { + "step": 36, + "epoch": 2.057142857142857, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493410816, + "loss": 1.3897, + "grad_norm": 10.996891021728516, + "learning_rate": 0.00016157631931899697 + }, + { + "step": 37, + "epoch": 2.1142857142857143, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493420032, + "loss": 1.4705, + "grad_norm": 15.965860366821289, + "learning_rate": 0.0001538621870482483 + }, + { + "step": 38, + "epoch": 2.1714285714285713, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493390848, + "loss": 1.3793, + "grad_norm": 5.473352432250977, + "learning_rate": 0.00014613781295175172 + }, + { + "step": 39, + "epoch": 2.2285714285714286, + "cpu_mem": 1.495007232, + "gpu_mem": 4.49340928, + "loss": 1.3478, + "grad_norm": 2.9749255180358887, + "learning_rate": 0.00013842368068100303 + }, + { + "step": 40, + "epoch": 2.2857142857142856, + "cpu_mem": 1.495007232, + "gpu_mem": 4.49338624, + "loss": 1.4282, + "grad_norm": 7.062312126159668, + "learning_rate": 0.00013074024672801731 + }, + { + "step": 41, + "epoch": 2.342857142857143, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493387776, + "loss": 1.4169, + "grad_norm": 5.1576642990112305, + "learning_rate": 0.00012310788617838966 + }, + { + "step": 42, + "epoch": 2.4, + "cpu_mem": 1.495007232, + "gpu_mem": 4.49341696, + "loss": 1.4134, + "grad_norm": 9.255854606628418, + "learning_rate": 0.00011554683868023067 + }, + { + "step": 43, + "epoch": 2.4571428571428573, + "cpu_mem": 1.495007232, + "gpu_mem": 4.49343232, + "loss": 1.3524, + "grad_norm": 5.755366802215576, + "learning_rate": 0.00010807715477225858 + }, + { + "step": 44, + "epoch": 2.5142857142857142, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493450752, + "loss": 1.3488, + "grad_norm": 4.835580348968506, + "learning_rate": 0.00010071864271337478 + }, + { + "step": 45, + "epoch": 2.571428571428571, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493404672, + "loss": 1.3469, + "grad_norm": 6.548555374145508, + "learning_rate": 9.34908159547222e-05 + }, + { + "step": 46, + "epoch": 2.6285714285714286, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493398528, + "loss": 1.2932, + "grad_norm": 3.8608975410461426, + "learning_rate": 8.641284139352091e-05 + }, + { + "step": 47, + "epoch": 2.685714285714286, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493392384, + "loss": 1.3043, + "grad_norm": 4.572495460510254, + "learning_rate": 7.950348854590204e-05 + }, + { + "step": 48, + "epoch": 2.742857142857143, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493396992, + "loss": 1.2866, + "grad_norm": 3.4438271522521973, + "learning_rate": 7.278107977352543e-05 + }, + { + "step": 49, + "epoch": 2.8, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493387776, + "loss": 1.3026, + "grad_norm": 3.738175868988037, + "learning_rate": 6.626344169597031e-05 + }, + { + "step": 50, + "epoch": 2.857142857142857, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493369344, + "loss": 1.3494, + "grad_norm": 4.942461967468262, + "learning_rate": 5.996785791774478e-05 + }, + { + "step": 51, + "epoch": 2.914285714285714, + "cpu_mem": 1.495007232, + "gpu_mem": 4.49339392, + "loss": 1.3157, + "grad_norm": 4.040122985839844, + "learning_rate": 5.391102319527373e-05 + }, + { + "step": 52, + "epoch": 2.9714285714285715, + "cpu_mem": 1.495007232, + "gpu_mem": 4.493421568, + "loss": 1.4034, + "grad_norm": 4.740878105163574, + "learning_rate": 4.8108999165406026e-05 + }, + { + "step": 53, + "epoch": 3.0285714285714285, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518701056, + "loss": 1.9696, + "grad_norm": 6.076801300048828, + "learning_rate": 4.257717175284103e-05 + }, + { + "step": 54, + "epoch": 3.085714285714286, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518670336, + "loss": 1.3186, + "grad_norm": 6.321723937988281, + "learning_rate": 3.733021036942205e-05 + }, + { + "step": 55, + "epoch": 3.142857142857143, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518704128, + "loss": 1.3322, + "grad_norm": 4.518864154815674, + "learning_rate": 3.238202901349345e-05 + }, + { + "step": 56, + "epoch": 3.2, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518777856, + "loss": 1.3229, + "grad_norm": 3.9502453804016113, + "learning_rate": 2.774574937247831e-05 + }, + { + "step": 57, + "epoch": 3.257142857142857, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518721024, + "loss": 1.2922, + "grad_norm": 2.77620530128479, + "learning_rate": 2.3433666026522153e-05 + }, + { + "step": 58, + "epoch": 3.314285714285714, + "cpu_mem": 1.495007232, + "gpu_mem": 4.51871488, + "loss": 1.221, + "grad_norm": 5.571518898010254, + "learning_rate": 1.945721384547671e-05 + }, + { + "step": 59, + "epoch": 3.3714285714285714, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518765568, + "loss": 1.2745, + "grad_norm": 4.341223239898682, + "learning_rate": 1.5826937665680693e-05 + }, + { + "step": 60, + "epoch": 3.4285714285714284, + "cpu_mem": 1.495007232, + "gpu_mem": 4.51869184, + "loss": 1.3552, + "grad_norm": 5.9276251792907715, + "learning_rate": 1.2552464326949302e-05 + }, + { + "step": 61, + "epoch": 3.4857142857142858, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518705664, + "loss": 1.3201, + "grad_norm": 4.258768558502197, + "learning_rate": 9.64247714392597e-06 + }, + { + "step": 62, + "epoch": 3.5428571428571427, + "cpu_mem": 1.495007232, + "gpu_mem": 4.5187072, + "loss": 1.3376, + "grad_norm": 5.822268009185791, + "learning_rate": 7.104692879492624e-06 + }, + { + "step": 63, + "epoch": 3.6, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518696448, + "loss": 1.3397, + "grad_norm": 3.860724925994873, + "learning_rate": 4.945841281301943e-06 + }, + { + "step": 64, + "epoch": 3.657142857142857, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518713344, + "loss": 1.2999, + "grad_norm": 4.712264060974121, + "learning_rate": 3.1716472356963286e-06 + }, + { + "step": 65, + "epoch": 3.7142857142857144, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518734848, + "loss": 1.3846, + "grad_norm": 7.827590465545654, + "learning_rate": 1.7868155863384415e-06 + }, + { + "step": 66, + "epoch": 3.7714285714285714, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518725632, + "loss": 1.255, + "grad_norm": 4.00440788269043, + "learning_rate": 7.950186578116413e-07 + }, + { + "step": 67, + "epoch": 3.8285714285714287, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518751744, + "loss": 1.2525, + "grad_norm": 4.9576640129089355, + "learning_rate": 1.988865172754206e-07 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518702592, + "loss": 1.3006, + "grad_norm": 4.395829677581787, + "learning_rate": 0.0 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.495007232, + "gpu_mem": 4.518702592, + "train_runtime": 376.1937, + "train_samples_per_second": 11.898, + "train_steps_per_second": 0.181, + "total_flos": 0.0, + "train_loss": 1.523840169696247 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/adapter_config.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..91562a2718627f56cb3f88093dd26c3a98c35384 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 4, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 2, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "A" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/eval_results.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..30cc83ae75888d4bc957956ed4c0c781daafe129 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_e", + "results": 0.33375420875420875 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/training_configuration.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..9d1497ca3f86f03ccfbc8fff7ae1e5333bfbe172 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_E", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "abl_A", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 1577576 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_A-arc_e-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2", + "seed": 42, + "timestamp": "2025-08-30T15:57:00.430559" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/training_logs.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..48f839dbfdd2a9c0f268296ef72ce8ff59caf3d3 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/training_logs.json @@ -0,0 +1,1273 @@ +[ + { + "step": 1, + "epoch": 0.028169014084507043, + "cpu_mem": 1.48697088, + "gpu_mem": 4.4237952, + "loss": 4.6319, + "grad_norm": 334.8832702636719, + "learning_rate": 2.1428571428571425e-05 + }, + { + "step": 2, + "epoch": 0.056338028169014086, + "cpu_mem": 1.492672512, + "gpu_mem": 4.436629504, + "loss": 4.4578, + "grad_norm": 338.71502685546875, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 3, + "epoch": 0.08450704225352113, + "cpu_mem": 1.49286912, + "gpu_mem": 4.436608, + "loss": 3.0613, + "grad_norm": 203.26577758789062, + "learning_rate": 6.428571428571427e-05 + }, + { + "step": 4, + "epoch": 0.11267605633802817, + "cpu_mem": 1.493065728, + "gpu_mem": 4.436586496, + "loss": 2.1672, + "grad_norm": 93.64673614501953, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 5, + "epoch": 0.14084507042253522, + "cpu_mem": 1.493065728, + "gpu_mem": 4.436627968, + "loss": 1.5508, + "grad_norm": 19.551036834716797, + "learning_rate": 0.00010714285714285714 + }, + { + "step": 6, + "epoch": 0.16901408450704225, + "cpu_mem": 1.493262336, + "gpu_mem": 4.436603392, + "loss": 1.4936, + "grad_norm": 32.31931686401367, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 7, + "epoch": 0.19718309859154928, + "cpu_mem": 1.493262336, + "gpu_mem": 4.436626432, + "loss": 1.4445, + "grad_norm": 20.193700790405273, + "learning_rate": 0.00015 + }, + { + "step": 8, + "epoch": 0.22535211267605634, + "cpu_mem": 1.493262336, + "gpu_mem": 4.43658496, + "loss": 1.3912, + "grad_norm": 15.018762588500977, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 9, + "epoch": 0.2535211267605634, + "cpu_mem": 1.493262336, + "gpu_mem": 4.436586496, + "loss": 1.3628, + "grad_norm": 12.806224822998047, + "learning_rate": 0.00019285714285714286 + }, + { + "step": 10, + "epoch": 0.28169014084507044, + "cpu_mem": 1.493262336, + "gpu_mem": 4.436581888, + "loss": 1.6795, + "grad_norm": 60.71196746826172, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 11, + "epoch": 0.30985915492957744, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436660224, + "loss": 1.3897, + "grad_norm": 14.609763145446777, + "learning_rate": 0.00023571428571428569 + }, + { + "step": 12, + "epoch": 0.3380281690140845, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436634112, + "loss": 1.3519, + "grad_norm": 10.01632308959961, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 13, + "epoch": 0.36619718309859156, + "cpu_mem": 1.493458944, + "gpu_mem": 4.43658496, + "loss": 1.3813, + "grad_norm": 9.067853927612305, + "learning_rate": 0.00027857142857142854 + }, + { + "step": 14, + "epoch": 0.39436619718309857, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436606464, + "loss": 1.4137, + "grad_norm": 11.54834270477295, + "learning_rate": 0.0003 + }, + { + "step": 15, + "epoch": 0.4225352112676056, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436583424, + "loss": 1.33, + "grad_norm": 4.984076499938965, + "learning_rate": 0.0002999533773001224 + }, + { + "step": 16, + "epoch": 0.4507042253521127, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436588032, + "loss": 1.3787, + "grad_norm": 4.959704875946045, + "learning_rate": 0.0002998135381828383 + }, + { + "step": 17, + "epoch": 0.4788732394366197, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436624896, + "loss": 1.3491, + "grad_norm": 6.165195465087891, + "learning_rate": 0.00029958056957717696 + }, + { + "step": 18, + "epoch": 0.5070422535211268, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436635648, + "loss": 1.3499, + "grad_norm": 6.145087242126465, + "learning_rate": 0.0002992546163048102 + }, + { + "step": 19, + "epoch": 0.5352112676056338, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436578816, + "loss": 1.3647, + "grad_norm": 14.54247760772705, + "learning_rate": 0.0002988358809900258 + }, + { + "step": 20, + "epoch": 0.5633802816901409, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436649472, + "loss": 1.3599, + "grad_norm": 6.127029895782471, + "learning_rate": 0.0002983246239337692 + }, + { + "step": 21, + "epoch": 0.5915492957746479, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436647936, + "loss": 1.3275, + "grad_norm": 5.688446521759033, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 22, + "epoch": 0.6197183098591549, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436604928, + "loss": 1.3288, + "grad_norm": 7.160696029663086, + "learning_rate": 0.00029702587317728153 + }, + { + "step": 23, + "epoch": 0.647887323943662, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436621824, + "loss": 1.3048, + "grad_norm": 5.542471408843994, + "learning_rate": 0.0002962391868272735 + }, + { + "step": 24, + "epoch": 0.676056338028169, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436578816, + "loss": 1.3567, + "grad_norm": 9.522262573242188, + "learning_rate": 0.00029536159293436166 + }, + { + "step": 25, + "epoch": 0.704225352112676, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436608, + "loss": 1.3671, + "grad_norm": 9.449694633483887, + "learning_rate": 0.00029439363704250176 + }, + { + "step": 26, + "epoch": 0.7323943661971831, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436588032, + "loss": 1.4186, + "grad_norm": 7.2168426513671875, + "learning_rate": 0.00029333592086792107 + }, + { + "step": 27, + "epoch": 0.7605633802816901, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436614144, + "loss": 1.3558, + "grad_norm": 10.885024070739746, + "learning_rate": 0.0002921891019250697 + }, + { + "step": 28, + "epoch": 0.7887323943661971, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436614144, + "loss": 1.3774, + "grad_norm": 6.4214277267456055, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 29, + "epoch": 0.8169014084507042, + "cpu_mem": 1.493458944, + "gpu_mem": 4.43659264, + "loss": 1.2741, + "grad_norm": 6.542706489562988, + "learning_rate": 0.00028963106229663063 + }, + { + "step": 30, + "epoch": 0.8450704225352113, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436583424, + "loss": 1.3852, + "grad_norm": 9.458147048950195, + "learning_rate": 0.00028822143178056114 + }, + { + "step": 31, + "epoch": 0.8732394366197183, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436601856, + "loss": 1.3504, + "grad_norm": 16.29282569885254, + "learning_rate": 0.00028672587784675096 + }, + { + "step": 32, + "epoch": 0.9014084507042254, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436624896, + "loss": 1.3272, + "grad_norm": 7.9392499923706055, + "learning_rate": 0.0002851453301853628 + }, + { + "step": 33, + "epoch": 0.9295774647887324, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436621824, + "loss": 1.3411, + "grad_norm": 4.136773109436035, + "learning_rate": 0.00028348077132172027 + }, + { + "step": 34, + "epoch": 0.9577464788732394, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436624896, + "loss": 1.3751, + "grad_norm": 5.5571393966674805, + "learning_rate": 0.0002817332360055343 + }, + { + "step": 35, + "epoch": 0.9859154929577465, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436606464, + "loss": 1.2823, + "grad_norm": 3.0069351196289062, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 36, + "epoch": 1.0140845070422535, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442983936, + "loss": 1.9088, + "grad_norm": 9.872271537780762, + "learning_rate": 0.0002779936322448233 + }, + { + "step": 37, + "epoch": 1.0422535211267605, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442988544, + "loss": 1.3898, + "grad_norm": 9.080979347229004, + "learning_rate": 0.0002760038884726157 + }, + { + "step": 38, + "epoch": 1.0704225352112675, + "cpu_mem": 1.493458944, + "gpu_mem": 4.44296704, + "loss": 1.2297, + "grad_norm": 4.792629718780518, + "learning_rate": 0.00027393581614739923 + }, + { + "step": 39, + "epoch": 1.0985915492957747, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442956288, + "loss": 1.3502, + "grad_norm": 10.400541305541992, + "learning_rate": 0.0002717907008573785 + }, + { + "step": 40, + "epoch": 1.1267605633802817, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443019264, + "loss": 1.3253, + "grad_norm": 4.320893287658691, + "learning_rate": 0.0002695698760834384 + }, + { + "step": 41, + "epoch": 1.1549295774647887, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442979328, + "loss": 1.3928, + "grad_norm": 12.37241268157959, + "learning_rate": 0.00026727472237020447 + }, + { + "step": 42, + "epoch": 1.1830985915492958, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443022336, + "loss": 1.3985, + "grad_norm": 9.03736686706543, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 43, + "epoch": 1.2112676056338028, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442971648, + "loss": 1.3813, + "grad_norm": 5.811436176300049, + "learning_rate": 0.0002624671804451601 + }, + { + "step": 44, + "epoch": 1.2394366197183098, + "cpu_mem": 1.493458944, + "gpu_mem": 4.44303616, + "loss": 1.3624, + "grad_norm": 5.308187961578369, + "learning_rate": 0.0002599577807744739 + }, + { + "step": 45, + "epoch": 1.267605633802817, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443003904, + "loss": 1.3711, + "grad_norm": 4.097907066345215, + "learning_rate": 0.0002573800273889577 + }, + { + "step": 46, + "epoch": 1.295774647887324, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443008512, + "loss": 1.3712, + "grad_norm": 4.213882923126221, + "learning_rate": 0.0002547355227129109 + }, + { + "step": 47, + "epoch": 1.323943661971831, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442954752, + "loss": 1.3301, + "grad_norm": 7.293313503265381, + "learning_rate": 0.00025202591066563786 + }, + { + "step": 48, + "epoch": 1.352112676056338, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442968576, + "loss": 1.3347, + "grad_norm": 8.195301055908203, + "learning_rate": 0.0002492528756395289 + }, + { + "step": 49, + "epoch": 1.380281690140845, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442957824, + "loss": 1.3941, + "grad_norm": 25.371461868286133, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 50, + "epoch": 1.408450704225352, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442971648, + "loss": 1.4377, + "grad_norm": 18.559114456176758, + "learning_rate": 0.00024352347027881003 + }, + { + "step": 51, + "epoch": 1.436619718309859, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443023872, + "loss": 1.364, + "grad_norm": 8.876504898071289, + "learning_rate": 0.0002405706615488216 + }, + { + "step": 52, + "epoch": 1.4647887323943662, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442971648, + "loss": 1.686, + "grad_norm": 56.97942352294922, + "learning_rate": 0.00023756155083521846 + }, + { + "step": 53, + "epoch": 1.4929577464788732, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443040768, + "loss": 1.4077, + "grad_norm": 22.452436447143555, + "learning_rate": 0.00023449800870954326 + }, + { + "step": 54, + "epoch": 1.5211267605633803, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443008512, + "loss": 1.3135, + "grad_norm": 6.5965576171875, + "learning_rate": 0.0002313819395798639 + }, + { + "step": 55, + "epoch": 1.5492957746478875, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443017728, + "loss": 1.3823, + "grad_norm": 6.214559078216553, + "learning_rate": 0.0002282152805069247 + }, + { + "step": 56, + "epoch": 1.5774647887323945, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442993152, + "loss": 1.3107, + "grad_norm": 2.944521903991699, + "learning_rate": 0.000225 + }, + { + "step": 57, + "epoch": 1.6056338028169015, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443026944, + "loss": 1.3232, + "grad_norm": 4.46520471572876, + "learning_rate": 0.00022173809679319772 + }, + { + "step": 58, + "epoch": 1.6338028169014085, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443008512, + "loss": 1.3273, + "grad_norm": 3.4397385120391846, + "learning_rate": 0.00021843159860297442 + }, + { + "step": 59, + "epoch": 1.6619718309859155, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442994688, + "loss": 1.3453, + "grad_norm": 3.8322765827178955, + "learning_rate": 0.00021508256086763368 + }, + { + "step": 60, + "epoch": 1.6901408450704225, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443033088, + "loss": 1.3242, + "grad_norm": 4.89071798324585, + "learning_rate": 0.00021169306546959174 + }, + { + "step": 61, + "epoch": 1.7183098591549295, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442965504, + "loss": 1.2936, + "grad_norm": 4.631297588348389, + "learning_rate": 0.0002082652194412042 + }, + { + "step": 62, + "epoch": 1.7464788732394365, + "cpu_mem": 1.493458944, + "gpu_mem": 4.44301312, + "loss": 1.4406, + "grad_norm": 12.51733684539795, + "learning_rate": 0.00020480115365495926 + }, + { + "step": 63, + "epoch": 1.7746478873239435, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442962432, + "loss": 1.446, + "grad_norm": 15.611908912658691, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 64, + "epoch": 1.8028169014084507, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443011584, + "loss": 1.4236, + "grad_norm": 12.190735816955566, + "learning_rate": 0.00019777299753775265 + }, + { + "step": 65, + "epoch": 1.8309859154929577, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443010048, + "loss": 1.3596, + "grad_norm": 5.446914196014404, + "learning_rate": 0.00019421327616163563 + }, + { + "step": 66, + "epoch": 1.8591549295774648, + "cpu_mem": 1.493458944, + "gpu_mem": 4.44302848, + "loss": 1.2865, + "grad_norm": 4.165730953216553, + "learning_rate": 0.00019062607022145078 + }, + { + "step": 67, + "epoch": 1.887323943661972, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442970112, + "loss": 1.2999, + "grad_norm": 3.62854266166687, + "learning_rate": 0.00018701360965354402 + }, + { + "step": 68, + "epoch": 1.915492957746479, + "cpu_mem": 1.493458944, + "gpu_mem": 4.4429824, + "loss": 1.3443, + "grad_norm": 3.0337541103363037, + "learning_rate": 0.00018337814009344714 + }, + { + "step": 69, + "epoch": 1.943661971830986, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443006976, + "loss": 1.3273, + "grad_norm": 4.934184551239014, + "learning_rate": 0.0001797219214799096 + }, + { + "step": 70, + "epoch": 1.971830985915493, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442983936, + "loss": 1.2715, + "grad_norm": 3.110337495803833, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 71, + "epoch": 2.0, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442819584, + "loss": 1.9529, + "grad_norm": 7.656370639801025, + "learning_rate": 0.00017235633992642615 + }, + { + "step": 72, + "epoch": 2.028169014084507, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436617216, + "loss": 1.3812, + "grad_norm": 5.92775297164917, + "learning_rate": 0.00016865155569712278 + }, + { + "step": 73, + "epoch": 2.056338028169014, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436580352, + "loss": 1.3011, + "grad_norm": 7.2833356857299805, + "learning_rate": 0.0001649351769893725 + }, + { + "step": 74, + "epoch": 2.084507042253521, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436640256, + "loss": 1.3427, + "grad_norm": 14.072025299072266, + "learning_rate": 0.00016120951403796364 + }, + { + "step": 75, + "epoch": 2.112676056338028, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436608, + "loss": 1.3463, + "grad_norm": 7.366079807281494, + "learning_rate": 0.00015747688284910457 + }, + { + "step": 76, + "epoch": 2.140845070422535, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436618752, + "loss": 1.3097, + "grad_norm": 11.609695434570312, + "learning_rate": 0.00015373960376071093 + }, + { + "step": 77, + "epoch": 2.169014084507042, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436655616, + "loss": 1.3418, + "grad_norm": 11.972086906433105, + "learning_rate": 0.00015 + }, + { + "step": 78, + "epoch": 2.1971830985915495, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436640256, + "loss": 1.3932, + "grad_norm": 11.582221984863281, + "learning_rate": 0.00014626039623928907 + }, + { + "step": 79, + "epoch": 2.2253521126760565, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436591104, + "loss": 1.2812, + "grad_norm": 7.3847832679748535, + "learning_rate": 0.0001425231171508954 + }, + { + "step": 80, + "epoch": 2.2535211267605635, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436635648, + "loss": 1.3296, + "grad_norm": 6.362971782684326, + "learning_rate": 0.00013879048596203636 + }, + { + "step": 81, + "epoch": 2.2816901408450705, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436621824, + "loss": 1.3876, + "grad_norm": 11.030096054077148, + "learning_rate": 0.0001350648230106275 + }, + { + "step": 82, + "epoch": 2.3098591549295775, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436589568, + "loss": 1.4036, + "grad_norm": 9.106473922729492, + "learning_rate": 0.00013134844430287725 + }, + { + "step": 83, + "epoch": 2.3380281690140845, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436640256, + "loss": 1.3261, + "grad_norm": 11.944342613220215, + "learning_rate": 0.0001276436600735738 + }, + { + "step": 84, + "epoch": 2.3661971830985915, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436578816, + "loss": 1.3887, + "grad_norm": 12.192349433898926, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 85, + "epoch": 2.3943661971830985, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436624896, + "loss": 1.3469, + "grad_norm": 7.058681488037109, + "learning_rate": 0.00012027807852009038 + }, + { + "step": 86, + "epoch": 2.4225352112676055, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436578816, + "loss": 1.3328, + "grad_norm": 5.5536932945251465, + "learning_rate": 0.00011662185990655284 + }, + { + "step": 87, + "epoch": 2.4507042253521125, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436609536, + "loss": 1.3939, + "grad_norm": 7.393185138702393, + "learning_rate": 0.00011298639034645593 + }, + { + "step": 88, + "epoch": 2.4788732394366195, + "cpu_mem": 1.493458944, + "gpu_mem": 4.43658496, + "loss": 1.3269, + "grad_norm": 7.776304721832275, + "learning_rate": 0.00010937392977854923 + }, + { + "step": 89, + "epoch": 2.507042253521127, + "cpu_mem": 1.493458944, + "gpu_mem": 4.43663872, + "loss": 1.3153, + "grad_norm": 5.972353935241699, + "learning_rate": 0.00010578672383836435 + }, + { + "step": 90, + "epoch": 2.535211267605634, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436620288, + "loss": 1.3751, + "grad_norm": 6.654686450958252, + "learning_rate": 0.00010222700246224735 + }, + { + "step": 91, + "epoch": 2.563380281690141, + "cpu_mem": 1.493458944, + "gpu_mem": 4.4365696, + "loss": 1.3689, + "grad_norm": 8.81556224822998, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 92, + "epoch": 2.591549295774648, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436594176, + "loss": 1.3643, + "grad_norm": 6.586201190948486, + "learning_rate": 9.519884634504074e-05 + }, + { + "step": 93, + "epoch": 2.619718309859155, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436597248, + "loss": 1.2994, + "grad_norm": 4.92837381362915, + "learning_rate": 9.17347805587958e-05 + }, + { + "step": 94, + "epoch": 2.647887323943662, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436589568, + "loss": 1.2857, + "grad_norm": 4.152866840362549, + "learning_rate": 8.830693453040829e-05 + }, + { + "step": 95, + "epoch": 2.676056338028169, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436627968, + "loss": 1.3404, + "grad_norm": 4.53918981552124, + "learning_rate": 8.491743913236628e-05 + }, + { + "step": 96, + "epoch": 2.704225352112676, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436637184, + "loss": 1.2617, + "grad_norm": 6.160858631134033, + "learning_rate": 8.156840139702554e-05 + }, + { + "step": 97, + "epoch": 2.732394366197183, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436580352, + "loss": 1.3543, + "grad_norm": 4.770242691040039, + "learning_rate": 7.82619032068023e-05 + }, + { + "step": 98, + "epoch": 2.76056338028169, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436580352, + "loss": 1.327, + "grad_norm": 2.966948986053467, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 99, + "epoch": 2.788732394366197, + "cpu_mem": 1.493458944, + "gpu_mem": 4.43657728, + "loss": 1.3015, + "grad_norm": 3.1356072425842285, + "learning_rate": 7.17847194930753e-05 + }, + { + "step": 100, + "epoch": 2.816901408450704, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436575744, + "loss": 1.272, + "grad_norm": 5.338186264038086, + "learning_rate": 6.86180604201361e-05 + }, + { + "step": 101, + "epoch": 2.845070422535211, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436618752, + "loss": 1.2616, + "grad_norm": 4.881860733032227, + "learning_rate": 6.550199129045668e-05 + }, + { + "step": 102, + "epoch": 2.873239436619718, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436557312, + "loss": 1.3467, + "grad_norm": 5.19181489944458, + "learning_rate": 6.243844916478155e-05 + }, + { + "step": 103, + "epoch": 2.9014084507042255, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436606464, + "loss": 1.309, + "grad_norm": 3.279639959335327, + "learning_rate": 5.9429338451178355e-05 + }, + { + "step": 104, + "epoch": 2.9295774647887325, + "cpu_mem": 1.493458944, + "gpu_mem": 4.43666944, + "loss": 1.3868, + "grad_norm": 6.660994052886963, + "learning_rate": 5.6476529721189974e-05 + }, + { + "step": 105, + "epoch": 2.9577464788732395, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436621824, + "loss": 1.2845, + "grad_norm": 3.30313777923584, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 106, + "epoch": 2.9859154929577465, + "cpu_mem": 1.493458944, + "gpu_mem": 4.436603392, + "loss": 1.3232, + "grad_norm": 3.6408579349517822, + "learning_rate": 5.074712436047112e-05 + }, + { + "step": 107, + "epoch": 3.0140845070422535, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443006976, + "loss": 1.8794, + "grad_norm": 7.724911212921143, + "learning_rate": 4.7974089334362057e-05 + }, + { + "step": 108, + "epoch": 3.0422535211267605, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442988544, + "loss": 1.3102, + "grad_norm": 3.513817310333252, + "learning_rate": 4.526447728708908e-05 + }, + { + "step": 109, + "epoch": 3.0704225352112675, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442977792, + "loss": 1.3073, + "grad_norm": 2.583456516265869, + "learning_rate": 4.261997261104223e-05 + }, + { + "step": 110, + "epoch": 3.0985915492957745, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443031552, + "loss": 1.327, + "grad_norm": 5.944295406341553, + "learning_rate": 4.004221922552608e-05 + }, + { + "step": 111, + "epoch": 3.1267605633802815, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442991616, + "loss": 1.3193, + "grad_norm": 3.914555072784424, + "learning_rate": 3.753281955483985e-05 + }, + { + "step": 112, + "epoch": 3.1549295774647885, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443010048, + "loss": 1.2872, + "grad_norm": 2.4625840187072754, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 113, + "epoch": 3.183098591549296, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443073024, + "loss": 1.3187, + "grad_norm": 3.991567611694336, + "learning_rate": 3.2725277629795526e-05 + }, + { + "step": 114, + "epoch": 3.211267605633803, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443000832, + "loss": 1.3259, + "grad_norm": 3.1732451915740967, + "learning_rate": 3.0430123916561672e-05 + }, + { + "step": 115, + "epoch": 3.23943661971831, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442994688, + "loss": 1.3577, + "grad_norm": 4.750394821166992, + "learning_rate": 2.8209299142621522e-05 + }, + { + "step": 116, + "epoch": 3.267605633802817, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443010048, + "loss": 1.3211, + "grad_norm": 3.380751132965088, + "learning_rate": 2.6064183852600797e-05 + }, + { + "step": 117, + "epoch": 3.295774647887324, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443025408, + "loss": 1.29, + "grad_norm": 4.058185577392578, + "learning_rate": 2.3996111527384288e-05 + }, + { + "step": 118, + "epoch": 3.323943661971831, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443016192, + "loss": 1.2777, + "grad_norm": 5.971620082855225, + "learning_rate": 2.2006367755176655e-05 + }, + { + "step": 119, + "epoch": 3.352112676056338, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443006976, + "loss": 1.2883, + "grad_norm": 3.4085781574249268, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 120, + "epoch": 3.380281690140845, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443025408, + "loss": 1.2777, + "grad_norm": 2.7961478233337402, + "learning_rate": 1.82667639944657e-05 + }, + { + "step": 121, + "epoch": 3.408450704225352, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443023872, + "loss": 1.2971, + "grad_norm": 4.205790042877197, + "learning_rate": 1.6519228678279718e-05 + }, + { + "step": 122, + "epoch": 3.436619718309859, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442980864, + "loss": 1.2862, + "grad_norm": 4.291749000549316, + "learning_rate": 1.4854669814637143e-05 + }, + { + "step": 123, + "epoch": 3.464788732394366, + "cpu_mem": 1.493458944, + "gpu_mem": 4.44301312, + "loss": 1.2822, + "grad_norm": 4.5682053565979, + "learning_rate": 1.3274122153249028e-05 + }, + { + "step": 124, + "epoch": 3.492957746478873, + "cpu_mem": 1.493458944, + "gpu_mem": 4.44296704, + "loss": 1.3005, + "grad_norm": 2.844740867614746, + "learning_rate": 1.1778568219438839e-05 + }, + { + "step": 125, + "epoch": 3.52112676056338, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443011584, + "loss": 1.2295, + "grad_norm": 5.023967266082764, + "learning_rate": 1.036893770336938e-05 + }, + { + "step": 126, + "epoch": 3.5492957746478875, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442962432, + "loss": 1.2662, + "grad_norm": 4.076255798339844, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 127, + "epoch": 3.5774647887323945, + "cpu_mem": 1.493458944, + "gpu_mem": 4.44297472, + "loss": 1.2682, + "grad_norm": 3.1658072471618652, + "learning_rate": 7.810898074930243e-06 + }, + { + "step": 128, + "epoch": 3.6056338028169015, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442999296, + "loss": 1.2845, + "grad_norm": 3.069446325302124, + "learning_rate": 6.664079132078881e-06 + }, + { + "step": 129, + "epoch": 3.6338028169014085, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442960896, + "loss": 1.3036, + "grad_norm": 3.197641372680664, + "learning_rate": 5.606362957498195e-06 + }, + { + "step": 130, + "epoch": 3.6619718309859155, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442963968, + "loss": 1.3179, + "grad_norm": 3.2924671173095703, + "learning_rate": 4.638407065638322e-06 + }, + { + "step": 131, + "epoch": 3.6901408450704225, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442976256, + "loss": 1.2769, + "grad_norm": 4.022733211517334, + "learning_rate": 3.760813172726457e-06 + }, + { + "step": 132, + "epoch": 3.7183098591549295, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442940928, + "loss": 1.2735, + "grad_norm": 3.0946011543273926, + "learning_rate": 2.9741268227184255e-06 + }, + { + "step": 133, + "epoch": 3.7464788732394365, + "cpu_mem": 1.493458944, + "gpu_mem": 4.4429824, + "loss": 1.2261, + "grad_norm": 3.561920166015625, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 134, + "epoch": 3.7746478873239435, + "cpu_mem": 1.493458944, + "gpu_mem": 4.44299776, + "loss": 1.248, + "grad_norm": 3.3285350799560547, + "learning_rate": 1.6753760662307215e-06 + }, + { + "step": 135, + "epoch": 3.802816901408451, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442962432, + "loss": 1.3244, + "grad_norm": 4.943061828613281, + "learning_rate": 1.1641190099741904e-06 + }, + { + "step": 136, + "epoch": 3.830985915492958, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442970112, + "loss": 1.3367, + "grad_norm": 4.610203266143799, + "learning_rate": 7.453836951897885e-07 + }, + { + "step": 137, + "epoch": 3.859154929577465, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442991616, + "loss": 1.3017, + "grad_norm": 4.349905967712402, + "learning_rate": 4.194304228229806e-07 + }, + { + "step": 138, + "epoch": 3.887323943661972, + "cpu_mem": 1.493458944, + "gpu_mem": 4.443002368, + "loss": 1.3214, + "grad_norm": 3.8690969944000244, + "learning_rate": 1.8646181716164831e-07 + }, + { + "step": 139, + "epoch": 3.915492957746479, + "cpu_mem": 1.493458944, + "gpu_mem": 4.442994688, + "loss": 1.3432, + "grad_norm": 3.9236385822296143, + "learning_rate": 4.662269987756317e-08 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.493458944, + "gpu_mem": 4.44302848, + "loss": 1.3178, + "grad_norm": 3.8790531158447266, + "learning_rate": 0.0 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.493458944, + "gpu_mem": 4.44302848, + "train_runtime": 672.9448, + "train_samples_per_second": 13.38, + "train_steps_per_second": 0.208, + "total_flos": 0.0, + "train_loss": 1.4192385068961553 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/adapter_config.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0052eed638e4aeb48f103586efb96096bb8d3ed --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 64, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 32, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "A" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/eval_results.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..2c2cb4ce87112b67373262724ab976f61aac3bff --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_e", + "results": 0.37247474747474746 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/training_configuration.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..b9a859b72574b8a9ec3727d284beb13e1b96a949 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_E", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "abl_A", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 25389056 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_A-arc_e-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2", + "seed": 42, + "timestamp": "2025-08-31T05:51:47.380869" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/training_logs.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..26560a2435c3775279dc26bf0fabeafd9ef4cff7 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/training_logs.json @@ -0,0 +1,1273 @@ +[ + { + "step": 1, + "epoch": 0.028169014084507043, + "cpu_mem": 1.496137728, + "gpu_mem": 4.518964736, + "loss": 4.6319, + "grad_norm": 285.1859436035156, + "learning_rate": 2.1428571428571425e-05 + }, + { + "step": 2, + "epoch": 0.056338028169014086, + "cpu_mem": 1.501642752, + "gpu_mem": 4.722138112, + "loss": 4.4578, + "grad_norm": 290.5561218261719, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 3, + "epoch": 0.08450704225352113, + "cpu_mem": 1.50183936, + "gpu_mem": 4.722116608, + "loss": 2.8478, + "grad_norm": 381.0905456542969, + "learning_rate": 6.428571428571427e-05 + }, + { + "step": 4, + "epoch": 0.11267605633802817, + "cpu_mem": 1.502035968, + "gpu_mem": 4.722095104, + "loss": 1.6686, + "grad_norm": 25.219541549682617, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 5, + "epoch": 0.14084507042253522, + "cpu_mem": 1.502035968, + "gpu_mem": 4.722136576, + "loss": 1.5693, + "grad_norm": 22.9230899810791, + "learning_rate": 0.00010714285714285714 + }, + { + "step": 6, + "epoch": 0.16901408450704225, + "cpu_mem": 1.502035968, + "gpu_mem": 4.722112, + "loss": 1.4612, + "grad_norm": 21.357065200805664, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 7, + "epoch": 0.19718309859154928, + "cpu_mem": 1.502232576, + "gpu_mem": 4.72213504, + "loss": 1.6244, + "grad_norm": 26.6319637298584, + "learning_rate": 0.00015 + }, + { + "step": 8, + "epoch": 0.22535211267605634, + "cpu_mem": 1.502232576, + "gpu_mem": 4.722093568, + "loss": 1.3759, + "grad_norm": 7.972470760345459, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 9, + "epoch": 0.2535211267605634, + "cpu_mem": 1.502429184, + "gpu_mem": 4.722095104, + "loss": 1.398, + "grad_norm": 12.75944995880127, + "learning_rate": 0.00019285714285714286 + }, + { + "step": 10, + "epoch": 0.28169014084507044, + "cpu_mem": 1.502429184, + "gpu_mem": 4.722090496, + "loss": 1.7315, + "grad_norm": 20.16497802734375, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 11, + "epoch": 0.30985915492957744, + "cpu_mem": 1.502429184, + "gpu_mem": 4.722168832, + "loss": 1.5034, + "grad_norm": 12.408662796020508, + "learning_rate": 0.00023571428571428569 + }, + { + "step": 12, + "epoch": 0.3380281690140845, + "cpu_mem": 1.502429184, + "gpu_mem": 4.72214272, + "loss": 1.3497, + "grad_norm": 6.996767044067383, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 13, + "epoch": 0.36619718309859156, + "cpu_mem": 1.502429184, + "gpu_mem": 4.722093568, + "loss": 2.105, + "grad_norm": 52.36497116088867, + "learning_rate": 0.00027857142857142854 + }, + { + "step": 14, + "epoch": 0.39436619718309857, + "cpu_mem": 1.502429184, + "gpu_mem": 4.722115072, + "loss": 1.4537, + "grad_norm": 8.213821411132812, + "learning_rate": 0.0003 + }, + { + "step": 15, + "epoch": 0.4225352112676056, + "cpu_mem": 1.502429184, + "gpu_mem": 4.722092032, + "loss": 1.4046, + "grad_norm": 8.018381118774414, + "learning_rate": 0.0002999533773001224 + }, + { + "step": 16, + "epoch": 0.4507042253521127, + "cpu_mem": 1.502429184, + "gpu_mem": 4.72209664, + "loss": 1.4606, + "grad_norm": 8.81755256652832, + "learning_rate": 0.0002998135381828383 + }, + { + "step": 17, + "epoch": 0.4788732394366197, + "cpu_mem": 1.502429184, + "gpu_mem": 4.722133504, + "loss": 1.535, + "grad_norm": 16.856678009033203, + "learning_rate": 0.00029958056957717696 + }, + { + "step": 18, + "epoch": 0.5070422535211268, + "cpu_mem": 1.502429184, + "gpu_mem": 4.722144256, + "loss": 1.4464, + "grad_norm": 6.95269775390625, + "learning_rate": 0.0002992546163048102 + }, + { + "step": 19, + "epoch": 0.5352112676056338, + "cpu_mem": 1.502429184, + "gpu_mem": 4.722087424, + "loss": 1.2935, + "grad_norm": 4.264415264129639, + "learning_rate": 0.0002988358809900258 + }, + { + "step": 20, + "epoch": 0.5633802816901409, + "cpu_mem": 1.502429184, + "gpu_mem": 4.72215808, + "loss": 1.6446, + "grad_norm": 16.577444076538086, + "learning_rate": 0.0002983246239337692 + }, + { + "step": 21, + "epoch": 0.5915492957746479, + "cpu_mem": 1.502429184, + "gpu_mem": 4.722156544, + "loss": 1.4246, + "grad_norm": 9.108543395996094, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 22, + "epoch": 0.6197183098591549, + "cpu_mem": 1.502429184, + "gpu_mem": 4.722113536, + "loss": 1.4405, + "grad_norm": 8.338932991027832, + "learning_rate": 0.00029702587317728153 + }, + { + "step": 23, + "epoch": 0.647887323943662, + "cpu_mem": 1.502429184, + "gpu_mem": 4.722130432, + "loss": 1.3325, + "grad_norm": 4.5872039794921875, + "learning_rate": 0.0002962391868272735 + }, + { + "step": 24, + "epoch": 0.676056338028169, + "cpu_mem": 1.502429184, + "gpu_mem": 4.722087424, + "loss": 1.346, + "grad_norm": 3.732668161392212, + "learning_rate": 0.00029536159293436166 + }, + { + "step": 25, + "epoch": 0.704225352112676, + "cpu_mem": 1.502429184, + "gpu_mem": 4.722116608, + "loss": 1.4111, + "grad_norm": 3.657146692276001, + "learning_rate": 0.00029439363704250176 + }, + { + "step": 26, + "epoch": 0.7323943661971831, + "cpu_mem": 1.502429184, + "gpu_mem": 4.72209664, + "loss": 1.5039, + "grad_norm": 3.2886135578155518, + "learning_rate": 0.00029333592086792107 + }, + { + "step": 27, + "epoch": 0.7605633802816901, + "cpu_mem": 1.502429184, + "gpu_mem": 4.722122752, + "loss": 1.358, + "grad_norm": 2.674607276916504, + "learning_rate": 0.0002921891019250697 + }, + { + "step": 28, + "epoch": 0.7887323943661971, + "cpu_mem": 1.502429184, + "gpu_mem": 4.722122752, + "loss": 1.4135, + "grad_norm": 3.7271816730499268, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 29, + "epoch": 0.8169014084507042, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722101248, + "loss": 1.2997, + "grad_norm": 3.6826894283294678, + "learning_rate": 0.00028963106229663063 + }, + { + "step": 30, + "epoch": 0.8450704225352113, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722092032, + "loss": 1.3454, + "grad_norm": 2.921555757522583, + "learning_rate": 0.00028822143178056114 + }, + { + "step": 31, + "epoch": 0.8732394366197183, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722110464, + "loss": 1.4119, + "grad_norm": 6.010624885559082, + "learning_rate": 0.00028672587784675096 + }, + { + "step": 32, + "epoch": 0.9014084507042254, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722133504, + "loss": 1.3421, + "grad_norm": 3.9047536849975586, + "learning_rate": 0.0002851453301853628 + }, + { + "step": 33, + "epoch": 0.9295774647887324, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722130432, + "loss": 1.38, + "grad_norm": 3.3717494010925293, + "learning_rate": 0.00028348077132172027 + }, + { + "step": 34, + "epoch": 0.9577464788732394, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722133504, + "loss": 1.3892, + "grad_norm": 2.1161556243896484, + "learning_rate": 0.0002817332360055343 + }, + { + "step": 35, + "epoch": 0.9859154929577465, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722115072, + "loss": 1.3076, + "grad_norm": 1.9025923013687134, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 36, + "epoch": 1.0140845070422535, + "cpu_mem": 1.502625792, + "gpu_mem": 4.82366208, + "loss": 1.9438, + "grad_norm": 4.72139835357666, + "learning_rate": 0.0002779936322448233 + }, + { + "step": 37, + "epoch": 1.0422535211267605, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823666688, + "loss": 1.3326, + "grad_norm": 4.325397968292236, + "learning_rate": 0.0002760038884726157 + }, + { + "step": 38, + "epoch": 1.0704225352112675, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823645184, + "loss": 1.2542, + "grad_norm": 3.8574061393737793, + "learning_rate": 0.00027393581614739923 + }, + { + "step": 39, + "epoch": 1.0985915492957747, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823634432, + "loss": 1.3937, + "grad_norm": 8.146137237548828, + "learning_rate": 0.0002717907008573785 + }, + { + "step": 40, + "epoch": 1.1267605633802817, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823697408, + "loss": 1.398, + "grad_norm": 4.23073148727417, + "learning_rate": 0.0002695698760834384 + }, + { + "step": 41, + "epoch": 1.1549295774647887, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823657472, + "loss": 1.5805, + "grad_norm": 9.995660781860352, + "learning_rate": 0.00026727472237020447 + }, + { + "step": 42, + "epoch": 1.1830985915492958, + "cpu_mem": 1.502625792, + "gpu_mem": 4.82370048, + "loss": 1.3628, + "grad_norm": 3.161339521408081, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 43, + "epoch": 1.2112676056338028, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823649792, + "loss": 1.4042, + "grad_norm": 4.245759010314941, + "learning_rate": 0.0002624671804451601 + }, + { + "step": 44, + "epoch": 1.2394366197183098, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823714304, + "loss": 1.3295, + "grad_norm": 3.277341365814209, + "learning_rate": 0.0002599577807744739 + }, + { + "step": 45, + "epoch": 1.267605633802817, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823682048, + "loss": 1.4885, + "grad_norm": 4.2645487785339355, + "learning_rate": 0.0002573800273889577 + }, + { + "step": 46, + "epoch": 1.295774647887324, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823686656, + "loss": 1.4298, + "grad_norm": 5.765285015106201, + "learning_rate": 0.0002547355227129109 + }, + { + "step": 47, + "epoch": 1.323943661971831, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823632896, + "loss": 1.3327, + "grad_norm": 3.2660598754882812, + "learning_rate": 0.00025202591066563786 + }, + { + "step": 48, + "epoch": 1.352112676056338, + "cpu_mem": 1.502625792, + "gpu_mem": 4.82364672, + "loss": 1.3981, + "grad_norm": 4.678296089172363, + "learning_rate": 0.0002492528756395289 + }, + { + "step": 49, + "epoch": 1.380281690140845, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823635968, + "loss": 2.184, + "grad_norm": 293.4337463378906, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 50, + "epoch": 1.408450704225352, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823649792, + "loss": 1.4052, + "grad_norm": 5.673724174499512, + "learning_rate": 0.00024352347027881003 + }, + { + "step": 51, + "epoch": 1.436619718309859, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823702016, + "loss": 1.4252, + "grad_norm": 6.799444198608398, + "learning_rate": 0.0002405706615488216 + }, + { + "step": 52, + "epoch": 1.4647887323943662, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823649792, + "loss": 1.3539, + "grad_norm": 2.679464340209961, + "learning_rate": 0.00023756155083521846 + }, + { + "step": 53, + "epoch": 1.4929577464788732, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823718912, + "loss": 1.4057, + "grad_norm": 4.1786208152771, + "learning_rate": 0.00023449800870954326 + }, + { + "step": 54, + "epoch": 1.5211267605633803, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823686656, + "loss": 1.4722, + "grad_norm": 5.400444507598877, + "learning_rate": 0.0002313819395798639 + }, + { + "step": 55, + "epoch": 1.5492957746478875, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823695872, + "loss": 1.3521, + "grad_norm": 1.586466908454895, + "learning_rate": 0.0002282152805069247 + }, + { + "step": 56, + "epoch": 1.5774647887323945, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823671296, + "loss": 1.3238, + "grad_norm": 1.901183843612671, + "learning_rate": 0.000225 + }, + { + "step": 57, + "epoch": 1.6056338028169015, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823705088, + "loss": 1.3406, + "grad_norm": 2.266773223876953, + "learning_rate": 0.00022173809679319772 + }, + { + "step": 58, + "epoch": 1.6338028169014085, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823686656, + "loss": 1.3164, + "grad_norm": 0.935617983341217, + "learning_rate": 0.00021843159860297442 + }, + { + "step": 59, + "epoch": 1.6619718309859155, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823672832, + "loss": 1.3413, + "grad_norm": 1.1456469297409058, + "learning_rate": 0.00021508256086763368 + }, + { + "step": 60, + "epoch": 1.6901408450704225, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823711232, + "loss": 1.2904, + "grad_norm": 1.728401780128479, + "learning_rate": 0.00021169306546959174 + }, + { + "step": 61, + "epoch": 1.7183098591549295, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823643648, + "loss": 1.3529, + "grad_norm": 3.0350029468536377, + "learning_rate": 0.0002082652194412042 + }, + { + "step": 62, + "epoch": 1.7464788732394365, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823691264, + "loss": 1.4544, + "grad_norm": 3.896820068359375, + "learning_rate": 0.00020480115365495926 + }, + { + "step": 63, + "epoch": 1.7746478873239435, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823640576, + "loss": 1.4155, + "grad_norm": 3.300171136856079, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 64, + "epoch": 1.8028169014084507, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823689728, + "loss": 1.3494, + "grad_norm": 2.4495275020599365, + "learning_rate": 0.00019777299753775265 + }, + { + "step": 65, + "epoch": 1.8309859154929577, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823688192, + "loss": 1.4032, + "grad_norm": 3.058351993560791, + "learning_rate": 0.00019421327616163563 + }, + { + "step": 66, + "epoch": 1.8591549295774648, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823706624, + "loss": 1.4233, + "grad_norm": 4.7667107582092285, + "learning_rate": 0.00019062607022145078 + }, + { + "step": 67, + "epoch": 1.887323943661972, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823648256, + "loss": 1.3687, + "grad_norm": 5.0993218421936035, + "learning_rate": 0.00018701360965354402 + }, + { + "step": 68, + "epoch": 1.915492957746479, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823660544, + "loss": 20.6412, + "grad_norm": 727.7623901367188, + "learning_rate": 0.00018337814009344714 + }, + { + "step": 69, + "epoch": 1.943661971830986, + "cpu_mem": 1.502625792, + "gpu_mem": 4.82368512, + "loss": 5.2158, + "grad_norm": 216.47073364257812, + "learning_rate": 0.0001797219214799096 + }, + { + "step": 70, + "epoch": 1.971830985915493, + "cpu_mem": 1.502625792, + "gpu_mem": 4.82366208, + "loss": 1.9586, + "grad_norm": 153.92445373535156, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 71, + "epoch": 2.0, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823497728, + "loss": 2.0123, + "grad_norm": 5.384662628173828, + "learning_rate": 0.00017235633992642615 + }, + { + "step": 72, + "epoch": 2.028169014084507, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722125824, + "loss": 1.3851, + "grad_norm": 2.2103421688079834, + "learning_rate": 0.00016865155569712278 + }, + { + "step": 73, + "epoch": 2.056338028169014, + "cpu_mem": 1.502625792, + "gpu_mem": 4.72208896, + "loss": 1.3777, + "grad_norm": 1.1181278228759766, + "learning_rate": 0.0001649351769893725 + }, + { + "step": 74, + "epoch": 2.084507042253521, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722148864, + "loss": 1.3305, + "grad_norm": 0.8732612133026123, + "learning_rate": 0.00016120951403796364 + }, + { + "step": 75, + "epoch": 2.112676056338028, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722116608, + "loss": 1.344, + "grad_norm": 1.5410144329071045, + "learning_rate": 0.00015747688284910457 + }, + { + "step": 76, + "epoch": 2.140845070422535, + "cpu_mem": 1.502625792, + "gpu_mem": 4.72212736, + "loss": 1.2997, + "grad_norm": 0.754600465297699, + "learning_rate": 0.00015373960376071093 + }, + { + "step": 77, + "epoch": 2.169014084507042, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722164224, + "loss": 1.3713, + "grad_norm": 1.0481202602386475, + "learning_rate": 0.00015 + }, + { + "step": 78, + "epoch": 2.1971830985915495, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722148864, + "loss": 1.4209, + "grad_norm": 2.4341037273406982, + "learning_rate": 0.00014626039623928907 + }, + { + "step": 79, + "epoch": 2.2253521126760565, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722099712, + "loss": 1.291, + "grad_norm": 1.7542685270309448, + "learning_rate": 0.0001425231171508954 + }, + { + "step": 80, + "epoch": 2.2535211267605635, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722144256, + "loss": 1.346, + "grad_norm": 1.6453899145126343, + "learning_rate": 0.00013879048596203636 + }, + { + "step": 81, + "epoch": 2.2816901408450705, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722130432, + "loss": 1.4232, + "grad_norm": 2.9081246852874756, + "learning_rate": 0.0001350648230106275 + }, + { + "step": 82, + "epoch": 2.3098591549295775, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722098176, + "loss": 1.4118, + "grad_norm": 3.444525718688965, + "learning_rate": 0.00013134844430287725 + }, + { + "step": 83, + "epoch": 2.3380281690140845, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722148864, + "loss": 1.3006, + "grad_norm": 2.5711867809295654, + "learning_rate": 0.0001276436600735738 + }, + { + "step": 84, + "epoch": 2.3661971830985915, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722087424, + "loss": 1.4037, + "grad_norm": 2.308828115463257, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 85, + "epoch": 2.3943661971830985, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722133504, + "loss": 1.3524, + "grad_norm": 2.286288261413574, + "learning_rate": 0.00012027807852009038 + }, + { + "step": 86, + "epoch": 2.4225352112676055, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722087424, + "loss": 1.3171, + "grad_norm": 0.8145406246185303, + "learning_rate": 0.00011662185990655284 + }, + { + "step": 87, + "epoch": 2.4507042253521125, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722118144, + "loss": 1.4024, + "grad_norm": 2.189497232437134, + "learning_rate": 0.00011298639034645593 + }, + { + "step": 88, + "epoch": 2.4788732394366195, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722093568, + "loss": 1.3276, + "grad_norm": 2.225213050842285, + "learning_rate": 0.00010937392977854923 + }, + { + "step": 89, + "epoch": 2.507042253521127, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722147328, + "loss": 1.3019, + "grad_norm": 1.4794793128967285, + "learning_rate": 0.00010578672383836435 + }, + { + "step": 90, + "epoch": 2.535211267605634, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722128896, + "loss": 1.3902, + "grad_norm": 2.832213878631592, + "learning_rate": 0.00010222700246224735 + }, + { + "step": 91, + "epoch": 2.563380281690141, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722078208, + "loss": 1.3622, + "grad_norm": 2.659364700317383, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 92, + "epoch": 2.591549295774648, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722102784, + "loss": 1.3862, + "grad_norm": 2.9223179817199707, + "learning_rate": 9.519884634504074e-05 + }, + { + "step": 93, + "epoch": 2.619718309859155, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722105856, + "loss": 1.3254, + "grad_norm": 2.7396914958953857, + "learning_rate": 9.17347805587958e-05 + }, + { + "step": 94, + "epoch": 2.647887323943662, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722098176, + "loss": 1.3504, + "grad_norm": 2.2619333267211914, + "learning_rate": 8.830693453040829e-05 + }, + { + "step": 95, + "epoch": 2.676056338028169, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722136576, + "loss": 1.375, + "grad_norm": 2.095527172088623, + "learning_rate": 8.491743913236628e-05 + }, + { + "step": 96, + "epoch": 2.704225352112676, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722145792, + "loss": 1.2863, + "grad_norm": 2.3975515365600586, + "learning_rate": 8.156840139702554e-05 + }, + { + "step": 97, + "epoch": 2.732394366197183, + "cpu_mem": 1.502625792, + "gpu_mem": 4.72208896, + "loss": 1.3655, + "grad_norm": 2.111492395401001, + "learning_rate": 7.82619032068023e-05 + }, + { + "step": 98, + "epoch": 2.76056338028169, + "cpu_mem": 1.502625792, + "gpu_mem": 4.72208896, + "loss": 1.3394, + "grad_norm": 0.9519637823104858, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 99, + "epoch": 2.788732394366197, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722085888, + "loss": 1.2922, + "grad_norm": 1.1138625144958496, + "learning_rate": 7.17847194930753e-05 + }, + { + "step": 100, + "epoch": 2.816901408450704, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722084352, + "loss": 1.2824, + "grad_norm": 1.63324773311615, + "learning_rate": 6.86180604201361e-05 + }, + { + "step": 101, + "epoch": 2.845070422535211, + "cpu_mem": 1.502625792, + "gpu_mem": 4.72212736, + "loss": 1.2767, + "grad_norm": 1.6835689544677734, + "learning_rate": 6.550199129045668e-05 + }, + { + "step": 102, + "epoch": 2.873239436619718, + "cpu_mem": 1.502625792, + "gpu_mem": 4.72206592, + "loss": 1.3731, + "grad_norm": 2.8863682746887207, + "learning_rate": 6.243844916478155e-05 + }, + { + "step": 103, + "epoch": 2.9014084507042255, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722115072, + "loss": 1.3284, + "grad_norm": 1.9598286151885986, + "learning_rate": 5.9429338451178355e-05 + }, + { + "step": 104, + "epoch": 2.9295774647887325, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722178048, + "loss": 1.3901, + "grad_norm": 3.4294772148132324, + "learning_rate": 5.6476529721189974e-05 + }, + { + "step": 105, + "epoch": 2.9577464788732395, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722130432, + "loss": 1.3027, + "grad_norm": 1.786590814590454, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 106, + "epoch": 2.9859154929577465, + "cpu_mem": 1.502625792, + "gpu_mem": 4.722112, + "loss": 1.3242, + "grad_norm": 1.6533207893371582, + "learning_rate": 5.074712436047112e-05 + }, + { + "step": 107, + "epoch": 3.0140845070422535, + "cpu_mem": 1.502625792, + "gpu_mem": 4.82368512, + "loss": 1.8733, + "grad_norm": 3.4330568313598633, + "learning_rate": 4.7974089334362057e-05 + }, + { + "step": 108, + "epoch": 3.0422535211267605, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823666688, + "loss": 1.2738, + "grad_norm": 1.685111403465271, + "learning_rate": 4.526447728708908e-05 + }, + { + "step": 109, + "epoch": 3.0704225352112675, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823655936, + "loss": 1.3244, + "grad_norm": 1.720697045326233, + "learning_rate": 4.261997261104223e-05 + }, + { + "step": 110, + "epoch": 3.0985915492957745, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823709696, + "loss": 1.2854, + "grad_norm": 2.801140308380127, + "learning_rate": 4.004221922552608e-05 + }, + { + "step": 111, + "epoch": 3.1267605633802815, + "cpu_mem": 1.502625792, + "gpu_mem": 4.82366976, + "loss": 1.3167, + "grad_norm": 1.9198555946350098, + "learning_rate": 3.753281955483985e-05 + }, + { + "step": 112, + "epoch": 3.1549295774647885, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823688192, + "loss": 1.2922, + "grad_norm": 1.4882051944732666, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 113, + "epoch": 3.183098591549296, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823751168, + "loss": 1.2862, + "grad_norm": 1.7628090381622314, + "learning_rate": 3.2725277629795526e-05 + }, + { + "step": 114, + "epoch": 3.211267605633803, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823678976, + "loss": 1.3162, + "grad_norm": 1.6963146924972534, + "learning_rate": 3.0430123916561672e-05 + }, + { + "step": 115, + "epoch": 3.23943661971831, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823672832, + "loss": 1.3671, + "grad_norm": 2.351639747619629, + "learning_rate": 2.8209299142621522e-05 + }, + { + "step": 116, + "epoch": 3.267605633802817, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823688192, + "loss": 1.3129, + "grad_norm": 1.3395206928253174, + "learning_rate": 2.6064183852600797e-05 + }, + { + "step": 117, + "epoch": 3.295774647887324, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823703552, + "loss": 1.276, + "grad_norm": 1.8355594873428345, + "learning_rate": 2.3996111527384288e-05 + }, + { + "step": 118, + "epoch": 3.323943661971831, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823694336, + "loss": 1.2855, + "grad_norm": 1.5066239833831787, + "learning_rate": 2.2006367755176655e-05 + }, + { + "step": 119, + "epoch": 3.352112676056338, + "cpu_mem": 1.502625792, + "gpu_mem": 4.82368512, + "loss": 1.2956, + "grad_norm": 1.6072317361831665, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 120, + "epoch": 3.380281690140845, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823703552, + "loss": 1.2925, + "grad_norm": 1.6089941263198853, + "learning_rate": 1.82667639944657e-05 + }, + { + "step": 121, + "epoch": 3.408450704225352, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823702016, + "loss": 1.2467, + "grad_norm": 1.8599209785461426, + "learning_rate": 1.6519228678279718e-05 + }, + { + "step": 122, + "epoch": 3.436619718309859, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823659008, + "loss": 1.2741, + "grad_norm": 1.5860140323638916, + "learning_rate": 1.4854669814637143e-05 + }, + { + "step": 123, + "epoch": 3.464788732394366, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823691264, + "loss": 1.2627, + "grad_norm": 1.814361810684204, + "learning_rate": 1.3274122153249028e-05 + }, + { + "step": 124, + "epoch": 3.492957746478873, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823645184, + "loss": 1.2697, + "grad_norm": 1.6075295209884644, + "learning_rate": 1.1778568219438839e-05 + }, + { + "step": 125, + "epoch": 3.52112676056338, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823689728, + "loss": 1.2082, + "grad_norm": 2.1656956672668457, + "learning_rate": 1.036893770336938e-05 + }, + { + "step": 126, + "epoch": 3.5492957746478875, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823640576, + "loss": 1.2388, + "grad_norm": 1.7753299474716187, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 127, + "epoch": 3.5774647887323945, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823652864, + "loss": 1.2669, + "grad_norm": 1.6709108352661133, + "learning_rate": 7.810898074930243e-06 + }, + { + "step": 128, + "epoch": 3.6056338028169015, + "cpu_mem": 1.502625792, + "gpu_mem": 4.82367744, + "loss": 1.28, + "grad_norm": 2.1730709075927734, + "learning_rate": 6.664079132078881e-06 + }, + { + "step": 129, + "epoch": 3.6338028169014085, + "cpu_mem": 1.502625792, + "gpu_mem": 4.82363904, + "loss": 1.2608, + "grad_norm": 1.86732017993927, + "learning_rate": 5.606362957498195e-06 + }, + { + "step": 130, + "epoch": 3.6619718309859155, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823642112, + "loss": 1.3212, + "grad_norm": 2.1604793071746826, + "learning_rate": 4.638407065638322e-06 + }, + { + "step": 131, + "epoch": 3.6901408450704225, + "cpu_mem": 1.502625792, + "gpu_mem": 4.8236544, + "loss": 1.2794, + "grad_norm": 2.1699371337890625, + "learning_rate": 3.760813172726457e-06 + }, + { + "step": 132, + "epoch": 3.7183098591549295, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823619072, + "loss": 1.2438, + "grad_norm": 1.8384634256362915, + "learning_rate": 2.9741268227184255e-06 + }, + { + "step": 133, + "epoch": 3.7464788732394365, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823660544, + "loss": 1.206, + "grad_norm": 1.811904788017273, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 134, + "epoch": 3.7746478873239435, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823675904, + "loss": 1.2126, + "grad_norm": 1.9175496101379395, + "learning_rate": 1.6753760662307215e-06 + }, + { + "step": 135, + "epoch": 3.802816901408451, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823640576, + "loss": 1.3218, + "grad_norm": 2.67806077003479, + "learning_rate": 1.1641190099741904e-06 + }, + { + "step": 136, + "epoch": 3.830985915492958, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823648256, + "loss": 1.2947, + "grad_norm": 2.2404983043670654, + "learning_rate": 7.453836951897885e-07 + }, + { + "step": 137, + "epoch": 3.859154929577465, + "cpu_mem": 1.502625792, + "gpu_mem": 4.82366976, + "loss": 1.2894, + "grad_norm": 2.716714382171631, + "learning_rate": 4.194304228229806e-07 + }, + { + "step": 138, + "epoch": 3.887323943661972, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823680512, + "loss": 1.2933, + "grad_norm": 1.7514668703079224, + "learning_rate": 1.8646181716164831e-07 + }, + { + "step": 139, + "epoch": 3.915492957746479, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823672832, + "loss": 1.3346, + "grad_norm": 1.9952417612075806, + "learning_rate": 4.662269987756317e-08 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823706624, + "loss": 1.3088, + "grad_norm": 2.4654135704040527, + "learning_rate": 0.0 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.502625792, + "gpu_mem": 4.823706624, + "train_runtime": 678.9966, + "train_samples_per_second": 13.261, + "train_steps_per_second": 0.206, + "total_flos": 0.0, + "train_loss": 1.6106574450220381 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/adapter_config.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6934cfad94edb068f0d54db83e6a8b58f0fc939 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 16, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 8, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "A" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/eval_results.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..bfe3ac46c1221606cee5f15e20691f184cd7266d --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_e", + "results": 0.6611952861952862 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/training_configuration.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..3492e2932810fab7849e0313b69ef2a9a29e881c --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_E", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "abl_A", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 6317696 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_A-arc_e-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2", + "seed": 42, + "timestamp": "2025-08-30T22:53:40.430500" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/training_logs.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..190f8b2444406592dd15c7280af260e6d15fdb88 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/training_logs.json @@ -0,0 +1,1273 @@ +[ + { + "step": 1, + "epoch": 0.028169014084507043, + "cpu_mem": 1.488003072, + "gpu_mem": 4.44271872, + "loss": 4.6319, + "grad_norm": 276.5605773925781, + "learning_rate": 2.1428571428571425e-05 + }, + { + "step": 2, + "epoch": 0.056338028169014086, + "cpu_mem": 1.493704704, + "gpu_mem": 4.493400064, + "loss": 4.4578, + "grad_norm": 282.3363952636719, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 3, + "epoch": 0.08450704225352113, + "cpu_mem": 1.493901312, + "gpu_mem": 4.49337856, + "loss": 2.9702, + "grad_norm": 613.773193359375, + "learning_rate": 6.428571428571427e-05 + }, + { + "step": 4, + "epoch": 0.11267605633802817, + "cpu_mem": 1.493901312, + "gpu_mem": 4.493357056, + "loss": 1.9283, + "grad_norm": 45.13872146606445, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 5, + "epoch": 0.14084507042253522, + "cpu_mem": 1.49409792, + "gpu_mem": 4.493398528, + "loss": 1.493, + "grad_norm": 16.81772232055664, + "learning_rate": 0.00010714285714285714 + }, + { + "step": 6, + "epoch": 0.16901408450704225, + "cpu_mem": 1.494294528, + "gpu_mem": 4.493373952, + "loss": 1.3415, + "grad_norm": 12.790365219116211, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 7, + "epoch": 0.19718309859154928, + "cpu_mem": 1.494294528, + "gpu_mem": 4.493396992, + "loss": 1.5003, + "grad_norm": 25.754161834716797, + "learning_rate": 0.00015 + }, + { + "step": 8, + "epoch": 0.22535211267605634, + "cpu_mem": 1.494294528, + "gpu_mem": 4.49335552, + "loss": 1.3258, + "grad_norm": 6.1254353523254395, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 9, + "epoch": 0.2535211267605634, + "cpu_mem": 1.494294528, + "gpu_mem": 4.493357056, + "loss": 1.4644, + "grad_norm": 34.32180404663086, + "learning_rate": 0.00019285714285714286 + }, + { + "step": 10, + "epoch": 0.28169014084507044, + "cpu_mem": 1.494294528, + "gpu_mem": 4.493352448, + "loss": 1.4288, + "grad_norm": 23.392850875854492, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 11, + "epoch": 0.30985915492957744, + "cpu_mem": 1.494294528, + "gpu_mem": 4.493430784, + "loss": 1.5429, + "grad_norm": 20.585525512695312, + "learning_rate": 0.00023571428571428569 + }, + { + "step": 12, + "epoch": 0.3380281690140845, + "cpu_mem": 1.494294528, + "gpu_mem": 4.493404672, + "loss": 1.305, + "grad_norm": 5.136800765991211, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 13, + "epoch": 0.36619718309859156, + "cpu_mem": 1.494294528, + "gpu_mem": 4.49335552, + "loss": 1.3883, + "grad_norm": 26.17745590209961, + "learning_rate": 0.00027857142857142854 + }, + { + "step": 14, + "epoch": 0.39436619718309857, + "cpu_mem": 1.494294528, + "gpu_mem": 4.493377024, + "loss": 1.4056, + "grad_norm": 12.29943561553955, + "learning_rate": 0.0003 + }, + { + "step": 15, + "epoch": 0.4225352112676056, + "cpu_mem": 1.494294528, + "gpu_mem": 4.493353984, + "loss": 1.3972, + "grad_norm": 5.9298176765441895, + "learning_rate": 0.0002999533773001224 + }, + { + "step": 16, + "epoch": 0.4507042253521127, + "cpu_mem": 1.494294528, + "gpu_mem": 4.493358592, + "loss": 1.3433, + "grad_norm": 2.671994924545288, + "learning_rate": 0.0002998135381828383 + }, + { + "step": 17, + "epoch": 0.4788732394366197, + "cpu_mem": 1.494294528, + "gpu_mem": 4.493395456, + "loss": 1.3402, + "grad_norm": 5.09276008605957, + "learning_rate": 0.00029958056957717696 + }, + { + "step": 18, + "epoch": 0.5070422535211268, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493406208, + "loss": 1.3272, + "grad_norm": 4.426205635070801, + "learning_rate": 0.0002992546163048102 + }, + { + "step": 19, + "epoch": 0.5352112676056338, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493349376, + "loss": 1.4243, + "grad_norm": 10.765657424926758, + "learning_rate": 0.0002988358809900258 + }, + { + "step": 20, + "epoch": 0.5633802816901409, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493420032, + "loss": 1.4339, + "grad_norm": 7.541872024536133, + "learning_rate": 0.0002983246239337692 + }, + { + "step": 21, + "epoch": 0.5915492957746479, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493418496, + "loss": 1.349, + "grad_norm": 5.185708522796631, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 22, + "epoch": 0.6197183098591549, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493375488, + "loss": 1.3539, + "grad_norm": 6.7187676429748535, + "learning_rate": 0.00029702587317728153 + }, + { + "step": 23, + "epoch": 0.647887323943662, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493392384, + "loss": 1.3318, + "grad_norm": 6.468657493591309, + "learning_rate": 0.0002962391868272735 + }, + { + "step": 24, + "epoch": 0.676056338028169, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493349376, + "loss": 1.306, + "grad_norm": 3.4460861682891846, + "learning_rate": 0.00029536159293436166 + }, + { + "step": 25, + "epoch": 0.704225352112676, + "cpu_mem": 1.494491136, + "gpu_mem": 4.49337856, + "loss": 1.5276, + "grad_norm": 12.144696235656738, + "learning_rate": 0.00029439363704250176 + }, + { + "step": 26, + "epoch": 0.7323943661971831, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493358592, + "loss": 1.4864, + "grad_norm": 5.351874351501465, + "learning_rate": 0.00029333592086792107 + }, + { + "step": 27, + "epoch": 0.7605633802816901, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493384704, + "loss": 1.3675, + "grad_norm": 5.478825092315674, + "learning_rate": 0.0002921891019250697 + }, + { + "step": 28, + "epoch": 0.7887323943661971, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493384704, + "loss": 1.4199, + "grad_norm": 5.453819274902344, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 29, + "epoch": 0.8169014084507042, + "cpu_mem": 1.494491136, + "gpu_mem": 4.4933632, + "loss": 1.2599, + "grad_norm": 2.9942517280578613, + "learning_rate": 0.00028963106229663063 + }, + { + "step": 30, + "epoch": 0.8450704225352113, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493353984, + "loss": 1.3457, + "grad_norm": 3.5960090160369873, + "learning_rate": 0.00028822143178056114 + }, + { + "step": 31, + "epoch": 0.8732394366197183, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493372416, + "loss": 1.3961, + "grad_norm": 5.403853416442871, + "learning_rate": 0.00028672587784675096 + }, + { + "step": 32, + "epoch": 0.9014084507042254, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493395456, + "loss": 1.3711, + "grad_norm": 6.018552303314209, + "learning_rate": 0.0002851453301853628 + }, + { + "step": 33, + "epoch": 0.9295774647887324, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493392384, + "loss": 1.3374, + "grad_norm": 2.8503122329711914, + "learning_rate": 0.00028348077132172027 + }, + { + "step": 34, + "epoch": 0.9577464788732394, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493395456, + "loss": 1.386, + "grad_norm": 4.291422367095947, + "learning_rate": 0.0002817332360055343 + }, + { + "step": 35, + "epoch": 0.9859154929577465, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493377024, + "loss": 1.3118, + "grad_norm": 3.1133341789245605, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 36, + "epoch": 1.0140845070422535, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518678016, + "loss": 1.9673, + "grad_norm": 5.740923881530762, + "learning_rate": 0.0002779936322448233 + }, + { + "step": 37, + "epoch": 1.0422535211267605, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518682624, + "loss": 1.339, + "grad_norm": 3.4668984413146973, + "learning_rate": 0.0002760038884726157 + }, + { + "step": 38, + "epoch": 1.0704225352112675, + "cpu_mem": 1.494491136, + "gpu_mem": 4.51866112, + "loss": 1.2514, + "grad_norm": 3.128293514251709, + "learning_rate": 0.00027393581614739923 + }, + { + "step": 39, + "epoch": 1.0985915492957747, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518650368, + "loss": 1.3329, + "grad_norm": 4.251654148101807, + "learning_rate": 0.0002717907008573785 + }, + { + "step": 40, + "epoch": 1.1267605633802817, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518713344, + "loss": 1.3431, + "grad_norm": 3.2362022399902344, + "learning_rate": 0.0002695698760834384 + }, + { + "step": 41, + "epoch": 1.1549295774647887, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518673408, + "loss": 1.3388, + "grad_norm": 3.327332019805908, + "learning_rate": 0.00026727472237020447 + }, + { + "step": 42, + "epoch": 1.1830985915492958, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518716416, + "loss": 1.3812, + "grad_norm": 3.0941553115844727, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 43, + "epoch": 1.2112676056338028, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518665728, + "loss": 1.3735, + "grad_norm": 2.356278419494629, + "learning_rate": 0.0002624671804451601 + }, + { + "step": 44, + "epoch": 1.2394366197183098, + "cpu_mem": 1.494491136, + "gpu_mem": 4.51873024, + "loss": 1.3766, + "grad_norm": 3.27545428276062, + "learning_rate": 0.0002599577807744739 + }, + { + "step": 45, + "epoch": 1.267605633802817, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518697984, + "loss": 1.3891, + "grad_norm": 2.333723306655884, + "learning_rate": 0.0002573800273889577 + }, + { + "step": 46, + "epoch": 1.295774647887324, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518702592, + "loss": 1.3937, + "grad_norm": 3.481844425201416, + "learning_rate": 0.0002547355227129109 + }, + { + "step": 47, + "epoch": 1.323943661971831, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518648832, + "loss": 1.3532, + "grad_norm": 4.956977844238281, + "learning_rate": 0.00025202591066563786 + }, + { + "step": 48, + "epoch": 1.352112676056338, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518662656, + "loss": 1.3068, + "grad_norm": 3.1343212127685547, + "learning_rate": 0.0002492528756395289 + }, + { + "step": 49, + "epoch": 1.380281690140845, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518651904, + "loss": 1.3936, + "grad_norm": 8.055434226989746, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 50, + "epoch": 1.408450704225352, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518665728, + "loss": 1.4007, + "grad_norm": 16.459016799926758, + "learning_rate": 0.00024352347027881003 + }, + { + "step": 51, + "epoch": 1.436619718309859, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518717952, + "loss": 1.4211, + "grad_norm": 16.374134063720703, + "learning_rate": 0.0002405706615488216 + }, + { + "step": 52, + "epoch": 1.4647887323943662, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518665728, + "loss": 1.3547, + "grad_norm": 4.7861762046813965, + "learning_rate": 0.00023756155083521846 + }, + { + "step": 53, + "epoch": 1.4929577464788732, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518734848, + "loss": 1.3086, + "grad_norm": 5.092458724975586, + "learning_rate": 0.00023449800870954326 + }, + { + "step": 54, + "epoch": 1.5211267605633803, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518702592, + "loss": 1.3138, + "grad_norm": 3.623900890350342, + "learning_rate": 0.0002313819395798639 + }, + { + "step": 55, + "epoch": 1.5492957746478875, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518711808, + "loss": 1.3956, + "grad_norm": 4.312674522399902, + "learning_rate": 0.0002282152805069247 + }, + { + "step": 56, + "epoch": 1.5774647887323945, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518687232, + "loss": 1.341, + "grad_norm": 3.958331346511841, + "learning_rate": 0.000225 + }, + { + "step": 57, + "epoch": 1.6056338028169015, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518721024, + "loss": 1.3175, + "grad_norm": 3.097858190536499, + "learning_rate": 0.00022173809679319772 + }, + { + "step": 58, + "epoch": 1.6338028169014085, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518702592, + "loss": 1.3063, + "grad_norm": 3.0172431468963623, + "learning_rate": 0.00021843159860297442 + }, + { + "step": 59, + "epoch": 1.6619718309859155, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518688768, + "loss": 1.3144, + "grad_norm": 2.152859687805176, + "learning_rate": 0.00021508256086763368 + }, + { + "step": 60, + "epoch": 1.6901408450704225, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518727168, + "loss": 1.3108, + "grad_norm": 3.8040716648101807, + "learning_rate": 0.00021169306546959174 + }, + { + "step": 61, + "epoch": 1.7183098591549295, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518659584, + "loss": 1.3203, + "grad_norm": 3.8633904457092285, + "learning_rate": 0.0002082652194412042 + }, + { + "step": 62, + "epoch": 1.7464788732394365, + "cpu_mem": 1.494491136, + "gpu_mem": 4.5187072, + "loss": 1.3983, + "grad_norm": 4.749051094055176, + "learning_rate": 0.00020480115365495926 + }, + { + "step": 63, + "epoch": 1.7746478873239435, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518656512, + "loss": 1.3682, + "grad_norm": 5.358827114105225, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 64, + "epoch": 1.8028169014084507, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518705664, + "loss": 1.4047, + "grad_norm": 5.390880584716797, + "learning_rate": 0.00019777299753775265 + }, + { + "step": 65, + "epoch": 1.8309859154929577, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518704128, + "loss": 1.3214, + "grad_norm": 4.8978447914123535, + "learning_rate": 0.00019421327616163563 + }, + { + "step": 66, + "epoch": 1.8591549295774648, + "cpu_mem": 1.494491136, + "gpu_mem": 4.51872256, + "loss": 1.3365, + "grad_norm": 4.024534225463867, + "learning_rate": 0.00019062607022145078 + }, + { + "step": 67, + "epoch": 1.887323943661972, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518664192, + "loss": 1.3593, + "grad_norm": 10.947258949279785, + "learning_rate": 0.00018701360965354402 + }, + { + "step": 68, + "epoch": 1.915492957746479, + "cpu_mem": 1.494491136, + "gpu_mem": 4.51867648, + "loss": 1.3801, + "grad_norm": 7.698343276977539, + "learning_rate": 0.00018337814009344714 + }, + { + "step": 69, + "epoch": 1.943661971830986, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518701056, + "loss": 1.365, + "grad_norm": 6.346189022064209, + "learning_rate": 0.0001797219214799096 + }, + { + "step": 70, + "epoch": 1.971830985915493, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518678016, + "loss": 1.2689, + "grad_norm": 4.332313060760498, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 71, + "epoch": 2.0, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518513664, + "loss": 1.9494, + "grad_norm": 61.389652252197266, + "learning_rate": 0.00017235633992642615 + }, + { + "step": 72, + "epoch": 2.028169014084507, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493387776, + "loss": 1.369, + "grad_norm": 5.659824848175049, + "learning_rate": 0.00016865155569712278 + }, + { + "step": 73, + "epoch": 2.056338028169014, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493350912, + "loss": 1.3145, + "grad_norm": 4.936837673187256, + "learning_rate": 0.0001649351769893725 + }, + { + "step": 74, + "epoch": 2.084507042253521, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493410816, + "loss": 1.2625, + "grad_norm": 3.075138568878174, + "learning_rate": 0.00016120951403796364 + }, + { + "step": 75, + "epoch": 2.112676056338028, + "cpu_mem": 1.494491136, + "gpu_mem": 4.49337856, + "loss": 1.34, + "grad_norm": 2.8183681964874268, + "learning_rate": 0.00015747688284910457 + }, + { + "step": 76, + "epoch": 2.140845070422535, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493389312, + "loss": 1.2405, + "grad_norm": 2.041452169418335, + "learning_rate": 0.00015373960376071093 + }, + { + "step": 77, + "epoch": 2.169014084507042, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493426176, + "loss": 1.3003, + "grad_norm": 3.310304880142212, + "learning_rate": 0.00015 + }, + { + "step": 78, + "epoch": 2.1971830985915495, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493410816, + "loss": 1.3301, + "grad_norm": 4.006730079650879, + "learning_rate": 0.00014626039623928907 + }, + { + "step": 79, + "epoch": 2.2253521126760565, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493361664, + "loss": 1.2198, + "grad_norm": 3.7885594367980957, + "learning_rate": 0.0001425231171508954 + }, + { + "step": 80, + "epoch": 2.2535211267605635, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493406208, + "loss": 1.3053, + "grad_norm": 3.224207639694214, + "learning_rate": 0.00013879048596203636 + }, + { + "step": 81, + "epoch": 2.2816901408450705, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493392384, + "loss": 1.3576, + "grad_norm": 5.85601282119751, + "learning_rate": 0.0001350648230106275 + }, + { + "step": 82, + "epoch": 2.3098591549295775, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493360128, + "loss": 1.3497, + "grad_norm": 6.43959903717041, + "learning_rate": 0.00013134844430287725 + }, + { + "step": 83, + "epoch": 2.3380281690140845, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493410816, + "loss": 1.2215, + "grad_norm": 5.418457508087158, + "learning_rate": 0.0001276436600735738 + }, + { + "step": 84, + "epoch": 2.3661971830985915, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493349376, + "loss": 1.3708, + "grad_norm": 6.882091045379639, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 85, + "epoch": 2.3943661971830985, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493395456, + "loss": 1.3319, + "grad_norm": 6.375625133514404, + "learning_rate": 0.00012027807852009038 + }, + { + "step": 86, + "epoch": 2.4225352112676055, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493349376, + "loss": 1.29, + "grad_norm": 4.537503719329834, + "learning_rate": 0.00011662185990655284 + }, + { + "step": 87, + "epoch": 2.4507042253521125, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493380096, + "loss": 1.2986, + "grad_norm": 5.286020278930664, + "learning_rate": 0.00011298639034645593 + }, + { + "step": 88, + "epoch": 2.4788732394366195, + "cpu_mem": 1.494491136, + "gpu_mem": 4.49335552, + "loss": 1.2059, + "grad_norm": 4.801527500152588, + "learning_rate": 0.00010937392977854923 + }, + { + "step": 89, + "epoch": 2.507042253521127, + "cpu_mem": 1.494491136, + "gpu_mem": 4.49340928, + "loss": 1.2417, + "grad_norm": 4.618600845336914, + "learning_rate": 0.00010578672383836435 + }, + { + "step": 90, + "epoch": 2.535211267605634, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493390848, + "loss": 1.3213, + "grad_norm": 5.864246845245361, + "learning_rate": 0.00010222700246224735 + }, + { + "step": 91, + "epoch": 2.563380281690141, + "cpu_mem": 1.494491136, + "gpu_mem": 4.49334016, + "loss": 1.3045, + "grad_norm": 5.989525318145752, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 92, + "epoch": 2.591549295774648, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493364736, + "loss": 1.3411, + "grad_norm": 6.808497905731201, + "learning_rate": 9.519884634504074e-05 + }, + { + "step": 93, + "epoch": 2.619718309859155, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493367808, + "loss": 1.2999, + "grad_norm": 7.139451026916504, + "learning_rate": 9.17347805587958e-05 + }, + { + "step": 94, + "epoch": 2.647887323943662, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493360128, + "loss": 1.2241, + "grad_norm": 5.613903522491455, + "learning_rate": 8.830693453040829e-05 + }, + { + "step": 95, + "epoch": 2.676056338028169, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493398528, + "loss": 1.2218, + "grad_norm": 4.103054523468018, + "learning_rate": 8.491743913236628e-05 + }, + { + "step": 96, + "epoch": 2.704225352112676, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493407744, + "loss": 1.1553, + "grad_norm": 4.9666829109191895, + "learning_rate": 8.156840139702554e-05 + }, + { + "step": 97, + "epoch": 2.732394366197183, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493350912, + "loss": 1.2102, + "grad_norm": 4.077977180480957, + "learning_rate": 7.82619032068023e-05 + }, + { + "step": 98, + "epoch": 2.76056338028169, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493350912, + "loss": 1.276, + "grad_norm": 4.605331897735596, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 99, + "epoch": 2.788732394366197, + "cpu_mem": 1.494491136, + "gpu_mem": 4.49334784, + "loss": 1.2602, + "grad_norm": 7.203280448913574, + "learning_rate": 7.17847194930753e-05 + }, + { + "step": 100, + "epoch": 2.816901408450704, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493346304, + "loss": 1.2185, + "grad_norm": 4.943838119506836, + "learning_rate": 6.86180604201361e-05 + }, + { + "step": 101, + "epoch": 2.845070422535211, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493389312, + "loss": 1.1767, + "grad_norm": 4.598587512969971, + "learning_rate": 6.550199129045668e-05 + }, + { + "step": 102, + "epoch": 2.873239436619718, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493327872, + "loss": 1.2838, + "grad_norm": 4.879561901092529, + "learning_rate": 6.243844916478155e-05 + }, + { + "step": 103, + "epoch": 2.9014084507042255, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493377024, + "loss": 1.2128, + "grad_norm": 4.067531108856201, + "learning_rate": 5.9429338451178355e-05 + }, + { + "step": 104, + "epoch": 2.9295774647887325, + "cpu_mem": 1.494491136, + "gpu_mem": 4.49344, + "loss": 1.2998, + "grad_norm": 5.858630180358887, + "learning_rate": 5.6476529721189974e-05 + }, + { + "step": 105, + "epoch": 2.9577464788732395, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493392384, + "loss": 1.1456, + "grad_norm": 4.226877689361572, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 106, + "epoch": 2.9859154929577465, + "cpu_mem": 1.494491136, + "gpu_mem": 4.493373952, + "loss": 1.2223, + "grad_norm": 4.734609127044678, + "learning_rate": 5.074712436047112e-05 + }, + { + "step": 107, + "epoch": 3.0140845070422535, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518701056, + "loss": 1.625, + "grad_norm": 12.016558647155762, + "learning_rate": 4.7974089334362057e-05 + }, + { + "step": 108, + "epoch": 3.0422535211267605, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518682624, + "loss": 1.025, + "grad_norm": 5.136899948120117, + "learning_rate": 4.526447728708908e-05 + }, + { + "step": 109, + "epoch": 3.0704225352112675, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518671872, + "loss": 1.0919, + "grad_norm": 6.250692844390869, + "learning_rate": 4.261997261104223e-05 + }, + { + "step": 110, + "epoch": 3.0985915492957745, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518725632, + "loss": 0.9629, + "grad_norm": 6.343587875366211, + "learning_rate": 4.004221922552608e-05 + }, + { + "step": 111, + "epoch": 3.1267605633802815, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518685696, + "loss": 1.0155, + "grad_norm": 6.334781169891357, + "learning_rate": 3.753281955483985e-05 + }, + { + "step": 112, + "epoch": 3.1549295774647885, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518704128, + "loss": 1.039, + "grad_norm": 8.391195297241211, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 113, + "epoch": 3.183098591549296, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518767104, + "loss": 0.9537, + "grad_norm": 7.252392768859863, + "learning_rate": 3.2725277629795526e-05 + }, + { + "step": 114, + "epoch": 3.211267605633803, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518694912, + "loss": 1.0189, + "grad_norm": 8.310603141784668, + "learning_rate": 3.0430123916561672e-05 + }, + { + "step": 115, + "epoch": 3.23943661971831, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518688768, + "loss": 1.0358, + "grad_norm": 7.496110916137695, + "learning_rate": 2.8209299142621522e-05 + }, + { + "step": 116, + "epoch": 3.267605633802817, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518704128, + "loss": 0.962, + "grad_norm": 8.337830543518066, + "learning_rate": 2.6064183852600797e-05 + }, + { + "step": 117, + "epoch": 3.295774647887324, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518719488, + "loss": 0.9699, + "grad_norm": 7.644598007202148, + "learning_rate": 2.3996111527384288e-05 + }, + { + "step": 118, + "epoch": 3.323943661971831, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518710272, + "loss": 0.89, + "grad_norm": 6.858006000518799, + "learning_rate": 2.2006367755176655e-05 + }, + { + "step": 119, + "epoch": 3.352112676056338, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518701056, + "loss": 0.8971, + "grad_norm": 8.39448356628418, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 120, + "epoch": 3.380281690140845, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518719488, + "loss": 0.9552, + "grad_norm": 8.790838241577148, + "learning_rate": 1.82667639944657e-05 + }, + { + "step": 121, + "epoch": 3.408450704225352, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518717952, + "loss": 0.7973, + "grad_norm": 9.548598289489746, + "learning_rate": 1.6519228678279718e-05 + }, + { + "step": 122, + "epoch": 3.436619718309859, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518674944, + "loss": 0.8216, + "grad_norm": 9.652661323547363, + "learning_rate": 1.4854669814637143e-05 + }, + { + "step": 123, + "epoch": 3.464788732394366, + "cpu_mem": 1.494491136, + "gpu_mem": 4.5187072, + "loss": 0.804, + "grad_norm": 8.80784797668457, + "learning_rate": 1.3274122153249028e-05 + }, + { + "step": 124, + "epoch": 3.492957746478873, + "cpu_mem": 1.494491136, + "gpu_mem": 4.51866112, + "loss": 0.8326, + "grad_norm": 9.57839584350586, + "learning_rate": 1.1778568219438839e-05 + }, + { + "step": 125, + "epoch": 3.52112676056338, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518705664, + "loss": 0.8423, + "grad_norm": 11.19555377960205, + "learning_rate": 1.036893770336938e-05 + }, + { + "step": 126, + "epoch": 3.5492957746478875, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518656512, + "loss": 0.7886, + "grad_norm": 10.461869239807129, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 127, + "epoch": 3.5774647887323945, + "cpu_mem": 1.494491136, + "gpu_mem": 4.5186688, + "loss": 0.8942, + "grad_norm": 10.252334594726562, + "learning_rate": 7.810898074930243e-06 + }, + { + "step": 128, + "epoch": 3.6056338028169015, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518693376, + "loss": 0.9278, + "grad_norm": 11.462838172912598, + "learning_rate": 6.664079132078881e-06 + }, + { + "step": 129, + "epoch": 3.6338028169014085, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518654976, + "loss": 0.7816, + "grad_norm": 10.681913375854492, + "learning_rate": 5.606362957498195e-06 + }, + { + "step": 130, + "epoch": 3.6619718309859155, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518658048, + "loss": 0.9345, + "grad_norm": 15.080108642578125, + "learning_rate": 4.638407065638322e-06 + }, + { + "step": 131, + "epoch": 3.6901408450704225, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518670336, + "loss": 0.8457, + "grad_norm": 11.008662223815918, + "learning_rate": 3.760813172726457e-06 + }, + { + "step": 132, + "epoch": 3.7183098591549295, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518635008, + "loss": 0.8203, + "grad_norm": 9.407642364501953, + "learning_rate": 2.9741268227184255e-06 + }, + { + "step": 133, + "epoch": 3.7464788732394365, + "cpu_mem": 1.494491136, + "gpu_mem": 4.51867648, + "loss": 0.8339, + "grad_norm": 9.52961254119873, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 134, + "epoch": 3.7746478873239435, + "cpu_mem": 1.494491136, + "gpu_mem": 4.51869184, + "loss": 0.8828, + "grad_norm": 9.80649185180664, + "learning_rate": 1.6753760662307215e-06 + }, + { + "step": 135, + "epoch": 3.802816901408451, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518656512, + "loss": 0.9178, + "grad_norm": 12.667458534240723, + "learning_rate": 1.1641190099741904e-06 + }, + { + "step": 136, + "epoch": 3.830985915492958, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518664192, + "loss": 0.7627, + "grad_norm": 11.412312507629395, + "learning_rate": 7.453836951897885e-07 + }, + { + "step": 137, + "epoch": 3.859154929577465, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518685696, + "loss": 0.7683, + "grad_norm": 11.488083839416504, + "learning_rate": 4.194304228229806e-07 + }, + { + "step": 138, + "epoch": 3.887323943661972, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518696448, + "loss": 0.816, + "grad_norm": 10.21458911895752, + "learning_rate": 1.8646181716164831e-07 + }, + { + "step": 139, + "epoch": 3.915492957746479, + "cpu_mem": 1.494491136, + "gpu_mem": 4.518688768, + "loss": 0.9006, + "grad_norm": 12.040121078491211, + "learning_rate": 4.662269987756317e-08 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.494491136, + "gpu_mem": 4.51872256, + "loss": 0.8363, + "grad_norm": 12.032706260681152, + "learning_rate": 0.0 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.494491136, + "gpu_mem": 4.51872256, + "train_runtime": 674.8012, + "train_samples_per_second": 13.343, + "train_steps_per_second": 0.207, + "total_flos": 0.0, + "train_loss": 1.3040791460445949 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/adapter_config.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..91562a2718627f56cb3f88093dd26c3a98c35384 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 4, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 2, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "A" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/eval_results.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..f06a06dda43805b5b0871c9ba520abae53770336 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "boolq", + "results": 0.7926605504587156 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/training_configuration.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..6358b188c6c3b0b67f30e428137f965d29eb38ed --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "BOOLQ", + "dataset_id": "google/boolq", + "preprocess_id": "boolq_train_deepeval" + }, + "peft_config": { + "method": "abl_A", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 1577576 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 2, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_A-boolq-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2", + "seed": 42, + "timestamp": "2025-08-30T12:13:11.031630" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/training_logs.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..4f5ce0a14bf47bdeaa1e506e9f14e922e70f6f7a --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/training_logs.json @@ -0,0 +1,2659 @@ +[ + { + "step": 1, + "epoch": 0.006779661016949152, + "cpu_mem": 1.4856192, + "gpu_mem": 4.424159232, + "loss": 8.869, + "grad_norm": 265.7165832519531, + "learning_rate": 9.999999999999999e-06 + }, + { + "step": 2, + "epoch": 0.013559322033898305, + "cpu_mem": 1.491910656, + "gpu_mem": 4.437070336, + "loss": 8.9376, + "grad_norm": 272.0975036621094, + "learning_rate": 1.9999999999999998e-05 + }, + { + "step": 3, + "epoch": 0.020338983050847456, + "cpu_mem": 1.492697088, + "gpu_mem": 4.436988928, + "loss": 8.2439, + "grad_norm": 279.63671875, + "learning_rate": 2.9999999999999997e-05 + }, + { + "step": 4, + "epoch": 0.02711864406779661, + "cpu_mem": 1.49348352, + "gpu_mem": 4.436988928, + "loss": 6.7912, + "grad_norm": 283.7794494628906, + "learning_rate": 3.9999999999999996e-05 + }, + { + "step": 5, + "epoch": 0.03389830508474576, + "cpu_mem": 1.493876736, + "gpu_mem": 4.436924416, + "loss": 4.9316, + "grad_norm": 254.95008850097656, + "learning_rate": 4.9999999999999996e-05 + }, + { + "step": 6, + "epoch": 0.04067796610169491, + "cpu_mem": 1.49446656, + "gpu_mem": 4.436944384, + "loss": 3.114, + "grad_norm": 200.49691772460938, + "learning_rate": 5.9999999999999995e-05 + }, + { + "step": 7, + "epoch": 0.04745762711864407, + "cpu_mem": 1.495056384, + "gpu_mem": 4.436996608, + "loss": 1.6758, + "grad_norm": 93.27091217041016, + "learning_rate": 7e-05 + }, + { + "step": 8, + "epoch": 0.05423728813559322, + "cpu_mem": 1.495646208, + "gpu_mem": 4.437082624, + "loss": 1.039, + "grad_norm": 57.26808547973633, + "learning_rate": 7.999999999999999e-05 + }, + { + "step": 9, + "epoch": 0.061016949152542375, + "cpu_mem": 1.496039424, + "gpu_mem": 4.436990464, + "loss": 1.0551, + "grad_norm": 59.7805061340332, + "learning_rate": 8.999999999999999e-05 + }, + { + "step": 10, + "epoch": 0.06779661016949153, + "cpu_mem": 1.49643264, + "gpu_mem": 4.436890624, + "loss": 0.7487, + "grad_norm": 19.48933982849121, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 11, + "epoch": 0.07457627118644068, + "cpu_mem": 1.496825856, + "gpu_mem": 4.436995072, + "loss": 1.1287, + "grad_norm": 91.75373840332031, + "learning_rate": 0.00010999999999999998 + }, + { + "step": 12, + "epoch": 0.08135593220338982, + "cpu_mem": 1.49741568, + "gpu_mem": 4.437366784, + "loss": 1.0336, + "grad_norm": 73.30252838134766, + "learning_rate": 0.00011999999999999999 + }, + { + "step": 13, + "epoch": 0.08813559322033898, + "cpu_mem": 1.497808896, + "gpu_mem": 4.436970496, + "loss": 0.6842, + "grad_norm": 6.66005802154541, + "learning_rate": 0.00013 + }, + { + "step": 14, + "epoch": 0.09491525423728814, + "cpu_mem": 1.498202112, + "gpu_mem": 4.436947456, + "loss": 0.745, + "grad_norm": 33.216796875, + "learning_rate": 0.00014 + }, + { + "step": 15, + "epoch": 0.1016949152542373, + "cpu_mem": 1.49839872, + "gpu_mem": 4.436886016, + "loss": 0.8204, + "grad_norm": 36.137210845947266, + "learning_rate": 0.00015 + }, + { + "step": 16, + "epoch": 0.10847457627118644, + "cpu_mem": 1.498791936, + "gpu_mem": 4.436970496, + "loss": 0.7792, + "grad_norm": 31.646080017089844, + "learning_rate": 0.00015999999999999999 + }, + { + "step": 17, + "epoch": 0.1152542372881356, + "cpu_mem": 1.499185152, + "gpu_mem": 4.437010432, + "loss": 0.7334, + "grad_norm": 24.66205596923828, + "learning_rate": 0.00016999999999999999 + }, + { + "step": 18, + "epoch": 0.12203389830508475, + "cpu_mem": 1.499578368, + "gpu_mem": 4.437073408, + "loss": 0.6744, + "grad_norm": 3.4665486812591553, + "learning_rate": 0.00017999999999999998 + }, + { + "step": 19, + "epoch": 0.1288135593220339, + "cpu_mem": 1.499971584, + "gpu_mem": 4.436910592, + "loss": 0.8234, + "grad_norm": 28.424806594848633, + "learning_rate": 0.00018999999999999998 + }, + { + "step": 20, + "epoch": 0.13559322033898305, + "cpu_mem": 1.500168192, + "gpu_mem": 4.43702272, + "loss": 0.6248, + "grad_norm": 3.359628677368164, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 21, + "epoch": 0.1423728813559322, + "cpu_mem": 1.500561408, + "gpu_mem": 4.437180928, + "loss": 0.6311, + "grad_norm": 10.9364595413208, + "learning_rate": 0.00020999999999999998 + }, + { + "step": 22, + "epoch": 0.14915254237288136, + "cpu_mem": 1.500954624, + "gpu_mem": 4.437073408, + "loss": 0.7505, + "grad_norm": 14.361480712890625, + "learning_rate": 0.00021999999999999995 + }, + { + "step": 23, + "epoch": 0.15593220338983052, + "cpu_mem": 1.501151232, + "gpu_mem": 4.43704576, + "loss": 0.6607, + "grad_norm": 11.354830741882324, + "learning_rate": 0.00023 + }, + { + "step": 24, + "epoch": 0.16271186440677965, + "cpu_mem": 1.50134784, + "gpu_mem": 4.437102592, + "loss": 0.608, + "grad_norm": 5.800236225128174, + "learning_rate": 0.00023999999999999998 + }, + { + "step": 25, + "epoch": 0.1694915254237288, + "cpu_mem": 1.501741056, + "gpu_mem": 4.436887552, + "loss": 0.8785, + "grad_norm": 32.040443420410156, + "learning_rate": 0.00025 + }, + { + "step": 26, + "epoch": 0.17627118644067796, + "cpu_mem": 1.501937664, + "gpu_mem": 4.436942848, + "loss": 1.0202, + "grad_norm": 42.82334899902344, + "learning_rate": 0.00026 + }, + { + "step": 27, + "epoch": 0.18305084745762712, + "cpu_mem": 1.50233088, + "gpu_mem": 4.437234688, + "loss": 0.6503, + "grad_norm": 10.639617919921875, + "learning_rate": 0.00027 + }, + { + "step": 28, + "epoch": 0.18983050847457628, + "cpu_mem": 1.502527488, + "gpu_mem": 4.436913664, + "loss": 0.7747, + "grad_norm": 29.44213104248047, + "learning_rate": 0.00028 + }, + { + "step": 29, + "epoch": 0.19661016949152543, + "cpu_mem": 1.502920704, + "gpu_mem": 4.436978176, + "loss": 1.0303, + "grad_norm": 55.7458381652832, + "learning_rate": 0.00029 + }, + { + "step": 30, + "epoch": 0.2033898305084746, + "cpu_mem": 1.503117312, + "gpu_mem": 4.437056512, + "loss": 0.7492, + "grad_norm": 22.058603286743164, + "learning_rate": 0.0003 + }, + { + "step": 31, + "epoch": 0.21016949152542372, + "cpu_mem": 1.50331392, + "gpu_mem": 4.436859904, + "loss": 0.5481, + "grad_norm": 3.686203956604004, + "learning_rate": 0.0002999893794250036 + }, + { + "step": 32, + "epoch": 0.21694915254237288, + "cpu_mem": 1.503510528, + "gpu_mem": 4.436973568, + "loss": 0.834, + "grad_norm": 23.762094497680664, + "learning_rate": 0.00029995751920396937 + }, + { + "step": 33, + "epoch": 0.22372881355932203, + "cpu_mem": 1.503707136, + "gpu_mem": 4.437211648, + "loss": 0.8735, + "grad_norm": 20.501628875732422, + "learning_rate": 0.00029990442384854874 + }, + { + "step": 34, + "epoch": 0.2305084745762712, + "cpu_mem": 1.503903744, + "gpu_mem": 4.436913664, + "loss": 0.5833, + "grad_norm": 5.7299933433532715, + "learning_rate": 0.0002998301008774512 + }, + { + "step": 35, + "epoch": 0.23728813559322035, + "cpu_mem": 1.504100352, + "gpu_mem": 4.437124096, + "loss": 0.6658, + "grad_norm": 7.160278797149658, + "learning_rate": 0.0002997345608153792 + }, + { + "step": 36, + "epoch": 0.2440677966101695, + "cpu_mem": 1.50429696, + "gpu_mem": 4.437074944, + "loss": 0.711, + "grad_norm": 18.156116485595703, + "learning_rate": 0.000299617817191538 + }, + { + "step": 37, + "epoch": 0.25084745762711863, + "cpu_mem": 1.504493568, + "gpu_mem": 4.436886016, + "loss": 0.5941, + "grad_norm": 4.312148094177246, + "learning_rate": 0.0002994798865377198 + }, + { + "step": 38, + "epoch": 0.2576271186440678, + "cpu_mem": 1.504690176, + "gpu_mem": 4.437133312, + "loss": 0.8683, + "grad_norm": 32.92335891723633, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 39, + "epoch": 0.26440677966101694, + "cpu_mem": 1.504886784, + "gpu_mem": 4.437512704, + "loss": 0.7286, + "grad_norm": 17.68904685974121, + "learning_rate": 0.0002991405452657846 + }, + { + "step": 40, + "epoch": 0.2711864406779661, + "cpu_mem": 1.505083392, + "gpu_mem": 4.437082624, + "loss": 0.5857, + "grad_norm": 4.258547782897949, + "learning_rate": 0.00029893918270099324 + }, + { + "step": 41, + "epoch": 0.27796610169491526, + "cpu_mem": 1.50528, + "gpu_mem": 4.437309952, + "loss": 0.6943, + "grad_norm": 15.192357063293457, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 42, + "epoch": 0.2847457627118644, + "cpu_mem": 1.505476608, + "gpu_mem": 4.43720704, + "loss": 0.5717, + "grad_norm": 10.226648330688477, + "learning_rate": 0.0002984732162821399 + }, + { + "step": 43, + "epoch": 0.29152542372881357, + "cpu_mem": 1.505673216, + "gpu_mem": 4.437028864, + "loss": 0.6568, + "grad_norm": 8.291679382324219, + "learning_rate": 0.0002982086784124952 + }, + { + "step": 44, + "epoch": 0.2983050847457627, + "cpu_mem": 1.505869824, + "gpu_mem": 4.437171712, + "loss": 0.6187, + "grad_norm": 9.696277618408203, + "learning_rate": 0.00029792315305772796 + }, + { + "step": 45, + "epoch": 0.3050847457627119, + "cpu_mem": 1.505869824, + "gpu_mem": 4.436952064, + "loss": 0.8114, + "grad_norm": 17.778474807739258, + "learning_rate": 0.0002976166806504174 + }, + { + "step": 46, + "epoch": 0.31186440677966104, + "cpu_mem": 1.506066432, + "gpu_mem": 4.437194752, + "loss": 0.7589, + "grad_norm": 25.64463996887207, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 47, + "epoch": 0.31864406779661014, + "cpu_mem": 1.50626304, + "gpu_mem": 4.436918272, + "loss": 0.8276, + "grad_norm": 28.21428108215332, + "learning_rate": 0.00029694107123365385 + }, + { + "step": 48, + "epoch": 0.3254237288135593, + "cpu_mem": 1.50626304, + "gpu_mem": 4.436995072, + "loss": 0.6789, + "grad_norm": 26.740333557128906, + "learning_rate": 0.00029657202989567393 + }, + { + "step": 49, + "epoch": 0.33220338983050846, + "cpu_mem": 1.506459648, + "gpu_mem": 4.437011968, + "loss": 0.7783, + "grad_norm": 16.810571670532227, + "learning_rate": 0.00029618223283454893 + }, + { + "step": 50, + "epoch": 0.3389830508474576, + "cpu_mem": 1.506459648, + "gpu_mem": 4.436950528, + "loss": 0.6132, + "grad_norm": 5.90757942199707, + "learning_rate": 0.00029577173524853123 + }, + { + "step": 51, + "epoch": 0.34576271186440677, + "cpu_mem": 1.506656256, + "gpu_mem": 4.436955136, + "loss": 0.5682, + "grad_norm": 6.385296821594238, + "learning_rate": 0.0002953405952672261 + }, + { + "step": 52, + "epoch": 0.3525423728813559, + "cpu_mem": 1.506656256, + "gpu_mem": 4.437035008, + "loss": 0.5911, + "grad_norm": 4.772059917449951, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 53, + "epoch": 0.3593220338983051, + "cpu_mem": 1.506852864, + "gpu_mem": 4.437058048, + "loss": 0.5848, + "grad_norm": 9.717635154724121, + "learning_rate": 0.0002944166352441363 + }, + { + "step": 54, + "epoch": 0.36610169491525424, + "cpu_mem": 1.507049472, + "gpu_mem": 4.436985856, + "loss": 0.6352, + "grad_norm": 10.524502754211426, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 55, + "epoch": 0.3728813559322034, + "cpu_mem": 1.507049472, + "gpu_mem": 4.437256192, + "loss": 0.6172, + "grad_norm": 6.137028694152832, + "learning_rate": 0.00029341087610604337 + }, + { + "step": 56, + "epoch": 0.37966101694915255, + "cpu_mem": 1.50724608, + "gpu_mem": 4.437042688, + "loss": 0.7273, + "grad_norm": 11.40707015991211, + "learning_rate": 0.00029287749809037904 + }, + { + "step": 57, + "epoch": 0.3864406779661017, + "cpu_mem": 1.50724608, + "gpu_mem": 4.437036544, + "loss": 0.5833, + "grad_norm": 8.116436958312988, + "learning_rate": 0.0002923238875255979 + }, + { + "step": 58, + "epoch": 0.39322033898305087, + "cpu_mem": 1.507442688, + "gpu_mem": 4.436932096, + "loss": 0.5561, + "grad_norm": 4.918622016906738, + "learning_rate": 0.00029175012280720024 + }, + { + "step": 59, + "epoch": 0.4, + "cpu_mem": 1.507442688, + "gpu_mem": 4.436948992, + "loss": 0.6541, + "grad_norm": 8.79553508758545, + "learning_rate": 0.000291156285184669 + }, + { + "step": 60, + "epoch": 0.4067796610169492, + "cpu_mem": 1.507639296, + "gpu_mem": 4.437042688, + "loss": 0.5448, + "grad_norm": 4.7404704093933105, + "learning_rate": 0.00029054245874996426 + }, + { + "step": 61, + "epoch": 0.4135593220338983, + "cpu_mem": 1.507639296, + "gpu_mem": 4.43705344, + "loss": 0.5885, + "grad_norm": 4.77510929107666, + "learning_rate": 0.0002899087304256151 + }, + { + "step": 62, + "epoch": 0.42033898305084744, + "cpu_mem": 1.507835904, + "gpu_mem": 4.437041152, + "loss": 0.7054, + "grad_norm": 11.460348129272461, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 63, + "epoch": 0.4271186440677966, + "cpu_mem": 1.507835904, + "gpu_mem": 4.437033472, + "loss": 0.5369, + "grad_norm": 31.070810317993164, + "learning_rate": 0.000288581929876693 + }, + { + "step": 64, + "epoch": 0.43389830508474575, + "cpu_mem": 1.507835904, + "gpu_mem": 4.436962816, + "loss": 0.609, + "grad_norm": 15.022584915161133, + "learning_rate": 0.0002878890455372498 + }, + { + "step": 65, + "epoch": 0.4406779661016949, + "cpu_mem": 1.507835904, + "gpu_mem": 4.43700736, + "loss": 0.5679, + "grad_norm": 5.917741775512695, + "learning_rate": 0.0002871766350518159 + }, + { + "step": 66, + "epoch": 0.44745762711864406, + "cpu_mem": 1.507835904, + "gpu_mem": 4.437200896, + "loss": 0.5544, + "grad_norm": 6.815539836883545, + "learning_rate": 0.00028644479930317775 + }, + { + "step": 67, + "epoch": 0.4542372881355932, + "cpu_mem": 1.508032512, + "gpu_mem": 4.436910592, + "loss": 0.6308, + "grad_norm": 13.951557159423828, + "learning_rate": 0.00028569364192488803 + }, + { + "step": 68, + "epoch": 0.4610169491525424, + "cpu_mem": 1.508032512, + "gpu_mem": 4.436878336, + "loss": 0.7345, + "grad_norm": 16.037248611450195, + "learning_rate": 0.00028492326928659045 + }, + { + "step": 69, + "epoch": 0.46779661016949153, + "cpu_mem": 1.50822912, + "gpu_mem": 4.436944384, + "loss": 0.6223, + "grad_norm": 8.664552688598633, + "learning_rate": 0.00028413379047895665 + }, + { + "step": 70, + "epoch": 0.4745762711864407, + "cpu_mem": 1.50822912, + "gpu_mem": 4.43693824, + "loss": 0.5409, + "grad_norm": 9.303351402282715, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 71, + "epoch": 0.48135593220338985, + "cpu_mem": 1.50822912, + "gpu_mem": 4.437167104, + "loss": 0.5304, + "grad_norm": 7.759465217590332, + "learning_rate": 0.0002824979642304366 + }, + { + "step": 72, + "epoch": 0.488135593220339, + "cpu_mem": 1.50822912, + "gpu_mem": 4.437159424, + "loss": 0.5726, + "grad_norm": 8.323700904846191, + "learning_rate": 0.0002816518484350883 + }, + { + "step": 73, + "epoch": 0.49491525423728816, + "cpu_mem": 1.508425728, + "gpu_mem": 4.437125632, + "loss": 0.8418, + "grad_norm": 20.5197696685791, + "learning_rate": 0.0002807870897286772 + }, + { + "step": 74, + "epoch": 0.5016949152542373, + "cpu_mem": 1.508622336, + "gpu_mem": 4.436985856, + "loss": 0.506, + "grad_norm": 8.913507461547852, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 75, + "epoch": 0.5084745762711864, + "cpu_mem": 1.508622336, + "gpu_mem": 4.436910592, + "loss": 0.5309, + "grad_norm": 10.124161720275879, + "learning_rate": 0.000279002136031155 + }, + { + "step": 76, + "epoch": 0.5152542372881356, + "cpu_mem": 1.508622336, + "gpu_mem": 4.436850688, + "loss": 0.6009, + "grad_norm": 12.703954696655273, + "learning_rate": 0.00027808219380317216 + }, + { + "step": 77, + "epoch": 0.5220338983050847, + "cpu_mem": 1.508622336, + "gpu_mem": 4.436924416, + "loss": 0.5558, + "grad_norm": 10.466872215270996, + "learning_rate": 0.0002771441141545895 + }, + { + "step": 78, + "epoch": 0.5288135593220339, + "cpu_mem": 1.508622336, + "gpu_mem": 4.43697664, + "loss": 0.7043, + "grad_norm": 20.635786056518555, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 79, + "epoch": 0.535593220338983, + "cpu_mem": 1.508622336, + "gpu_mem": 4.437108736, + "loss": 0.6018, + "grad_norm": 11.645002365112305, + "learning_rate": 0.000275214076502292 + }, + { + "step": 80, + "epoch": 0.5423728813559322, + "cpu_mem": 1.508622336, + "gpu_mem": 4.43699968, + "loss": 0.5279, + "grad_norm": 6.659698963165283, + "learning_rate": 0.0002742223918067056 + }, + { + "step": 81, + "epoch": 0.5491525423728814, + "cpu_mem": 1.508622336, + "gpu_mem": 4.436879872, + "loss": 0.5791, + "grad_norm": 6.4264235496521, + "learning_rate": 0.00027321311626807374 + }, + { + "step": 82, + "epoch": 0.5559322033898305, + "cpu_mem": 1.508622336, + "gpu_mem": 4.436948992, + "loss": 0.6501, + "grad_norm": 9.969060897827148, + "learning_rate": 0.0002721863928075503 + }, + { + "step": 83, + "epoch": 0.5627118644067797, + "cpu_mem": 1.508818944, + "gpu_mem": 4.437048832, + "loss": 0.6573, + "grad_norm": 8.324270248413086, + "learning_rate": 0.000271142366817049 + }, + { + "step": 84, + "epoch": 0.5694915254237288, + "cpu_mem": 1.508818944, + "gpu_mem": 4.437011968, + "loss": 0.5463, + "grad_norm": 8.097661972045898, + "learning_rate": 0.00027008118613865406 + }, + { + "step": 85, + "epoch": 0.576271186440678, + "cpu_mem": 1.508818944, + "gpu_mem": 4.437044224, + "loss": 0.5767, + "grad_norm": 6.6934494972229, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 86, + "epoch": 0.5830508474576271, + "cpu_mem": 1.509015552, + "gpu_mem": 4.436995072, + "loss": 0.6116, + "grad_norm": 9.769342422485352, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 87, + "epoch": 0.5898305084745763, + "cpu_mem": 1.509015552, + "gpu_mem": 4.437002752, + "loss": 0.564, + "grad_norm": 9.166972160339355, + "learning_rate": 0.00026679623070746325 + }, + { + "step": 88, + "epoch": 0.5966101694915255, + "cpu_mem": 1.509015552, + "gpu_mem": 4.437147136, + "loss": 0.5411, + "grad_norm": 5.252224445343018, + "learning_rate": 0.0002656679579618081 + }, + { + "step": 89, + "epoch": 0.6033898305084746, + "cpu_mem": 1.509015552, + "gpu_mem": 4.436929024, + "loss": 0.6234, + "grad_norm": 6.647994518280029, + "learning_rate": 0.0002645233057465235 + }, + { + "step": 90, + "epoch": 0.6101694915254238, + "cpu_mem": 1.509015552, + "gpu_mem": 4.436982784, + "loss": 0.5256, + "grad_norm": 7.916544437408447, + "learning_rate": 0.00026336243615313873 + }, + { + "step": 91, + "epoch": 0.6169491525423729, + "cpu_mem": 1.509015552, + "gpu_mem": 4.436950528, + "loss": 0.4528, + "grad_norm": 6.592220306396484, + "learning_rate": 0.00026218551356968814 + }, + { + "step": 92, + "epoch": 0.6237288135593221, + "cpu_mem": 1.509015552, + "gpu_mem": 4.437031936, + "loss": 0.6788, + "grad_norm": 12.278592109680176, + "learning_rate": 0.00026099270465743254 + }, + { + "step": 93, + "epoch": 0.6305084745762712, + "cpu_mem": 1.509015552, + "gpu_mem": 4.436835328, + "loss": 0.8175, + "grad_norm": 13.33436393737793, + "learning_rate": 0.0002597841783272588 + }, + { + "step": 94, + "epoch": 0.6372881355932203, + "cpu_mem": 1.509015552, + "gpu_mem": 4.436948992, + "loss": 0.5207, + "grad_norm": 6.806389331817627, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 95, + "epoch": 0.6440677966101694, + "cpu_mem": 1.509015552, + "gpu_mem": 4.43696896, + "loss": 0.5228, + "grad_norm": 6.125219821929932, + "learning_rate": 0.00025732066016100394 + }, + { + "step": 96, + "epoch": 0.6508474576271186, + "cpu_mem": 1.50921216, + "gpu_mem": 4.43700736, + "loss": 0.4348, + "grad_norm": 6.964541912078857, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 97, + "epoch": 0.6576271186440678, + "cpu_mem": 1.50921216, + "gpu_mem": 4.436992, + "loss": 0.6261, + "grad_norm": 8.247956275939941, + "learning_rate": 0.0002547963544337602 + }, + { + "step": 98, + "epoch": 0.6644067796610169, + "cpu_mem": 1.50921216, + "gpu_mem": 4.436904448, + "loss": 0.4827, + "grad_norm": 8.204977035522461, + "learning_rate": 0.0002535118517223168 + }, + { + "step": 99, + "epoch": 0.6711864406779661, + "cpu_mem": 1.50921216, + "gpu_mem": 4.43685376, + "loss": 0.5029, + "grad_norm": 11.964973449707031, + "learning_rate": 0.00025221269093908365 + }, + { + "step": 100, + "epoch": 0.6779661016949152, + "cpu_mem": 1.50921216, + "gpu_mem": 4.436970496, + "loss": 0.51, + "grad_norm": 11.552522659301758, + "learning_rate": 0.0002508990560551879 + }, + { + "step": 101, + "epoch": 0.6847457627118644, + "cpu_mem": 1.50921216, + "gpu_mem": 4.437002752, + "loss": 0.5467, + "grad_norm": 13.213567733764648, + "learning_rate": 0.0002495711330914001 + }, + { + "step": 102, + "epoch": 0.6915254237288135, + "cpu_mem": 1.50921216, + "gpu_mem": 4.437036544, + "loss": 0.5372, + "grad_norm": 8.773509979248047, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 103, + "epoch": 0.6983050847457627, + "cpu_mem": 1.50921216, + "gpu_mem": 4.437087232, + "loss": 0.6076, + "grad_norm": 12.28891372680664, + "learning_rate": 0.0002468731770971113 + }, + { + "step": 104, + "epoch": 0.7050847457627119, + "cpu_mem": 1.50921216, + "gpu_mem": 4.436992, + "loss": 0.5038, + "grad_norm": 11.784523010253906, + "learning_rate": 0.0002455035261178632 + }, + { + "step": 105, + "epoch": 0.711864406779661, + "cpu_mem": 1.50921216, + "gpu_mem": 4.437093376, + "loss": 0.4978, + "grad_norm": 8.252092361450195, + "learning_rate": 0.0002441203511071278 + }, + { + "step": 106, + "epoch": 0.7186440677966102, + "cpu_mem": 1.50921216, + "gpu_mem": 4.437044224, + "loss": 0.5585, + "grad_norm": 9.146671295166016, + "learning_rate": 0.00024272384793309077 + }, + { + "step": 107, + "epoch": 0.7254237288135593, + "cpu_mem": 1.509408768, + "gpu_mem": 4.436932096, + "loss": 0.512, + "grad_norm": 7.901447772979736, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 108, + "epoch": 0.7322033898305085, + "cpu_mem": 1.509408768, + "gpu_mem": 4.437116416, + "loss": 0.5794, + "grad_norm": 7.828333377838135, + "learning_rate": 0.00023989164997670202 + }, + { + "step": 109, + "epoch": 0.7389830508474576, + "cpu_mem": 1.509408768, + "gpu_mem": 4.436970496, + "loss": 0.5699, + "grad_norm": 5.465837478637695, + "learning_rate": 0.0002384563562552943 + }, + { + "step": 110, + "epoch": 0.7457627118644068, + "cpu_mem": 1.509408768, + "gpu_mem": 4.436973568, + "loss": 0.5263, + "grad_norm": 5.203726768493652, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 111, + "epoch": 0.752542372881356, + "cpu_mem": 1.509408768, + "gpu_mem": 4.436942848, + "loss": 0.5348, + "grad_norm": 6.427229881286621, + "learning_rate": 0.0002355483955402446 + }, + { + "step": 112, + "epoch": 0.7593220338983051, + "cpu_mem": 1.509408768, + "gpu_mem": 4.436988928, + "loss": 0.5472, + "grad_norm": 6.567240238189697, + "learning_rate": 0.00023407614033613407 + }, + { + "step": 113, + "epoch": 0.7661016949152543, + "cpu_mem": 1.509408768, + "gpu_mem": 4.436979712, + "loss": 0.5146, + "grad_norm": 5.874990463256836, + "learning_rate": 0.0002325919793059723 + }, + { + "step": 114, + "epoch": 0.7728813559322034, + "cpu_mem": 1.509408768, + "gpu_mem": 4.43696128, + "loss": 0.4087, + "grad_norm": 6.523895740509033, + "learning_rate": 0.00023109612261833963 + }, + { + "step": 115, + "epoch": 0.7796610169491526, + "cpu_mem": 1.509408768, + "gpu_mem": 4.437036544, + "loss": 0.4837, + "grad_norm": 6.8895134925842285, + "learning_rate": 0.0002295887820980112 + }, + { + "step": 116, + "epoch": 0.7864406779661017, + "cpu_mem": 1.509408768, + "gpu_mem": 4.436956672, + "loss": 0.53, + "grad_norm": 12.884737968444824, + "learning_rate": 0.0002280701711959608 + }, + { + "step": 117, + "epoch": 0.7932203389830509, + "cpu_mem": 1.509408768, + "gpu_mem": 4.436847616, + "loss": 0.418, + "grad_norm": 8.543777465820312, + "learning_rate": 0.00022654050495913495 + }, + { + "step": 118, + "epoch": 0.8, + "cpu_mem": 1.509408768, + "gpu_mem": 4.437085696, + "loss": 0.6238, + "grad_norm": 18.599990844726562, + "learning_rate": 0.000225 + }, + { + "step": 119, + "epoch": 0.8067796610169492, + "cpu_mem": 1.509408768, + "gpu_mem": 4.437256192, + "loss": 0.4314, + "grad_norm": 8.732511520385742, + "learning_rate": 0.00022344887446586865 + }, + { + "step": 120, + "epoch": 0.8135593220338984, + "cpu_mem": 1.509408768, + "gpu_mem": 4.436988928, + "loss": 0.4056, + "grad_norm": 9.370827674865723, + "learning_rate": 0.00022188734800800852 + }, + { + "step": 121, + "epoch": 0.8203389830508474, + "cpu_mem": 1.509408768, + "gpu_mem": 4.437016576, + "loss": 0.6384, + "grad_norm": 15.165224075317383, + "learning_rate": 0.00022031564175053754 + }, + { + "step": 122, + "epoch": 0.8271186440677966, + "cpu_mem": 1.509408768, + "gpu_mem": 4.437067264, + "loss": 0.5406, + "grad_norm": 14.913131713867188, + "learning_rate": 0.00021873397825911153 + }, + { + "step": 123, + "epoch": 0.8338983050847457, + "cpu_mem": 1.509408768, + "gpu_mem": 4.4368768, + "loss": 0.429, + "grad_norm": 10.716595649719238, + "learning_rate": 0.00021714258150940685 + }, + { + "step": 124, + "epoch": 0.8406779661016949, + "cpu_mem": 1.509408768, + "gpu_mem": 4.437319168, + "loss": 0.3996, + "grad_norm": 7.652353286743164, + "learning_rate": 0.0002155416768554039 + }, + { + "step": 125, + "epoch": 0.847457627118644, + "cpu_mem": 1.509408768, + "gpu_mem": 4.43704576, + "loss": 0.5075, + "grad_norm": 13.265007972717285, + "learning_rate": 0.00021393149099747523 + }, + { + "step": 126, + "epoch": 0.8542372881355932, + "cpu_mem": 1.509408768, + "gpu_mem": 4.436929024, + "loss": 0.4667, + "grad_norm": 9.696459770202637, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 127, + "epoch": 0.8610169491525423, + "cpu_mem": 1.509408768, + "gpu_mem": 4.43736832, + "loss": 0.5595, + "grad_norm": 15.351218223571777, + "learning_rate": 0.00021068418901049025 + }, + { + "step": 128, + "epoch": 0.8677966101694915, + "cpu_mem": 1.509408768, + "gpu_mem": 4.437144064, + "loss": 0.5056, + "grad_norm": 10.441043853759766, + "learning_rate": 0.0002090475327242912 + }, + { + "step": 129, + "epoch": 0.8745762711864407, + "cpu_mem": 1.509408768, + "gpu_mem": 4.437184, + "loss": 0.6595, + "grad_norm": 13.417473793029785, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 130, + "epoch": 0.8813559322033898, + "cpu_mem": 1.509408768, + "gpu_mem": 4.436965888, + "loss": 0.6462, + "grad_norm": 7.923618316650391, + "learning_rate": 0.0002057493683490491 + }, + { + "step": 131, + "epoch": 0.888135593220339, + "cpu_mem": 1.509408768, + "gpu_mem": 4.437094912, + "loss": 0.5114, + "grad_norm": 6.110599994659424, + "learning_rate": 0.00020408832730536746 + }, + { + "step": 132, + "epoch": 0.8949152542372881, + "cpu_mem": 1.509408768, + "gpu_mem": 4.43717632, + "loss": 0.4793, + "grad_norm": 12.383698463439941, + "learning_rate": 0.00020241962693986476 + }, + { + "step": 133, + "epoch": 0.9016949152542373, + "cpu_mem": 1.509408768, + "gpu_mem": 4.436959744, + "loss": 0.5583, + "grad_norm": 12.624692916870117, + "learning_rate": 0.0002007435035533061 + }, + { + "step": 134, + "epoch": 0.9084745762711864, + "cpu_mem": 1.509408768, + "gpu_mem": 4.437093376, + "loss": 0.4981, + "grad_norm": 12.406174659729004, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 135, + "epoch": 0.9152542372881356, + "cpu_mem": 1.509605376, + "gpu_mem": 4.437116416, + "loss": 0.4536, + "grad_norm": 7.415020942687988, + "learning_rate": 0.00019736993814225374 + }, + { + "step": 136, + "epoch": 0.9220338983050848, + "cpu_mem": 1.509605376, + "gpu_mem": 4.4369536, + "loss": 0.5096, + "grad_norm": 9.677729606628418, + "learning_rate": 0.00019567297384048604 + }, + { + "step": 137, + "epoch": 0.9288135593220339, + "cpu_mem": 1.509605376, + "gpu_mem": 4.436833792, + "loss": 0.5439, + "grad_norm": 11.361948013305664, + "learning_rate": 0.0001939695418954653 + }, + { + "step": 138, + "epoch": 0.9355932203389831, + "cpu_mem": 1.509605376, + "gpu_mem": 4.43701504, + "loss": 0.5307, + "grad_norm": 11.554671287536621, + "learning_rate": 0.00019225988352621445 + }, + { + "step": 139, + "epoch": 0.9423728813559322, + "cpu_mem": 1.509605376, + "gpu_mem": 4.436913664, + "loss": 0.4409, + "grad_norm": 7.895120620727539, + "learning_rate": 0.00019054424083346592 + }, + { + "step": 140, + "epoch": 0.9491525423728814, + "cpu_mem": 1.509605376, + "gpu_mem": 4.436965888, + "loss": 0.4858, + "grad_norm": 10.334193229675293, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 141, + "epoch": 0.9559322033898305, + "cpu_mem": 1.509605376, + "gpu_mem": 4.436998144, + "loss": 0.6513, + "grad_norm": 19.94317626953125, + "learning_rate": 0.0001870959750831323 + }, + { + "step": 142, + "epoch": 0.9627118644067797, + "cpu_mem": 1.509605376, + "gpu_mem": 4.43713792, + "loss": 0.5677, + "grad_norm": 16.330734252929688, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 143, + "epoch": 0.9694915254237289, + "cpu_mem": 1.509605376, + "gpu_mem": 4.437121024, + "loss": 0.5919, + "grad_norm": 11.14167308807373, + "learning_rate": 0.00018362669777878453 + }, + { + "step": 144, + "epoch": 0.976271186440678, + "cpu_mem": 1.509605376, + "gpu_mem": 4.437313024, + "loss": 0.5742, + "grad_norm": 7.925104141235352, + "learning_rate": 0.00018188479343294648 + }, + { + "step": 145, + "epoch": 0.9830508474576272, + "cpu_mem": 1.509605376, + "gpu_mem": 4.437024256, + "loss": 0.4559, + "grad_norm": 6.353579044342041, + "learning_rate": 0.0001801383739559098 + }, + { + "step": 146, + "epoch": 0.9898305084745763, + "cpu_mem": 1.509605376, + "gpu_mem": 4.437059584, + "loss": 0.579, + "grad_norm": 9.637496948242188, + "learning_rate": 0.0001783876866540615 + }, + { + "step": 147, + "epoch": 0.9966101694915255, + "cpu_mem": 1.509605376, + "gpu_mem": 4.436958208, + "loss": 0.5382, + "grad_norm": 7.5189595222473145, + "learning_rate": 0.00017663297943814552 + }, + { + "step": 148, + "epoch": 1.0033898305084745, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443509248, + "loss": 0.6157, + "grad_norm": 11.473037719726562, + "learning_rate": 0.0001748745007881561 + }, + { + "step": 149, + "epoch": 1.0101694915254238, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443444736, + "loss": 0.3944, + "grad_norm": 5.674694538116455, + "learning_rate": 0.00017311249971815185 + }, + { + "step": 150, + "epoch": 1.0169491525423728, + "cpu_mem": 1.509605376, + "gpu_mem": 4.44328192, + "loss": 0.3795, + "grad_norm": 6.071469783782959, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 151, + "epoch": 1.023728813559322, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443354112, + "loss": 0.3988, + "grad_norm": 5.714435577392578, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 152, + "epoch": 1.0305084745762711, + "cpu_mem": 1.509605376, + "gpu_mem": 4.44338944, + "loss": 0.3796, + "grad_norm": 6.586073875427246, + "learning_rate": 0.00016780785939859576 + }, + { + "step": 153, + "epoch": 1.0372881355932204, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443414016, + "loss": 0.452, + "grad_norm": 6.130358695983887, + "learning_rate": 0.00016603426823476693 + }, + { + "step": 154, + "epoch": 1.0440677966101695, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443375616, + "loss": 0.3782, + "grad_norm": 6.469320774078369, + "learning_rate": 0.00016425840649562736 + }, + { + "step": 155, + "epoch": 1.0508474576271187, + "cpu_mem": 1.509605376, + "gpu_mem": 4.4435968, + "loss": 0.4447, + "grad_norm": 9.796608924865723, + "learning_rate": 0.00016248052565681436 + }, + { + "step": 156, + "epoch": 1.0576271186440678, + "cpu_mem": 1.509605376, + "gpu_mem": 4.44350464, + "loss": 0.3952, + "grad_norm": 10.653168678283691, + "learning_rate": 0.00016070087747988482 + }, + { + "step": 157, + "epoch": 1.064406779661017, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443410944, + "loss": 0.4352, + "grad_norm": 10.047937393188477, + "learning_rate": 0.00015891971397666464 + }, + { + "step": 158, + "epoch": 1.071186440677966, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443337216, + "loss": 0.3561, + "grad_norm": 7.982423782348633, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 159, + "epoch": 1.0779661016949154, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443685888, + "loss": 0.3552, + "grad_norm": 8.629858016967773, + "learning_rate": 0.00015535385007584706 + }, + { + "step": 160, + "epoch": 1.0847457627118644, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443280384, + "loss": 0.3805, + "grad_norm": 10.6151123046875, + "learning_rate": 0.0001535696546319161 + }, + { + "step": 161, + "epoch": 1.0915254237288137, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443226624, + "loss": 0.38, + "grad_norm": 8.782222747802734, + "learning_rate": 0.00015178495369752213 + }, + { + "step": 162, + "epoch": 1.0983050847457627, + "cpu_mem": 1.509605376, + "gpu_mem": 4.444002304, + "loss": 0.26, + "grad_norm": 7.612800598144531, + "learning_rate": 0.00015 + }, + { + "step": 163, + "epoch": 1.1050847457627118, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443478528, + "loss": 0.3393, + "grad_norm": 7.5802998542785645, + "learning_rate": 0.00014821504630247785 + }, + { + "step": 164, + "epoch": 1.111864406779661, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443390976, + "loss": 0.429, + "grad_norm": 9.032646179199219, + "learning_rate": 0.00014643034536808387 + }, + { + "step": 165, + "epoch": 1.11864406779661, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443340288, + "loss": 0.3151, + "grad_norm": 8.619991302490234, + "learning_rate": 0.00014464614992415294 + }, + { + "step": 166, + "epoch": 1.1254237288135593, + "cpu_mem": 1.509605376, + "gpu_mem": 4.44343552, + "loss": 0.3742, + "grad_norm": 9.681302070617676, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 167, + "epoch": 1.1322033898305084, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443352576, + "loss": 0.4062, + "grad_norm": 14.850049018859863, + "learning_rate": 0.00014108028602333536 + }, + { + "step": 168, + "epoch": 1.1389830508474577, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443371008, + "loss": 0.4535, + "grad_norm": 10.279086112976074, + "learning_rate": 0.00013929912252011516 + }, + { + "step": 169, + "epoch": 1.1457627118644067, + "cpu_mem": 1.509605376, + "gpu_mem": 4.44345856, + "loss": 0.3463, + "grad_norm": 9.80820369720459, + "learning_rate": 0.00013751947434318564 + }, + { + "step": 170, + "epoch": 1.152542372881356, + "cpu_mem": 1.509605376, + "gpu_mem": 4.44334336, + "loss": 0.3783, + "grad_norm": 10.852489471435547, + "learning_rate": 0.00013574159350437261 + }, + { + "step": 171, + "epoch": 1.159322033898305, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443406336, + "loss": 0.4453, + "grad_norm": 11.156643867492676, + "learning_rate": 0.0001339657317652331 + }, + { + "step": 172, + "epoch": 1.1661016949152543, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443314176, + "loss": 0.3344, + "grad_norm": 8.870190620422363, + "learning_rate": 0.00013219214060140424 + }, + { + "step": 173, + "epoch": 1.1728813559322033, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443613696, + "loss": 0.4352, + "grad_norm": 9.64120101928711, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 174, + "epoch": 1.1796610169491526, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443337216, + "loss": 0.333, + "grad_norm": 8.450601577758789, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 175, + "epoch": 1.1864406779661016, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443303424, + "loss": 0.3363, + "grad_norm": 7.288397312164307, + "learning_rate": 0.00012688750028184818 + }, + { + "step": 176, + "epoch": 1.193220338983051, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443441664, + "loss": 0.3218, + "grad_norm": 10.227561950683594, + "learning_rate": 0.0001251254992118439 + }, + { + "step": 177, + "epoch": 1.2, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443539968, + "loss": 0.351, + "grad_norm": 7.246642112731934, + "learning_rate": 0.00012336702056185453 + }, + { + "step": 178, + "epoch": 1.2067796610169492, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443286528, + "loss": 0.5062, + "grad_norm": 15.948833465576172, + "learning_rate": 0.00012161231334593851 + }, + { + "step": 179, + "epoch": 1.2135593220338983, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443386368, + "loss": 0.413, + "grad_norm": 9.143070220947266, + "learning_rate": 0.00011986162604409015 + }, + { + "step": 180, + "epoch": 1.2203389830508475, + "cpu_mem": 1.509605376, + "gpu_mem": 4.44335872, + "loss": 0.314, + "grad_norm": 9.495607376098633, + "learning_rate": 0.00011811520656705348 + }, + { + "step": 181, + "epoch": 1.2271186440677966, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443295744, + "loss": 0.3162, + "grad_norm": 9.015178680419922, + "learning_rate": 0.00011637330222121543 + }, + { + "step": 182, + "epoch": 1.2338983050847459, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443513856, + "loss": 0.4318, + "grad_norm": 10.653160095214844, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 183, + "epoch": 1.240677966101695, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443410944, + "loss": 0.3551, + "grad_norm": 11.33315372467041, + "learning_rate": 0.00011290402491686766 + }, + { + "step": 184, + "epoch": 1.2474576271186442, + "cpu_mem": 1.509605376, + "gpu_mem": 4.44335872, + "loss": 0.3593, + "grad_norm": 9.929872512817383, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 185, + "epoch": 1.2542372881355932, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443337216, + "loss": 0.3663, + "grad_norm": 9.877488136291504, + "learning_rate": 0.00010945575916653407 + }, + { + "step": 186, + "epoch": 1.2610169491525425, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443346432, + "loss": 0.2439, + "grad_norm": 6.803407192230225, + "learning_rate": 0.00010774011647378553 + }, + { + "step": 187, + "epoch": 1.2677966101694915, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443278848, + "loss": 0.4327, + "grad_norm": 13.585451126098633, + "learning_rate": 0.00010603045810453468 + }, + { + "step": 188, + "epoch": 1.2745762711864406, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443441664, + "loss": 0.245, + "grad_norm": 10.62351131439209, + "learning_rate": 0.00010432702615951396 + }, + { + "step": 189, + "epoch": 1.2813559322033898, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443311104, + "loss": 0.4337, + "grad_norm": 9.73697280883789, + "learning_rate": 0.00010263006185774627 + }, + { + "step": 190, + "epoch": 1.288135593220339, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443430912, + "loss": 0.3561, + "grad_norm": 11.984561920166016, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 191, + "epoch": 1.2949152542372881, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443249664, + "loss": 0.3225, + "grad_norm": 12.810022354125977, + "learning_rate": 9.925649644669391e-05 + }, + { + "step": 192, + "epoch": 1.3016949152542372, + "cpu_mem": 1.509605376, + "gpu_mem": 4.44338176, + "loss": 0.2664, + "grad_norm": 10.05370807647705, + "learning_rate": 9.758037306013526e-05 + }, + { + "step": 193, + "epoch": 1.3084745762711865, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443355648, + "loss": 0.4455, + "grad_norm": 14.030529975891113, + "learning_rate": 9.591167269463255e-05 + }, + { + "step": 194, + "epoch": 1.3152542372881357, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443321856, + "loss": 0.3602, + "grad_norm": 12.235883712768555, + "learning_rate": 9.425063165095088e-05 + }, + { + "step": 195, + "epoch": 1.3220338983050848, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443426304, + "loss": 0.2762, + "grad_norm": 9.612601280212402, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 196, + "epoch": 1.3288135593220338, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443421696, + "loss": 0.3528, + "grad_norm": 8.862492561340332, + "learning_rate": 9.095246727570879e-05 + }, + { + "step": 197, + "epoch": 1.335593220338983, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443280384, + "loss": 0.2842, + "grad_norm": 11.436196327209473, + "learning_rate": 8.931581098950973e-05 + }, + { + "step": 198, + "epoch": 1.3423728813559321, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443472384, + "loss": 0.3027, + "grad_norm": 10.520874977111816, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 199, + "epoch": 1.3491525423728814, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443323392, + "loss": 0.3717, + "grad_norm": 14.724334716796875, + "learning_rate": 8.606850900252478e-05 + }, + { + "step": 200, + "epoch": 1.3559322033898304, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443426304, + "loss": 0.3074, + "grad_norm": 12.475892066955566, + "learning_rate": 8.445832314459608e-05 + }, + { + "step": 201, + "epoch": 1.3627118644067797, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443629056, + "loss": 0.2782, + "grad_norm": 8.169771194458008, + "learning_rate": 8.285741849059311e-05 + }, + { + "step": 202, + "epoch": 1.3694915254237288, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443430912, + "loss": 0.2923, + "grad_norm": 9.363910675048828, + "learning_rate": 8.126602174088843e-05 + }, + { + "step": 203, + "epoch": 1.376271186440678, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443317248, + "loss": 0.2826, + "grad_norm": 7.223607063293457, + "learning_rate": 7.968435824946242e-05 + }, + { + "step": 204, + "epoch": 1.383050847457627, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443331072, + "loss": 0.262, + "grad_norm": 8.399836540222168, + "learning_rate": 7.811265199199152e-05 + }, + { + "step": 205, + "epoch": 1.3898305084745763, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443375616, + "loss": 0.2814, + "grad_norm": 8.11514663696289, + "learning_rate": 7.655112553413135e-05 + }, + { + "step": 206, + "epoch": 1.3966101694915254, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443317248, + "loss": 0.2771, + "grad_norm": 9.344612121582031, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 207, + "epoch": 1.4033898305084747, + "cpu_mem": 1.509605376, + "gpu_mem": 4.44355072, + "loss": 0.3327, + "grad_norm": 10.335131645202637, + "learning_rate": 7.345949504086507e-05 + }, + { + "step": 208, + "epoch": 1.4101694915254237, + "cpu_mem": 1.509605376, + "gpu_mem": 4.44358144, + "loss": 0.2319, + "grad_norm": 12.18466567993164, + "learning_rate": 7.192982880403917e-05 + }, + { + "step": 209, + "epoch": 1.4169491525423727, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443507712, + "loss": 0.3699, + "grad_norm": 11.099276542663574, + "learning_rate": 7.041121790198881e-05 + }, + { + "step": 210, + "epoch": 1.423728813559322, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443395584, + "loss": 0.4507, + "grad_norm": 12.702630996704102, + "learning_rate": 6.890387738166041e-05 + }, + { + "step": 211, + "epoch": 1.4305084745762713, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443344896, + "loss": 0.4079, + "grad_norm": 10.903487205505371, + "learning_rate": 6.740802069402771e-05 + }, + { + "step": 212, + "epoch": 1.4372881355932203, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443314176, + "loss": 0.3, + "grad_norm": 10.615644454956055, + "learning_rate": 6.592385966386588e-05 + }, + { + "step": 213, + "epoch": 1.4440677966101694, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443337216, + "loss": 0.3518, + "grad_norm": 10.656424522399902, + "learning_rate": 6.445160445975536e-05 + }, + { + "step": 214, + "epoch": 1.4508474576271186, + "cpu_mem": 1.509605376, + "gpu_mem": 4.44342016, + "loss": 0.4096, + "grad_norm": 16.58029556274414, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 215, + "epoch": 1.457627118644068, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443347968, + "loss": 0.4284, + "grad_norm": 16.37483024597168, + "learning_rate": 6.154364374470568e-05 + }, + { + "step": 216, + "epoch": 1.464406779661017, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443513856, + "loss": 0.2537, + "grad_norm": 8.520498275756836, + "learning_rate": 6.010835002329795e-05 + }, + { + "step": 217, + "epoch": 1.471186440677966, + "cpu_mem": 1.509605376, + "gpu_mem": 4.443355648, + "loss": 0.3216, + "grad_norm": 15.882022857666016, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 218, + "epoch": 1.4779661016949153, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443332608, + "loss": 0.3527, + "grad_norm": 11.748701095581055, + "learning_rate": 5.72761520669092e-05 + }, + { + "step": 219, + "epoch": 1.4847457627118645, + "cpu_mem": 1.509801984, + "gpu_mem": 4.44345856, + "loss": 0.4225, + "grad_norm": 10.945619583129883, + "learning_rate": 5.587964889287218e-05 + }, + { + "step": 220, + "epoch": 1.4915254237288136, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443492352, + "loss": 0.3201, + "grad_norm": 10.223454475402832, + "learning_rate": 5.449647388213678e-05 + }, + { + "step": 221, + "epoch": 1.4983050847457626, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443360256, + "loss": 0.2756, + "grad_norm": 8.79985237121582, + "learning_rate": 5.312682290288869e-05 + }, + { + "step": 222, + "epoch": 1.505084745762712, + "cpu_mem": 1.509801984, + "gpu_mem": 4.44349696, + "loss": 0.3005, + "grad_norm": 9.963611602783203, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 223, + "epoch": 1.5118644067796612, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443410944, + "loss": 0.2828, + "grad_norm": 13.369943618774414, + "learning_rate": 5.0428866908599864e-05 + }, + { + "step": 224, + "epoch": 1.5186440677966102, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443375616, + "loss": 0.2504, + "grad_norm": 8.19646167755127, + "learning_rate": 4.9100943944812114e-05 + }, + { + "step": 225, + "epoch": 1.5254237288135593, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443340288, + "loss": 0.2712, + "grad_norm": 10.986623764038086, + "learning_rate": 4.778730906091632e-05 + }, + { + "step": 226, + "epoch": 1.5322033898305085, + "cpu_mem": 1.509801984, + "gpu_mem": 4.44348928, + "loss": 0.3418, + "grad_norm": 9.064874649047852, + "learning_rate": 4.648814827768322e-05 + }, + { + "step": 227, + "epoch": 1.5389830508474578, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443378688, + "loss": 0.3776, + "grad_norm": 13.318199157714844, + "learning_rate": 4.5203645566239816e-05 + }, + { + "step": 228, + "epoch": 1.5457627118644068, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443323392, + "loss": 0.4031, + "grad_norm": 11.604644775390625, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 229, + "epoch": 1.5525423728813559, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443265024, + "loss": 0.2382, + "grad_norm": 7.073112487792969, + "learning_rate": 4.267933983899601e-05 + }, + { + "step": 230, + "epoch": 1.559322033898305, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443321856, + "loss": 0.2682, + "grad_norm": 8.373688697814941, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 231, + "epoch": 1.5661016949152542, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443599872, + "loss": 0.4171, + "grad_norm": 13.975024223327637, + "learning_rate": 4.0215821672741213e-05 + }, + { + "step": 232, + "epoch": 1.5728813559322035, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443323392, + "loss": 0.3271, + "grad_norm": 9.13338851928711, + "learning_rate": 3.900729534256745e-05 + }, + { + "step": 233, + "epoch": 1.5796610169491525, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443636736, + "loss": 0.3622, + "grad_norm": 12.784040451049805, + "learning_rate": 3.781448643031187e-05 + }, + { + "step": 234, + "epoch": 1.5864406779661016, + "cpu_mem": 1.509801984, + "gpu_mem": 4.44351232, + "loss": 0.2907, + "grad_norm": 9.858100891113281, + "learning_rate": 3.663756384686127e-05 + }, + { + "step": 235, + "epoch": 1.5932203389830508, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443268096, + "loss": 0.2375, + "grad_norm": 7.437261581420898, + "learning_rate": 3.547669425347647e-05 + }, + { + "step": 236, + "epoch": 1.6, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443328, + "loss": 0.3846, + "grad_norm": 15.018045425415039, + "learning_rate": 3.433204203819185e-05 + }, + { + "step": 237, + "epoch": 1.6067796610169491, + "cpu_mem": 1.509801984, + "gpu_mem": 4.44338944, + "loss": 0.4068, + "grad_norm": 13.178804397583008, + "learning_rate": 3.3203769292536764e-05 + }, + { + "step": 238, + "epoch": 1.6135593220338982, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443390976, + "loss": 0.3033, + "grad_norm": 11.403144836425781, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 239, + "epoch": 1.6203389830508474, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443644416, + "loss": 0.3641, + "grad_norm": 11.864053726196289, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 240, + "epoch": 1.6271186440677967, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443294208, + "loss": 0.5409, + "grad_norm": 16.487295150756836, + "learning_rate": 2.9918813861345952e-05 + }, + { + "step": 241, + "epoch": 1.6338983050847458, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443590656, + "loss": 0.2953, + "grad_norm": 10.208303451538086, + "learning_rate": 2.885763318295102e-05 + }, + { + "step": 242, + "epoch": 1.6406779661016948, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443452416, + "loss": 0.3304, + "grad_norm": 12.945493698120117, + "learning_rate": 2.781360719244964e-05 + }, + { + "step": 243, + "epoch": 1.647457627118644, + "cpu_mem": 1.509801984, + "gpu_mem": 4.44330496, + "loss": 0.4223, + "grad_norm": 10.953478813171387, + "learning_rate": 2.6786883731926306e-05 + }, + { + "step": 244, + "epoch": 1.6542372881355933, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443444736, + "loss": 0.2251, + "grad_norm": 8.908162117004395, + "learning_rate": 2.5777608193294396e-05 + }, + { + "step": 245, + "epoch": 1.6610169491525424, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443323392, + "loss": 0.3956, + "grad_norm": 11.671682357788086, + "learning_rate": 2.4785923497707956e-05 + }, + { + "step": 246, + "epoch": 1.6677966101694914, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443417088, + "loss": 0.4087, + "grad_norm": 10.720558166503906, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 247, + "epoch": 1.6745762711864407, + "cpu_mem": 1.509801984, + "gpu_mem": 4.44343552, + "loss": 0.2392, + "grad_norm": 7.951722621917725, + "learning_rate": 2.285588584541047e-05 + }, + { + "step": 248, + "epoch": 1.68135593220339, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443387904, + "loss": 0.286, + "grad_norm": 8.448762893676758, + "learning_rate": 2.1917806196827792e-05 + }, + { + "step": 249, + "epoch": 1.688135593220339, + "cpu_mem": 1.509801984, + "gpu_mem": 4.443294208, + "loss": 0.2903, + "grad_norm": 7.876997947692871, + "learning_rate": 2.0997863968844914e-05 + }, + { + "step": 250, + "epoch": 1.694915254237288, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443386368, + "loss": 0.3307, + "grad_norm": 11.870707511901855, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 251, + "epoch": 1.7016949152542373, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443298816, + "loss": 0.2187, + "grad_norm": 9.706609725952148, + "learning_rate": 1.921291027132278e-05 + }, + { + "step": 252, + "epoch": 1.7084745762711866, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443341824, + "loss": 0.3021, + "grad_norm": 8.621807098388672, + "learning_rate": 1.834815156491165e-05 + }, + { + "step": 253, + "epoch": 1.7152542372881356, + "cpu_mem": 1.510785024, + "gpu_mem": 4.44353536, + "loss": 0.3015, + "grad_norm": 10.553934097290039, + "learning_rate": 1.750203576956341e-05 + }, + { + "step": 254, + "epoch": 1.7220338983050847, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443331072, + "loss": 0.3854, + "grad_norm": 11.682550430297852, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 255, + "epoch": 1.7288135593220337, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443487744, + "loss": 0.3721, + "grad_norm": 12.575425148010254, + "learning_rate": 1.5866209521043304e-05 + }, + { + "step": 256, + "epoch": 1.735593220338983, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443314176, + "loss": 0.3563, + "grad_norm": 13.305479049682617, + "learning_rate": 1.5076730713409523e-05 + }, + { + "step": 257, + "epoch": 1.7423728813559323, + "cpu_mem": 1.510785024, + "gpu_mem": 4.44372736, + "loss": 0.372, + "grad_norm": 8.018106460571289, + "learning_rate": 1.4306358075111923e-05 + }, + { + "step": 258, + "epoch": 1.7491525423728813, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443386368, + "loss": 0.3647, + "grad_norm": 11.9230375289917, + "learning_rate": 1.3555200696822232e-05 + }, + { + "step": 259, + "epoch": 1.7559322033898304, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443303424, + "loss": 0.3188, + "grad_norm": 9.143546104431152, + "learning_rate": 1.2823364948184095e-05 + }, + { + "step": 260, + "epoch": 1.7627118644067796, + "cpu_mem": 1.510785024, + "gpu_mem": 4.44342016, + "loss": 0.2007, + "grad_norm": 7.097804069519043, + "learning_rate": 1.2110954462750166e-05 + }, + { + "step": 261, + "epoch": 1.769491525423729, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443375616, + "loss": 0.1669, + "grad_norm": 6.277010440826416, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 262, + "epoch": 1.776271186440678, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443332608, + "loss": 0.2355, + "grad_norm": 8.6841459274292, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 263, + "epoch": 1.783050847457627, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443369472, + "loss": 0.273, + "grad_norm": 7.298412322998047, + "learning_rate": 1.0091269574384874e-05 + }, + { + "step": 264, + "epoch": 1.7898305084745763, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443457024, + "loss": 0.2702, + "grad_norm": 10.429343223571777, + "learning_rate": 9.45754125003576e-06 + }, + { + "step": 265, + "epoch": 1.7966101694915255, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443375616, + "loss": 0.3714, + "grad_norm": 10.106574058532715, + "learning_rate": 8.843714815330987e-06 + }, + { + "step": 266, + "epoch": 1.8033898305084746, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443590656, + "loss": 0.3821, + "grad_norm": 12.320114135742188, + "learning_rate": 8.249877192799731e-06 + }, + { + "step": 267, + "epoch": 1.8101694915254236, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443383296, + "loss": 0.2547, + "grad_norm": 9.98828125, + "learning_rate": 7.676112474402068e-06 + }, + { + "step": 268, + "epoch": 1.8169491525423729, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443387904, + "loss": 0.2823, + "grad_norm": 10.371786117553711, + "learning_rate": 7.122501909620926e-06 + }, + { + "step": 269, + "epoch": 1.8237288135593221, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443398656, + "loss": 0.3287, + "grad_norm": 9.897567749023438, + "learning_rate": 6.5891238939566275e-06 + }, + { + "step": 270, + "epoch": 1.8305084745762712, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443437056, + "loss": 0.2544, + "grad_norm": 10.063066482543945, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 271, + "epoch": 1.8372881355932202, + "cpu_mem": 1.510785024, + "gpu_mem": 4.44348928, + "loss": 0.3142, + "grad_norm": 10.186683654785156, + "learning_rate": 5.583364755863701e-06 + }, + { + "step": 272, + "epoch": 1.8440677966101695, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443347968, + "loss": 0.3402, + "grad_norm": 8.863958358764648, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 273, + "epoch": 1.8508474576271188, + "cpu_mem": 1.510785024, + "gpu_mem": 4.44322816, + "loss": 0.2973, + "grad_norm": 9.513245582580566, + "learning_rate": 4.659404732773908e-06 + }, + { + "step": 274, + "epoch": 1.8576271186440678, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443455488, + "loss": 0.279, + "grad_norm": 10.836873054504395, + "learning_rate": 4.228264751468752e-06 + }, + { + "step": 275, + "epoch": 1.8644067796610169, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443699712, + "loss": 0.32, + "grad_norm": 11.420705795288086, + "learning_rate": 3.817767165451041e-06 + }, + { + "step": 276, + "epoch": 1.8711864406779661, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443360256, + "loss": 0.3155, + "grad_norm": 10.466375350952148, + "learning_rate": 3.4279701043260886e-06 + }, + { + "step": 277, + "epoch": 1.8779661016949154, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443306496, + "loss": 0.3167, + "grad_norm": 10.945162773132324, + "learning_rate": 3.0589287663461472e-06 + }, + { + "step": 278, + "epoch": 1.8847457627118644, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443469312, + "loss": 0.4518, + "grad_norm": 12.45777416229248, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 279, + "epoch": 1.8915254237288135, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443409408, + "loss": 0.2909, + "grad_norm": 8.404402732849121, + "learning_rate": 2.3833193495825853e-06 + }, + { + "step": 280, + "epoch": 1.8983050847457628, + "cpu_mem": 1.510785024, + "gpu_mem": 4.44338944, + "loss": 0.3295, + "grad_norm": 10.143949508666992, + "learning_rate": 2.076846942272026e-06 + }, + { + "step": 281, + "epoch": 1.905084745762712, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443324928, + "loss": 0.3426, + "grad_norm": 10.10682201385498, + "learning_rate": 1.791321587504768e-06 + }, + { + "step": 282, + "epoch": 1.911864406779661, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443753472, + "loss": 0.2746, + "grad_norm": 8.048744201660156, + "learning_rate": 1.5267837178600972e-06 + }, + { + "step": 283, + "epoch": 1.9186440677966101, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443460096, + "loss": 0.3007, + "grad_norm": 10.570841789245605, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 284, + "epoch": 1.9254237288135592, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443315712, + "loss": 0.3486, + "grad_norm": 8.057785987854004, + "learning_rate": 1.0608172990067553e-06 + }, + { + "step": 285, + "epoch": 1.9322033898305084, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443369472, + "loss": 0.3287, + "grad_norm": 9.795562744140625, + "learning_rate": 8.594547342153979e-07 + }, + { + "step": 286, + "epoch": 1.9389830508474577, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443787264, + "loss": 0.2857, + "grad_norm": 13.503580093383789, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 287, + "epoch": 1.9457627118644067, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443556864, + "loss": 0.2571, + "grad_norm": 9.61729621887207, + "learning_rate": 5.201134622801473e-07 + }, + { + "step": 288, + "epoch": 1.9525423728813558, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443341824, + "loss": 0.4314, + "grad_norm": 11.989745140075684, + "learning_rate": 3.821828084619727e-07 + }, + { + "step": 289, + "epoch": 1.959322033898305, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443426304, + "loss": 0.3231, + "grad_norm": 10.468039512634277, + "learning_rate": 2.654391846207915e-07 + }, + { + "step": 290, + "epoch": 1.9661016949152543, + "cpu_mem": 1.510785024, + "gpu_mem": 4.44335104, + "loss": 0.3028, + "grad_norm": 13.236854553222656, + "learning_rate": 1.6989912254880556e-07 + }, + { + "step": 291, + "epoch": 1.9728813559322034, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443386368, + "loss": 0.3675, + "grad_norm": 10.517420768737793, + "learning_rate": 9.557615145123765e-08 + }, + { + "step": 292, + "epoch": 1.9796610169491524, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443469312, + "loss": 0.3511, + "grad_norm": 9.854143142700195, + "learning_rate": 4.248079603064724e-08 + }, + { + "step": 293, + "epoch": 1.9864406779661017, + "cpu_mem": 1.510785024, + "gpu_mem": 4.443386368, + "loss": 0.4175, + "grad_norm": 13.601873397827148, + "learning_rate": 1.0620574996372811e-08 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.510785024, + "gpu_mem": 4.44341248, + "loss": 0.3715, + "grad_norm": 12.147957801818848, + "learning_rate": 0.0 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.510785024, + "gpu_mem": 4.44341248, + "train_runtime": 4455.2502, + "train_samples_per_second": 4.232, + "train_steps_per_second": 0.066, + "total_flos": 0.0, + "train_loss": 0.610709656562124 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/adapter_config.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0052eed638e4aeb48f103586efb96096bb8d3ed --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 64, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 32, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "A" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/eval_results.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e6b86adedc8194c6e134a2301025fcf81e6d28d7 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "boolq", + "results": 0.7929663608562691 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/training_configuration.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..ddfc8f276f499bec8a74c3cfae2c47c2a4087ce3 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "BOOLQ", + "dataset_id": "google/boolq", + "preprocess_id": "boolq_train_deepeval" + }, + "peft_config": { + "method": "abl_A", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 25389056 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 2, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_A-boolq-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2", + "seed": 42, + "timestamp": "2025-08-31T02:06:30.838829" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/training_logs.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..8aba3e6fe72f53aefc135ce189aa24aa9bb41419 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/training_logs.json @@ -0,0 +1,2659 @@ +[ + { + "step": 1, + "epoch": 0.006779661016949152, + "cpu_mem": 1.503981568, + "gpu_mem": 4.519328768, + "loss": 8.869, + "grad_norm": 233.80860900878906, + "learning_rate": 9.999999999999999e-06 + }, + { + "step": 2, + "epoch": 0.013559322033898305, + "cpu_mem": 1.510076416, + "gpu_mem": 4.722578944, + "loss": 8.9376, + "grad_norm": 239.66294860839844, + "learning_rate": 1.9999999999999998e-05 + }, + { + "step": 3, + "epoch": 0.020338983050847456, + "cpu_mem": 1.510862848, + "gpu_mem": 4.722497536, + "loss": 6.3632, + "grad_norm": 227.827392578125, + "learning_rate": 2.9999999999999997e-05 + }, + { + "step": 4, + "epoch": 0.02711864406779661, + "cpu_mem": 1.511452672, + "gpu_mem": 4.722497536, + "loss": 2.6529, + "grad_norm": 144.43919372558594, + "learning_rate": 3.9999999999999996e-05 + }, + { + "step": 5, + "epoch": 0.03389830508474576, + "cpu_mem": 1.512042496, + "gpu_mem": 4.722433024, + "loss": 1.212, + "grad_norm": 47.84184265136719, + "learning_rate": 4.9999999999999996e-05 + }, + { + "step": 6, + "epoch": 0.04067796610169491, + "cpu_mem": 1.51263232, + "gpu_mem": 4.722452992, + "loss": 0.8617, + "grad_norm": 43.27740478515625, + "learning_rate": 5.9999999999999995e-05 + }, + { + "step": 7, + "epoch": 0.04745762711864407, + "cpu_mem": 1.513222144, + "gpu_mem": 4.722505216, + "loss": 1.1695, + "grad_norm": 69.79517364501953, + "learning_rate": 7e-05 + }, + { + "step": 8, + "epoch": 0.05423728813559322, + "cpu_mem": 1.51361536, + "gpu_mem": 4.722591232, + "loss": 0.6559, + "grad_norm": 18.012210845947266, + "learning_rate": 7.999999999999999e-05 + }, + { + "step": 9, + "epoch": 0.061016949152542375, + "cpu_mem": 1.514008576, + "gpu_mem": 4.722499072, + "loss": 2.141, + "grad_norm": 79.17948913574219, + "learning_rate": 8.999999999999999e-05 + }, + { + "step": 10, + "epoch": 0.06779661016949153, + "cpu_mem": 1.5145984, + "gpu_mem": 4.722399232, + "loss": 1.456, + "grad_norm": 58.11606979370117, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 11, + "epoch": 0.07457627118644068, + "cpu_mem": 1.515188224, + "gpu_mem": 4.72250368, + "loss": 0.6457, + "grad_norm": 6.229933738708496, + "learning_rate": 0.00010999999999999998 + }, + { + "step": 12, + "epoch": 0.08135593220338982, + "cpu_mem": 1.51558144, + "gpu_mem": 4.722875392, + "loss": 1.2234, + "grad_norm": 116.1498794555664, + "learning_rate": 0.00011999999999999999 + }, + { + "step": 13, + "epoch": 0.08813559322033898, + "cpu_mem": 1.515974656, + "gpu_mem": 4.722479104, + "loss": 1.2587, + "grad_norm": 52.9954719543457, + "learning_rate": 0.00013 + }, + { + "step": 14, + "epoch": 0.09491525423728814, + "cpu_mem": 1.516367872, + "gpu_mem": 4.722456064, + "loss": 0.686, + "grad_norm": 9.985751152038574, + "learning_rate": 0.00014 + }, + { + "step": 15, + "epoch": 0.1016949152542373, + "cpu_mem": 1.516761088, + "gpu_mem": 4.722394624, + "loss": 1.1792, + "grad_norm": 29.390972137451172, + "learning_rate": 0.00015 + }, + { + "step": 16, + "epoch": 0.10847457627118644, + "cpu_mem": 1.516957696, + "gpu_mem": 4.722479104, + "loss": 1.8027, + "grad_norm": 77.97930908203125, + "learning_rate": 0.00015999999999999999 + }, + { + "step": 17, + "epoch": 0.1152542372881356, + "cpu_mem": 1.517350912, + "gpu_mem": 4.72251904, + "loss": 0.9339, + "grad_norm": 31.851720809936523, + "learning_rate": 0.00016999999999999999 + }, + { + "step": 18, + "epoch": 0.12203389830508475, + "cpu_mem": 1.517744128, + "gpu_mem": 4.722582016, + "loss": 1.2629, + "grad_norm": 45.89571762084961, + "learning_rate": 0.00017999999999999998 + }, + { + "step": 19, + "epoch": 0.1288135593220339, + "cpu_mem": 1.518137344, + "gpu_mem": 4.7224192, + "loss": 1.2291, + "grad_norm": 32.983150482177734, + "learning_rate": 0.00018999999999999998 + }, + { + "step": 20, + "epoch": 0.13559322033898305, + "cpu_mem": 1.51853056, + "gpu_mem": 4.722531328, + "loss": 0.8234, + "grad_norm": 22.25924301147461, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 21, + "epoch": 0.1423728813559322, + "cpu_mem": 1.518727168, + "gpu_mem": 4.722689536, + "loss": 0.6795, + "grad_norm": 8.868040084838867, + "learning_rate": 0.00020999999999999998 + }, + { + "step": 22, + "epoch": 0.14915254237288136, + "cpu_mem": 1.519120384, + "gpu_mem": 4.722582016, + "loss": 1.0734, + "grad_norm": 22.563232421875, + "learning_rate": 0.00021999999999999995 + }, + { + "step": 23, + "epoch": 0.15593220338983052, + "cpu_mem": 1.519316992, + "gpu_mem": 4.722554368, + "loss": 0.8224, + "grad_norm": 13.91633415222168, + "learning_rate": 0.00023 + }, + { + "step": 24, + "epoch": 0.16271186440677965, + "cpu_mem": 1.5195136, + "gpu_mem": 4.7226112, + "loss": 0.7573, + "grad_norm": 16.691375732421875, + "learning_rate": 0.00023999999999999998 + }, + { + "step": 25, + "epoch": 0.1694915254237288, + "cpu_mem": 1.519906816, + "gpu_mem": 4.72239616, + "loss": 0.6975, + "grad_norm": 6.51262092590332, + "learning_rate": 0.00025 + }, + { + "step": 26, + "epoch": 0.17627118644067796, + "cpu_mem": 1.520300032, + "gpu_mem": 4.722451456, + "loss": 0.882, + "grad_norm": 14.44697093963623, + "learning_rate": 0.00026 + }, + { + "step": 27, + "epoch": 0.18305084745762712, + "cpu_mem": 1.52049664, + "gpu_mem": 4.722743296, + "loss": 0.6362, + "grad_norm": 3.7360706329345703, + "learning_rate": 0.00027 + }, + { + "step": 28, + "epoch": 0.18983050847457628, + "cpu_mem": 1.520693248, + "gpu_mem": 4.722422272, + "loss": 0.7593, + "grad_norm": 8.032002449035645, + "learning_rate": 0.00028 + }, + { + "step": 29, + "epoch": 0.19661016949152543, + "cpu_mem": 1.521086464, + "gpu_mem": 4.722486784, + "loss": 0.7451, + "grad_norm": 12.261842727661133, + "learning_rate": 0.00029 + }, + { + "step": 30, + "epoch": 0.2033898305084746, + "cpu_mem": 1.521283072, + "gpu_mem": 4.72256512, + "loss": 0.7261, + "grad_norm": 7.222959518432617, + "learning_rate": 0.0003 + }, + { + "step": 31, + "epoch": 0.21016949152542372, + "cpu_mem": 1.52147968, + "gpu_mem": 4.722368512, + "loss": 0.6136, + "grad_norm": 2.5524110794067383, + "learning_rate": 0.0002999893794250036 + }, + { + "step": 32, + "epoch": 0.21694915254237288, + "cpu_mem": 1.521676288, + "gpu_mem": 4.722482176, + "loss": 0.7509, + "grad_norm": 9.033954620361328, + "learning_rate": 0.00029995751920396937 + }, + { + "step": 33, + "epoch": 0.22372881355932203, + "cpu_mem": 1.521872896, + "gpu_mem": 4.722720256, + "loss": 0.7194, + "grad_norm": 8.025568008422852, + "learning_rate": 0.00029990442384854874 + }, + { + "step": 34, + "epoch": 0.2305084745762712, + "cpu_mem": 1.522069504, + "gpu_mem": 4.722422272, + "loss": 0.5858, + "grad_norm": 2.7295961380004883, + "learning_rate": 0.0002998301008774512 + }, + { + "step": 35, + "epoch": 0.23728813559322035, + "cpu_mem": 1.522266112, + "gpu_mem": 4.722632704, + "loss": 0.8056, + "grad_norm": 10.636892318725586, + "learning_rate": 0.0002997345608153792 + }, + { + "step": 36, + "epoch": 0.2440677966101695, + "cpu_mem": 1.52246272, + "gpu_mem": 4.722583552, + "loss": 0.6826, + "grad_norm": 2.589643955230713, + "learning_rate": 0.000299617817191538 + }, + { + "step": 37, + "epoch": 0.25084745762711863, + "cpu_mem": 1.522659328, + "gpu_mem": 4.722394624, + "loss": 0.6468, + "grad_norm": 4.602322101593018, + "learning_rate": 0.0002994798865377198 + }, + { + "step": 38, + "epoch": 0.2576271186440678, + "cpu_mem": 1.522855936, + "gpu_mem": 4.72264192, + "loss": 0.8638, + "grad_norm": 11.864049911499023, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 39, + "epoch": 0.26440677966101694, + "cpu_mem": 1.523052544, + "gpu_mem": 4.723021312, + "loss": 0.7293, + "grad_norm": 7.218650817871094, + "learning_rate": 0.0002991405452657846 + }, + { + "step": 40, + "epoch": 0.2711864406779661, + "cpu_mem": 1.523249152, + "gpu_mem": 4.722591232, + "loss": 0.6592, + "grad_norm": 2.3266420364379883, + "learning_rate": 0.00029893918270099324 + }, + { + "step": 41, + "epoch": 0.27796610169491526, + "cpu_mem": 1.52344576, + "gpu_mem": 4.72281856, + "loss": 0.6739, + "grad_norm": 2.876038074493408, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 42, + "epoch": 0.2847457627118644, + "cpu_mem": 1.523642368, + "gpu_mem": 4.722715648, + "loss": 0.6569, + "grad_norm": 5.236875057220459, + "learning_rate": 0.0002984732162821399 + }, + { + "step": 43, + "epoch": 0.29152542372881357, + "cpu_mem": 1.523838976, + "gpu_mem": 4.722537472, + "loss": 0.6302, + "grad_norm": 2.8164334297180176, + "learning_rate": 0.0002982086784124952 + }, + { + "step": 44, + "epoch": 0.2983050847457627, + "cpu_mem": 1.524035584, + "gpu_mem": 4.72268032, + "loss": 0.6166, + "grad_norm": 4.41773796081543, + "learning_rate": 0.00029792315305772796 + }, + { + "step": 45, + "epoch": 0.3050847457627119, + "cpu_mem": 1.524232192, + "gpu_mem": 4.722460672, + "loss": 0.8092, + "grad_norm": 9.20035457611084, + "learning_rate": 0.0002976166806504174 + }, + { + "step": 46, + "epoch": 0.31186440677966104, + "cpu_mem": 1.524232192, + "gpu_mem": 4.72270336, + "loss": 0.8993, + "grad_norm": 15.634708404541016, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 47, + "epoch": 0.31864406779661014, + "cpu_mem": 1.5244288, + "gpu_mem": 4.72242688, + "loss": 0.8146, + "grad_norm": 14.326847076416016, + "learning_rate": 0.00029694107123365385 + }, + { + "step": 48, + "epoch": 0.3254237288135593, + "cpu_mem": 1.524625408, + "gpu_mem": 4.72250368, + "loss": 0.6621, + "grad_norm": 7.587663650512695, + "learning_rate": 0.00029657202989567393 + }, + { + "step": 49, + "epoch": 0.33220338983050846, + "cpu_mem": 1.524822016, + "gpu_mem": 4.722520576, + "loss": 0.9905, + "grad_norm": 13.618229866027832, + "learning_rate": 0.00029618223283454893 + }, + { + "step": 50, + "epoch": 0.3389830508474576, + "cpu_mem": 1.524822016, + "gpu_mem": 4.722459136, + "loss": 0.8435, + "grad_norm": 9.786823272705078, + "learning_rate": 0.00029577173524853123 + }, + { + "step": 51, + "epoch": 0.34576271186440677, + "cpu_mem": 1.524822016, + "gpu_mem": 4.722463744, + "loss": 0.6079, + "grad_norm": 3.4515862464904785, + "learning_rate": 0.0002953405952672261 + }, + { + "step": 52, + "epoch": 0.3525423728813559, + "cpu_mem": 1.525018624, + "gpu_mem": 4.722543616, + "loss": 0.6645, + "grad_norm": 2.619760751724243, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 53, + "epoch": 0.3593220338983051, + "cpu_mem": 1.525018624, + "gpu_mem": 4.722566656, + "loss": 0.7463, + "grad_norm": 10.795845031738281, + "learning_rate": 0.0002944166352441363 + }, + { + "step": 54, + "epoch": 0.36610169491525424, + "cpu_mem": 1.525215232, + "gpu_mem": 4.722494464, + "loss": 0.6756, + "grad_norm": 3.1487743854522705, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 55, + "epoch": 0.3728813559322034, + "cpu_mem": 1.52541184, + "gpu_mem": 4.7227648, + "loss": 0.6088, + "grad_norm": 2.5326249599456787, + "learning_rate": 0.00029341087610604337 + }, + { + "step": 56, + "epoch": 0.37966101694915255, + "cpu_mem": 1.525608448, + "gpu_mem": 4.722551296, + "loss": 0.8536, + "grad_norm": 9.061946868896484, + "learning_rate": 0.00029287749809037904 + }, + { + "step": 57, + "epoch": 0.3864406779661017, + "cpu_mem": 1.525608448, + "gpu_mem": 4.722545152, + "loss": 0.6668, + "grad_norm": 4.727614879608154, + "learning_rate": 0.0002923238875255979 + }, + { + "step": 58, + "epoch": 0.39322033898305087, + "cpu_mem": 1.525805056, + "gpu_mem": 4.722440704, + "loss": 0.6361, + "grad_norm": 3.2904443740844727, + "learning_rate": 0.00029175012280720024 + }, + { + "step": 59, + "epoch": 0.4, + "cpu_mem": 1.525805056, + "gpu_mem": 4.7224576, + "loss": 0.6821, + "grad_norm": 1.5682965517044067, + "learning_rate": 0.000291156285184669 + }, + { + "step": 60, + "epoch": 0.4067796610169492, + "cpu_mem": 1.526001664, + "gpu_mem": 4.722551296, + "loss": 0.6106, + "grad_norm": 5.173000812530518, + "learning_rate": 0.00029054245874996426 + }, + { + "step": 61, + "epoch": 0.4135593220338983, + "cpu_mem": 1.526001664, + "gpu_mem": 4.722562048, + "loss": 0.6116, + "grad_norm": 2.3430614471435547, + "learning_rate": 0.0002899087304256151 + }, + { + "step": 62, + "epoch": 0.42033898305084744, + "cpu_mem": 1.526198272, + "gpu_mem": 4.72254976, + "loss": 0.7339, + "grad_norm": 6.764347553253174, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 63, + "epoch": 0.4271186440677966, + "cpu_mem": 1.526198272, + "gpu_mem": 4.72254208, + "loss": 0.5956, + "grad_norm": 6.654665946960449, + "learning_rate": 0.000288581929876693 + }, + { + "step": 64, + "epoch": 0.43389830508474575, + "cpu_mem": 1.526198272, + "gpu_mem": 4.722471424, + "loss": 0.6023, + "grad_norm": 3.233610153198242, + "learning_rate": 0.0002878890455372498 + }, + { + "step": 65, + "epoch": 0.4406779661016949, + "cpu_mem": 1.52639488, + "gpu_mem": 4.722515968, + "loss": 0.6724, + "grad_norm": 5.837782382965088, + "learning_rate": 0.0002871766350518159 + }, + { + "step": 66, + "epoch": 0.44745762711864406, + "cpu_mem": 1.52639488, + "gpu_mem": 4.722709504, + "loss": 0.6014, + "grad_norm": 4.1243205070495605, + "learning_rate": 0.00028644479930317775 + }, + { + "step": 67, + "epoch": 0.4542372881355932, + "cpu_mem": 1.52639488, + "gpu_mem": 4.7224192, + "loss": 0.6198, + "grad_norm": 4.467631816864014, + "learning_rate": 0.00028569364192488803 + }, + { + "step": 68, + "epoch": 0.4610169491525424, + "cpu_mem": 1.526591488, + "gpu_mem": 4.722386944, + "loss": 0.6373, + "grad_norm": 3.114952802658081, + "learning_rate": 0.00028492326928659045 + }, + { + "step": 69, + "epoch": 0.46779661016949153, + "cpu_mem": 1.526591488, + "gpu_mem": 4.722452992, + "loss": 0.6229, + "grad_norm": 4.952956199645996, + "learning_rate": 0.00028413379047895665 + }, + { + "step": 70, + "epoch": 0.4745762711864407, + "cpu_mem": 1.526591488, + "gpu_mem": 4.722446848, + "loss": 0.6326, + "grad_norm": 3.901777744293213, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 71, + "epoch": 0.48135593220338985, + "cpu_mem": 1.526591488, + "gpu_mem": 4.722675712, + "loss": 0.525, + "grad_norm": 4.247412204742432, + "learning_rate": 0.0002824979642304366 + }, + { + "step": 72, + "epoch": 0.488135593220339, + "cpu_mem": 1.526788096, + "gpu_mem": 4.722668032, + "loss": 0.5622, + "grad_norm": 5.4284772872924805, + "learning_rate": 0.0002816518484350883 + }, + { + "step": 73, + "epoch": 0.49491525423728816, + "cpu_mem": 1.526984704, + "gpu_mem": 4.72263424, + "loss": 0.7902, + "grad_norm": 8.788385391235352, + "learning_rate": 0.0002807870897286772 + }, + { + "step": 74, + "epoch": 0.5016949152542373, + "cpu_mem": 1.526984704, + "gpu_mem": 4.722494464, + "loss": 0.5742, + "grad_norm": 8.913131713867188, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 75, + "epoch": 0.5084745762711864, + "cpu_mem": 1.526984704, + "gpu_mem": 4.7224192, + "loss": 0.6146, + "grad_norm": 8.38949203491211, + "learning_rate": 0.000279002136031155 + }, + { + "step": 76, + "epoch": 0.5152542372881356, + "cpu_mem": 1.526984704, + "gpu_mem": 4.722359296, + "loss": 0.5725, + "grad_norm": 4.2171711921691895, + "learning_rate": 0.00027808219380317216 + }, + { + "step": 77, + "epoch": 0.5220338983050847, + "cpu_mem": 1.526984704, + "gpu_mem": 4.722433024, + "loss": 0.6514, + "grad_norm": 6.086472988128662, + "learning_rate": 0.0002771441141545895 + }, + { + "step": 78, + "epoch": 0.5288135593220339, + "cpu_mem": 1.526984704, + "gpu_mem": 4.722485248, + "loss": 0.8012, + "grad_norm": 10.957486152648926, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 79, + "epoch": 0.535593220338983, + "cpu_mem": 1.527181312, + "gpu_mem": 4.722617344, + "loss": 0.5858, + "grad_norm": 3.74336576461792, + "learning_rate": 0.000275214076502292 + }, + { + "step": 80, + "epoch": 0.5423728813559322, + "cpu_mem": 1.527181312, + "gpu_mem": 4.722508288, + "loss": 0.6445, + "grad_norm": 7.315963268280029, + "learning_rate": 0.0002742223918067056 + }, + { + "step": 81, + "epoch": 0.5491525423728814, + "cpu_mem": 1.527181312, + "gpu_mem": 4.72238848, + "loss": 0.6349, + "grad_norm": 4.654501438140869, + "learning_rate": 0.00027321311626807374 + }, + { + "step": 82, + "epoch": 0.5559322033898305, + "cpu_mem": 1.527181312, + "gpu_mem": 4.7224576, + "loss": 0.6408, + "grad_norm": 4.707687854766846, + "learning_rate": 0.0002721863928075503 + }, + { + "step": 83, + "epoch": 0.5627118644067797, + "cpu_mem": 1.527181312, + "gpu_mem": 4.72255744, + "loss": 0.7144, + "grad_norm": 7.812034606933594, + "learning_rate": 0.000271142366817049 + }, + { + "step": 84, + "epoch": 0.5694915254237288, + "cpu_mem": 1.527181312, + "gpu_mem": 4.722520576, + "loss": 0.5678, + "grad_norm": 3.4686105251312256, + "learning_rate": 0.00027008118613865406 + }, + { + "step": 85, + "epoch": 0.576271186440678, + "cpu_mem": 1.527181312, + "gpu_mem": 4.722552832, + "loss": 0.5893, + "grad_norm": 5.409703254699707, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 86, + "epoch": 0.5830508474576271, + "cpu_mem": 1.52737792, + "gpu_mem": 4.72250368, + "loss": 0.6168, + "grad_norm": 7.087602138519287, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 87, + "epoch": 0.5898305084745763, + "cpu_mem": 1.52737792, + "gpu_mem": 4.72251136, + "loss": 0.5399, + "grad_norm": 5.04774284362793, + "learning_rate": 0.00026679623070746325 + }, + { + "step": 88, + "epoch": 0.5966101694915255, + "cpu_mem": 1.52737792, + "gpu_mem": 4.722655744, + "loss": 0.512, + "grad_norm": 4.10666561126709, + "learning_rate": 0.0002656679579618081 + }, + { + "step": 89, + "epoch": 0.6033898305084746, + "cpu_mem": 1.52737792, + "gpu_mem": 4.722437632, + "loss": 0.5958, + "grad_norm": 5.078695774078369, + "learning_rate": 0.0002645233057465235 + }, + { + "step": 90, + "epoch": 0.6101694915254238, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722491392, + "loss": 0.5342, + "grad_norm": 4.536576747894287, + "learning_rate": 0.00026336243615313873 + }, + { + "step": 91, + "epoch": 0.6169491525423729, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722459136, + "loss": 0.4877, + "grad_norm": 4.9868693351745605, + "learning_rate": 0.00026218551356968814 + }, + { + "step": 92, + "epoch": 0.6237288135593221, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722540544, + "loss": 0.7501, + "grad_norm": 11.705029487609863, + "learning_rate": 0.00026099270465743254 + }, + { + "step": 93, + "epoch": 0.6305084745762712, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722343936, + "loss": 0.7142, + "grad_norm": 9.641088485717773, + "learning_rate": 0.0002597841783272588 + }, + { + "step": 94, + "epoch": 0.6372881355932203, + "cpu_mem": 1.527574528, + "gpu_mem": 4.7224576, + "loss": 0.5724, + "grad_norm": 6.110875129699707, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 95, + "epoch": 0.6440677966101694, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722477568, + "loss": 0.6165, + "grad_norm": 4.48118257522583, + "learning_rate": 0.00025732066016100394 + }, + { + "step": 96, + "epoch": 0.6508474576271186, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722515968, + "loss": 0.5351, + "grad_norm": 2.5242867469787598, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 97, + "epoch": 0.6576271186440678, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722500608, + "loss": 0.7382, + "grad_norm": 7.024951457977295, + "learning_rate": 0.0002547963544337602 + }, + { + "step": 98, + "epoch": 0.6644067796610169, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722413056, + "loss": 0.6812, + "grad_norm": 5.02927827835083, + "learning_rate": 0.0002535118517223168 + }, + { + "step": 99, + "epoch": 0.6711864406779661, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722362368, + "loss": 0.5979, + "grad_norm": 5.04942512512207, + "learning_rate": 0.00025221269093908365 + }, + { + "step": 100, + "epoch": 0.6779661016949152, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722479104, + "loss": 0.6131, + "grad_norm": 6.651904106140137, + "learning_rate": 0.0002508990560551879 + }, + { + "step": 101, + "epoch": 0.6847457627118644, + "cpu_mem": 1.527574528, + "gpu_mem": 4.72251136, + "loss": 0.6984, + "grad_norm": 3.8917150497436523, + "learning_rate": 0.0002495711330914001 + }, + { + "step": 102, + "epoch": 0.6915254237288135, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722545152, + "loss": 0.5914, + "grad_norm": 2.322913885116577, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 103, + "epoch": 0.6983050847457627, + "cpu_mem": 1.527574528, + "gpu_mem": 4.72259584, + "loss": 0.6322, + "grad_norm": 3.5212390422821045, + "learning_rate": 0.0002468731770971113 + }, + { + "step": 104, + "epoch": 0.7050847457627119, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722500608, + "loss": 0.5647, + "grad_norm": 3.077224016189575, + "learning_rate": 0.0002455035261178632 + }, + { + "step": 105, + "epoch": 0.711864406779661, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722601984, + "loss": 0.5667, + "grad_norm": 3.436150550842285, + "learning_rate": 0.0002441203511071278 + }, + { + "step": 106, + "epoch": 0.7186440677966102, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722552832, + "loss": 0.5603, + "grad_norm": 5.609046936035156, + "learning_rate": 0.00024272384793309077 + }, + { + "step": 107, + "epoch": 0.7254237288135593, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722440704, + "loss": 0.4929, + "grad_norm": 4.49297571182251, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 108, + "epoch": 0.7322033898305085, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722625024, + "loss": 0.5647, + "grad_norm": 5.14377498626709, + "learning_rate": 0.00023989164997670202 + }, + { + "step": 109, + "epoch": 0.7389830508474576, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722479104, + "loss": 0.591, + "grad_norm": 4.947544574737549, + "learning_rate": 0.0002384563562552943 + }, + { + "step": 110, + "epoch": 0.7457627118644068, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722482176, + "loss": 0.5745, + "grad_norm": 4.043002605438232, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 111, + "epoch": 0.752542372881356, + "cpu_mem": 1.527574528, + "gpu_mem": 4.722451456, + "loss": 0.5688, + "grad_norm": 6.002414703369141, + "learning_rate": 0.0002355483955402446 + }, + { + "step": 112, + "epoch": 0.7593220338983051, + "cpu_mem": 1.527771136, + "gpu_mem": 4.722497536, + "loss": 0.6035, + "grad_norm": 5.334151268005371, + "learning_rate": 0.00023407614033613407 + }, + { + "step": 113, + "epoch": 0.7661016949152543, + "cpu_mem": 1.527771136, + "gpu_mem": 4.72248832, + "loss": 0.5538, + "grad_norm": 4.7518310546875, + "learning_rate": 0.0002325919793059723 + }, + { + "step": 114, + "epoch": 0.7728813559322034, + "cpu_mem": 1.527771136, + "gpu_mem": 4.722469888, + "loss": 0.4641, + "grad_norm": 4.555304527282715, + "learning_rate": 0.00023109612261833963 + }, + { + "step": 115, + "epoch": 0.7796610169491526, + "cpu_mem": 1.527967744, + "gpu_mem": 4.722545152, + "loss": 0.6187, + "grad_norm": 5.561794281005859, + "learning_rate": 0.0002295887820980112 + }, + { + "step": 116, + "epoch": 0.7864406779661017, + "cpu_mem": 1.527967744, + "gpu_mem": 4.72246528, + "loss": 0.5549, + "grad_norm": 4.324403285980225, + "learning_rate": 0.0002280701711959608 + }, + { + "step": 117, + "epoch": 0.7932203389830509, + "cpu_mem": 1.527967744, + "gpu_mem": 4.722356224, + "loss": 0.4801, + "grad_norm": 2.37454891204834, + "learning_rate": 0.00022654050495913495 + }, + { + "step": 118, + "epoch": 0.8, + "cpu_mem": 1.527967744, + "gpu_mem": 4.722594304, + "loss": 0.6734, + "grad_norm": 4.633813858032227, + "learning_rate": 0.000225 + }, + { + "step": 119, + "epoch": 0.8067796610169492, + "cpu_mem": 1.527967744, + "gpu_mem": 4.7227648, + "loss": 0.4906, + "grad_norm": 4.976076126098633, + "learning_rate": 0.00022344887446586865 + }, + { + "step": 120, + "epoch": 0.8135593220338984, + "cpu_mem": 1.527967744, + "gpu_mem": 4.722497536, + "loss": 0.4778, + "grad_norm": 4.014054775238037, + "learning_rate": 0.00022188734800800852 + }, + { + "step": 121, + "epoch": 0.8203389830508474, + "cpu_mem": 1.527967744, + "gpu_mem": 4.722525184, + "loss": 0.5773, + "grad_norm": 4.499019622802734, + "learning_rate": 0.00022031564175053754 + }, + { + "step": 122, + "epoch": 0.8271186440677966, + "cpu_mem": 1.527967744, + "gpu_mem": 4.722575872, + "loss": 0.4382, + "grad_norm": 4.889860153198242, + "learning_rate": 0.00021873397825911153 + }, + { + "step": 123, + "epoch": 0.8338983050847457, + "cpu_mem": 1.527967744, + "gpu_mem": 4.722385408, + "loss": 0.5738, + "grad_norm": 6.226327419281006, + "learning_rate": 0.00021714258150940685 + }, + { + "step": 124, + "epoch": 0.8406779661016949, + "cpu_mem": 1.527967744, + "gpu_mem": 4.722827776, + "loss": 0.5588, + "grad_norm": 5.272500514984131, + "learning_rate": 0.0002155416768554039 + }, + { + "step": 125, + "epoch": 0.847457627118644, + "cpu_mem": 1.527967744, + "gpu_mem": 4.722554368, + "loss": 0.6352, + "grad_norm": 8.107681274414062, + "learning_rate": 0.00021393149099747523 + }, + { + "step": 126, + "epoch": 0.8542372881355932, + "cpu_mem": 1.527967744, + "gpu_mem": 4.722437632, + "loss": 0.5789, + "grad_norm": 6.859676837921143, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 127, + "epoch": 0.8610169491525423, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722876928, + "loss": 0.5435, + "grad_norm": 4.632746696472168, + "learning_rate": 0.00021068418901049025 + }, + { + "step": 128, + "epoch": 0.8677966101694915, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722652672, + "loss": 0.4812, + "grad_norm": 3.7110488414764404, + "learning_rate": 0.0002090475327242912 + }, + { + "step": 129, + "epoch": 0.8745762711864407, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722692608, + "loss": 0.5938, + "grad_norm": 6.665708541870117, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 130, + "epoch": 0.8813559322033898, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722474496, + "loss": 0.5986, + "grad_norm": 3.7718665599823, + "learning_rate": 0.0002057493683490491 + }, + { + "step": 131, + "epoch": 0.888135593220339, + "cpu_mem": 1.530130432, + "gpu_mem": 4.72260352, + "loss": 0.5023, + "grad_norm": 3.4843013286590576, + "learning_rate": 0.00020408832730536746 + }, + { + "step": 132, + "epoch": 0.8949152542372881, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722684928, + "loss": 0.5568, + "grad_norm": 6.419014930725098, + "learning_rate": 0.00020241962693986476 + }, + { + "step": 133, + "epoch": 0.9016949152542373, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722468352, + "loss": 0.4996, + "grad_norm": 4.196140766143799, + "learning_rate": 0.0002007435035533061 + }, + { + "step": 134, + "epoch": 0.9084745762711864, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722601984, + "loss": 0.4804, + "grad_norm": 3.8392629623413086, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 135, + "epoch": 0.9152542372881356, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722625024, + "loss": 0.664, + "grad_norm": 9.533957481384277, + "learning_rate": 0.00019736993814225374 + }, + { + "step": 136, + "epoch": 0.9220338983050848, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722462208, + "loss": 0.526, + "grad_norm": 6.26369047164917, + "learning_rate": 0.00019567297384048604 + }, + { + "step": 137, + "epoch": 0.9288135593220339, + "cpu_mem": 1.530130432, + "gpu_mem": 4.7223424, + "loss": 0.5442, + "grad_norm": 5.538697719573975, + "learning_rate": 0.0001939695418954653 + }, + { + "step": 138, + "epoch": 0.9355932203389831, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722523648, + "loss": 0.4846, + "grad_norm": 3.0243053436279297, + "learning_rate": 0.00019225988352621445 + }, + { + "step": 139, + "epoch": 0.9423728813559322, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722422272, + "loss": 0.5617, + "grad_norm": 5.228503704071045, + "learning_rate": 0.00019054424083346592 + }, + { + "step": 140, + "epoch": 0.9491525423728814, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722474496, + "loss": 0.5831, + "grad_norm": 6.361831188201904, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 141, + "epoch": 0.9559322033898305, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722506752, + "loss": 0.5879, + "grad_norm": 5.449702262878418, + "learning_rate": 0.0001870959750831323 + }, + { + "step": 142, + "epoch": 0.9627118644067797, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722646528, + "loss": 0.4928, + "grad_norm": 4.112429141998291, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 143, + "epoch": 0.9694915254237289, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722629632, + "loss": 0.6331, + "grad_norm": 4.983449459075928, + "learning_rate": 0.00018362669777878453 + }, + { + "step": 144, + "epoch": 0.976271186440678, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722821632, + "loss": 0.6526, + "grad_norm": 6.455733299255371, + "learning_rate": 0.00018188479343294648 + }, + { + "step": 145, + "epoch": 0.9830508474576272, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722532864, + "loss": 0.505, + "grad_norm": 3.729292392730713, + "learning_rate": 0.0001801383739559098 + }, + { + "step": 146, + "epoch": 0.9898305084745763, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722568192, + "loss": 0.5684, + "grad_norm": 3.6822805404663086, + "learning_rate": 0.0001783876866540615 + }, + { + "step": 147, + "epoch": 0.9966101694915255, + "cpu_mem": 1.530130432, + "gpu_mem": 4.722466816, + "loss": 0.5114, + "grad_norm": 3.8303308486938477, + "learning_rate": 0.00017663297943814552 + }, + { + "step": 148, + "epoch": 1.0033898305084745, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824187392, + "loss": 0.7086, + "grad_norm": 4.2697529792785645, + "learning_rate": 0.0001748745007881561 + }, + { + "step": 149, + "epoch": 1.0101694915254238, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82412288, + "loss": 0.4824, + "grad_norm": 3.1086933612823486, + "learning_rate": 0.00017311249971815185 + }, + { + "step": 150, + "epoch": 1.0169491525423728, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823960064, + "loss": 0.4912, + "grad_norm": 3.0585293769836426, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 151, + "epoch": 1.023728813559322, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824032256, + "loss": 0.5404, + "grad_norm": 4.751421928405762, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 152, + "epoch": 1.0305084745762711, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824067584, + "loss": 0.4155, + "grad_norm": 3.7576749324798584, + "learning_rate": 0.00016780785939859576 + }, + { + "step": 153, + "epoch": 1.0372881355932204, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82409216, + "loss": 0.5912, + "grad_norm": 4.714956283569336, + "learning_rate": 0.00016603426823476693 + }, + { + "step": 154, + "epoch": 1.0440677966101695, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82405376, + "loss": 0.4068, + "grad_norm": 6.185878753662109, + "learning_rate": 0.00016425840649562736 + }, + { + "step": 155, + "epoch": 1.0508474576271187, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824274944, + "loss": 0.5478, + "grad_norm": 5.617146015167236, + "learning_rate": 0.00016248052565681436 + }, + { + "step": 156, + "epoch": 1.0576271186440678, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824182784, + "loss": 0.3949, + "grad_norm": 3.8420050144195557, + "learning_rate": 0.00016070087747988482 + }, + { + "step": 157, + "epoch": 1.064406779661017, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824089088, + "loss": 0.3574, + "grad_norm": 3.955946683883667, + "learning_rate": 0.00015891971397666464 + }, + { + "step": 158, + "epoch": 1.071186440677966, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82401536, + "loss": 0.478, + "grad_norm": 5.138974189758301, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 159, + "epoch": 1.0779661016949154, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824364032, + "loss": 0.3684, + "grad_norm": 5.789059638977051, + "learning_rate": 0.00015535385007584706 + }, + { + "step": 160, + "epoch": 1.0847457627118644, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823958528, + "loss": 0.417, + "grad_norm": 4.242332935333252, + "learning_rate": 0.0001535696546319161 + }, + { + "step": 161, + "epoch": 1.0915254237288137, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823904768, + "loss": 0.4055, + "grad_norm": 5.762329578399658, + "learning_rate": 0.00015178495369752213 + }, + { + "step": 162, + "epoch": 1.0983050847457627, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824680448, + "loss": 0.3548, + "grad_norm": 4.2064528465271, + "learning_rate": 0.00015 + }, + { + "step": 163, + "epoch": 1.1050847457627118, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824156672, + "loss": 0.4353, + "grad_norm": 7.034615993499756, + "learning_rate": 0.00014821504630247785 + }, + { + "step": 164, + "epoch": 1.111864406779661, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82406912, + "loss": 0.4521, + "grad_norm": 5.314269065856934, + "learning_rate": 0.00014643034536808387 + }, + { + "step": 165, + "epoch": 1.11864406779661, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824018432, + "loss": 0.4232, + "grad_norm": 4.84352970123291, + "learning_rate": 0.00014464614992415294 + }, + { + "step": 166, + "epoch": 1.1254237288135593, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824113664, + "loss": 0.2749, + "grad_norm": 6.184368133544922, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 167, + "epoch": 1.1322033898305084, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82403072, + "loss": 0.5079, + "grad_norm": 6.006751537322998, + "learning_rate": 0.00014108028602333536 + }, + { + "step": 168, + "epoch": 1.1389830508474577, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824049152, + "loss": 0.4548, + "grad_norm": 6.466878414154053, + "learning_rate": 0.00013929912252011516 + }, + { + "step": 169, + "epoch": 1.1457627118644067, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824136704, + "loss": 0.4594, + "grad_norm": 6.34998083114624, + "learning_rate": 0.00013751947434318564 + }, + { + "step": 170, + "epoch": 1.152542372881356, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824021504, + "loss": 0.5691, + "grad_norm": 7.32921028137207, + "learning_rate": 0.00013574159350437261 + }, + { + "step": 171, + "epoch": 1.159322033898305, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82408448, + "loss": 0.5156, + "grad_norm": 7.228818416595459, + "learning_rate": 0.0001339657317652331 + }, + { + "step": 172, + "epoch": 1.1661016949152543, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82399232, + "loss": 0.3484, + "grad_norm": 5.381630897521973, + "learning_rate": 0.00013219214060140424 + }, + { + "step": 173, + "epoch": 1.1728813559322033, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82429184, + "loss": 0.4047, + "grad_norm": 7.392773151397705, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 174, + "epoch": 1.1796610169491526, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82401536, + "loss": 0.5447, + "grad_norm": 7.2776360511779785, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 175, + "epoch": 1.1864406779661016, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823981568, + "loss": 0.422, + "grad_norm": 5.06213903427124, + "learning_rate": 0.00012688750028184818 + }, + { + "step": 176, + "epoch": 1.193220338983051, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824119808, + "loss": 0.4156, + "grad_norm": 7.205942630767822, + "learning_rate": 0.0001251254992118439 + }, + { + "step": 177, + "epoch": 1.2, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824218112, + "loss": 0.4374, + "grad_norm": 4.606486797332764, + "learning_rate": 0.00012336702056185453 + }, + { + "step": 178, + "epoch": 1.2067796610169492, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823964672, + "loss": 0.4501, + "grad_norm": 5.773846626281738, + "learning_rate": 0.00012161231334593851 + }, + { + "step": 179, + "epoch": 1.2135593220338983, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824064512, + "loss": 0.4543, + "grad_norm": 5.905703544616699, + "learning_rate": 0.00011986162604409015 + }, + { + "step": 180, + "epoch": 1.2203389830508475, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824036864, + "loss": 0.4158, + "grad_norm": 5.3787360191345215, + "learning_rate": 0.00011811520656705348 + }, + { + "step": 181, + "epoch": 1.2271186440677966, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823973888, + "loss": 0.3065, + "grad_norm": 4.495090007781982, + "learning_rate": 0.00011637330222121543 + }, + { + "step": 182, + "epoch": 1.2338983050847459, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824192, + "loss": 0.5224, + "grad_norm": 7.384599685668945, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 183, + "epoch": 1.240677966101695, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824089088, + "loss": 0.3435, + "grad_norm": 5.406001091003418, + "learning_rate": 0.00011290402491686766 + }, + { + "step": 184, + "epoch": 1.2474576271186442, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824036864, + "loss": 0.396, + "grad_norm": 5.245143890380859, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 185, + "epoch": 1.2542372881355932, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82401536, + "loss": 0.5028, + "grad_norm": 8.920019149780273, + "learning_rate": 0.00010945575916653407 + }, + { + "step": 186, + "epoch": 1.2610169491525425, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824024576, + "loss": 0.315, + "grad_norm": 6.18515682220459, + "learning_rate": 0.00010774011647378553 + }, + { + "step": 187, + "epoch": 1.2677966101694915, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823956992, + "loss": 0.4535, + "grad_norm": 10.364043235778809, + "learning_rate": 0.00010603045810453468 + }, + { + "step": 188, + "epoch": 1.2745762711864406, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824119808, + "loss": 0.2713, + "grad_norm": 4.690507888793945, + "learning_rate": 0.00010432702615951396 + }, + { + "step": 189, + "epoch": 1.2813559322033898, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823989248, + "loss": 0.4559, + "grad_norm": 5.3982133865356445, + "learning_rate": 0.00010263006185774627 + }, + { + "step": 190, + "epoch": 1.288135593220339, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824109056, + "loss": 0.3843, + "grad_norm": 4.67440128326416, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 191, + "epoch": 1.2949152542372881, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823927808, + "loss": 0.3162, + "grad_norm": 5.031257152557373, + "learning_rate": 9.925649644669391e-05 + }, + { + "step": 192, + "epoch": 1.3016949152542372, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824059904, + "loss": 0.2999, + "grad_norm": 5.343701362609863, + "learning_rate": 9.758037306013526e-05 + }, + { + "step": 193, + "epoch": 1.3084745762711865, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824033792, + "loss": 0.3915, + "grad_norm": 7.458854675292969, + "learning_rate": 9.591167269463255e-05 + }, + { + "step": 194, + "epoch": 1.3152542372881357, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824, + "loss": 0.345, + "grad_norm": 7.649142265319824, + "learning_rate": 9.425063165095088e-05 + }, + { + "step": 195, + "epoch": 1.3220338983050848, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824104448, + "loss": 0.2545, + "grad_norm": 6.707334518432617, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 196, + "epoch": 1.3288135593220338, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82409984, + "loss": 0.4273, + "grad_norm": 6.2424116134643555, + "learning_rate": 9.095246727570879e-05 + }, + { + "step": 197, + "epoch": 1.335593220338983, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823958528, + "loss": 0.3421, + "grad_norm": 6.736060619354248, + "learning_rate": 8.931581098950973e-05 + }, + { + "step": 198, + "epoch": 1.3423728813559321, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824150528, + "loss": 0.409, + "grad_norm": 6.864956378936768, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 199, + "epoch": 1.3491525423728814, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824001536, + "loss": 0.4033, + "grad_norm": 8.16348934173584, + "learning_rate": 8.606850900252478e-05 + }, + { + "step": 200, + "epoch": 1.3559322033898304, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824104448, + "loss": 0.2264, + "grad_norm": 4.169793605804443, + "learning_rate": 8.445832314459608e-05 + }, + { + "step": 201, + "epoch": 1.3627118644067797, + "cpu_mem": 1.530130432, + "gpu_mem": 4.8243072, + "loss": 0.301, + "grad_norm": 5.401573657989502, + "learning_rate": 8.285741849059311e-05 + }, + { + "step": 202, + "epoch": 1.3694915254237288, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824109056, + "loss": 0.2946, + "grad_norm": 5.444881439208984, + "learning_rate": 8.126602174088843e-05 + }, + { + "step": 203, + "epoch": 1.376271186440678, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823995392, + "loss": 0.3586, + "grad_norm": 7.087218284606934, + "learning_rate": 7.968435824946242e-05 + }, + { + "step": 204, + "epoch": 1.383050847457627, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824009216, + "loss": 0.2954, + "grad_norm": 5.234076976776123, + "learning_rate": 7.811265199199152e-05 + }, + { + "step": 205, + "epoch": 1.3898305084745763, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82405376, + "loss": 0.3908, + "grad_norm": 6.1905012130737305, + "learning_rate": 7.655112553413135e-05 + }, + { + "step": 206, + "epoch": 1.3966101694915254, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823995392, + "loss": 0.3348, + "grad_norm": 5.874136447906494, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 207, + "epoch": 1.4033898305084747, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824228864, + "loss": 0.3414, + "grad_norm": 5.023383617401123, + "learning_rate": 7.345949504086507e-05 + }, + { + "step": 208, + "epoch": 1.4101694915254237, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824259584, + "loss": 0.2438, + "grad_norm": 7.677697658538818, + "learning_rate": 7.192982880403917e-05 + }, + { + "step": 209, + "epoch": 1.4169491525423727, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824185856, + "loss": 0.3552, + "grad_norm": 5.987409591674805, + "learning_rate": 7.041121790198881e-05 + }, + { + "step": 210, + "epoch": 1.423728813559322, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824073728, + "loss": 0.4027, + "grad_norm": 5.519845485687256, + "learning_rate": 6.890387738166041e-05 + }, + { + "step": 211, + "epoch": 1.4305084745762713, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82402304, + "loss": 0.3283, + "grad_norm": 6.361313819885254, + "learning_rate": 6.740802069402771e-05 + }, + { + "step": 212, + "epoch": 1.4372881355932203, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82399232, + "loss": 0.3592, + "grad_norm": 5.429263591766357, + "learning_rate": 6.592385966386588e-05 + }, + { + "step": 213, + "epoch": 1.4440677966101694, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82401536, + "loss": 0.4217, + "grad_norm": 8.665788650512695, + "learning_rate": 6.445160445975536e-05 + }, + { + "step": 214, + "epoch": 1.4508474576271186, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824098304, + "loss": 0.421, + "grad_norm": 7.960748195648193, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 215, + "epoch": 1.457627118644068, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824026112, + "loss": 0.397, + "grad_norm": 8.969430923461914, + "learning_rate": 6.154364374470568e-05 + }, + { + "step": 216, + "epoch": 1.464406779661017, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824192, + "loss": 0.373, + "grad_norm": 5.677824020385742, + "learning_rate": 6.010835002329795e-05 + }, + { + "step": 217, + "epoch": 1.471186440677966, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824033792, + "loss": 0.4046, + "grad_norm": 6.211999893188477, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 218, + "epoch": 1.4779661016949153, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824010752, + "loss": 0.3678, + "grad_norm": 5.141634464263916, + "learning_rate": 5.72761520669092e-05 + }, + { + "step": 219, + "epoch": 1.4847457627118645, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824136704, + "loss": 0.4164, + "grad_norm": 5.972975254058838, + "learning_rate": 5.587964889287218e-05 + }, + { + "step": 220, + "epoch": 1.4915254237288136, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824170496, + "loss": 0.3468, + "grad_norm": 5.6360673904418945, + "learning_rate": 5.449647388213678e-05 + }, + { + "step": 221, + "epoch": 1.4983050847457626, + "cpu_mem": 1.530130432, + "gpu_mem": 4.8240384, + "loss": 0.3753, + "grad_norm": 5.116311550140381, + "learning_rate": 5.312682290288869e-05 + }, + { + "step": 222, + "epoch": 1.505084745762712, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824175104, + "loss": 0.3698, + "grad_norm": 6.675261974334717, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 223, + "epoch": 1.5118644067796612, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824089088, + "loss": 0.4058, + "grad_norm": 6.807044982910156, + "learning_rate": 5.0428866908599864e-05 + }, + { + "step": 224, + "epoch": 1.5186440677966102, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82405376, + "loss": 0.2952, + "grad_norm": 4.836634635925293, + "learning_rate": 4.9100943944812114e-05 + }, + { + "step": 225, + "epoch": 1.5254237288135593, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824018432, + "loss": 0.2557, + "grad_norm": 3.945059299468994, + "learning_rate": 4.778730906091632e-05 + }, + { + "step": 226, + "epoch": 1.5322033898305085, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824167424, + "loss": 0.3338, + "grad_norm": 4.126738548278809, + "learning_rate": 4.648814827768322e-05 + }, + { + "step": 227, + "epoch": 1.5389830508474578, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824056832, + "loss": 0.2903, + "grad_norm": 6.6302809715271, + "learning_rate": 4.5203645566239816e-05 + }, + { + "step": 228, + "epoch": 1.5457627118644068, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824001536, + "loss": 0.4299, + "grad_norm": 7.667603015899658, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 229, + "epoch": 1.5525423728813559, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823943168, + "loss": 0.2388, + "grad_norm": 4.576896667480469, + "learning_rate": 4.267933983899601e-05 + }, + { + "step": 230, + "epoch": 1.559322033898305, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824, + "loss": 0.3806, + "grad_norm": 6.581130027770996, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 231, + "epoch": 1.5661016949152542, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824278016, + "loss": 0.3558, + "grad_norm": 4.444263458251953, + "learning_rate": 4.0215821672741213e-05 + }, + { + "step": 232, + "epoch": 1.5728813559322035, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824001536, + "loss": 0.3733, + "grad_norm": 6.31781005859375, + "learning_rate": 3.900729534256745e-05 + }, + { + "step": 233, + "epoch": 1.5796610169491525, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82431488, + "loss": 0.2761, + "grad_norm": 4.112189769744873, + "learning_rate": 3.781448643031187e-05 + }, + { + "step": 234, + "epoch": 1.5864406779661016, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824190464, + "loss": 0.218, + "grad_norm": 3.6443135738372803, + "learning_rate": 3.663756384686127e-05 + }, + { + "step": 235, + "epoch": 1.5932203389830508, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82394624, + "loss": 0.3588, + "grad_norm": 6.56972599029541, + "learning_rate": 3.547669425347647e-05 + }, + { + "step": 236, + "epoch": 1.6, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824006144, + "loss": 0.3269, + "grad_norm": 5.762430191040039, + "learning_rate": 3.433204203819185e-05 + }, + { + "step": 237, + "epoch": 1.6067796610169491, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824067584, + "loss": 0.4199, + "grad_norm": 6.846770763397217, + "learning_rate": 3.3203769292536764e-05 + }, + { + "step": 238, + "epoch": 1.6135593220338982, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82406912, + "loss": 0.3317, + "grad_norm": 5.823359489440918, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 239, + "epoch": 1.6203389830508474, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82432256, + "loss": 0.3896, + "grad_norm": 7.253147125244141, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 240, + "epoch": 1.6271186440677967, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823972352, + "loss": 0.5372, + "grad_norm": 7.62632417678833, + "learning_rate": 2.9918813861345952e-05 + }, + { + "step": 241, + "epoch": 1.6338983050847458, + "cpu_mem": 1.530130432, + "gpu_mem": 4.8242688, + "loss": 0.2841, + "grad_norm": 5.374094009399414, + "learning_rate": 2.885763318295102e-05 + }, + { + "step": 242, + "epoch": 1.6406779661016948, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82413056, + "loss": 0.2746, + "grad_norm": 5.998660564422607, + "learning_rate": 2.781360719244964e-05 + }, + { + "step": 243, + "epoch": 1.647457627118644, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823983104, + "loss": 0.4833, + "grad_norm": 7.127227306365967, + "learning_rate": 2.6786883731926306e-05 + }, + { + "step": 244, + "epoch": 1.6542372881355933, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82412288, + "loss": 0.1932, + "grad_norm": 3.3074233531951904, + "learning_rate": 2.5777608193294396e-05 + }, + { + "step": 245, + "epoch": 1.6610169491525424, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824001536, + "loss": 0.4678, + "grad_norm": 7.167958736419678, + "learning_rate": 2.4785923497707956e-05 + }, + { + "step": 246, + "epoch": 1.6677966101694914, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824095232, + "loss": 0.3263, + "grad_norm": 5.361066818237305, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 247, + "epoch": 1.6745762711864407, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824113664, + "loss": 0.3142, + "grad_norm": 7.1523284912109375, + "learning_rate": 2.285588584541047e-05 + }, + { + "step": 248, + "epoch": 1.68135593220339, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824066048, + "loss": 0.4077, + "grad_norm": 5.50556755065918, + "learning_rate": 2.1917806196827792e-05 + }, + { + "step": 249, + "epoch": 1.688135593220339, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823972352, + "loss": 0.2624, + "grad_norm": 4.57589864730835, + "learning_rate": 2.0997863968844914e-05 + }, + { + "step": 250, + "epoch": 1.694915254237288, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824064512, + "loss": 0.2678, + "grad_norm": 6.028458118438721, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 251, + "epoch": 1.7016949152542373, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82397696, + "loss": 0.2385, + "grad_norm": 9.162259101867676, + "learning_rate": 1.921291027132278e-05 + }, + { + "step": 252, + "epoch": 1.7084745762711866, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824019968, + "loss": 0.3512, + "grad_norm": 4.852019309997559, + "learning_rate": 1.834815156491165e-05 + }, + { + "step": 253, + "epoch": 1.7152542372881356, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824213504, + "loss": 0.3405, + "grad_norm": 6.2241082191467285, + "learning_rate": 1.750203576956341e-05 + }, + { + "step": 254, + "epoch": 1.7220338983050847, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824009216, + "loss": 0.3576, + "grad_norm": 5.062891960144043, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 255, + "epoch": 1.7288135593220337, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824165888, + "loss": 0.4266, + "grad_norm": 6.995367527008057, + "learning_rate": 1.5866209521043304e-05 + }, + { + "step": 256, + "epoch": 1.735593220338983, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82399232, + "loss": 0.2653, + "grad_norm": 4.953624248504639, + "learning_rate": 1.5076730713409523e-05 + }, + { + "step": 257, + "epoch": 1.7423728813559323, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824405504, + "loss": 0.4008, + "grad_norm": 4.404302597045898, + "learning_rate": 1.4306358075111923e-05 + }, + { + "step": 258, + "epoch": 1.7491525423728813, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824064512, + "loss": 0.4191, + "grad_norm": 7.855671405792236, + "learning_rate": 1.3555200696822232e-05 + }, + { + "step": 259, + "epoch": 1.7559322033898304, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823981568, + "loss": 0.2937, + "grad_norm": 7.6881279945373535, + "learning_rate": 1.2823364948184095e-05 + }, + { + "step": 260, + "epoch": 1.7627118644067796, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824098304, + "loss": 0.1671, + "grad_norm": 4.152520179748535, + "learning_rate": 1.2110954462750166e-05 + }, + { + "step": 261, + "epoch": 1.769491525423729, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82405376, + "loss": 0.2918, + "grad_norm": 4.768893241882324, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 262, + "epoch": 1.776271186440678, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824010752, + "loss": 0.2511, + "grad_norm": 3.7594785690307617, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 263, + "epoch": 1.783050847457627, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824047616, + "loss": 0.2518, + "grad_norm": 4.245553970336914, + "learning_rate": 1.0091269574384874e-05 + }, + { + "step": 264, + "epoch": 1.7898305084745763, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824135168, + "loss": 0.2495, + "grad_norm": 4.321831226348877, + "learning_rate": 9.45754125003576e-06 + }, + { + "step": 265, + "epoch": 1.7966101694915255, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82405376, + "loss": 0.3896, + "grad_norm": 5.726314067840576, + "learning_rate": 8.843714815330987e-06 + }, + { + "step": 266, + "epoch": 1.8033898305084746, + "cpu_mem": 1.530130432, + "gpu_mem": 4.8242688, + "loss": 0.4383, + "grad_norm": 5.080480098724365, + "learning_rate": 8.249877192799731e-06 + }, + { + "step": 267, + "epoch": 1.8101694915254236, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82406144, + "loss": 0.3601, + "grad_norm": 5.688658714294434, + "learning_rate": 7.676112474402068e-06 + }, + { + "step": 268, + "epoch": 1.8169491525423729, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824066048, + "loss": 0.3501, + "grad_norm": 6.421919345855713, + "learning_rate": 7.122501909620926e-06 + }, + { + "step": 269, + "epoch": 1.8237288135593221, + "cpu_mem": 1.530130432, + "gpu_mem": 4.8240768, + "loss": 0.3546, + "grad_norm": 6.48486328125, + "learning_rate": 6.5891238939566275e-06 + }, + { + "step": 270, + "epoch": 1.8305084745762712, + "cpu_mem": 1.530130432, + "gpu_mem": 4.8241152, + "loss": 0.2864, + "grad_norm": 5.4872260093688965, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 271, + "epoch": 1.8372881355932202, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824167424, + "loss": 0.3182, + "grad_norm": 5.292596817016602, + "learning_rate": 5.583364755863701e-06 + }, + { + "step": 272, + "epoch": 1.8440677966101695, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824026112, + "loss": 0.3154, + "grad_norm": 4.313265800476074, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 273, + "epoch": 1.8508474576271188, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823906304, + "loss": 0.2951, + "grad_norm": 5.555436134338379, + "learning_rate": 4.659404732773908e-06 + }, + { + "step": 274, + "epoch": 1.8576271186440678, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824133632, + "loss": 0.2837, + "grad_norm": 4.105805397033691, + "learning_rate": 4.228264751468752e-06 + }, + { + "step": 275, + "epoch": 1.8644067796610169, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824377856, + "loss": 0.2878, + "grad_norm": 6.073948860168457, + "learning_rate": 3.817767165451041e-06 + }, + { + "step": 276, + "epoch": 1.8711864406779661, + "cpu_mem": 1.530130432, + "gpu_mem": 4.8240384, + "loss": 0.2768, + "grad_norm": 4.70308256149292, + "learning_rate": 3.4279701043260886e-06 + }, + { + "step": 277, + "epoch": 1.8779661016949154, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82398464, + "loss": 0.3124, + "grad_norm": 4.56823205947876, + "learning_rate": 3.0589287663461472e-06 + }, + { + "step": 278, + "epoch": 1.8847457627118644, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824147456, + "loss": 0.3178, + "grad_norm": 6.935946464538574, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 279, + "epoch": 1.8915254237288135, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824087552, + "loss": 0.4199, + "grad_norm": 5.427887439727783, + "learning_rate": 2.3833193495825853e-06 + }, + { + "step": 280, + "epoch": 1.8983050847457628, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824067584, + "loss": 0.3266, + "grad_norm": 4.603908538818359, + "learning_rate": 2.076846942272026e-06 + }, + { + "step": 281, + "epoch": 1.905084745762712, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824003072, + "loss": 0.3995, + "grad_norm": 4.938188076019287, + "learning_rate": 1.791321587504768e-06 + }, + { + "step": 282, + "epoch": 1.911864406779661, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824431616, + "loss": 0.2409, + "grad_norm": 4.201538562774658, + "learning_rate": 1.5267837178600972e-06 + }, + { + "step": 283, + "epoch": 1.9186440677966101, + "cpu_mem": 1.530130432, + "gpu_mem": 4.82413824, + "loss": 0.3625, + "grad_norm": 6.524600982666016, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 284, + "epoch": 1.9254237288135592, + "cpu_mem": 1.530130432, + "gpu_mem": 4.823993856, + "loss": 0.3915, + "grad_norm": 4.94792366027832, + "learning_rate": 1.0608172990067553e-06 + }, + { + "step": 285, + "epoch": 1.9322033898305084, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824047616, + "loss": 0.3007, + "grad_norm": 6.609028339385986, + "learning_rate": 8.594547342153979e-07 + }, + { + "step": 286, + "epoch": 1.9389830508474577, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824465408, + "loss": 0.293, + "grad_norm": 4.769420623779297, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 287, + "epoch": 1.9457627118644067, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824235008, + "loss": 0.3999, + "grad_norm": 8.176187515258789, + "learning_rate": 5.201134622801473e-07 + }, + { + "step": 288, + "epoch": 1.9525423728813558, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824019968, + "loss": 0.4427, + "grad_norm": 6.034461975097656, + "learning_rate": 3.821828084619727e-07 + }, + { + "step": 289, + "epoch": 1.959322033898305, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824104448, + "loss": 0.254, + "grad_norm": 7.134485721588135, + "learning_rate": 2.654391846207915e-07 + }, + { + "step": 290, + "epoch": 1.9661016949152543, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824029184, + "loss": 0.405, + "grad_norm": 6.609726905822754, + "learning_rate": 1.6989912254880556e-07 + }, + { + "step": 291, + "epoch": 1.9728813559322034, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824064512, + "loss": 0.4325, + "grad_norm": 6.219490051269531, + "learning_rate": 9.557615145123765e-08 + }, + { + "step": 292, + "epoch": 1.9796610169491524, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824147456, + "loss": 0.3779, + "grad_norm": 4.7131218910217285, + "learning_rate": 4.248079603064724e-08 + }, + { + "step": 293, + "epoch": 1.9864406779661017, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824064512, + "loss": 0.4359, + "grad_norm": 6.299029350280762, + "learning_rate": 1.0620574996372811e-08 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824090624, + "loss": 0.3799, + "grad_norm": 6.89005184173584, + "learning_rate": 0.0 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.530130432, + "gpu_mem": 4.824090624, + "train_runtime": 4484.6084, + "train_samples_per_second": 4.204, + "train_steps_per_second": 0.066, + "total_flos": 0.0, + "train_loss": 0.6128277448671204 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/adapter_config.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6934cfad94edb068f0d54db83e6a8b58f0fc939 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 16, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 8, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "A" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/eval_results.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..d55a8571d9449dc9e1004af51164889aeacf327c --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "boolq", + "results": 0.7033639143730887 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/training_configuration.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..227dc4081e739bcbc03ff7a1fc183ed15847e14b --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "BOOLQ", + "dataset_id": "google/boolq", + "preprocess_id": "boolq_train_deepeval" + }, + "peft_config": { + "method": "abl_A", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 6317696 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 2, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_A-boolq-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2", + "seed": 42, + "timestamp": "2025-08-30T19:09:28.617533" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/training_logs.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..6a6316a83c4c9adb44c978e87bac61a37ad1b442 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/training_logs.json @@ -0,0 +1,2659 @@ +[ + { + "step": 1, + "epoch": 0.006779661016949152, + "cpu_mem": 1.48830208, + "gpu_mem": 4.443082752, + "loss": 8.869, + "grad_norm": 234.86416625976562, + "learning_rate": 9.999999999999999e-06 + }, + { + "step": 2, + "epoch": 0.013559322033898305, + "cpu_mem": 1.494396928, + "gpu_mem": 4.493840896, + "loss": 8.9376, + "grad_norm": 240.33407592773438, + "learning_rate": 1.9999999999999998e-05 + }, + { + "step": 3, + "epoch": 0.020338983050847456, + "cpu_mem": 1.49518336, + "gpu_mem": 4.493759488, + "loss": 7.5679, + "grad_norm": 243.47679138183594, + "learning_rate": 2.9999999999999997e-05 + }, + { + "step": 4, + "epoch": 0.02711864406779661, + "cpu_mem": 1.495773184, + "gpu_mem": 4.493759488, + "loss": 4.959, + "grad_norm": 228.1814727783203, + "learning_rate": 3.9999999999999996e-05 + }, + { + "step": 5, + "epoch": 0.03389830508474576, + "cpu_mem": 1.496363008, + "gpu_mem": 4.493694976, + "loss": 2.537, + "grad_norm": 137.45384216308594, + "learning_rate": 4.9999999999999996e-05 + }, + { + "step": 6, + "epoch": 0.04067796610169491, + "cpu_mem": 1.496952832, + "gpu_mem": 4.493714944, + "loss": 1.4387, + "grad_norm": 56.679893493652344, + "learning_rate": 5.9999999999999995e-05 + }, + { + "step": 7, + "epoch": 0.04745762711864407, + "cpu_mem": 1.497542656, + "gpu_mem": 4.493767168, + "loss": 0.8578, + "grad_norm": 21.124313354492188, + "learning_rate": 7e-05 + }, + { + "step": 8, + "epoch": 0.05423728813559322, + "cpu_mem": 1.497935872, + "gpu_mem": 4.493853184, + "loss": 0.6193, + "grad_norm": 10.238547325134277, + "learning_rate": 7.999999999999999e-05 + }, + { + "step": 9, + "epoch": 0.061016949152542375, + "cpu_mem": 1.498329088, + "gpu_mem": 4.493761024, + "loss": 0.6998, + "grad_norm": 18.19664764404297, + "learning_rate": 8.999999999999999e-05 + }, + { + "step": 10, + "epoch": 0.06779661016949153, + "cpu_mem": 1.498918912, + "gpu_mem": 4.493661184, + "loss": 1.6846, + "grad_norm": 192.40335083007812, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 11, + "epoch": 0.07457627118644068, + "cpu_mem": 1.499312128, + "gpu_mem": 4.493765632, + "loss": 1.0755, + "grad_norm": 123.23554229736328, + "learning_rate": 0.00010999999999999998 + }, + { + "step": 12, + "epoch": 0.08135593220338982, + "cpu_mem": 1.499705344, + "gpu_mem": 4.494137344, + "loss": 1.2442, + "grad_norm": 281.9166259765625, + "learning_rate": 0.00011999999999999999 + }, + { + "step": 13, + "epoch": 0.08813559322033898, + "cpu_mem": 1.50009856, + "gpu_mem": 4.493741056, + "loss": 1.585, + "grad_norm": 92.95726013183594, + "learning_rate": 0.00013 + }, + { + "step": 14, + "epoch": 0.09491525423728814, + "cpu_mem": 1.500688384, + "gpu_mem": 4.493718016, + "loss": 0.7672, + "grad_norm": 36.01921081542969, + "learning_rate": 0.00014 + }, + { + "step": 15, + "epoch": 0.1016949152542373, + "cpu_mem": 1.500884992, + "gpu_mem": 4.493656576, + "loss": 1.018, + "grad_norm": 49.682037353515625, + "learning_rate": 0.00015 + }, + { + "step": 16, + "epoch": 0.10847457627118644, + "cpu_mem": 1.501278208, + "gpu_mem": 4.493741056, + "loss": 0.9599, + "grad_norm": 113.08747100830078, + "learning_rate": 0.00015999999999999999 + }, + { + "step": 17, + "epoch": 0.1152542372881356, + "cpu_mem": 1.501671424, + "gpu_mem": 4.493780992, + "loss": 0.6904, + "grad_norm": 5.700827598571777, + "learning_rate": 0.00016999999999999999 + }, + { + "step": 18, + "epoch": 0.12203389830508475, + "cpu_mem": 1.50206464, + "gpu_mem": 4.493843968, + "loss": 1.0158, + "grad_norm": 47.9433479309082, + "learning_rate": 0.00017999999999999998 + }, + { + "step": 19, + "epoch": 0.1288135593220339, + "cpu_mem": 1.502261248, + "gpu_mem": 4.493681152, + "loss": 1.2045, + "grad_norm": 46.0986213684082, + "learning_rate": 0.00018999999999999998 + }, + { + "step": 20, + "epoch": 0.13559322033898305, + "cpu_mem": 1.502654464, + "gpu_mem": 4.49379328, + "loss": 0.617, + "grad_norm": 4.902522087097168, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 21, + "epoch": 0.1423728813559322, + "cpu_mem": 1.50304768, + "gpu_mem": 4.493951488, + "loss": 0.6933, + "grad_norm": 14.804486274719238, + "learning_rate": 0.00020999999999999998 + }, + { + "step": 22, + "epoch": 0.14915254237288136, + "cpu_mem": 1.503244288, + "gpu_mem": 4.493843968, + "loss": 0.7679, + "grad_norm": 14.584829330444336, + "learning_rate": 0.00021999999999999995 + }, + { + "step": 23, + "epoch": 0.15593220338983052, + "cpu_mem": 1.503440896, + "gpu_mem": 4.49381632, + "loss": 0.6614, + "grad_norm": 6.266756057739258, + "learning_rate": 0.00023 + }, + { + "step": 24, + "epoch": 0.16271186440677965, + "cpu_mem": 1.503834112, + "gpu_mem": 4.493873152, + "loss": 0.6063, + "grad_norm": 5.272337913513184, + "learning_rate": 0.00023999999999999998 + }, + { + "step": 25, + "epoch": 0.1694915254237288, + "cpu_mem": 1.504227328, + "gpu_mem": 4.493658112, + "loss": 0.7254, + "grad_norm": 11.210253715515137, + "learning_rate": 0.00025 + }, + { + "step": 26, + "epoch": 0.17627118644067796, + "cpu_mem": 1.504423936, + "gpu_mem": 4.493713408, + "loss": 0.7634, + "grad_norm": 18.455121994018555, + "learning_rate": 0.00026 + }, + { + "step": 27, + "epoch": 0.18305084745762712, + "cpu_mem": 1.504620544, + "gpu_mem": 4.494005248, + "loss": 0.7479, + "grad_norm": 19.921911239624023, + "learning_rate": 0.00027 + }, + { + "step": 28, + "epoch": 0.18983050847457628, + "cpu_mem": 1.50501376, + "gpu_mem": 4.493684224, + "loss": 0.845, + "grad_norm": 19.626916885375977, + "learning_rate": 0.00028 + }, + { + "step": 29, + "epoch": 0.19661016949152543, + "cpu_mem": 1.505210368, + "gpu_mem": 4.493748736, + "loss": 0.678, + "grad_norm": 8.194727897644043, + "learning_rate": 0.00029 + }, + { + "step": 30, + "epoch": 0.2033898305084746, + "cpu_mem": 1.505406976, + "gpu_mem": 4.493827072, + "loss": 0.7646, + "grad_norm": 10.00369644165039, + "learning_rate": 0.0003 + }, + { + "step": 31, + "epoch": 0.21016949152542372, + "cpu_mem": 1.505603584, + "gpu_mem": 4.493630464, + "loss": 0.6244, + "grad_norm": 6.776846885681152, + "learning_rate": 0.0002999893794250036 + }, + { + "step": 32, + "epoch": 0.21694915254237288, + "cpu_mem": 1.505800192, + "gpu_mem": 4.493744128, + "loss": 0.7586, + "grad_norm": 16.354310989379883, + "learning_rate": 0.00029995751920396937 + }, + { + "step": 33, + "epoch": 0.22372881355932203, + "cpu_mem": 1.506193408, + "gpu_mem": 4.493982208, + "loss": 0.7397, + "grad_norm": 12.160492897033691, + "learning_rate": 0.00029990442384854874 + }, + { + "step": 34, + "epoch": 0.2305084745762712, + "cpu_mem": 1.506390016, + "gpu_mem": 4.493684224, + "loss": 0.6239, + "grad_norm": 9.15272331237793, + "learning_rate": 0.0002998301008774512 + }, + { + "step": 35, + "epoch": 0.23728813559322035, + "cpu_mem": 1.506586624, + "gpu_mem": 4.493894656, + "loss": 0.6729, + "grad_norm": 5.571009635925293, + "learning_rate": 0.0002997345608153792 + }, + { + "step": 36, + "epoch": 0.2440677966101695, + "cpu_mem": 1.506783232, + "gpu_mem": 4.493845504, + "loss": 0.7223, + "grad_norm": 25.372941970825195, + "learning_rate": 0.000299617817191538 + }, + { + "step": 37, + "epoch": 0.25084745762711863, + "cpu_mem": 1.50697984, + "gpu_mem": 4.493656576, + "loss": 0.8172, + "grad_norm": 16.308820724487305, + "learning_rate": 0.0002994798865377198 + }, + { + "step": 38, + "epoch": 0.2576271186440678, + "cpu_mem": 1.507373056, + "gpu_mem": 4.493903872, + "loss": 0.6857, + "grad_norm": 4.158070087432861, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 39, + "epoch": 0.26440677966101694, + "cpu_mem": 1.507569664, + "gpu_mem": 4.494283264, + "loss": 0.664, + "grad_norm": 3.092892646789551, + "learning_rate": 0.0002991405452657846 + }, + { + "step": 40, + "epoch": 0.2711864406779661, + "cpu_mem": 1.507569664, + "gpu_mem": 4.493853184, + "loss": 0.6499, + "grad_norm": 4.837502956390381, + "learning_rate": 0.00029893918270099324 + }, + { + "step": 41, + "epoch": 0.27796610169491526, + "cpu_mem": 1.507766272, + "gpu_mem": 4.494080512, + "loss": 0.6745, + "grad_norm": 2.230825424194336, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 42, + "epoch": 0.2847457627118644, + "cpu_mem": 1.507766272, + "gpu_mem": 4.4939776, + "loss": 0.6285, + "grad_norm": 6.971991539001465, + "learning_rate": 0.0002984732162821399 + }, + { + "step": 43, + "epoch": 0.29152542372881357, + "cpu_mem": 1.50796288, + "gpu_mem": 4.493799424, + "loss": 0.6839, + "grad_norm": 7.030607223510742, + "learning_rate": 0.0002982086784124952 + }, + { + "step": 44, + "epoch": 0.2983050847457627, + "cpu_mem": 1.508159488, + "gpu_mem": 4.493942272, + "loss": 0.6815, + "grad_norm": 9.800080299377441, + "learning_rate": 0.00029792315305772796 + }, + { + "step": 45, + "epoch": 0.3050847457627119, + "cpu_mem": 1.508356096, + "gpu_mem": 4.493722624, + "loss": 1.188, + "grad_norm": 21.56556510925293, + "learning_rate": 0.0002976166806504174 + }, + { + "step": 46, + "epoch": 0.31186440677966104, + "cpu_mem": 1.508552704, + "gpu_mem": 4.493965312, + "loss": 0.7325, + "grad_norm": 7.097214221954346, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 47, + "epoch": 0.31864406779661014, + "cpu_mem": 1.508552704, + "gpu_mem": 4.493688832, + "loss": 0.686, + "grad_norm": 10.250021934509277, + "learning_rate": 0.00029694107123365385 + }, + { + "step": 48, + "epoch": 0.3254237288135593, + "cpu_mem": 1.508749312, + "gpu_mem": 4.493765632, + "loss": 0.8689, + "grad_norm": 17.215639114379883, + "learning_rate": 0.00029657202989567393 + }, + { + "step": 49, + "epoch": 0.33220338983050846, + "cpu_mem": 1.50894592, + "gpu_mem": 4.493782528, + "loss": 0.7259, + "grad_norm": 3.048801898956299, + "learning_rate": 0.00029618223283454893 + }, + { + "step": 50, + "epoch": 0.3389830508474576, + "cpu_mem": 1.509142528, + "gpu_mem": 4.493721088, + "loss": 0.6172, + "grad_norm": 2.2575485706329346, + "learning_rate": 0.00029577173524853123 + }, + { + "step": 51, + "epoch": 0.34576271186440677, + "cpu_mem": 1.509142528, + "gpu_mem": 4.493725696, + "loss": 0.5723, + "grad_norm": 2.044959306716919, + "learning_rate": 0.0002953405952672261 + }, + { + "step": 52, + "epoch": 0.3525423728813559, + "cpu_mem": 1.509339136, + "gpu_mem": 4.493805568, + "loss": 0.6726, + "grad_norm": 4.235073089599609, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 53, + "epoch": 0.3593220338983051, + "cpu_mem": 1.509339136, + "gpu_mem": 4.493828608, + "loss": 0.6754, + "grad_norm": 10.029523849487305, + "learning_rate": 0.0002944166352441363 + }, + { + "step": 54, + "epoch": 0.36610169491525424, + "cpu_mem": 1.509535744, + "gpu_mem": 4.493756416, + "loss": 0.6683, + "grad_norm": 4.766758918762207, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 55, + "epoch": 0.3728813559322034, + "cpu_mem": 1.509535744, + "gpu_mem": 4.494026752, + "loss": 0.6831, + "grad_norm": 3.753432273864746, + "learning_rate": 0.00029341087610604337 + }, + { + "step": 56, + "epoch": 0.37966101694915255, + "cpu_mem": 1.509732352, + "gpu_mem": 4.493813248, + "loss": 0.926, + "grad_norm": 12.049140930175781, + "learning_rate": 0.00029287749809037904 + }, + { + "step": 57, + "epoch": 0.3864406779661017, + "cpu_mem": 1.509732352, + "gpu_mem": 4.493807104, + "loss": 0.7591, + "grad_norm": 7.700575351715088, + "learning_rate": 0.0002923238875255979 + }, + { + "step": 58, + "epoch": 0.39322033898305087, + "cpu_mem": 1.50992896, + "gpu_mem": 4.493702656, + "loss": 0.6608, + "grad_norm": 2.9501571655273438, + "learning_rate": 0.00029175012280720024 + }, + { + "step": 59, + "epoch": 0.4, + "cpu_mem": 1.50992896, + "gpu_mem": 4.493719552, + "loss": 0.7055, + "grad_norm": 4.570174217224121, + "learning_rate": 0.000291156285184669 + }, + { + "step": 60, + "epoch": 0.4067796610169492, + "cpu_mem": 1.510125568, + "gpu_mem": 4.493813248, + "loss": 0.7155, + "grad_norm": 10.250066757202148, + "learning_rate": 0.00029054245874996426 + }, + { + "step": 61, + "epoch": 0.4135593220338983, + "cpu_mem": 1.510125568, + "gpu_mem": 4.493824, + "loss": 0.6198, + "grad_norm": 1.4150381088256836, + "learning_rate": 0.0002899087304256151 + }, + { + "step": 62, + "epoch": 0.42033898305084744, + "cpu_mem": 1.510125568, + "gpu_mem": 4.493811712, + "loss": 0.8658, + "grad_norm": 11.572601318359375, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 63, + "epoch": 0.4271186440677966, + "cpu_mem": 1.510125568, + "gpu_mem": 4.493804032, + "loss": 0.5689, + "grad_norm": 2.9226982593536377, + "learning_rate": 0.000288581929876693 + }, + { + "step": 64, + "epoch": 0.43389830508474575, + "cpu_mem": 1.510518784, + "gpu_mem": 4.493733376, + "loss": 0.6423, + "grad_norm": 1.547162413597107, + "learning_rate": 0.0002878890455372498 + }, + { + "step": 65, + "epoch": 0.4406779661016949, + "cpu_mem": 1.510715392, + "gpu_mem": 4.49377792, + "loss": 0.6797, + "grad_norm": 3.6416873931884766, + "learning_rate": 0.0002871766350518159 + }, + { + "step": 66, + "epoch": 0.44745762711864406, + "cpu_mem": 1.510715392, + "gpu_mem": 4.493971456, + "loss": 0.6036, + "grad_norm": 3.8238625526428223, + "learning_rate": 0.00028644479930317775 + }, + { + "step": 67, + "epoch": 0.4542372881355932, + "cpu_mem": 1.510715392, + "gpu_mem": 4.493681152, + "loss": 0.7277, + "grad_norm": 8.09846019744873, + "learning_rate": 0.00028569364192488803 + }, + { + "step": 68, + "epoch": 0.4610169491525424, + "cpu_mem": 1.510912, + "gpu_mem": 4.493648896, + "loss": 0.8994, + "grad_norm": 13.207178115844727, + "learning_rate": 0.00028492326928659045 + }, + { + "step": 69, + "epoch": 0.46779661016949153, + "cpu_mem": 1.510912, + "gpu_mem": 4.493714944, + "loss": 0.5864, + "grad_norm": 1.9375393390655518, + "learning_rate": 0.00028413379047895665 + }, + { + "step": 70, + "epoch": 0.4745762711864407, + "cpu_mem": 1.510912, + "gpu_mem": 4.4937088, + "loss": 0.7792, + "grad_norm": 10.860440254211426, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 71, + "epoch": 0.48135593220338985, + "cpu_mem": 1.511108608, + "gpu_mem": 4.493937664, + "loss": 0.8567, + "grad_norm": 15.644757270812988, + "learning_rate": 0.0002824979642304366 + }, + { + "step": 72, + "epoch": 0.488135593220339, + "cpu_mem": 1.511108608, + "gpu_mem": 4.493929984, + "loss": 0.6608, + "grad_norm": 6.629893779754639, + "learning_rate": 0.0002816518484350883 + }, + { + "step": 73, + "epoch": 0.49491525423728816, + "cpu_mem": 1.511108608, + "gpu_mem": 4.493896192, + "loss": 0.8023, + "grad_norm": 10.461833953857422, + "learning_rate": 0.0002807870897286772 + }, + { + "step": 74, + "epoch": 0.5016949152542373, + "cpu_mem": 1.511305216, + "gpu_mem": 4.493756416, + "loss": 0.6224, + "grad_norm": 7.1698713302612305, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 75, + "epoch": 0.5084745762711864, + "cpu_mem": 1.511305216, + "gpu_mem": 4.493681152, + "loss": 0.5662, + "grad_norm": 2.1262013912200928, + "learning_rate": 0.000279002136031155 + }, + { + "step": 76, + "epoch": 0.5152542372881356, + "cpu_mem": 1.511305216, + "gpu_mem": 4.493621248, + "loss": 0.6138, + "grad_norm": 3.743492841720581, + "learning_rate": 0.00027808219380317216 + }, + { + "step": 77, + "epoch": 0.5220338983050847, + "cpu_mem": 1.511305216, + "gpu_mem": 4.493694976, + "loss": 0.6539, + "grad_norm": 6.320612907409668, + "learning_rate": 0.0002771441141545895 + }, + { + "step": 78, + "epoch": 0.5288135593220339, + "cpu_mem": 1.511305216, + "gpu_mem": 4.4937472, + "loss": 0.7095, + "grad_norm": 5.929784774780273, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 79, + "epoch": 0.535593220338983, + "cpu_mem": 1.511305216, + "gpu_mem": 4.493879296, + "loss": 0.6698, + "grad_norm": 1.6754264831542969, + "learning_rate": 0.000275214076502292 + }, + { + "step": 80, + "epoch": 0.5423728813559322, + "cpu_mem": 1.511305216, + "gpu_mem": 4.49377024, + "loss": 0.6258, + "grad_norm": 1.8663870096206665, + "learning_rate": 0.0002742223918067056 + }, + { + "step": 81, + "epoch": 0.5491525423728814, + "cpu_mem": 1.511305216, + "gpu_mem": 4.493650432, + "loss": 0.6314, + "grad_norm": 2.4315168857574463, + "learning_rate": 0.00027321311626807374 + }, + { + "step": 82, + "epoch": 0.5559322033898305, + "cpu_mem": 1.511305216, + "gpu_mem": 4.493719552, + "loss": 0.635, + "grad_norm": 1.932876467704773, + "learning_rate": 0.0002721863928075503 + }, + { + "step": 83, + "epoch": 0.5627118644067797, + "cpu_mem": 1.511305216, + "gpu_mem": 4.493819392, + "loss": 0.6554, + "grad_norm": 3.4201409816741943, + "learning_rate": 0.000271142366817049 + }, + { + "step": 84, + "epoch": 0.5694915254237288, + "cpu_mem": 1.511501824, + "gpu_mem": 4.493782528, + "loss": 0.7156, + "grad_norm": 7.90298318862915, + "learning_rate": 0.00027008118613865406 + }, + { + "step": 85, + "epoch": 0.576271186440678, + "cpu_mem": 1.511698432, + "gpu_mem": 4.493814784, + "loss": 0.5996, + "grad_norm": 3.2398500442504883, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 86, + "epoch": 0.5830508474576271, + "cpu_mem": 1.511698432, + "gpu_mem": 4.493765632, + "loss": 0.731, + "grad_norm": 5.571208953857422, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 87, + "epoch": 0.5898305084745763, + "cpu_mem": 1.511698432, + "gpu_mem": 4.493773312, + "loss": 0.7147, + "grad_norm": 7.965809345245361, + "learning_rate": 0.00026679623070746325 + }, + { + "step": 88, + "epoch": 0.5966101694915255, + "cpu_mem": 1.511698432, + "gpu_mem": 4.493917696, + "loss": 0.5825, + "grad_norm": 2.4796321392059326, + "learning_rate": 0.0002656679579618081 + }, + { + "step": 89, + "epoch": 0.6033898305084746, + "cpu_mem": 1.511698432, + "gpu_mem": 4.493699584, + "loss": 0.6858, + "grad_norm": 3.1727724075317383, + "learning_rate": 0.0002645233057465235 + }, + { + "step": 90, + "epoch": 0.6101694915254238, + "cpu_mem": 1.511698432, + "gpu_mem": 4.493753344, + "loss": 0.7181, + "grad_norm": 8.902009010314941, + "learning_rate": 0.00026336243615313873 + }, + { + "step": 91, + "epoch": 0.6169491525423729, + "cpu_mem": 1.511698432, + "gpu_mem": 4.493721088, + "loss": 0.7394, + "grad_norm": 10.157062530517578, + "learning_rate": 0.00026218551356968814 + }, + { + "step": 92, + "epoch": 0.6237288135593221, + "cpu_mem": 1.511698432, + "gpu_mem": 4.493802496, + "loss": 0.6819, + "grad_norm": 2.2884974479675293, + "learning_rate": 0.00026099270465743254 + }, + { + "step": 93, + "epoch": 0.6305084745762712, + "cpu_mem": 1.511698432, + "gpu_mem": 4.493605888, + "loss": 0.7171, + "grad_norm": 3.9411673545837402, + "learning_rate": 0.0002597841783272588 + }, + { + "step": 94, + "epoch": 0.6372881355932203, + "cpu_mem": 1.511698432, + "gpu_mem": 4.493719552, + "loss": 0.5627, + "grad_norm": 2.8555171489715576, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 95, + "epoch": 0.6440677966101694, + "cpu_mem": 1.511698432, + "gpu_mem": 4.49373952, + "loss": 0.6865, + "grad_norm": 5.10888147354126, + "learning_rate": 0.00025732066016100394 + }, + { + "step": 96, + "epoch": 0.6508474576271186, + "cpu_mem": 1.511698432, + "gpu_mem": 4.49377792, + "loss": 0.631, + "grad_norm": 4.621267318725586, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 97, + "epoch": 0.6576271186440678, + "cpu_mem": 1.51189504, + "gpu_mem": 4.49376256, + "loss": 0.6521, + "grad_norm": 3.2591891288757324, + "learning_rate": 0.0002547963544337602 + }, + { + "step": 98, + "epoch": 0.6644067796610169, + "cpu_mem": 1.51189504, + "gpu_mem": 4.493675008, + "loss": 0.6631, + "grad_norm": 3.0139002799987793, + "learning_rate": 0.0002535118517223168 + }, + { + "step": 99, + "epoch": 0.6711864406779661, + "cpu_mem": 1.51189504, + "gpu_mem": 4.49362432, + "loss": 0.6299, + "grad_norm": 2.386324405670166, + "learning_rate": 0.00025221269093908365 + }, + { + "step": 100, + "epoch": 0.6779661016949152, + "cpu_mem": 1.51189504, + "gpu_mem": 4.493741056, + "loss": 0.6204, + "grad_norm": 1.986992359161377, + "learning_rate": 0.0002508990560551879 + }, + { + "step": 101, + "epoch": 0.6847457627118644, + "cpu_mem": 1.51189504, + "gpu_mem": 4.493773312, + "loss": 0.7108, + "grad_norm": 6.049999237060547, + "learning_rate": 0.0002495711330914001 + }, + { + "step": 102, + "epoch": 0.6915254237288135, + "cpu_mem": 1.51189504, + "gpu_mem": 4.493807104, + "loss": 0.6755, + "grad_norm": 5.4609575271606445, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 103, + "epoch": 0.6983050847457627, + "cpu_mem": 1.51189504, + "gpu_mem": 4.493857792, + "loss": 0.6327, + "grad_norm": 2.271766185760498, + "learning_rate": 0.0002468731770971113 + }, + { + "step": 104, + "epoch": 0.7050847457627119, + "cpu_mem": 1.51189504, + "gpu_mem": 4.49376256, + "loss": 0.6587, + "grad_norm": 3.347034454345703, + "learning_rate": 0.0002455035261178632 + }, + { + "step": 105, + "epoch": 0.711864406779661, + "cpu_mem": 1.51189504, + "gpu_mem": 4.493863936, + "loss": 0.6336, + "grad_norm": 4.408857822418213, + "learning_rate": 0.0002441203511071278 + }, + { + "step": 106, + "epoch": 0.7186440677966102, + "cpu_mem": 1.51189504, + "gpu_mem": 4.493814784, + "loss": 0.6127, + "grad_norm": 4.522818088531494, + "learning_rate": 0.00024272384793309077 + }, + { + "step": 107, + "epoch": 0.7254237288135593, + "cpu_mem": 1.51189504, + "gpu_mem": 4.493702656, + "loss": 0.5933, + "grad_norm": 1.8976800441741943, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 108, + "epoch": 0.7322033898305085, + "cpu_mem": 1.51189504, + "gpu_mem": 4.493886976, + "loss": 0.6641, + "grad_norm": 3.7306277751922607, + "learning_rate": 0.00023989164997670202 + }, + { + "step": 109, + "epoch": 0.7389830508474576, + "cpu_mem": 1.512091648, + "gpu_mem": 4.493741056, + "loss": 0.7569, + "grad_norm": 5.521296501159668, + "learning_rate": 0.0002384563562552943 + }, + { + "step": 110, + "epoch": 0.7457627118644068, + "cpu_mem": 1.512091648, + "gpu_mem": 4.493744128, + "loss": 0.6737, + "grad_norm": 4.360266208648682, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 111, + "epoch": 0.752542372881356, + "cpu_mem": 1.512091648, + "gpu_mem": 4.493713408, + "loss": 0.5683, + "grad_norm": 2.3229658603668213, + "learning_rate": 0.0002355483955402446 + }, + { + "step": 112, + "epoch": 0.7593220338983051, + "cpu_mem": 1.512091648, + "gpu_mem": 4.493759488, + "loss": 0.6631, + "grad_norm": 6.712233543395996, + "learning_rate": 0.00023407614033613407 + }, + { + "step": 113, + "epoch": 0.7661016949152543, + "cpu_mem": 1.512091648, + "gpu_mem": 4.493750272, + "loss": 0.7655, + "grad_norm": 6.895766258239746, + "learning_rate": 0.0002325919793059723 + }, + { + "step": 114, + "epoch": 0.7728813559322034, + "cpu_mem": 1.512091648, + "gpu_mem": 4.49373184, + "loss": 0.7358, + "grad_norm": 8.146341323852539, + "learning_rate": 0.00023109612261833963 + }, + { + "step": 115, + "epoch": 0.7796610169491526, + "cpu_mem": 1.512091648, + "gpu_mem": 4.493807104, + "loss": 0.601, + "grad_norm": 3.9480652809143066, + "learning_rate": 0.0002295887820980112 + }, + { + "step": 116, + "epoch": 0.7864406779661017, + "cpu_mem": 1.512091648, + "gpu_mem": 4.493727232, + "loss": 0.5963, + "grad_norm": 2.6513514518737793, + "learning_rate": 0.0002280701711959608 + }, + { + "step": 117, + "epoch": 0.7932203389830509, + "cpu_mem": 1.512091648, + "gpu_mem": 4.493618176, + "loss": 0.7067, + "grad_norm": 5.624129772186279, + "learning_rate": 0.00022654050495913495 + }, + { + "step": 118, + "epoch": 0.8, + "cpu_mem": 1.512091648, + "gpu_mem": 4.493856256, + "loss": 0.7582, + "grad_norm": 6.480310916900635, + "learning_rate": 0.000225 + }, + { + "step": 119, + "epoch": 0.8067796610169492, + "cpu_mem": 1.512091648, + "gpu_mem": 4.494026752, + "loss": 0.5894, + "grad_norm": 2.979290723800659, + "learning_rate": 0.00022344887446586865 + }, + { + "step": 120, + "epoch": 0.8135593220338984, + "cpu_mem": 1.512091648, + "gpu_mem": 4.493759488, + "loss": 0.5885, + "grad_norm": 1.9377977848052979, + "learning_rate": 0.00022188734800800852 + }, + { + "step": 121, + "epoch": 0.8203389830508474, + "cpu_mem": 1.512091648, + "gpu_mem": 4.493787136, + "loss": 0.6238, + "grad_norm": 3.6376171112060547, + "learning_rate": 0.00022031564175053754 + }, + { + "step": 122, + "epoch": 0.8271186440677966, + "cpu_mem": 1.512091648, + "gpu_mem": 4.493837824, + "loss": 0.5311, + "grad_norm": 4.018235206604004, + "learning_rate": 0.00021873397825911153 + }, + { + "step": 123, + "epoch": 0.8338983050847457, + "cpu_mem": 1.512091648, + "gpu_mem": 4.49364736, + "loss": 0.6679, + "grad_norm": 5.683865070343018, + "learning_rate": 0.00021714258150940685 + }, + { + "step": 124, + "epoch": 0.8406779661016949, + "cpu_mem": 1.512091648, + "gpu_mem": 4.494089728, + "loss": 0.6208, + "grad_norm": 5.339485168457031, + "learning_rate": 0.0002155416768554039 + }, + { + "step": 125, + "epoch": 0.847457627118644, + "cpu_mem": 1.512091648, + "gpu_mem": 4.49381632, + "loss": 0.5845, + "grad_norm": 3.5815553665161133, + "learning_rate": 0.00021393149099747523 + }, + { + "step": 126, + "epoch": 0.8542372881355932, + "cpu_mem": 1.512091648, + "gpu_mem": 4.493699584, + "loss": 0.5642, + "grad_norm": 4.037660121917725, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 127, + "epoch": 0.8610169491525423, + "cpu_mem": 1.512288256, + "gpu_mem": 4.49413888, + "loss": 0.6483, + "grad_norm": 5.6473846435546875, + "learning_rate": 0.00021068418901049025 + }, + { + "step": 128, + "epoch": 0.8677966101694915, + "cpu_mem": 1.512288256, + "gpu_mem": 4.493914624, + "loss": 0.5481, + "grad_norm": 3.1490492820739746, + "learning_rate": 0.0002090475327242912 + }, + { + "step": 129, + "epoch": 0.8745762711864407, + "cpu_mem": 1.512288256, + "gpu_mem": 4.49395456, + "loss": 0.6907, + "grad_norm": 3.3728561401367188, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 130, + "epoch": 0.8813559322033898, + "cpu_mem": 1.512288256, + "gpu_mem": 4.493736448, + "loss": 0.6776, + "grad_norm": 2.8839058876037598, + "learning_rate": 0.0002057493683490491 + }, + { + "step": 131, + "epoch": 0.888135593220339, + "cpu_mem": 1.512288256, + "gpu_mem": 4.493865472, + "loss": 0.6277, + "grad_norm": 2.4115381240844727, + "learning_rate": 0.00020408832730536746 + }, + { + "step": 132, + "epoch": 0.8949152542372881, + "cpu_mem": 1.512288256, + "gpu_mem": 4.49394688, + "loss": 0.6367, + "grad_norm": 3.60898494720459, + "learning_rate": 0.00020241962693986476 + }, + { + "step": 133, + "epoch": 0.9016949152542373, + "cpu_mem": 1.512288256, + "gpu_mem": 4.493730304, + "loss": 0.5515, + "grad_norm": 2.1373813152313232, + "learning_rate": 0.0002007435035533061 + }, + { + "step": 134, + "epoch": 0.9084745762711864, + "cpu_mem": 1.512288256, + "gpu_mem": 4.493863936, + "loss": 0.5556, + "grad_norm": 2.2262206077575684, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 135, + "epoch": 0.9152542372881356, + "cpu_mem": 1.512288256, + "gpu_mem": 4.493886976, + "loss": 0.6743, + "grad_norm": 6.190613746643066, + "learning_rate": 0.00019736993814225374 + }, + { + "step": 136, + "epoch": 0.9220338983050848, + "cpu_mem": 1.512288256, + "gpu_mem": 4.49372416, + "loss": 0.601, + "grad_norm": 4.490257263183594, + "learning_rate": 0.00019567297384048604 + }, + { + "step": 137, + "epoch": 0.9288135593220339, + "cpu_mem": 1.512288256, + "gpu_mem": 4.493604352, + "loss": 0.6619, + "grad_norm": 4.613885402679443, + "learning_rate": 0.0001939695418954653 + }, + { + "step": 138, + "epoch": 0.9355932203389831, + "cpu_mem": 1.512288256, + "gpu_mem": 4.4937856, + "loss": 0.5927, + "grad_norm": 2.2556755542755127, + "learning_rate": 0.00019225988352621445 + }, + { + "step": 139, + "epoch": 0.9423728813559322, + "cpu_mem": 1.512288256, + "gpu_mem": 4.493684224, + "loss": 0.6136, + "grad_norm": 3.3856916427612305, + "learning_rate": 0.00019054424083346592 + }, + { + "step": 140, + "epoch": 0.9491525423728814, + "cpu_mem": 1.512288256, + "gpu_mem": 4.493736448, + "loss": 0.6362, + "grad_norm": 5.717785835266113, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 141, + "epoch": 0.9559322033898305, + "cpu_mem": 1.512288256, + "gpu_mem": 4.493768704, + "loss": 0.6383, + "grad_norm": 5.319495677947998, + "learning_rate": 0.0001870959750831323 + }, + { + "step": 142, + "epoch": 0.9627118644067797, + "cpu_mem": 1.512288256, + "gpu_mem": 4.49390848, + "loss": 0.5894, + "grad_norm": 4.640230178833008, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 143, + "epoch": 0.9694915254237289, + "cpu_mem": 1.512288256, + "gpu_mem": 4.493891584, + "loss": 0.6378, + "grad_norm": 3.016573905944824, + "learning_rate": 0.00018362669777878453 + }, + { + "step": 144, + "epoch": 0.976271186440678, + "cpu_mem": 1.512288256, + "gpu_mem": 4.494083584, + "loss": 0.6572, + "grad_norm": 4.5237603187561035, + "learning_rate": 0.00018188479343294648 + }, + { + "step": 145, + "epoch": 0.9830508474576272, + "cpu_mem": 1.512288256, + "gpu_mem": 4.493794816, + "loss": 0.5759, + "grad_norm": 3.0536630153656006, + "learning_rate": 0.0001801383739559098 + }, + { + "step": 146, + "epoch": 0.9898305084745763, + "cpu_mem": 1.512288256, + "gpu_mem": 4.493830144, + "loss": 0.6313, + "grad_norm": 3.5982203483581543, + "learning_rate": 0.0001783876866540615 + }, + { + "step": 147, + "epoch": 0.9966101694915255, + "cpu_mem": 1.512288256, + "gpu_mem": 4.493728768, + "loss": 0.5625, + "grad_norm": 3.026538133621216, + "learning_rate": 0.00017663297943814552 + }, + { + "step": 148, + "epoch": 1.0033898305084745, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519203328, + "loss": 0.884, + "grad_norm": 3.9491984844207764, + "learning_rate": 0.0001748745007881561 + }, + { + "step": 149, + "epoch": 1.0101694915254238, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519138816, + "loss": 0.6115, + "grad_norm": 2.8990418910980225, + "learning_rate": 0.00017311249971815185 + }, + { + "step": 150, + "epoch": 1.0169491525423728, + "cpu_mem": 1.512288256, + "gpu_mem": 4.518976, + "loss": 0.5619, + "grad_norm": 3.0098154544830322, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 151, + "epoch": 1.023728813559322, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519048192, + "loss": 0.6179, + "grad_norm": 3.9371068477630615, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 152, + "epoch": 1.0305084745762711, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51908352, + "loss": 0.4941, + "grad_norm": 2.8057267665863037, + "learning_rate": 0.00016780785939859576 + }, + { + "step": 153, + "epoch": 1.0372881355932204, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519108096, + "loss": 0.5268, + "grad_norm": 4.285440444946289, + "learning_rate": 0.00016603426823476693 + }, + { + "step": 154, + "epoch": 1.0440677966101695, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519069696, + "loss": 0.5617, + "grad_norm": 4.94078254699707, + "learning_rate": 0.00016425840649562736 + }, + { + "step": 155, + "epoch": 1.0508474576271187, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51929088, + "loss": 0.5337, + "grad_norm": 3.763066291809082, + "learning_rate": 0.00016248052565681436 + }, + { + "step": 156, + "epoch": 1.0576271186440678, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51919872, + "loss": 0.5724, + "grad_norm": 5.346607685089111, + "learning_rate": 0.00016070087747988482 + }, + { + "step": 157, + "epoch": 1.064406779661017, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519105024, + "loss": 0.5009, + "grad_norm": 5.106917381286621, + "learning_rate": 0.00015891971397666464 + }, + { + "step": 158, + "epoch": 1.071186440677966, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519031296, + "loss": 0.5795, + "grad_norm": 4.663048267364502, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 159, + "epoch": 1.0779661016949154, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519379968, + "loss": 0.4231, + "grad_norm": 4.06447696685791, + "learning_rate": 0.00015535385007584706 + }, + { + "step": 160, + "epoch": 1.0847457627118644, + "cpu_mem": 1.512288256, + "gpu_mem": 4.518974464, + "loss": 0.6225, + "grad_norm": 5.922128200531006, + "learning_rate": 0.0001535696546319161 + }, + { + "step": 161, + "epoch": 1.0915254237288137, + "cpu_mem": 1.512288256, + "gpu_mem": 4.518920704, + "loss": 0.5124, + "grad_norm": 5.51249885559082, + "learning_rate": 0.00015178495369752213 + }, + { + "step": 162, + "epoch": 1.0983050847457627, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519696384, + "loss": 0.5305, + "grad_norm": 4.262174129486084, + "learning_rate": 0.00015 + }, + { + "step": 163, + "epoch": 1.1050847457627118, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519172608, + "loss": 0.5712, + "grad_norm": 6.790377140045166, + "learning_rate": 0.00014821504630247785 + }, + { + "step": 164, + "epoch": 1.111864406779661, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519085056, + "loss": 0.6282, + "grad_norm": 5.247696876525879, + "learning_rate": 0.00014643034536808387 + }, + { + "step": 165, + "epoch": 1.11864406779661, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519034368, + "loss": 0.5203, + "grad_norm": 5.485547065734863, + "learning_rate": 0.00014464614992415294 + }, + { + "step": 166, + "epoch": 1.1254237288135593, + "cpu_mem": 1.512288256, + "gpu_mem": 4.5191296, + "loss": 0.5093, + "grad_norm": 4.293337821960449, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 167, + "epoch": 1.1322033898305084, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519046656, + "loss": 0.5422, + "grad_norm": 4.634438991546631, + "learning_rate": 0.00014108028602333536 + }, + { + "step": 168, + "epoch": 1.1389830508474577, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519065088, + "loss": 0.5884, + "grad_norm": 6.198184490203857, + "learning_rate": 0.00013929912252011516 + }, + { + "step": 169, + "epoch": 1.1457627118644067, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51915264, + "loss": 0.5658, + "grad_norm": 6.395980358123779, + "learning_rate": 0.00013751947434318564 + }, + { + "step": 170, + "epoch": 1.152542372881356, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51903744, + "loss": 0.4778, + "grad_norm": 4.4228129386901855, + "learning_rate": 0.00013574159350437261 + }, + { + "step": 171, + "epoch": 1.159322033898305, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519100416, + "loss": 0.5222, + "grad_norm": 6.1567864418029785, + "learning_rate": 0.0001339657317652331 + }, + { + "step": 172, + "epoch": 1.1661016949152543, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519008256, + "loss": 0.6052, + "grad_norm": 6.764645099639893, + "learning_rate": 0.00013219214060140424 + }, + { + "step": 173, + "epoch": 1.1728813559322033, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519307776, + "loss": 0.4933, + "grad_norm": 4.290213108062744, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 174, + "epoch": 1.1796610169491526, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519031296, + "loss": 0.579, + "grad_norm": 4.750498294830322, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 175, + "epoch": 1.1864406779661016, + "cpu_mem": 1.512288256, + "gpu_mem": 4.518997504, + "loss": 0.4567, + "grad_norm": 4.482128620147705, + "learning_rate": 0.00012688750028184818 + }, + { + "step": 176, + "epoch": 1.193220338983051, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519135744, + "loss": 0.5528, + "grad_norm": 7.500720500946045, + "learning_rate": 0.0001251254992118439 + }, + { + "step": 177, + "epoch": 1.2, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519234048, + "loss": 0.4816, + "grad_norm": 5.709393501281738, + "learning_rate": 0.00012336702056185453 + }, + { + "step": 178, + "epoch": 1.2067796610169492, + "cpu_mem": 1.512288256, + "gpu_mem": 4.518980608, + "loss": 0.6219, + "grad_norm": 5.3650712966918945, + "learning_rate": 0.00012161231334593851 + }, + { + "step": 179, + "epoch": 1.2135593220338983, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519080448, + "loss": 0.6314, + "grad_norm": 7.034570693969727, + "learning_rate": 0.00011986162604409015 + }, + { + "step": 180, + "epoch": 1.2203389830508475, + "cpu_mem": 1.512288256, + "gpu_mem": 4.5190528, + "loss": 0.5441, + "grad_norm": 4.4938225746154785, + "learning_rate": 0.00011811520656705348 + }, + { + "step": 181, + "epoch": 1.2271186440677966, + "cpu_mem": 1.512288256, + "gpu_mem": 4.518989824, + "loss": 0.4821, + "grad_norm": 5.907620906829834, + "learning_rate": 0.00011637330222121543 + }, + { + "step": 182, + "epoch": 1.2338983050847459, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519207936, + "loss": 0.7578, + "grad_norm": 9.750370025634766, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 183, + "epoch": 1.240677966101695, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519105024, + "loss": 0.5486, + "grad_norm": 5.949014663696289, + "learning_rate": 0.00011290402491686766 + }, + { + "step": 184, + "epoch": 1.2474576271186442, + "cpu_mem": 1.512288256, + "gpu_mem": 4.5190528, + "loss": 0.4688, + "grad_norm": 4.6593017578125, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 185, + "epoch": 1.2542372881355932, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519031296, + "loss": 0.5701, + "grad_norm": 4.530482769012451, + "learning_rate": 0.00010945575916653407 + }, + { + "step": 186, + "epoch": 1.2610169491525425, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519040512, + "loss": 0.4945, + "grad_norm": 3.4325568675994873, + "learning_rate": 0.00010774011647378553 + }, + { + "step": 187, + "epoch": 1.2677966101694915, + "cpu_mem": 1.512288256, + "gpu_mem": 4.518972928, + "loss": 0.6189, + "grad_norm": 4.750690460205078, + "learning_rate": 0.00010603045810453468 + }, + { + "step": 188, + "epoch": 1.2745762711864406, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519135744, + "loss": 0.5404, + "grad_norm": 4.004016399383545, + "learning_rate": 0.00010432702615951396 + }, + { + "step": 189, + "epoch": 1.2813559322033898, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519005184, + "loss": 0.547, + "grad_norm": 4.070260524749756, + "learning_rate": 0.00010263006185774627 + }, + { + "step": 190, + "epoch": 1.288135593220339, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519124992, + "loss": 0.5295, + "grad_norm": 5.301217555999756, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 191, + "epoch": 1.2949152542372881, + "cpu_mem": 1.512288256, + "gpu_mem": 4.518943744, + "loss": 0.4816, + "grad_norm": 3.699140787124634, + "learning_rate": 9.925649644669391e-05 + }, + { + "step": 192, + "epoch": 1.3016949152542372, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51907584, + "loss": 0.4443, + "grad_norm": 4.096963882446289, + "learning_rate": 9.758037306013526e-05 + }, + { + "step": 193, + "epoch": 1.3084745762711865, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519049728, + "loss": 0.5866, + "grad_norm": 4.5070319175720215, + "learning_rate": 9.591167269463255e-05 + }, + { + "step": 194, + "epoch": 1.3152542372881357, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519015936, + "loss": 0.5797, + "grad_norm": 5.173367500305176, + "learning_rate": 9.425063165095088e-05 + }, + { + "step": 195, + "epoch": 1.3220338983050848, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519120384, + "loss": 0.4506, + "grad_norm": 5.948993682861328, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 196, + "epoch": 1.3288135593220338, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519115776, + "loss": 0.5417, + "grad_norm": 4.922159671783447, + "learning_rate": 9.095246727570879e-05 + }, + { + "step": 197, + "epoch": 1.335593220338983, + "cpu_mem": 1.512288256, + "gpu_mem": 4.518974464, + "loss": 0.4126, + "grad_norm": 4.2993011474609375, + "learning_rate": 8.931581098950973e-05 + }, + { + "step": 198, + "epoch": 1.3423728813559321, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519166464, + "loss": 0.4516, + "grad_norm": 3.3898727893829346, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 199, + "epoch": 1.3491525423728814, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519017472, + "loss": 0.5272, + "grad_norm": 4.7902374267578125, + "learning_rate": 8.606850900252478e-05 + }, + { + "step": 200, + "epoch": 1.3559322033898304, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519120384, + "loss": 0.3785, + "grad_norm": 3.5034830570220947, + "learning_rate": 8.445832314459608e-05 + }, + { + "step": 201, + "epoch": 1.3627118644067797, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519323136, + "loss": 0.4795, + "grad_norm": 4.421779632568359, + "learning_rate": 8.285741849059311e-05 + }, + { + "step": 202, + "epoch": 1.3694915254237288, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519124992, + "loss": 0.4514, + "grad_norm": 4.375877380371094, + "learning_rate": 8.126602174088843e-05 + }, + { + "step": 203, + "epoch": 1.376271186440678, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519011328, + "loss": 0.4037, + "grad_norm": 5.991669654846191, + "learning_rate": 7.968435824946242e-05 + }, + { + "step": 204, + "epoch": 1.383050847457627, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519025152, + "loss": 0.4836, + "grad_norm": 5.3182172775268555, + "learning_rate": 7.811265199199152e-05 + }, + { + "step": 205, + "epoch": 1.3898305084745763, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519069696, + "loss": 0.4796, + "grad_norm": 5.581629276275635, + "learning_rate": 7.655112553413135e-05 + }, + { + "step": 206, + "epoch": 1.3966101694915254, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519011328, + "loss": 0.481, + "grad_norm": 5.053388595581055, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 207, + "epoch": 1.4033898305084747, + "cpu_mem": 1.512288256, + "gpu_mem": 4.5192448, + "loss": 0.4202, + "grad_norm": 5.716559886932373, + "learning_rate": 7.345949504086507e-05 + }, + { + "step": 208, + "epoch": 1.4101694915254237, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51927552, + "loss": 0.3872, + "grad_norm": 6.213109493255615, + "learning_rate": 7.192982880403917e-05 + }, + { + "step": 209, + "epoch": 1.4169491525423727, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519201792, + "loss": 0.5066, + "grad_norm": 6.249429225921631, + "learning_rate": 7.041121790198881e-05 + }, + { + "step": 210, + "epoch": 1.423728813559322, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519089664, + "loss": 0.4701, + "grad_norm": 5.749514102935791, + "learning_rate": 6.890387738166041e-05 + }, + { + "step": 211, + "epoch": 1.4305084745762713, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519038976, + "loss": 0.42, + "grad_norm": 6.426360130310059, + "learning_rate": 6.740802069402771e-05 + }, + { + "step": 212, + "epoch": 1.4372881355932203, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519008256, + "loss": 0.4786, + "grad_norm": 5.285887241363525, + "learning_rate": 6.592385966386588e-05 + }, + { + "step": 213, + "epoch": 1.4440677966101694, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519031296, + "loss": 0.5431, + "grad_norm": 8.069002151489258, + "learning_rate": 6.445160445975536e-05 + }, + { + "step": 214, + "epoch": 1.4508474576271186, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51911424, + "loss": 0.527, + "grad_norm": 6.687179088592529, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 215, + "epoch": 1.457627118644068, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519042048, + "loss": 0.538, + "grad_norm": 7.980434894561768, + "learning_rate": 6.154364374470568e-05 + }, + { + "step": 216, + "epoch": 1.464406779661017, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519207936, + "loss": 0.4374, + "grad_norm": 5.38814640045166, + "learning_rate": 6.010835002329795e-05 + }, + { + "step": 217, + "epoch": 1.471186440677966, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519049728, + "loss": 0.5355, + "grad_norm": 8.008475303649902, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 218, + "epoch": 1.4779661016949153, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519026688, + "loss": 0.4467, + "grad_norm": 5.458549499511719, + "learning_rate": 5.72761520669092e-05 + }, + { + "step": 219, + "epoch": 1.4847457627118645, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51915264, + "loss": 0.477, + "grad_norm": 6.730329990386963, + "learning_rate": 5.587964889287218e-05 + }, + { + "step": 220, + "epoch": 1.4915254237288136, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519186432, + "loss": 0.5097, + "grad_norm": 6.172441005706787, + "learning_rate": 5.449647388213678e-05 + }, + { + "step": 221, + "epoch": 1.4983050847457626, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519054336, + "loss": 0.5732, + "grad_norm": 7.062180519104004, + "learning_rate": 5.312682290288869e-05 + }, + { + "step": 222, + "epoch": 1.505084745762712, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51919104, + "loss": 0.4604, + "grad_norm": 6.362549781799316, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 223, + "epoch": 1.5118644067796612, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519105024, + "loss": 0.4812, + "grad_norm": 6.742758274078369, + "learning_rate": 5.0428866908599864e-05 + }, + { + "step": 224, + "epoch": 1.5186440677966102, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519069696, + "loss": 0.5005, + "grad_norm": 7.343570709228516, + "learning_rate": 4.9100943944812114e-05 + }, + { + "step": 225, + "epoch": 1.5254237288135593, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519034368, + "loss": 0.5068, + "grad_norm": 5.180023670196533, + "learning_rate": 4.778730906091632e-05 + }, + { + "step": 226, + "epoch": 1.5322033898305085, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51918336, + "loss": 0.379, + "grad_norm": 5.091320991516113, + "learning_rate": 4.648814827768322e-05 + }, + { + "step": 227, + "epoch": 1.5389830508474578, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519072768, + "loss": 0.4426, + "grad_norm": 5.11602258682251, + "learning_rate": 4.5203645566239816e-05 + }, + { + "step": 228, + "epoch": 1.5457627118644068, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519017472, + "loss": 0.5256, + "grad_norm": 5.844997882843018, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 229, + "epoch": 1.5525423728813559, + "cpu_mem": 1.512288256, + "gpu_mem": 4.518959104, + "loss": 0.5054, + "grad_norm": 5.75474214553833, + "learning_rate": 4.267933983899601e-05 + }, + { + "step": 230, + "epoch": 1.559322033898305, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519015936, + "loss": 0.5413, + "grad_norm": 6.338455677032471, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 231, + "epoch": 1.5661016949152542, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519293952, + "loss": 0.4582, + "grad_norm": 5.49623966217041, + "learning_rate": 4.0215821672741213e-05 + }, + { + "step": 232, + "epoch": 1.5728813559322035, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519017472, + "loss": 0.5658, + "grad_norm": 5.572755813598633, + "learning_rate": 3.900729534256745e-05 + }, + { + "step": 233, + "epoch": 1.5796610169491525, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519330816, + "loss": 0.5023, + "grad_norm": 5.0739946365356445, + "learning_rate": 3.781448643031187e-05 + }, + { + "step": 234, + "epoch": 1.5864406779661016, + "cpu_mem": 1.512288256, + "gpu_mem": 4.5192064, + "loss": 0.4126, + "grad_norm": 5.143454074859619, + "learning_rate": 3.663756384686127e-05 + }, + { + "step": 235, + "epoch": 1.5932203389830508, + "cpu_mem": 1.512288256, + "gpu_mem": 4.518962176, + "loss": 0.4623, + "grad_norm": 5.956307888031006, + "learning_rate": 3.547669425347647e-05 + }, + { + "step": 236, + "epoch": 1.6, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51902208, + "loss": 0.4677, + "grad_norm": 5.250843524932861, + "learning_rate": 3.433204203819185e-05 + }, + { + "step": 237, + "epoch": 1.6067796610169491, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51908352, + "loss": 0.4254, + "grad_norm": 4.902020454406738, + "learning_rate": 3.3203769292536764e-05 + }, + { + "step": 238, + "epoch": 1.6135593220338982, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519085056, + "loss": 0.4989, + "grad_norm": 5.615092754364014, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 239, + "epoch": 1.6203389830508474, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519338496, + "loss": 0.5546, + "grad_norm": 5.185640335083008, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 240, + "epoch": 1.6271186440677967, + "cpu_mem": 1.512288256, + "gpu_mem": 4.518988288, + "loss": 0.6922, + "grad_norm": 7.394616603851318, + "learning_rate": 2.9918813861345952e-05 + }, + { + "step": 241, + "epoch": 1.6338983050847458, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519284736, + "loss": 0.4441, + "grad_norm": 5.41588020324707, + "learning_rate": 2.885763318295102e-05 + }, + { + "step": 242, + "epoch": 1.6406779661016948, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519146496, + "loss": 0.5156, + "grad_norm": 6.340667247772217, + "learning_rate": 2.781360719244964e-05 + }, + { + "step": 243, + "epoch": 1.647457627118644, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51899904, + "loss": 0.5521, + "grad_norm": 5.889646053314209, + "learning_rate": 2.6786883731926306e-05 + }, + { + "step": 244, + "epoch": 1.6542372881355933, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519138816, + "loss": 0.4731, + "grad_norm": 5.065695285797119, + "learning_rate": 2.5777608193294396e-05 + }, + { + "step": 245, + "epoch": 1.6610169491525424, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519017472, + "loss": 0.4947, + "grad_norm": 5.03191614151001, + "learning_rate": 2.4785923497707956e-05 + }, + { + "step": 246, + "epoch": 1.6677966101694914, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519111168, + "loss": 0.4658, + "grad_norm": 4.863046169281006, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 247, + "epoch": 1.6745762711864407, + "cpu_mem": 1.512288256, + "gpu_mem": 4.5191296, + "loss": 0.5575, + "grad_norm": 5.03303337097168, + "learning_rate": 2.285588584541047e-05 + }, + { + "step": 248, + "epoch": 1.68135593220339, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519081984, + "loss": 0.4857, + "grad_norm": 4.907963752746582, + "learning_rate": 2.1917806196827792e-05 + }, + { + "step": 249, + "epoch": 1.688135593220339, + "cpu_mem": 1.512288256, + "gpu_mem": 4.518988288, + "loss": 0.4642, + "grad_norm": 5.556704521179199, + "learning_rate": 2.0997863968844914e-05 + }, + { + "step": 250, + "epoch": 1.694915254237288, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519080448, + "loss": 0.458, + "grad_norm": 4.857274055480957, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 251, + "epoch": 1.7016949152542373, + "cpu_mem": 1.512288256, + "gpu_mem": 4.518992896, + "loss": 0.417, + "grad_norm": 5.613741874694824, + "learning_rate": 1.921291027132278e-05 + }, + { + "step": 252, + "epoch": 1.7084745762711866, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519035904, + "loss": 0.4969, + "grad_norm": 4.631195068359375, + "learning_rate": 1.834815156491165e-05 + }, + { + "step": 253, + "epoch": 1.7152542372881356, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51922944, + "loss": 0.4858, + "grad_norm": 5.4093122482299805, + "learning_rate": 1.750203576956341e-05 + }, + { + "step": 254, + "epoch": 1.7220338983050847, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519025152, + "loss": 0.5449, + "grad_norm": 5.517231464385986, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 255, + "epoch": 1.7288135593220337, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519181824, + "loss": 0.5259, + "grad_norm": 6.029206275939941, + "learning_rate": 1.5866209521043304e-05 + }, + { + "step": 256, + "epoch": 1.735593220338983, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519008256, + "loss": 0.392, + "grad_norm": 4.221251010894775, + "learning_rate": 1.5076730713409523e-05 + }, + { + "step": 257, + "epoch": 1.7423728813559323, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51942144, + "loss": 0.5529, + "grad_norm": 6.13136100769043, + "learning_rate": 1.4306358075111923e-05 + }, + { + "step": 258, + "epoch": 1.7491525423728813, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519080448, + "loss": 0.4382, + "grad_norm": 5.44303035736084, + "learning_rate": 1.3555200696822232e-05 + }, + { + "step": 259, + "epoch": 1.7559322033898304, + "cpu_mem": 1.512288256, + "gpu_mem": 4.518997504, + "loss": 0.5205, + "grad_norm": 4.832195281982422, + "learning_rate": 1.2823364948184095e-05 + }, + { + "step": 260, + "epoch": 1.7627118644067796, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51911424, + "loss": 0.3783, + "grad_norm": 4.001471519470215, + "learning_rate": 1.2110954462750166e-05 + }, + { + "step": 261, + "epoch": 1.769491525423729, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519069696, + "loss": 0.3939, + "grad_norm": 4.915110111236572, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 262, + "epoch": 1.776271186440678, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519026688, + "loss": 0.381, + "grad_norm": 3.9966530799865723, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 263, + "epoch": 1.783050847457627, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519063552, + "loss": 0.4564, + "grad_norm": 4.660472393035889, + "learning_rate": 1.0091269574384874e-05 + }, + { + "step": 264, + "epoch": 1.7898305084745763, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519151104, + "loss": 0.4723, + "grad_norm": 5.000064849853516, + "learning_rate": 9.45754125003576e-06 + }, + { + "step": 265, + "epoch": 1.7966101694915255, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519069696, + "loss": 0.5149, + "grad_norm": 5.103734970092773, + "learning_rate": 8.843714815330987e-06 + }, + { + "step": 266, + "epoch": 1.8033898305084746, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519284736, + "loss": 0.4547, + "grad_norm": 5.005780220031738, + "learning_rate": 8.249877192799731e-06 + }, + { + "step": 267, + "epoch": 1.8101694915254236, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519077376, + "loss": 0.4271, + "grad_norm": 5.692404270172119, + "learning_rate": 7.676112474402068e-06 + }, + { + "step": 268, + "epoch": 1.8169491525423729, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519081984, + "loss": 0.405, + "grad_norm": 5.452561855316162, + "learning_rate": 7.122501909620926e-06 + }, + { + "step": 269, + "epoch": 1.8237288135593221, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519092736, + "loss": 0.4592, + "grad_norm": 5.44649600982666, + "learning_rate": 6.5891238939566275e-06 + }, + { + "step": 270, + "epoch": 1.8305084745762712, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519131136, + "loss": 0.4592, + "grad_norm": 5.148582935333252, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 271, + "epoch": 1.8372881355932202, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51918336, + "loss": 0.414, + "grad_norm": 5.350499153137207, + "learning_rate": 5.583364755863701e-06 + }, + { + "step": 272, + "epoch": 1.8440677966101695, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519042048, + "loss": 0.4478, + "grad_norm": 4.392088413238525, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 273, + "epoch": 1.8508474576271188, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51892224, + "loss": 0.4744, + "grad_norm": 5.41953706741333, + "learning_rate": 4.659404732773908e-06 + }, + { + "step": 274, + "epoch": 1.8576271186440678, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519149568, + "loss": 0.4541, + "grad_norm": 4.928563594818115, + "learning_rate": 4.228264751468752e-06 + }, + { + "step": 275, + "epoch": 1.8644067796610169, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519393792, + "loss": 0.396, + "grad_norm": 5.382067680358887, + "learning_rate": 3.817767165451041e-06 + }, + { + "step": 276, + "epoch": 1.8711864406779661, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519054336, + "loss": 0.4373, + "grad_norm": 4.41796875, + "learning_rate": 3.4279701043260886e-06 + }, + { + "step": 277, + "epoch": 1.8779661016949154, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519000576, + "loss": 0.6379, + "grad_norm": 5.958452224731445, + "learning_rate": 3.0589287663461472e-06 + }, + { + "step": 278, + "epoch": 1.8847457627118644, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519163392, + "loss": 0.4834, + "grad_norm": 4.944537162780762, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 279, + "epoch": 1.8915254237288135, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519103488, + "loss": 0.4909, + "grad_norm": 4.551985740661621, + "learning_rate": 2.3833193495825853e-06 + }, + { + "step": 280, + "epoch": 1.8983050847457628, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51908352, + "loss": 0.5207, + "grad_norm": 4.847994327545166, + "learning_rate": 2.076846942272026e-06 + }, + { + "step": 281, + "epoch": 1.905084745762712, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519019008, + "loss": 0.5324, + "grad_norm": 5.380536079406738, + "learning_rate": 1.791321587504768e-06 + }, + { + "step": 282, + "epoch": 1.911864406779661, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519447552, + "loss": 0.4495, + "grad_norm": 5.5009050369262695, + "learning_rate": 1.5267837178600972e-06 + }, + { + "step": 283, + "epoch": 1.9186440677966101, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519154176, + "loss": 0.4852, + "grad_norm": 5.446907997131348, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 284, + "epoch": 1.9254237288135592, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519009792, + "loss": 0.4832, + "grad_norm": 4.65576171875, + "learning_rate": 1.0608172990067553e-06 + }, + { + "step": 285, + "epoch": 1.9322033898305084, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519063552, + "loss": 0.4847, + "grad_norm": 4.889958381652832, + "learning_rate": 8.594547342153979e-07 + }, + { + "step": 286, + "epoch": 1.9389830508474577, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519481344, + "loss": 0.4748, + "grad_norm": 5.272000789642334, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 287, + "epoch": 1.9457627118644067, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519250944, + "loss": 0.4557, + "grad_norm": 4.938218116760254, + "learning_rate": 5.201134622801473e-07 + }, + { + "step": 288, + "epoch": 1.9525423728813558, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519035904, + "loss": 0.481, + "grad_norm": 5.558644771575928, + "learning_rate": 3.821828084619727e-07 + }, + { + "step": 289, + "epoch": 1.959322033898305, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519120384, + "loss": 0.4614, + "grad_norm": 5.039109706878662, + "learning_rate": 2.654391846207915e-07 + }, + { + "step": 290, + "epoch": 1.9661016949152543, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51904512, + "loss": 0.5343, + "grad_norm": 5.333126068115234, + "learning_rate": 1.6989912254880556e-07 + }, + { + "step": 291, + "epoch": 1.9728813559322034, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519080448, + "loss": 0.6398, + "grad_norm": 5.9029459953308105, + "learning_rate": 9.557615145123765e-08 + }, + { + "step": 292, + "epoch": 1.9796610169491524, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519163392, + "loss": 0.4624, + "grad_norm": 5.084424018859863, + "learning_rate": 4.248079603064724e-08 + }, + { + "step": 293, + "epoch": 1.9864406779661017, + "cpu_mem": 1.512288256, + "gpu_mem": 4.519080448, + "loss": 0.5694, + "grad_norm": 5.880096435546875, + "learning_rate": 1.0620574996372811e-08 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51910656, + "loss": 0.5335, + "grad_norm": 5.264451503753662, + "learning_rate": 0.0 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.512288256, + "gpu_mem": 4.51910656, + "train_runtime": 4458.5149, + "train_samples_per_second": 4.229, + "train_steps_per_second": 0.066, + "total_flos": 0.0, + "train_loss": 0.7076091230118355 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/adapter_config.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..91562a2718627f56cb3f88093dd26c3a98c35384 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 4, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 2, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "A" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/eval_results.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b63baeb510c1b2e95033f40cedb4f268c847491b --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "hellaswag", + "results": 0.7826130252937662 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/training_configuration.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..e9d8174fa4265c86f254e492a5d1d626c8b938f8 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "HELLASWAG", + "dataset_id": "Rowan/hellaswag", + "preprocess_id": "hellaswag_train_deepeval" + }, + "peft_config": { + "method": "abl_A", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 1577576 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 1, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_A-hellaswag-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2", + "seed": 42, + "timestamp": "2025-08-30T16:43:23.732951" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/training_logs.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..40c40950a1fae0edc631440e6aa036ea40251f63 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/training_logs.json @@ -0,0 +1,5629 @@ +[ + { + "step": 1, + "epoch": 0.0016025641025641025, + "cpu_mem": 1.488089088, + "gpu_mem": 4.424145408, + "loss": 3.4877, + "grad_norm": 241.90362548828125, + "learning_rate": 4.7619047619047615e-06 + }, + { + "step": 2, + "epoch": 0.003205128205128205, + "cpu_mem": 1.494577152, + "gpu_mem": 4.436912128, + "loss": 3.6203, + "grad_norm": 238.2837677001953, + "learning_rate": 9.523809523809523e-06 + }, + { + "step": 3, + "epoch": 0.004807692307692308, + "cpu_mem": 1.4957568, + "gpu_mem": 4.436919808, + "loss": 3.154, + "grad_norm": 219.66639709472656, + "learning_rate": 1.4285714285714284e-05 + }, + { + "step": 4, + "epoch": 0.00641025641025641, + "cpu_mem": 1.49673984, + "gpu_mem": 4.4369536, + "loss": 2.8942, + "grad_norm": 164.12588500976562, + "learning_rate": 1.9047619047619046e-05 + }, + { + "step": 5, + "epoch": 0.008012820512820512, + "cpu_mem": 1.49772288, + "gpu_mem": 4.436916736, + "loss": 2.3059, + "grad_norm": 108.55628204345703, + "learning_rate": 2.3809523809523807e-05 + }, + { + "step": 6, + "epoch": 0.009615384615384616, + "cpu_mem": 1.49870592, + "gpu_mem": 4.436962816, + "loss": 1.9371, + "grad_norm": 75.2010498046875, + "learning_rate": 2.8571428571428567e-05 + }, + { + "step": 7, + "epoch": 0.011217948717948718, + "cpu_mem": 1.49968896, + "gpu_mem": 4.43692288, + "loss": 1.6792, + "grad_norm": 54.37167739868164, + "learning_rate": 3.333333333333333e-05 + }, + { + "step": 8, + "epoch": 0.01282051282051282, + "cpu_mem": 1.500475392, + "gpu_mem": 4.4369536, + "loss": 1.5035, + "grad_norm": 30.701902389526367, + "learning_rate": 3.809523809523809e-05 + }, + { + "step": 9, + "epoch": 0.014423076923076924, + "cpu_mem": 1.501261824, + "gpu_mem": 4.4369536, + "loss": 1.4498, + "grad_norm": 16.277263641357422, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 10, + "epoch": 0.016025641025641024, + "cpu_mem": 1.502048256, + "gpu_mem": 4.436896768, + "loss": 1.4264, + "grad_norm": 11.342458724975586, + "learning_rate": 4.7619047619047614e-05 + }, + { + "step": 11, + "epoch": 0.017628205128205128, + "cpu_mem": 1.502834688, + "gpu_mem": 4.436916736, + "loss": 1.4206, + "grad_norm": 20.573930740356445, + "learning_rate": 5.238095238095237e-05 + }, + { + "step": 12, + "epoch": 0.019230769230769232, + "cpu_mem": 1.50362112, + "gpu_mem": 4.436913664, + "loss": 1.4385, + "grad_norm": 24.91997528076172, + "learning_rate": 5.7142857142857135e-05 + }, + { + "step": 13, + "epoch": 0.020833333333333332, + "cpu_mem": 1.504210944, + "gpu_mem": 4.436905984, + "loss": 1.3991, + "grad_norm": 15.008387565612793, + "learning_rate": 6.190476190476189e-05 + }, + { + "step": 14, + "epoch": 0.022435897435897436, + "cpu_mem": 1.504997376, + "gpu_mem": 4.436932096, + "loss": 1.3916, + "grad_norm": 15.583274841308594, + "learning_rate": 6.666666666666666e-05 + }, + { + "step": 15, + "epoch": 0.02403846153846154, + "cpu_mem": 1.505783808, + "gpu_mem": 4.43693056, + "loss": 1.4263, + "grad_norm": 22.400733947753906, + "learning_rate": 7.142857142857142e-05 + }, + { + "step": 16, + "epoch": 0.02564102564102564, + "cpu_mem": 1.506373632, + "gpu_mem": 4.43692288, + "loss": 1.3956, + "grad_norm": 11.228878021240234, + "learning_rate": 7.619047619047618e-05 + }, + { + "step": 17, + "epoch": 0.027243589743589744, + "cpu_mem": 1.507160064, + "gpu_mem": 4.43692288, + "loss": 1.463, + "grad_norm": 13.905608177185059, + "learning_rate": 8.095238095238093e-05 + }, + { + "step": 18, + "epoch": 0.028846153846153848, + "cpu_mem": 1.507749888, + "gpu_mem": 4.43692288, + "loss": 1.3242, + "grad_norm": 8.524161338806152, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 19, + "epoch": 0.030448717948717948, + "cpu_mem": 1.50853632, + "gpu_mem": 4.43692288, + "loss": 1.4647, + "grad_norm": 10.819677352905273, + "learning_rate": 9.047619047619046e-05 + }, + { + "step": 20, + "epoch": 0.03205128205128205, + "cpu_mem": 1.509126144, + "gpu_mem": 4.436896768, + "loss": 1.5016, + "grad_norm": 16.34766387939453, + "learning_rate": 9.523809523809523e-05 + }, + { + "step": 21, + "epoch": 0.03365384615384615, + "cpu_mem": 1.509912576, + "gpu_mem": 4.436913664, + "loss": 1.438, + "grad_norm": 11.764151573181152, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 22, + "epoch": 0.035256410256410256, + "cpu_mem": 1.5105024, + "gpu_mem": 4.436921344, + "loss": 1.4885, + "grad_norm": 21.95326805114746, + "learning_rate": 0.00010476190476190474 + }, + { + "step": 23, + "epoch": 0.03685897435897436, + "cpu_mem": 1.511288832, + "gpu_mem": 4.436935168, + "loss": 1.3671, + "grad_norm": 6.237077713012695, + "learning_rate": 0.0001095238095238095 + }, + { + "step": 24, + "epoch": 0.038461538461538464, + "cpu_mem": 1.511878656, + "gpu_mem": 4.436919808, + "loss": 1.4204, + "grad_norm": 13.261194229125977, + "learning_rate": 0.00011428571428571427 + }, + { + "step": 25, + "epoch": 0.04006410256410257, + "cpu_mem": 1.51246848, + "gpu_mem": 4.43690752, + "loss": 1.4996, + "grad_norm": 14.647529602050781, + "learning_rate": 0.00011904761904761903 + }, + { + "step": 26, + "epoch": 0.041666666666666664, + "cpu_mem": 1.513058304, + "gpu_mem": 4.436913664, + "loss": 1.3791, + "grad_norm": 6.817897319793701, + "learning_rate": 0.00012380952380952378 + }, + { + "step": 27, + "epoch": 0.04326923076923077, + "cpu_mem": 1.513648128, + "gpu_mem": 4.436921344, + "loss": 1.363, + "grad_norm": 3.5496745109558105, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 28, + "epoch": 0.04487179487179487, + "cpu_mem": 1.514041344, + "gpu_mem": 4.436916736, + "loss": 1.3724, + "grad_norm": 2.5399515628814697, + "learning_rate": 0.0001333333333333333 + }, + { + "step": 29, + "epoch": 0.046474358974358976, + "cpu_mem": 1.514631168, + "gpu_mem": 4.436925952, + "loss": 1.4515, + "grad_norm": 7.351158618927002, + "learning_rate": 0.00013809523809523808 + }, + { + "step": 30, + "epoch": 0.04807692307692308, + "cpu_mem": 1.5154176, + "gpu_mem": 4.436898304, + "loss": 1.4091, + "grad_norm": 5.48233699798584, + "learning_rate": 0.00014285714285714284 + }, + { + "step": 31, + "epoch": 0.049679487179487176, + "cpu_mem": 1.515810816, + "gpu_mem": 4.4369536, + "loss": 1.4056, + "grad_norm": 3.8774490356445312, + "learning_rate": 0.0001476190476190476 + }, + { + "step": 32, + "epoch": 0.05128205128205128, + "cpu_mem": 1.51640064, + "gpu_mem": 4.43694592, + "loss": 1.3833, + "grad_norm": 2.7514734268188477, + "learning_rate": 0.00015238095238095237 + }, + { + "step": 33, + "epoch": 0.052884615384615384, + "cpu_mem": 1.516990464, + "gpu_mem": 4.43689984, + "loss": 1.4043, + "grad_norm": 5.555708885192871, + "learning_rate": 0.00015714285714285713 + }, + { + "step": 34, + "epoch": 0.05448717948717949, + "cpu_mem": 1.517580288, + "gpu_mem": 4.436918272, + "loss": 1.4099, + "grad_norm": 4.449511528015137, + "learning_rate": 0.00016190476190476187 + }, + { + "step": 35, + "epoch": 0.05608974358974359, + "cpu_mem": 1.518170112, + "gpu_mem": 4.436939776, + "loss": 1.4866, + "grad_norm": 12.073359489440918, + "learning_rate": 0.00016666666666666666 + }, + { + "step": 36, + "epoch": 0.057692307692307696, + "cpu_mem": 1.518759936, + "gpu_mem": 4.43693824, + "loss": 1.3941, + "grad_norm": 2.994689464569092, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 37, + "epoch": 0.05929487179487179, + "cpu_mem": 1.51934976, + "gpu_mem": 4.436970496, + "loss": 1.3962, + "grad_norm": 4.743826389312744, + "learning_rate": 0.0001761904761904762 + }, + { + "step": 38, + "epoch": 0.060897435897435896, + "cpu_mem": 1.519939584, + "gpu_mem": 4.43692288, + "loss": 1.4273, + "grad_norm": 5.094912052154541, + "learning_rate": 0.00018095238095238093 + }, + { + "step": 39, + "epoch": 0.0625, + "cpu_mem": 1.520529408, + "gpu_mem": 4.436979712, + "loss": 1.3633, + "grad_norm": 6.204122543334961, + "learning_rate": 0.00018571428571428572 + }, + { + "step": 40, + "epoch": 0.0641025641025641, + "cpu_mem": 1.521119232, + "gpu_mem": 4.43690752, + "loss": 1.4421, + "grad_norm": 6.1482744216918945, + "learning_rate": 0.00019047619047619045 + }, + { + "step": 41, + "epoch": 0.06570512820512821, + "cpu_mem": 1.521512448, + "gpu_mem": 4.436935168, + "loss": 1.3662, + "grad_norm": 3.924143075942993, + "learning_rate": 0.00019523809523809522 + }, + { + "step": 42, + "epoch": 0.0673076923076923, + "cpu_mem": 1.521905664, + "gpu_mem": 4.436948992, + "loss": 1.455, + "grad_norm": 7.653059005737305, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 43, + "epoch": 0.06891025641025642, + "cpu_mem": 1.522692096, + "gpu_mem": 4.436955136, + "loss": 1.379, + "grad_norm": 3.5776376724243164, + "learning_rate": 0.00020476190476190475 + }, + { + "step": 44, + "epoch": 0.07051282051282051, + "cpu_mem": 1.523085312, + "gpu_mem": 4.436933632, + "loss": 1.4145, + "grad_norm": 3.8843088150024414, + "learning_rate": 0.00020952380952380948 + }, + { + "step": 45, + "epoch": 0.07211538461538461, + "cpu_mem": 1.523675136, + "gpu_mem": 4.436933632, + "loss": 1.4093, + "grad_norm": 4.140640735626221, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 46, + "epoch": 0.07371794871794872, + "cpu_mem": 1.52426496, + "gpu_mem": 4.436933632, + "loss": 1.3665, + "grad_norm": 4.814624786376953, + "learning_rate": 0.000219047619047619 + }, + { + "step": 47, + "epoch": 0.07532051282051282, + "cpu_mem": 1.524658176, + "gpu_mem": 4.436919808, + "loss": 1.4264, + "grad_norm": 4.442328453063965, + "learning_rate": 0.0002238095238095238 + }, + { + "step": 48, + "epoch": 0.07692307692307693, + "cpu_mem": 1.525248, + "gpu_mem": 4.43693824, + "loss": 1.4079, + "grad_norm": 4.087531566619873, + "learning_rate": 0.00022857142857142854 + }, + { + "step": 49, + "epoch": 0.07852564102564102, + "cpu_mem": 1.525837824, + "gpu_mem": 4.436950528, + "loss": 1.4086, + "grad_norm": 3.1759774684906006, + "learning_rate": 0.0002333333333333333 + }, + { + "step": 50, + "epoch": 0.08012820512820513, + "cpu_mem": 1.52623104, + "gpu_mem": 4.436927488, + "loss": 1.405, + "grad_norm": 4.485915184020996, + "learning_rate": 0.00023809523809523807 + }, + { + "step": 51, + "epoch": 0.08173076923076923, + "cpu_mem": 1.526624256, + "gpu_mem": 4.436912128, + "loss": 1.3842, + "grad_norm": 3.978679656982422, + "learning_rate": 0.00024285714285714283 + }, + { + "step": 52, + "epoch": 0.08333333333333333, + "cpu_mem": 1.52721408, + "gpu_mem": 4.436916736, + "loss": 1.3638, + "grad_norm": 2.873281240463257, + "learning_rate": 0.00024761904761904757 + }, + { + "step": 53, + "epoch": 0.08493589743589744, + "cpu_mem": 1.527803904, + "gpu_mem": 4.436944384, + "loss": 1.4319, + "grad_norm": 9.080708503723145, + "learning_rate": 0.0002523809523809524 + }, + { + "step": 54, + "epoch": 0.08653846153846154, + "cpu_mem": 1.52819712, + "gpu_mem": 4.436919808, + "loss": 1.4828, + "grad_norm": 11.819942474365234, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 55, + "epoch": 0.08814102564102565, + "cpu_mem": 1.528590336, + "gpu_mem": 4.43693824, + "loss": 1.4374, + "grad_norm": 8.049686431884766, + "learning_rate": 0.00026190476190476186 + }, + { + "step": 56, + "epoch": 0.08974358974358974, + "cpu_mem": 1.528983552, + "gpu_mem": 4.436932096, + "loss": 1.4002, + "grad_norm": 6.084953308105469, + "learning_rate": 0.0002666666666666666 + }, + { + "step": 57, + "epoch": 0.09134615384615384, + "cpu_mem": 1.529573376, + "gpu_mem": 4.436898304, + "loss": 1.4097, + "grad_norm": 9.048219680786133, + "learning_rate": 0.0002714285714285714 + }, + { + "step": 58, + "epoch": 0.09294871794871795, + "cpu_mem": 1.529966592, + "gpu_mem": 4.436927488, + "loss": 1.4132, + "grad_norm": 4.993281364440918, + "learning_rate": 0.00027619047619047615 + }, + { + "step": 59, + "epoch": 0.09455128205128205, + "cpu_mem": 1.530556416, + "gpu_mem": 4.436910592, + "loss": 1.3347, + "grad_norm": 3.7465627193450928, + "learning_rate": 0.0002809523809523809 + }, + { + "step": 60, + "epoch": 0.09615384615384616, + "cpu_mem": 1.530949632, + "gpu_mem": 4.436952064, + "loss": 1.3856, + "grad_norm": 4.201577663421631, + "learning_rate": 0.0002857142857142857 + }, + { + "step": 61, + "epoch": 0.09775641025641026, + "cpu_mem": 1.531539456, + "gpu_mem": 4.436918272, + "loss": 1.42, + "grad_norm": 3.6438090801239014, + "learning_rate": 0.00029047619047619045 + }, + { + "step": 62, + "epoch": 0.09935897435897435, + "cpu_mem": 1.53212928, + "gpu_mem": 4.436958208, + "loss": 1.3192, + "grad_norm": 2.5962512493133545, + "learning_rate": 0.0002952380952380952 + }, + { + "step": 63, + "epoch": 0.10096153846153846, + "cpu_mem": 1.532522496, + "gpu_mem": 4.436912128, + "loss": 1.4508, + "grad_norm": 5.008424282073975, + "learning_rate": 0.0003 + }, + { + "step": 64, + "epoch": 0.10256410256410256, + "cpu_mem": 1.532915712, + "gpu_mem": 4.436916736, + "loss": 1.4316, + "grad_norm": 4.73649263381958, + "learning_rate": 0.00029999764801714643 + }, + { + "step": 65, + "epoch": 0.10416666666666667, + "cpu_mem": 1.533308928, + "gpu_mem": 4.436913664, + "loss": 1.39, + "grad_norm": 2.992147207260132, + "learning_rate": 0.00029999059214234344 + }, + { + "step": 66, + "epoch": 0.10576923076923077, + "cpu_mem": 1.533702144, + "gpu_mem": 4.436932096, + "loss": 1.4239, + "grad_norm": 6.530444622039795, + "learning_rate": 0.00029997883259686163 + }, + { + "step": 67, + "epoch": 0.10737179487179487, + "cpu_mem": 1.534291968, + "gpu_mem": 4.436924416, + "loss": 1.4685, + "grad_norm": 13.371931076049805, + "learning_rate": 0.00029996236974947764 + }, + { + "step": 68, + "epoch": 0.10897435897435898, + "cpu_mem": 1.534881792, + "gpu_mem": 4.436909056, + "loss": 1.5021, + "grad_norm": 13.506332397460938, + "learning_rate": 0.00029994120411646263 + }, + { + "step": 69, + "epoch": 0.11057692307692307, + "cpu_mem": 1.535275008, + "gpu_mem": 4.436979712, + "loss": 1.4436, + "grad_norm": 8.845291137695312, + "learning_rate": 0.00029991533636156603 + }, + { + "step": 70, + "epoch": 0.11217948717948718, + "cpu_mem": 1.535864832, + "gpu_mem": 4.43693056, + "loss": 1.4091, + "grad_norm": 6.964128017425537, + "learning_rate": 0.00029988476729599464 + }, + { + "step": 71, + "epoch": 0.11378205128205128, + "cpu_mem": 1.536258048, + "gpu_mem": 4.436955136, + "loss": 1.3876, + "grad_norm": 8.153307914733887, + "learning_rate": 0.0002998494978783874 + }, + { + "step": 72, + "epoch": 0.11538461538461539, + "cpu_mem": 1.536651264, + "gpu_mem": 4.436925952, + "loss": 1.4071, + "grad_norm": 4.937172889709473, + "learning_rate": 0.0002998095292147852 + }, + { + "step": 73, + "epoch": 0.11698717948717949, + "cpu_mem": 1.53704448, + "gpu_mem": 4.436918272, + "loss": 1.4449, + "grad_norm": 10.612336158752441, + "learning_rate": 0.0002997648625585962 + }, + { + "step": 74, + "epoch": 0.11858974358974358, + "cpu_mem": 1.537437696, + "gpu_mem": 4.436912128, + "loss": 1.3922, + "grad_norm": 10.176785469055176, + "learning_rate": 0.0002997154993105566 + }, + { + "step": 75, + "epoch": 0.1201923076923077, + "cpu_mem": 1.537830912, + "gpu_mem": 4.436941312, + "loss": 1.4207, + "grad_norm": 6.5194807052612305, + "learning_rate": 0.00029966144101868636 + }, + { + "step": 76, + "epoch": 0.12179487179487179, + "cpu_mem": 1.538224128, + "gpu_mem": 4.436932096, + "loss": 1.3701, + "grad_norm": 5.880921840667725, + "learning_rate": 0.0002996026893782414 + }, + { + "step": 77, + "epoch": 0.1233974358974359, + "cpu_mem": 1.538617344, + "gpu_mem": 4.436919808, + "loss": 1.4006, + "grad_norm": 4.762421607971191, + "learning_rate": 0.00029953924623165955 + }, + { + "step": 78, + "epoch": 0.125, + "cpu_mem": 1.53901056, + "gpu_mem": 4.436912128, + "loss": 1.4759, + "grad_norm": 13.992104530334473, + "learning_rate": 0.0002994711135685035 + }, + { + "step": 79, + "epoch": 0.1266025641025641, + "cpu_mem": 1.539403776, + "gpu_mem": 4.436964352, + "loss": 1.4071, + "grad_norm": 7.344609260559082, + "learning_rate": 0.00029939829352539787 + }, + { + "step": 80, + "epoch": 0.1282051282051282, + "cpu_mem": 1.5399936, + "gpu_mem": 4.436942848, + "loss": 1.4055, + "grad_norm": 5.657229423522949, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 81, + "epoch": 0.12980769230769232, + "cpu_mem": 1.540386816, + "gpu_mem": 4.436936704, + "loss": 1.3882, + "grad_norm": 9.168224334716797, + "learning_rate": 0.0002992386005807413 + }, + { + "step": 82, + "epoch": 0.13141025641025642, + "cpu_mem": 1.540780032, + "gpu_mem": 4.436913664, + "loss": 1.3795, + "grad_norm": 9.726070404052734, + "learning_rate": 0.00029915173268712456 + }, + { + "step": 83, + "epoch": 0.1330128205128205, + "cpu_mem": 1.541173248, + "gpu_mem": 4.436935168, + "loss": 1.9648, + "grad_norm": 168.8311309814453, + "learning_rate": 0.0002990601874292698 + }, + { + "step": 84, + "epoch": 0.1346153846153846, + "cpu_mem": 1.541566464, + "gpu_mem": 4.43690752, + "loss": 1.5657, + "grad_norm": 34.121585845947266, + "learning_rate": 0.0002989639676780152 + }, + { + "step": 85, + "epoch": 0.1362179487179487, + "cpu_mem": 1.54195968, + "gpu_mem": 4.4369152, + "loss": 1.398, + "grad_norm": 6.760315418243408, + "learning_rate": 0.0002988630764507904 + }, + { + "step": 86, + "epoch": 0.13782051282051283, + "cpu_mem": 1.542352896, + "gpu_mem": 4.436933632, + "loss": 1.3703, + "grad_norm": 1.545555591583252, + "learning_rate": 0.00029875751691152094 + }, + { + "step": 87, + "epoch": 0.13942307692307693, + "cpu_mem": 1.542746112, + "gpu_mem": 4.43692288, + "loss": 1.4379, + "grad_norm": 5.410030841827393, + "learning_rate": 0.0002986472923705301 + }, + { + "step": 88, + "epoch": 0.14102564102564102, + "cpu_mem": 1.543139328, + "gpu_mem": 4.436921344, + "loss": 1.4222, + "grad_norm": 5.5628767013549805, + "learning_rate": 0.0002985324062844341 + }, + { + "step": 89, + "epoch": 0.14262820512820512, + "cpu_mem": 1.543532544, + "gpu_mem": 4.436916736, + "loss": 1.4162, + "grad_norm": 4.204005241394043, + "learning_rate": 0.0002984128622560345 + }, + { + "step": 90, + "epoch": 0.14423076923076922, + "cpu_mem": 1.54392576, + "gpu_mem": 4.436921344, + "loss": 1.3941, + "grad_norm": 4.120467185974121, + "learning_rate": 0.0002982886640342046 + }, + { + "step": 91, + "epoch": 0.14583333333333334, + "cpu_mem": 1.544318976, + "gpu_mem": 4.436932096, + "loss": 1.4998, + "grad_norm": 13.538830757141113, + "learning_rate": 0.00029815981551377217 + }, + { + "step": 92, + "epoch": 0.14743589743589744, + "cpu_mem": 1.544712192, + "gpu_mem": 4.436935168, + "loss": 1.4359, + "grad_norm": 7.964791774749756, + "learning_rate": 0.00029802632073539745 + }, + { + "step": 93, + "epoch": 0.14903846153846154, + "cpu_mem": 1.545105408, + "gpu_mem": 4.436935168, + "loss": 1.425, + "grad_norm": 5.441211700439453, + "learning_rate": 0.0002978881838854462 + }, + { + "step": 94, + "epoch": 0.15064102564102563, + "cpu_mem": 1.545498624, + "gpu_mem": 4.43693056, + "loss": 1.3939, + "grad_norm": 4.65664005279541, + "learning_rate": 0.00029774540929585847 + }, + { + "step": 95, + "epoch": 0.15224358974358973, + "cpu_mem": 1.545695232, + "gpu_mem": 4.436948992, + "loss": 1.4277, + "grad_norm": 6.111709117889404, + "learning_rate": 0.0002975980014440126 + }, + { + "step": 96, + "epoch": 0.15384615384615385, + "cpu_mem": 1.546088448, + "gpu_mem": 4.436952064, + "loss": 1.3981, + "grad_norm": 3.79237961769104, + "learning_rate": 0.00029744596495258525 + }, + { + "step": 97, + "epoch": 0.15544871794871795, + "cpu_mem": 1.546481664, + "gpu_mem": 4.436929024, + "loss": 1.4103, + "grad_norm": 4.288849353790283, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 98, + "epoch": 0.15705128205128205, + "cpu_mem": 1.54687488, + "gpu_mem": 4.436939776, + "loss": 1.4001, + "grad_norm": 3.2684688568115234, + "learning_rate": 0.000297128025267308 + }, + { + "step": 99, + "epoch": 0.15865384615384615, + "cpu_mem": 1.547268096, + "gpu_mem": 4.436939776, + "loss": 1.4543, + "grad_norm": 10.608121871948242, + "learning_rate": 0.00029696213204397396 + }, + { + "step": 100, + "epoch": 0.16025641025641027, + "cpu_mem": 1.547661312, + "gpu_mem": 4.4369152, + "loss": 1.4163, + "grad_norm": 5.638442039489746, + "learning_rate": 0.00029679163012177737 + }, + { + "step": 101, + "epoch": 0.16185897435897437, + "cpu_mem": 1.548054528, + "gpu_mem": 4.436944384, + "loss": 1.3884, + "grad_norm": 3.4781880378723145, + "learning_rate": 0.0002966165248476196 + }, + { + "step": 102, + "epoch": 0.16346153846153846, + "cpu_mem": 1.548447744, + "gpu_mem": 4.436921344, + "loss": 1.365, + "grad_norm": 2.517791748046875, + "learning_rate": 0.00029643682171276203 + }, + { + "step": 103, + "epoch": 0.16506410256410256, + "cpu_mem": 1.54884096, + "gpu_mem": 4.43693824, + "loss": 1.431, + "grad_norm": 3.4702959060668945, + "learning_rate": 0.0002962525263526538 + }, + { + "step": 104, + "epoch": 0.16666666666666666, + "cpu_mem": 1.549037568, + "gpu_mem": 4.436905984, + "loss": 1.3888, + "grad_norm": 3.8904337882995605, + "learning_rate": 0.0002960636445467553 + }, + { + "step": 105, + "epoch": 0.16826923076923078, + "cpu_mem": 1.549430784, + "gpu_mem": 4.436921344, + "loss": 1.3802, + "grad_norm": 3.688410758972168, + "learning_rate": 0.0002958701822183569 + }, + { + "step": 106, + "epoch": 0.16987179487179488, + "cpu_mem": 1.549824, + "gpu_mem": 4.436901376, + "loss": 1.5614, + "grad_norm": 10.637764930725098, + "learning_rate": 0.0002956721454343928 + }, + { + "step": 107, + "epoch": 0.17147435897435898, + "cpu_mem": 1.550020608, + "gpu_mem": 4.436942848, + "loss": 1.4072, + "grad_norm": 3.104708194732666, + "learning_rate": 0.0002954695404052514 + }, + { + "step": 108, + "epoch": 0.17307692307692307, + "cpu_mem": 1.550413824, + "gpu_mem": 4.43693824, + "loss": 1.3898, + "grad_norm": 1.8257195949554443, + "learning_rate": 0.00029526237348458003 + }, + { + "step": 109, + "epoch": 0.17467948717948717, + "cpu_mem": 1.55080704, + "gpu_mem": 4.436944384, + "loss": 1.4578, + "grad_norm": 8.511282920837402, + "learning_rate": 0.000295050651169086 + }, + { + "step": 110, + "epoch": 0.1762820512820513, + "cpu_mem": 1.551003648, + "gpu_mem": 4.436941312, + "loss": 1.444, + "grad_norm": 7.872385025024414, + "learning_rate": 0.00029483438009833264 + }, + { + "step": 111, + "epoch": 0.1778846153846154, + "cpu_mem": 1.551396864, + "gpu_mem": 4.436942848, + "loss": 1.3714, + "grad_norm": 4.319558143615723, + "learning_rate": 0.0002946135670545314 + }, + { + "step": 112, + "epoch": 0.1794871794871795, + "cpu_mem": 1.55179008, + "gpu_mem": 4.436939776, + "loss": 1.4074, + "grad_norm": 4.419754981994629, + "learning_rate": 0.0002943882189623288 + }, + { + "step": 113, + "epoch": 0.18108974358974358, + "cpu_mem": 1.552183296, + "gpu_mem": 4.436919808, + "loss": 1.4371, + "grad_norm": 4.935364246368408, + "learning_rate": 0.00029415834288858947 + }, + { + "step": 114, + "epoch": 0.18269230769230768, + "cpu_mem": 1.552576512, + "gpu_mem": 4.4369152, + "loss": 1.4026, + "grad_norm": 5.536993980407715, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 115, + "epoch": 0.1842948717948718, + "cpu_mem": 1.55277312, + "gpu_mem": 4.436933632, + "loss": 1.4314, + "grad_norm": 5.418115615844727, + "learning_rate": 0.0002936850357737156 + }, + { + "step": 116, + "epoch": 0.1858974358974359, + "cpu_mem": 1.553166336, + "gpu_mem": 4.436944384, + "loss": 1.3874, + "grad_norm": 1.4687130451202393, + "learning_rate": 0.0002934416195753839 + }, + { + "step": 117, + "epoch": 0.1875, + "cpu_mem": 1.553362944, + "gpu_mem": 4.43693056, + "loss": 1.4204, + "grad_norm": 4.573318958282471, + "learning_rate": 0.00029319370508065594 + }, + { + "step": 118, + "epoch": 0.1891025641025641, + "cpu_mem": 1.55375616, + "gpu_mem": 4.43694592, + "loss": 1.3481, + "grad_norm": 3.1209068298339844, + "learning_rate": 0.0002929413000640735 + }, + { + "step": 119, + "epoch": 0.1907051282051282, + "cpu_mem": 1.554149376, + "gpu_mem": 4.436927488, + "loss": 1.498, + "grad_norm": 11.857294082641602, + "learning_rate": 0.0002926844124410001 + }, + { + "step": 120, + "epoch": 0.19230769230769232, + "cpu_mem": 1.554542592, + "gpu_mem": 4.4369536, + "loss": 1.4725, + "grad_norm": 8.6198148727417, + "learning_rate": 0.0002924230502673731 + }, + { + "step": 121, + "epoch": 0.19391025641025642, + "cpu_mem": 1.5547392, + "gpu_mem": 4.436912128, + "loss": 1.3752, + "grad_norm": 3.041146993637085, + "learning_rate": 0.00029215722173945034 + }, + { + "step": 122, + "epoch": 0.1955128205128205, + "cpu_mem": 1.555132416, + "gpu_mem": 4.436944384, + "loss": 1.409, + "grad_norm": 4.232198715209961, + "learning_rate": 0.0002918869351935537 + }, + { + "step": 123, + "epoch": 0.1971153846153846, + "cpu_mem": 1.555525632, + "gpu_mem": 4.43693824, + "loss": 1.4259, + "grad_norm": 4.541390419006348, + "learning_rate": 0.00029161219910580754 + }, + { + "step": 124, + "epoch": 0.1987179487179487, + "cpu_mem": 1.555918848, + "gpu_mem": 4.436939776, + "loss": 1.3878, + "grad_norm": 3.5401904582977295, + "learning_rate": 0.00029133302209187267 + }, + { + "step": 125, + "epoch": 0.20032051282051283, + "cpu_mem": 1.556312064, + "gpu_mem": 4.4369152, + "loss": 1.468, + "grad_norm": 10.657790184020996, + "learning_rate": 0.00029104941290667655 + }, + { + "step": 126, + "epoch": 0.20192307692307693, + "cpu_mem": 1.556508672, + "gpu_mem": 4.436924416, + "loss": 1.442, + "grad_norm": 9.351613998413086, + "learning_rate": 0.00029076138044413827 + }, + { + "step": 127, + "epoch": 0.20352564102564102, + "cpu_mem": 1.55670528, + "gpu_mem": 4.436910592, + "loss": 1.479, + "grad_norm": 8.64948844909668, + "learning_rate": 0.00029046893373689 + }, + { + "step": 128, + "epoch": 0.20512820512820512, + "cpu_mem": 1.557098496, + "gpu_mem": 4.436947456, + "loss": 1.3752, + "grad_norm": 2.392360210418701, + "learning_rate": 0.00029017208195599375 + }, + { + "step": 129, + "epoch": 0.20673076923076922, + "cpu_mem": 1.557295104, + "gpu_mem": 4.436944384, + "loss": 1.4088, + "grad_norm": 2.834447145462036, + "learning_rate": 0.0002898708344106533 + }, + { + "step": 130, + "epoch": 0.20833333333333334, + "cpu_mem": 1.55768832, + "gpu_mem": 4.436944384, + "loss": 1.3991, + "grad_norm": 2.657150983810425, + "learning_rate": 0.00028956520054792303 + }, + { + "step": 131, + "epoch": 0.20993589743589744, + "cpu_mem": 1.558081536, + "gpu_mem": 4.436933632, + "loss": 1.3935, + "grad_norm": 2.711601734161377, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 132, + "epoch": 0.21153846153846154, + "cpu_mem": 1.558278144, + "gpu_mem": 4.436933632, + "loss": 1.3956, + "grad_norm": 4.471643924713135, + "learning_rate": 0.0002889408123459782 + }, + { + "step": 133, + "epoch": 0.21314102564102563, + "cpu_mem": 1.55867136, + "gpu_mem": 4.4369152, + "loss": 1.494, + "grad_norm": 10.330343246459961, + "learning_rate": 0.000288622077587435 + }, + { + "step": 134, + "epoch": 0.21474358974358973, + "cpu_mem": 1.559064576, + "gpu_mem": 4.436925952, + "loss": 1.4286, + "grad_norm": 6.339375972747803, + "learning_rate": 0.0002882989956722303 + }, + { + "step": 135, + "epoch": 0.21634615384615385, + "cpu_mem": 1.559457792, + "gpu_mem": 4.436935168, + "loss": 1.4106, + "grad_norm": 15.671834945678711, + "learning_rate": 0.00028797157673213914 + }, + { + "step": 136, + "epoch": 0.21794871794871795, + "cpu_mem": 1.5596544, + "gpu_mem": 4.436950528, + "loss": 1.4009, + "grad_norm": 3.698800563812256, + "learning_rate": 0.00028763983103494465 + }, + { + "step": 137, + "epoch": 0.21955128205128205, + "cpu_mem": 1.560047616, + "gpu_mem": 4.436898304, + "loss": 1.3998, + "grad_norm": 4.2266411781311035, + "learning_rate": 0.00028730376898411606 + }, + { + "step": 138, + "epoch": 0.22115384615384615, + "cpu_mem": 1.560244224, + "gpu_mem": 4.436918272, + "loss": 1.4291, + "grad_norm": 5.915955543518066, + "learning_rate": 0.00028696340111848245 + }, + { + "step": 139, + "epoch": 0.22275641025641027, + "cpu_mem": 1.560440832, + "gpu_mem": 4.43689984, + "loss": 1.4058, + "grad_norm": 3.393568515777588, + "learning_rate": 0.00028661873811190226 + }, + { + "step": 140, + "epoch": 0.22435897435897437, + "cpu_mem": 1.560834048, + "gpu_mem": 4.436916736, + "loss": 1.3809, + "grad_norm": 2.658200979232788, + "learning_rate": 0.0002862697907729285 + }, + { + "step": 141, + "epoch": 0.22596153846153846, + "cpu_mem": 1.561030656, + "gpu_mem": 4.43692288, + "loss": 1.4265, + "grad_norm": 5.219371795654297, + "learning_rate": 0.0002859165700444701 + }, + { + "step": 142, + "epoch": 0.22756410256410256, + "cpu_mem": 1.561227264, + "gpu_mem": 4.436919808, + "loss": 1.3974, + "grad_norm": 9.667746543884277, + "learning_rate": 0.00028555908700344824 + }, + { + "step": 143, + "epoch": 0.22916666666666666, + "cpu_mem": 1.56162048, + "gpu_mem": 4.43694592, + "loss": 1.394, + "grad_norm": 4.256577968597412, + "learning_rate": 0.00028519735286044936 + }, + { + "step": 144, + "epoch": 0.23076923076923078, + "cpu_mem": 1.561817088, + "gpu_mem": 4.436919808, + "loss": 1.3894, + "grad_norm": 5.617374897003174, + "learning_rate": 0.0002848313789593736 + }, + { + "step": 145, + "epoch": 0.23237179487179488, + "cpu_mem": 1.562210304, + "gpu_mem": 4.436959744, + "loss": 1.3652, + "grad_norm": 3.6724853515625, + "learning_rate": 0.00028446117677707867 + }, + { + "step": 146, + "epoch": 0.23397435897435898, + "cpu_mem": 1.562406912, + "gpu_mem": 4.436909056, + "loss": 1.4076, + "grad_norm": 2.855616807937622, + "learning_rate": 0.0002840867579230205 + }, + { + "step": 147, + "epoch": 0.23557692307692307, + "cpu_mem": 1.562800128, + "gpu_mem": 4.436918272, + "loss": 1.446, + "grad_norm": 4.65022087097168, + "learning_rate": 0.00028370813413888866 + }, + { + "step": 148, + "epoch": 0.23717948717948717, + "cpu_mem": 1.562996736, + "gpu_mem": 4.43693824, + "loss": 1.3781, + "grad_norm": 2.2443299293518066, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 149, + "epoch": 0.2387820512820513, + "cpu_mem": 1.563389952, + "gpu_mem": 4.436929024, + "loss": 1.3872, + "grad_norm": 5.994711875915527, + "learning_rate": 0.0002829383194061186 + }, + { + "step": 150, + "epoch": 0.2403846153846154, + "cpu_mem": 1.56358656, + "gpu_mem": 4.436941312, + "loss": 1.4, + "grad_norm": 4.952937126159668, + "learning_rate": 0.00028254715259869444 + }, + { + "step": 151, + "epoch": 0.2419871794871795, + "cpu_mem": 1.563783168, + "gpu_mem": 4.436905984, + "loss": 1.4595, + "grad_norm": 7.068958282470703, + "learning_rate": 0.00028215182914286766 + }, + { + "step": 152, + "epoch": 0.24358974358974358, + "cpu_mem": 1.563979776, + "gpu_mem": 4.436936704, + "loss": 1.4208, + "grad_norm": 5.334195137023926, + "learning_rate": 0.00028175236143589144 + }, + { + "step": 153, + "epoch": 0.24519230769230768, + "cpu_mem": 1.564372992, + "gpu_mem": 4.436932096, + "loss": 1.378, + "grad_norm": 4.242816925048828, + "learning_rate": 0.0002813487620049817 + }, + { + "step": 154, + "epoch": 0.2467948717948718, + "cpu_mem": 1.5645696, + "gpu_mem": 4.436956672, + "loss": 1.3949, + "grad_norm": 3.568427085876465, + "learning_rate": 0.00028094104350692435 + }, + { + "step": 155, + "epoch": 0.2483974358974359, + "cpu_mem": 1.564962816, + "gpu_mem": 4.436893696, + "loss": 1.4458, + "grad_norm": 5.308867931365967, + "learning_rate": 0.0002805292187276783 + }, + { + "step": 156, + "epoch": 0.25, + "cpu_mem": 1.565159424, + "gpu_mem": 4.436947456, + "loss": 1.3806, + "grad_norm": 2.49784779548645, + "learning_rate": 0.0002801133005819744 + }, + { + "step": 157, + "epoch": 0.2516025641025641, + "cpu_mem": 1.565356032, + "gpu_mem": 4.436939776, + "loss": 1.3983, + "grad_norm": 3.389962911605835, + "learning_rate": 0.00027969330211291077 + }, + { + "step": 158, + "epoch": 0.2532051282051282, + "cpu_mem": 1.56555264, + "gpu_mem": 4.436955136, + "loss": 1.3925, + "grad_norm": 2.092052936553955, + "learning_rate": 0.00027926923649154327 + }, + { + "step": 159, + "epoch": 0.2548076923076923, + "cpu_mem": 1.565945856, + "gpu_mem": 4.436956672, + "loss": 1.3887, + "grad_norm": 1.7223609685897827, + "learning_rate": 0.00027884111701647284 + }, + { + "step": 160, + "epoch": 0.2564102564102564, + "cpu_mem": 1.566142464, + "gpu_mem": 4.436924416, + "loss": 1.409, + "grad_norm": 4.628638744354248, + "learning_rate": 0.00027840895711342834 + }, + { + "step": 161, + "epoch": 0.25801282051282054, + "cpu_mem": 1.566339072, + "gpu_mem": 4.436916736, + "loss": 1.3718, + "grad_norm": 1.936938762664795, + "learning_rate": 0.00027797277033484553 + }, + { + "step": 162, + "epoch": 0.25961538461538464, + "cpu_mem": 1.56653568, + "gpu_mem": 4.436952064, + "loss": 1.3986, + "grad_norm": 4.912286281585693, + "learning_rate": 0.0002775325703594421 + }, + { + "step": 163, + "epoch": 0.26121794871794873, + "cpu_mem": 1.566928896, + "gpu_mem": 4.43689984, + "loss": 1.4685, + "grad_norm": 10.85272216796875, + "learning_rate": 0.0002770883709917886 + }, + { + "step": 164, + "epoch": 0.26282051282051283, + "cpu_mem": 1.567125504, + "gpu_mem": 4.436935168, + "loss": 1.5338, + "grad_norm": 13.72280502319336, + "learning_rate": 0.0002766401861618757 + }, + { + "step": 165, + "epoch": 0.2644230769230769, + "cpu_mem": 1.56751872, + "gpu_mem": 4.436924416, + "loss": 1.4071, + "grad_norm": 4.751026630401611, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 166, + "epoch": 0.266025641025641, + "cpu_mem": 1.567911936, + "gpu_mem": 4.436956672, + "loss": 1.3743, + "grad_norm": 2.554128885269165, + "learning_rate": 0.0002757319164597092 + }, + { + "step": 167, + "epoch": 0.2676282051282051, + "cpu_mem": 1.568108544, + "gpu_mem": 4.436950528, + "loss": 1.3745, + "grad_norm": 3.347041606903076, + "learning_rate": 0.0002752718600705858 + }, + { + "step": 168, + "epoch": 0.2692307692307692, + "cpu_mem": 1.568305152, + "gpu_mem": 4.436929024, + "loss": 1.3967, + "grad_norm": 2.4096672534942627, + "learning_rate": 0.00027480787518457023 + }, + { + "step": 169, + "epoch": 0.2708333333333333, + "cpu_mem": 1.56850176, + "gpu_mem": 4.436925952, + "loss": 1.3739, + "grad_norm": 2.6572647094726562, + "learning_rate": 0.0002743399763521223 + }, + { + "step": 170, + "epoch": 0.2724358974358974, + "cpu_mem": 1.568894976, + "gpu_mem": 4.436962816, + "loss": 1.3964, + "grad_norm": 2.868647575378418, + "learning_rate": 0.0002738681782464426 + }, + { + "step": 171, + "epoch": 0.27403846153846156, + "cpu_mem": 1.569091584, + "gpu_mem": 4.436936704, + "loss": 1.3602, + "grad_norm": 1.770275592803955, + "learning_rate": 0.0002733924956630117 + }, + { + "step": 172, + "epoch": 0.27564102564102566, + "cpu_mem": 1.569288192, + "gpu_mem": 4.436913664, + "loss": 1.3838, + "grad_norm": 3.1046483516693115, + "learning_rate": 0.00027291294351912664 + }, + { + "step": 173, + "epoch": 0.27724358974358976, + "cpu_mem": 1.569681408, + "gpu_mem": 4.436939776, + "loss": 1.3526, + "grad_norm": 3.443833351135254, + "learning_rate": 0.00027242953685343327 + }, + { + "step": 174, + "epoch": 0.27884615384615385, + "cpu_mem": 1.569878016, + "gpu_mem": 4.436952064, + "loss": 1.3095, + "grad_norm": 4.371649265289307, + "learning_rate": 0.0002719422908254538 + }, + { + "step": 175, + "epoch": 0.28044871794871795, + "cpu_mem": 1.570074624, + "gpu_mem": 4.436913664, + "loss": 1.3702, + "grad_norm": 9.696763038635254, + "learning_rate": 0.0002714512207151125 + }, + { + "step": 176, + "epoch": 0.28205128205128205, + "cpu_mem": 1.570271232, + "gpu_mem": 4.43692288, + "loss": 1.4842, + "grad_norm": 14.07298469543457, + "learning_rate": 0.0002709563419222557 + }, + { + "step": 177, + "epoch": 0.28365384615384615, + "cpu_mem": 1.57046784, + "gpu_mem": 4.436904448, + "loss": 1.4024, + "grad_norm": 12.896562576293945, + "learning_rate": 0.0002704576699661691 + }, + { + "step": 178, + "epoch": 0.28525641025641024, + "cpu_mem": 1.570664448, + "gpu_mem": 4.436918272, + "loss": 1.3245, + "grad_norm": 10.335596084594727, + "learning_rate": 0.0002699552204850914 + }, + { + "step": 179, + "epoch": 0.28685897435897434, + "cpu_mem": 1.570861056, + "gpu_mem": 4.436925952, + "loss": 1.286, + "grad_norm": 9.373173713684082, + "learning_rate": 0.0002694490092357233 + }, + { + "step": 180, + "epoch": 0.28846153846153844, + "cpu_mem": 1.571057664, + "gpu_mem": 4.43690752, + "loss": 1.3347, + "grad_norm": 10.524504661560059, + "learning_rate": 0.00026893905209273404 + }, + { + "step": 181, + "epoch": 0.2900641025641026, + "cpu_mem": 1.57145088, + "gpu_mem": 4.43693824, + "loss": 1.3477, + "grad_norm": 10.140438079833984, + "learning_rate": 0.00026842536504826286 + }, + { + "step": 182, + "epoch": 0.2916666666666667, + "cpu_mem": 1.571647488, + "gpu_mem": 4.436909056, + "loss": 1.3172, + "grad_norm": 9.12689208984375, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 183, + "epoch": 0.2932692307692308, + "cpu_mem": 1.571844096, + "gpu_mem": 4.436933632, + "loss": 1.2545, + "grad_norm": 9.095449447631836, + "learning_rate": 0.0002673868658077717 + }, + { + "step": 184, + "epoch": 0.2948717948717949, + "cpu_mem": 1.572040704, + "gpu_mem": 4.436913664, + "loss": 1.3505, + "grad_norm": 14.985014915466309, + "learning_rate": 0.00026686208617885055 + }, + { + "step": 185, + "epoch": 0.296474358974359, + "cpu_mem": 1.57243392, + "gpu_mem": 4.43694592, + "loss": 1.3748, + "grad_norm": 16.65363311767578, + "learning_rate": 0.0002663336417816238 + }, + { + "step": 186, + "epoch": 0.2980769230769231, + "cpu_mem": 1.572630528, + "gpu_mem": 4.436936704, + "loss": 1.4149, + "grad_norm": 14.779111862182617, + "learning_rate": 0.0002658015491879868 + }, + { + "step": 187, + "epoch": 0.29967948717948717, + "cpu_mem": 1.572827136, + "gpu_mem": 4.436932096, + "loss": 1.271, + "grad_norm": 10.550195693969727, + "learning_rate": 0.00026526582508424175 + }, + { + "step": 188, + "epoch": 0.30128205128205127, + "cpu_mem": 1.573023744, + "gpu_mem": 4.436889088, + "loss": 1.4106, + "grad_norm": 10.818365097045898, + "learning_rate": 0.0002647264862705741 + }, + { + "step": 189, + "epoch": 0.30288461538461536, + "cpu_mem": 1.573220352, + "gpu_mem": 4.43696896, + "loss": 1.3686, + "grad_norm": 10.522789001464844, + "learning_rate": 0.00026418354966052573 + }, + { + "step": 190, + "epoch": 0.30448717948717946, + "cpu_mem": 1.57341696, + "gpu_mem": 4.436919808, + "loss": 1.2848, + "grad_norm": 9.571028709411621, + "learning_rate": 0.00026363703228046454 + }, + { + "step": 191, + "epoch": 0.3060897435897436, + "cpu_mem": 1.573613568, + "gpu_mem": 4.436919808, + "loss": 1.3062, + "grad_norm": 7.7640581130981445, + "learning_rate": 0.0002630869512690507 + }, + { + "step": 192, + "epoch": 0.3076923076923077, + "cpu_mem": 1.573810176, + "gpu_mem": 4.436886016, + "loss": 1.3775, + "grad_norm": 10.665118217468262, + "learning_rate": 0.0002625333238766989 + }, + { + "step": 193, + "epoch": 0.3092948717948718, + "cpu_mem": 1.574006784, + "gpu_mem": 4.436925952, + "loss": 1.2949, + "grad_norm": 7.862358570098877, + "learning_rate": 0.0002619761674650377 + }, + { + "step": 194, + "epoch": 0.3108974358974359, + "cpu_mem": 1.5744, + "gpu_mem": 4.436921344, + "loss": 1.2931, + "grad_norm": 10.97903060913086, + "learning_rate": 0.0002614154995063647 + }, + { + "step": 195, + "epoch": 0.3125, + "cpu_mem": 1.574596608, + "gpu_mem": 4.436909056, + "loss": 1.2539, + "grad_norm": 6.608334064483643, + "learning_rate": 0.00026085133758309883 + }, + { + "step": 196, + "epoch": 0.3141025641025641, + "cpu_mem": 1.574793216, + "gpu_mem": 4.436933632, + "loss": 1.3335, + "grad_norm": 9.479626655578613, + "learning_rate": 0.0002602836993872292 + }, + { + "step": 197, + "epoch": 0.3157051282051282, + "cpu_mem": 1.574989824, + "gpu_mem": 4.436948992, + "loss": 1.2823, + "grad_norm": 9.61367130279541, + "learning_rate": 0.0002597126027197598 + }, + { + "step": 198, + "epoch": 0.3173076923076923, + "cpu_mem": 1.575186432, + "gpu_mem": 4.436921344, + "loss": 1.2645, + "grad_norm": 8.816216468811035, + "learning_rate": 0.0002591380654901515 + }, + { + "step": 199, + "epoch": 0.3189102564102564, + "cpu_mem": 1.57538304, + "gpu_mem": 4.436918272, + "loss": 1.259, + "grad_norm": 14.01868724822998, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 200, + "epoch": 0.32051282051282054, + "cpu_mem": 1.575579648, + "gpu_mem": 4.436933632, + "loss": 1.4218, + "grad_norm": 20.759809494018555, + "learning_rate": 0.0002579787415212732 + }, + { + "step": 201, + "epoch": 0.32211538461538464, + "cpu_mem": 1.575579648, + "gpu_mem": 4.436910592, + "loss": 1.1618, + "grad_norm": 11.007397651672363, + "learning_rate": 0.00025739399113813784 + }, + { + "step": 202, + "epoch": 0.32371794871794873, + "cpu_mem": 1.575776256, + "gpu_mem": 4.436912128, + "loss": 1.1557, + "grad_norm": 11.840123176574707, + "learning_rate": 0.00025680587290399277 + }, + { + "step": 203, + "epoch": 0.32532051282051283, + "cpu_mem": 1.575972864, + "gpu_mem": 4.4369536, + "loss": 1.1334, + "grad_norm": 13.242268562316895, + "learning_rate": 0.0002562144052620913 + }, + { + "step": 204, + "epoch": 0.3269230769230769, + "cpu_mem": 1.576169472, + "gpu_mem": 4.436924416, + "loss": 1.1384, + "grad_norm": 13.936006546020508, + "learning_rate": 0.00025561960676072354 + }, + { + "step": 205, + "epoch": 0.328525641025641, + "cpu_mem": 1.576562688, + "gpu_mem": 4.436924416, + "loss": 1.2811, + "grad_norm": 15.677022933959961, + "learning_rate": 0.0002550214960526344 + }, + { + "step": 206, + "epoch": 0.3301282051282051, + "cpu_mem": 1.576759296, + "gpu_mem": 4.436921344, + "loss": 1.2402, + "grad_norm": 17.103199005126953, + "learning_rate": 0.000254420091894439 + }, + { + "step": 207, + "epoch": 0.3317307692307692, + "cpu_mem": 1.576955904, + "gpu_mem": 4.436921344, + "loss": 1.2814, + "grad_norm": 16.412954330444336, + "learning_rate": 0.0002538154131460342 + }, + { + "step": 208, + "epoch": 0.3333333333333333, + "cpu_mem": 1.577152512, + "gpu_mem": 4.436912128, + "loss": 1.1382, + "grad_norm": 15.65660285949707, + "learning_rate": 0.00025320747877000745 + }, + { + "step": 209, + "epoch": 0.3349358974358974, + "cpu_mem": 1.57734912, + "gpu_mem": 4.436947456, + "loss": 1.1738, + "grad_norm": 20.234386444091797, + "learning_rate": 0.00025259630783104164 + }, + { + "step": 210, + "epoch": 0.33653846153846156, + "cpu_mem": 1.577545728, + "gpu_mem": 4.436904448, + "loss": 1.2437, + "grad_norm": 12.806293487548828, + "learning_rate": 0.00025198191949531786 + }, + { + "step": 211, + "epoch": 0.33814102564102566, + "cpu_mem": 1.577742336, + "gpu_mem": 4.436932096, + "loss": 1.2049, + "grad_norm": 13.271135330200195, + "learning_rate": 0.00025136433302991366 + }, + { + "step": 212, + "epoch": 0.33974358974358976, + "cpu_mem": 1.577938944, + "gpu_mem": 4.436941312, + "loss": 1.0933, + "grad_norm": 13.20634937286377, + "learning_rate": 0.00025074356780219946 + }, + { + "step": 213, + "epoch": 0.34134615384615385, + "cpu_mem": 1.578135552, + "gpu_mem": 4.436913664, + "loss": 1.107, + "grad_norm": 14.315176963806152, + "learning_rate": 0.000250119643279231 + }, + { + "step": 214, + "epoch": 0.34294871794871795, + "cpu_mem": 1.57833216, + "gpu_mem": 4.43692288, + "loss": 1.2005, + "grad_norm": 13.631906509399414, + "learning_rate": 0.0002494925790271386 + }, + { + "step": 215, + "epoch": 0.34455128205128205, + "cpu_mem": 1.578528768, + "gpu_mem": 4.436924416, + "loss": 1.3179, + "grad_norm": 18.204267501831055, + "learning_rate": 0.00024886239471051376 + }, + { + "step": 216, + "epoch": 0.34615384615384615, + "cpu_mem": 1.578725376, + "gpu_mem": 4.436924416, + "loss": 1.1494, + "grad_norm": 17.389556884765625, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 217, + "epoch": 0.34775641025641024, + "cpu_mem": 1.578921984, + "gpu_mem": 4.436909056, + "loss": 1.139, + "grad_norm": 26.20899200439453, + "learning_rate": 0.0002475927450306363 + }, + { + "step": 218, + "epoch": 0.34935897435897434, + "cpu_mem": 1.579118592, + "gpu_mem": 4.43693056, + "loss": 1.1821, + "grad_norm": 21.787784576416016, + "learning_rate": 0.0002469533194833073 + }, + { + "step": 219, + "epoch": 0.35096153846153844, + "cpu_mem": 1.5793152, + "gpu_mem": 4.436964352, + "loss": 1.2842, + "grad_norm": 17.286773681640625, + "learning_rate": 0.0002463108535020447 + }, + { + "step": 220, + "epoch": 0.3525641025641026, + "cpu_mem": 1.579511808, + "gpu_mem": 4.436918272, + "loss": 1.2443, + "grad_norm": 25.711593627929688, + "learning_rate": 0.0002456653672344348 + }, + { + "step": 221, + "epoch": 0.3541666666666667, + "cpu_mem": 1.579708416, + "gpu_mem": 4.436924416, + "loss": 1.3196, + "grad_norm": 13.14501667022705, + "learning_rate": 0.0002450168809227794 + }, + { + "step": 222, + "epoch": 0.3557692307692308, + "cpu_mem": 1.579905024, + "gpu_mem": 4.436939776, + "loss": 1.1655, + "grad_norm": 15.12158489227295, + "learning_rate": 0.00024436541490346095 + }, + { + "step": 223, + "epoch": 0.3573717948717949, + "cpu_mem": 1.579905024, + "gpu_mem": 4.436958208, + "loss": 1.1373, + "grad_norm": 13.445638656616211, + "learning_rate": 0.00024371098960630495 + }, + { + "step": 224, + "epoch": 0.358974358974359, + "cpu_mem": 1.580101632, + "gpu_mem": 4.436927488, + "loss": 1.0459, + "grad_norm": 22.770763397216797, + "learning_rate": 0.000243053625553939 + }, + { + "step": 225, + "epoch": 0.3605769230769231, + "cpu_mem": 1.58029824, + "gpu_mem": 4.436913664, + "loss": 1.1154, + "grad_norm": 16.206815719604492, + "learning_rate": 0.00024239334336114953 + }, + { + "step": 226, + "epoch": 0.36217948717948717, + "cpu_mem": 1.580494848, + "gpu_mem": 4.436905984, + "loss": 1.0589, + "grad_norm": 14.71753215789795, + "learning_rate": 0.0002417301637342352 + }, + { + "step": 227, + "epoch": 0.36378205128205127, + "cpu_mem": 1.580691456, + "gpu_mem": 4.436970496, + "loss": 1.0837, + "grad_norm": 18.158159255981445, + "learning_rate": 0.00024106410747035744 + }, + { + "step": 228, + "epoch": 0.36538461538461536, + "cpu_mem": 1.580888064, + "gpu_mem": 4.436909056, + "loss": 1.1044, + "grad_norm": 21.470510482788086, + "learning_rate": 0.00024039519545688846 + }, + { + "step": 229, + "epoch": 0.36698717948717946, + "cpu_mem": 1.581084672, + "gpu_mem": 4.43696128, + "loss": 0.952, + "grad_norm": 20.478343963623047, + "learning_rate": 0.000239723448670756 + }, + { + "step": 230, + "epoch": 0.3685897435897436, + "cpu_mem": 1.58128128, + "gpu_mem": 4.436942848, + "loss": 0.9568, + "grad_norm": 18.30299186706543, + "learning_rate": 0.0002390488881777858 + }, + { + "step": 231, + "epoch": 0.3701923076923077, + "cpu_mem": 1.581477888, + "gpu_mem": 4.436941312, + "loss": 0.9652, + "grad_norm": 16.326221466064453, + "learning_rate": 0.0002383715351320406 + }, + { + "step": 232, + "epoch": 0.3717948717948718, + "cpu_mem": 1.581674496, + "gpu_mem": 4.43694592, + "loss": 1.0658, + "grad_norm": 22.273956298828125, + "learning_rate": 0.00023769141077515713 + }, + { + "step": 233, + "epoch": 0.3733974358974359, + "cpu_mem": 1.581871104, + "gpu_mem": 4.436921344, + "loss": 0.8366, + "grad_norm": 22.221769332885742, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 234, + "epoch": 0.375, + "cpu_mem": 1.582067712, + "gpu_mem": 4.436950528, + "loss": 1.1762, + "grad_norm": 39.18935775756836, + "learning_rate": 0.0002363229335283915 + }, + { + "step": 235, + "epoch": 0.3766025641025641, + "cpu_mem": 1.58226432, + "gpu_mem": 4.436927488, + "loss": 1.0584, + "grad_norm": 21.668363571166992, + "learning_rate": 0.00023563462355364297 + }, + { + "step": 236, + "epoch": 0.3782051282051282, + "cpu_mem": 1.582460928, + "gpu_mem": 4.436988928, + "loss": 1.0014, + "grad_norm": 22.00063705444336, + "learning_rate": 0.0002349436280966775 + }, + { + "step": 237, + "epoch": 0.3798076923076923, + "cpu_mem": 1.582460928, + "gpu_mem": 4.436913664, + "loss": 0.9412, + "grad_norm": 15.908936500549316, + "learning_rate": 0.00023424996882695468 + }, + { + "step": 238, + "epoch": 0.3814102564102564, + "cpu_mem": 1.582657536, + "gpu_mem": 4.436924416, + "loss": 1.0522, + "grad_norm": 29.43532371520996, + "learning_rate": 0.00023355366749747063 + }, + { + "step": 239, + "epoch": 0.38301282051282054, + "cpu_mem": 1.582854144, + "gpu_mem": 4.43692288, + "loss": 1.2668, + "grad_norm": 27.527606964111328, + "learning_rate": 0.00023285474594407585 + }, + { + "step": 240, + "epoch": 0.38461538461538464, + "cpu_mem": 1.583050752, + "gpu_mem": 4.436919808, + "loss": 1.1808, + "grad_norm": 22.69518280029297, + "learning_rate": 0.0002321532260847905 + }, + { + "step": 241, + "epoch": 0.38621794871794873, + "cpu_mem": 1.58324736, + "gpu_mem": 4.436950528, + "loss": 1.0042, + "grad_norm": 21.08757209777832, + "learning_rate": 0.00023144912991911691 + }, + { + "step": 242, + "epoch": 0.38782051282051283, + "cpu_mem": 1.583443968, + "gpu_mem": 4.436929024, + "loss": 1.0955, + "grad_norm": 23.198461532592773, + "learning_rate": 0.0002307424795273499 + }, + { + "step": 243, + "epoch": 0.3894230769230769, + "cpu_mem": 1.583640576, + "gpu_mem": 4.436924416, + "loss": 1.0187, + "grad_norm": 21.032400131225586, + "learning_rate": 0.00023003329706988425 + }, + { + "step": 244, + "epoch": 0.391025641025641, + "cpu_mem": 1.583837184, + "gpu_mem": 4.436935168, + "loss": 1.0306, + "grad_norm": 15.72517204284668, + "learning_rate": 0.00022932160478651963 + }, + { + "step": 245, + "epoch": 0.3926282051282051, + "cpu_mem": 1.584033792, + "gpu_mem": 4.436939776, + "loss": 0.9821, + "grad_norm": 10.96260929107666, + "learning_rate": 0.00022860742499576338 + }, + { + "step": 246, + "epoch": 0.3942307692307692, + "cpu_mem": 1.5842304, + "gpu_mem": 4.436901376, + "loss": 1.0171, + "grad_norm": 12.094376564025879, + "learning_rate": 0.00022789078009413042 + }, + { + "step": 247, + "epoch": 0.3958333333333333, + "cpu_mem": 1.5842304, + "gpu_mem": 4.43696896, + "loss": 1.0279, + "grad_norm": 12.474210739135742, + "learning_rate": 0.00022717169255544108 + }, + { + "step": 248, + "epoch": 0.3974358974358974, + "cpu_mem": 1.5842304, + "gpu_mem": 4.436932096, + "loss": 0.9226, + "grad_norm": 13.618504524230957, + "learning_rate": 0.00022645018493011612 + }, + { + "step": 249, + "epoch": 0.39903846153846156, + "cpu_mem": 1.584427008, + "gpu_mem": 4.436921344, + "loss": 1.021, + "grad_norm": 19.20660400390625, + "learning_rate": 0.0002257262798444698 + }, + { + "step": 250, + "epoch": 0.40064102564102566, + "cpu_mem": 1.584623616, + "gpu_mem": 4.43693824, + "loss": 0.9365, + "grad_norm": 13.512292861938477, + "learning_rate": 0.000225 + }, + { + "step": 251, + "epoch": 0.40224358974358976, + "cpu_mem": 1.584820224, + "gpu_mem": 4.436912128, + "loss": 0.9663, + "grad_norm": 18.542512893676758, + "learning_rate": 0.00022427136817267668 + }, + { + "step": 252, + "epoch": 0.40384615384615385, + "cpu_mem": 1.584820224, + "gpu_mem": 4.436959744, + "loss": 0.9839, + "grad_norm": 16.40854263305664, + "learning_rate": 0.0002235404072122273 + }, + { + "step": 253, + "epoch": 0.40544871794871795, + "cpu_mem": 1.585016832, + "gpu_mem": 4.436927488, + "loss": 1.025, + "grad_norm": 26.17331314086914, + "learning_rate": 0.00022280714004142054 + }, + { + "step": 254, + "epoch": 0.40705128205128205, + "cpu_mem": 1.58521344, + "gpu_mem": 4.436916736, + "loss": 0.9149, + "grad_norm": 27.041730880737305, + "learning_rate": 0.00022207158965534726 + }, + { + "step": 255, + "epoch": 0.40865384615384615, + "cpu_mem": 1.585410048, + "gpu_mem": 4.436932096, + "loss": 0.8307, + "grad_norm": 20.591215133666992, + "learning_rate": 0.0002213337791206993 + }, + { + "step": 256, + "epoch": 0.41025641025641024, + "cpu_mem": 1.585410048, + "gpu_mem": 4.436929024, + "loss": 0.7595, + "grad_norm": 19.721179962158203, + "learning_rate": 0.00022059373157504636 + }, + { + "step": 257, + "epoch": 0.41185897435897434, + "cpu_mem": 1.585606656, + "gpu_mem": 4.436929024, + "loss": 1.0104, + "grad_norm": 21.43137550354004, + "learning_rate": 0.00021985147022611038 + }, + { + "step": 258, + "epoch": 0.41346153846153844, + "cpu_mem": 1.585803264, + "gpu_mem": 4.436916736, + "loss": 0.9018, + "grad_norm": 23.34449577331543, + "learning_rate": 0.0002191070183510375 + }, + { + "step": 259, + "epoch": 0.4150641025641026, + "cpu_mem": 1.585999872, + "gpu_mem": 4.43689984, + "loss": 1.0665, + "grad_norm": 17.62995719909668, + "learning_rate": 0.00021836039929566835 + }, + { + "step": 260, + "epoch": 0.4166666666666667, + "cpu_mem": 1.58619648, + "gpu_mem": 4.436962816, + "loss": 0.9813, + "grad_norm": 18.687274932861328, + "learning_rate": 0.0002176116364738058 + }, + { + "step": 261, + "epoch": 0.4182692307692308, + "cpu_mem": 1.58619648, + "gpu_mem": 4.436916736, + "loss": 0.895, + "grad_norm": 15.335165977478027, + "learning_rate": 0.00021686075336648075 + }, + { + "step": 262, + "epoch": 0.4198717948717949, + "cpu_mem": 1.586393088, + "gpu_mem": 4.436925952, + "loss": 0.9055, + "grad_norm": 24.01694107055664, + "learning_rate": 0.00021610777352121574 + }, + { + "step": 263, + "epoch": 0.421474358974359, + "cpu_mem": 1.586589696, + "gpu_mem": 4.43696128, + "loss": 0.8227, + "grad_norm": 16.897476196289062, + "learning_rate": 0.0002153527205512867 + }, + { + "step": 264, + "epoch": 0.4230769230769231, + "cpu_mem": 1.586786304, + "gpu_mem": 4.436925952, + "loss": 0.8837, + "grad_norm": 14.621524810791016, + "learning_rate": 0.0002145956181349821 + }, + { + "step": 265, + "epoch": 0.42467948717948717, + "cpu_mem": 1.586982912, + "gpu_mem": 4.43693056, + "loss": 0.9329, + "grad_norm": 16.350372314453125, + "learning_rate": 0.00021383649001486055 + }, + { + "step": 266, + "epoch": 0.42628205128205127, + "cpu_mem": 1.58717952, + "gpu_mem": 4.436978176, + "loss": 0.6949, + "grad_norm": 19.85226821899414, + "learning_rate": 0.00021307535999700637 + }, + { + "step": 267, + "epoch": 0.42788461538461536, + "cpu_mem": 1.58717952, + "gpu_mem": 4.436987392, + "loss": 0.7231, + "grad_norm": 18.09774398803711, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 268, + "epoch": 0.42948717948717946, + "cpu_mem": 1.587376128, + "gpu_mem": 4.436941312, + "loss": 0.6375, + "grad_norm": 17.430437088012695, + "learning_rate": 0.00021154718980558417 + }, + { + "step": 269, + "epoch": 0.4310897435897436, + "cpu_mem": 1.587572736, + "gpu_mem": 4.436935168, + "loss": 0.8899, + "grad_norm": 25.402971267700195, + "learning_rate": 0.00021078019755508398 + }, + { + "step": 270, + "epoch": 0.4326923076923077, + "cpu_mem": 1.587572736, + "gpu_mem": 4.436996608, + "loss": 0.8733, + "grad_norm": 24.573543548583984, + "learning_rate": 0.00021001129925148396 + }, + { + "step": 271, + "epoch": 0.4342948717948718, + "cpu_mem": 1.587769344, + "gpu_mem": 4.43692288, + "loss": 0.8418, + "grad_norm": 28.2899112701416, + "learning_rate": 0.00020924051900725923 + }, + { + "step": 272, + "epoch": 0.4358974358974359, + "cpu_mem": 1.587965952, + "gpu_mem": 4.436921344, + "loss": 0.8342, + "grad_norm": 27.17462158203125, + "learning_rate": 0.00020846788099390188 + }, + { + "step": 273, + "epoch": 0.4375, + "cpu_mem": 1.587965952, + "gpu_mem": 4.436924416, + "loss": 0.8337, + "grad_norm": 22.65558433532715, + "learning_rate": 0.0002076934094411635 + }, + { + "step": 274, + "epoch": 0.4391025641025641, + "cpu_mem": 1.58816256, + "gpu_mem": 4.436910592, + "loss": 0.8481, + "grad_norm": 23.472333908081055, + "learning_rate": 0.0002069171286362949 + }, + { + "step": 275, + "epoch": 0.4407051282051282, + "cpu_mem": 1.58816256, + "gpu_mem": 4.436925952, + "loss": 0.7309, + "grad_norm": 20.086904525756836, + "learning_rate": 0.00020613906292328457 + }, + { + "step": 276, + "epoch": 0.4423076923076923, + "cpu_mem": 1.588359168, + "gpu_mem": 4.436964352, + "loss": 0.6952, + "grad_norm": 18.061967849731445, + "learning_rate": 0.0002053592367020955 + }, + { + "step": 277, + "epoch": 0.4439102564102564, + "cpu_mem": 1.588555776, + "gpu_mem": 4.436944384, + "loss": 0.968, + "grad_norm": 20.638925552368164, + "learning_rate": 0.0002045776744278996 + }, + { + "step": 278, + "epoch": 0.44551282051282054, + "cpu_mem": 1.588555776, + "gpu_mem": 4.436970496, + "loss": 0.8875, + "grad_norm": 17.965011596679688, + "learning_rate": 0.00020379440061031118 + }, + { + "step": 279, + "epoch": 0.44711538461538464, + "cpu_mem": 1.588948992, + "gpu_mem": 4.436921344, + "loss": 0.6326, + "grad_norm": 15.595129013061523, + "learning_rate": 0.00020300943981261808 + }, + { + "step": 280, + "epoch": 0.44871794871794873, + "cpu_mem": 1.588948992, + "gpu_mem": 4.4369152, + "loss": 0.7553, + "grad_norm": 17.26482391357422, + "learning_rate": 0.0002022228166510114 + }, + { + "step": 281, + "epoch": 0.45032051282051283, + "cpu_mem": 1.588948992, + "gpu_mem": 4.43693824, + "loss": 0.7499, + "grad_norm": 19.800975799560547, + "learning_rate": 0.00020143455579381373 + }, + { + "step": 282, + "epoch": 0.4519230769230769, + "cpu_mem": 1.5891456, + "gpu_mem": 4.436916736, + "loss": 0.8598, + "grad_norm": 22.96339988708496, + "learning_rate": 0.0002006446819607053 + }, + { + "step": 283, + "epoch": 0.453525641025641, + "cpu_mem": 1.589342208, + "gpu_mem": 4.43693056, + "loss": 1.0323, + "grad_norm": 23.777544021606445, + "learning_rate": 0.00019985321992194892 + }, + { + "step": 284, + "epoch": 0.4551282051282051, + "cpu_mem": 1.589538816, + "gpu_mem": 4.436935168, + "loss": 0.5932, + "grad_norm": 21.020109176635742, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 285, + "epoch": 0.4567307692307692, + "cpu_mem": 1.589538816, + "gpu_mem": 4.4369536, + "loss": 0.8503, + "grad_norm": 25.27409553527832, + "learning_rate": 0.00019826563055679418 + }, + { + "step": 286, + "epoch": 0.4583333333333333, + "cpu_mem": 1.589735424, + "gpu_mem": 4.436924416, + "loss": 0.6402, + "grad_norm": 15.202515602111816, + "learning_rate": 0.00019746955301683537 + }, + { + "step": 287, + "epoch": 0.4599358974358974, + "cpu_mem": 1.589932032, + "gpu_mem": 4.436952064, + "loss": 1.0287, + "grad_norm": 24.3122615814209, + "learning_rate": 0.0001966719868425464 + }, + { + "step": 288, + "epoch": 0.46153846153846156, + "cpu_mem": 1.589932032, + "gpu_mem": 4.436933632, + "loss": 0.7395, + "grad_norm": 15.268125534057617, + "learning_rate": 0.0001958729570454201 + }, + { + "step": 289, + "epoch": 0.46314102564102566, + "cpu_mem": 1.589932032, + "gpu_mem": 4.436921344, + "loss": 0.8532, + "grad_norm": 16.243370056152344, + "learning_rate": 0.0001950724886828484 + }, + { + "step": 290, + "epoch": 0.46474358974358976, + "cpu_mem": 1.59012864, + "gpu_mem": 4.43693056, + "loss": 0.9398, + "grad_norm": 18.406536102294922, + "learning_rate": 0.000194270606857336 + }, + { + "step": 291, + "epoch": 0.46634615384615385, + "cpu_mem": 1.590325248, + "gpu_mem": 4.436927488, + "loss": 0.8969, + "grad_norm": 16.45110321044922, + "learning_rate": 0.00019346733671571367 + }, + { + "step": 292, + "epoch": 0.46794871794871795, + "cpu_mem": 1.590325248, + "gpu_mem": 4.436942848, + "loss": 0.6435, + "grad_norm": 14.347558975219727, + "learning_rate": 0.00019266270344834942 + }, + { + "step": 293, + "epoch": 0.46955128205128205, + "cpu_mem": 1.590521856, + "gpu_mem": 4.436950528, + "loss": 0.7766, + "grad_norm": 19.81787109375, + "learning_rate": 0.00019185673228835857 + }, + { + "step": 294, + "epoch": 0.47115384615384615, + "cpu_mem": 1.590521856, + "gpu_mem": 4.436939776, + "loss": 0.909, + "grad_norm": 19.08997917175293, + "learning_rate": 0.00019104944851081244 + }, + { + "step": 295, + "epoch": 0.47275641025641024, + "cpu_mem": 1.590718464, + "gpu_mem": 4.436924416, + "loss": 0.9583, + "grad_norm": 22.47648048400879, + "learning_rate": 0.00019024087743194564 + }, + { + "step": 296, + "epoch": 0.47435897435897434, + "cpu_mem": 1.590915072, + "gpu_mem": 4.436927488, + "loss": 0.9397, + "grad_norm": 21.780574798583984, + "learning_rate": 0.0001894310444083625 + }, + { + "step": 297, + "epoch": 0.47596153846153844, + "cpu_mem": 1.59111168, + "gpu_mem": 4.436921344, + "loss": 0.7249, + "grad_norm": 21.357091903686523, + "learning_rate": 0.00018861997483624136 + }, + { + "step": 298, + "epoch": 0.4775641025641026, + "cpu_mem": 1.59111168, + "gpu_mem": 4.436916736, + "loss": 0.9038, + "grad_norm": 21.246315002441406, + "learning_rate": 0.00018780769415053866 + }, + { + "step": 299, + "epoch": 0.4791666666666667, + "cpu_mem": 1.591308288, + "gpu_mem": 4.43693824, + "loss": 0.8239, + "grad_norm": 19.782630920410156, + "learning_rate": 0.00018699422782419094 + }, + { + "step": 300, + "epoch": 0.4807692307692308, + "cpu_mem": 1.591308288, + "gpu_mem": 4.43693056, + "loss": 0.7733, + "grad_norm": 13.903717041015625, + "learning_rate": 0.00018617960136731624 + }, + { + "step": 301, + "epoch": 0.4823717948717949, + "cpu_mem": 1.591308288, + "gpu_mem": 4.436902912, + "loss": 0.9299, + "grad_norm": 18.777523040771484, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 302, + "epoch": 0.483974358974359, + "cpu_mem": 1.591504896, + "gpu_mem": 4.436901376, + "loss": 0.697, + "grad_norm": 15.403160095214844, + "learning_rate": 0.0001845469702835641 + }, + { + "step": 303, + "epoch": 0.4855769230769231, + "cpu_mem": 1.591504896, + "gpu_mem": 4.436927488, + "loss": 0.823, + "grad_norm": 18.980356216430664, + "learning_rate": 0.00018372901685562414 + }, + { + "step": 304, + "epoch": 0.48717948717948717, + "cpu_mem": 1.591701504, + "gpu_mem": 4.436910592, + "loss": 0.6307, + "grad_norm": 15.2971830368042, + "learning_rate": 0.00018291000569342676 + }, + { + "step": 305, + "epoch": 0.48878205128205127, + "cpu_mem": 1.591898112, + "gpu_mem": 4.436941312, + "loss": 0.7459, + "grad_norm": 23.428600311279297, + "learning_rate": 0.00018208996248097458 + }, + { + "step": 306, + "epoch": 0.49038461538461536, + "cpu_mem": 1.591898112, + "gpu_mem": 4.436924416, + "loss": 0.8731, + "grad_norm": 19.309341430664062, + "learning_rate": 0.00018126891293463547 + }, + { + "step": 307, + "epoch": 0.49198717948717946, + "cpu_mem": 1.59209472, + "gpu_mem": 4.436955136, + "loss": 0.9037, + "grad_norm": 22.455564498901367, + "learning_rate": 0.0001804468828023354 + }, + { + "step": 308, + "epoch": 0.4935897435897436, + "cpu_mem": 1.592291328, + "gpu_mem": 4.43692288, + "loss": 0.997, + "grad_norm": 23.522445678710938, + "learning_rate": 0.00017962389786275142 + }, + { + "step": 309, + "epoch": 0.4951923076923077, + "cpu_mem": 1.592291328, + "gpu_mem": 4.436948992, + "loss": 0.8152, + "grad_norm": 20.53652572631836, + "learning_rate": 0.000178799983924503 + }, + { + "step": 310, + "epoch": 0.4967948717948718, + "cpu_mem": 1.592291328, + "gpu_mem": 4.436924416, + "loss": 0.707, + "grad_norm": 15.77580451965332, + "learning_rate": 0.00017797516682534293 + }, + { + "step": 311, + "epoch": 0.4983974358974359, + "cpu_mem": 1.592487936, + "gpu_mem": 4.436919808, + "loss": 0.8242, + "grad_norm": 17.85352325439453, + "learning_rate": 0.00017714947243134695 + }, + { + "step": 312, + "epoch": 0.5, + "cpu_mem": 1.592487936, + "gpu_mem": 4.43692288, + "loss": 0.8485, + "grad_norm": 18.147342681884766, + "learning_rate": 0.0001763229266361024 + }, + { + "step": 313, + "epoch": 0.5016025641025641, + "cpu_mem": 1.592487936, + "gpu_mem": 4.436941312, + "loss": 0.651, + "grad_norm": 14.757570266723633, + "learning_rate": 0.00017549555535989648 + }, + { + "step": 314, + "epoch": 0.5032051282051282, + "cpu_mem": 1.592684544, + "gpu_mem": 4.436921344, + "loss": 0.8363, + "grad_norm": 14.060874938964844, + "learning_rate": 0.00017466738454890323 + }, + { + "step": 315, + "epoch": 0.5048076923076923, + "cpu_mem": 1.592684544, + "gpu_mem": 4.436925952, + "loss": 0.7903, + "grad_norm": 14.21678638458252, + "learning_rate": 0.00017383844017436996 + }, + { + "step": 316, + "epoch": 0.5064102564102564, + "cpu_mem": 1.592881152, + "gpu_mem": 4.436921344, + "loss": 0.7814, + "grad_norm": 20.17334747314453, + "learning_rate": 0.00017300874823180282 + }, + { + "step": 317, + "epoch": 0.5080128205128205, + "cpu_mem": 1.592881152, + "gpu_mem": 4.436929024, + "loss": 0.5541, + "grad_norm": 13.71360969543457, + "learning_rate": 0.00017217833474015128 + }, + { + "step": 318, + "epoch": 0.5096153846153846, + "cpu_mem": 1.59307776, + "gpu_mem": 4.4369536, + "loss": 0.7349, + "grad_norm": 17.3328800201416, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 319, + "epoch": 0.5112179487179487, + "cpu_mem": 1.59307776, + "gpu_mem": 4.43694592, + "loss": 0.6927, + "grad_norm": 18.419845581054688, + "learning_rate": 0.0001705154472977154 + }, + { + "step": 320, + "epoch": 0.5128205128205128, + "cpu_mem": 1.593274368, + "gpu_mem": 4.436947456, + "loss": 0.7777, + "grad_norm": 18.338157653808594, + "learning_rate": 0.00016968302549470095 + }, + { + "step": 321, + "epoch": 0.5144230769230769, + "cpu_mem": 1.593470976, + "gpu_mem": 4.43692288, + "loss": 0.6166, + "grad_norm": 19.599151611328125, + "learning_rate": 0.00016884998643650694 + }, + { + "step": 322, + "epoch": 0.5160256410256411, + "cpu_mem": 1.593667584, + "gpu_mem": 4.436924416, + "loss": 0.81, + "grad_norm": 23.615489959716797, + "learning_rate": 0.00016801635624704776 + }, + { + "step": 323, + "epoch": 0.5176282051282052, + "cpu_mem": 1.593667584, + "gpu_mem": 4.436944384, + "loss": 0.7968, + "grad_norm": 21.297399520874023, + "learning_rate": 0.0001671821610687756 + }, + { + "step": 324, + "epoch": 0.5192307692307693, + "cpu_mem": 1.593667584, + "gpu_mem": 4.436916736, + "loss": 1.003, + "grad_norm": 25.882535934448242, + "learning_rate": 0.00016634742706186036 + }, + { + "step": 325, + "epoch": 0.5208333333333334, + "cpu_mem": 1.593864192, + "gpu_mem": 4.436929024, + "loss": 0.5797, + "grad_norm": 19.94464683532715, + "learning_rate": 0.00016551218040336993 + }, + { + "step": 326, + "epoch": 0.5224358974358975, + "cpu_mem": 1.5940608, + "gpu_mem": 4.43693824, + "loss": 0.8381, + "grad_norm": 25.126588821411133, + "learning_rate": 0.00016467644728644843 + }, + { + "step": 327, + "epoch": 0.5240384615384616, + "cpu_mem": 1.5940608, + "gpu_mem": 4.4369152, + "loss": 0.6147, + "grad_norm": 18.76034164428711, + "learning_rate": 0.0001638402539194953 + }, + { + "step": 328, + "epoch": 0.5256410256410257, + "cpu_mem": 1.5940608, + "gpu_mem": 4.436939776, + "loss": 0.9793, + "grad_norm": 24.708589553833008, + "learning_rate": 0.00016300362652534346 + }, + { + "step": 329, + "epoch": 0.5272435897435898, + "cpu_mem": 1.594257408, + "gpu_mem": 4.436939776, + "loss": 0.7329, + "grad_norm": 19.350584030151367, + "learning_rate": 0.00016216659134043657 + }, + { + "step": 330, + "epoch": 0.5288461538461539, + "cpu_mem": 1.594257408, + "gpu_mem": 4.43692288, + "loss": 0.8151, + "grad_norm": 20.003129959106445, + "learning_rate": 0.00016132917461400686 + }, + { + "step": 331, + "epoch": 0.530448717948718, + "cpu_mem": 1.594257408, + "gpu_mem": 4.436919808, + "loss": 0.7065, + "grad_norm": 16.23320770263672, + "learning_rate": 0.00016049140260725127 + }, + { + "step": 332, + "epoch": 0.532051282051282, + "cpu_mem": 1.594454016, + "gpu_mem": 4.436912128, + "loss": 0.7599, + "grad_norm": 16.238933563232422, + "learning_rate": 0.00015965330159250845 + }, + { + "step": 333, + "epoch": 0.5336538461538461, + "cpu_mem": 1.594454016, + "gpu_mem": 4.436950528, + "loss": 0.8025, + "grad_norm": 16.76068687438965, + "learning_rate": 0.00015881489785243467 + }, + { + "step": 334, + "epoch": 0.5352564102564102, + "cpu_mem": 1.594454016, + "gpu_mem": 4.436927488, + "loss": 0.9212, + "grad_norm": 19.284290313720703, + "learning_rate": 0.00015797621767917942 + }, + { + "step": 335, + "epoch": 0.5368589743589743, + "cpu_mem": 1.594650624, + "gpu_mem": 4.436925952, + "loss": 0.7315, + "grad_norm": 19.758514404296875, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 336, + "epoch": 0.5384615384615384, + "cpu_mem": 1.594650624, + "gpu_mem": 4.436942848, + "loss": 0.7077, + "grad_norm": 17.041902542114258, + "learning_rate": 0.00015629813324424292 + }, + { + "step": 337, + "epoch": 0.5400641025641025, + "cpu_mem": 1.594650624, + "gpu_mem": 4.436927488, + "loss": 0.6366, + "grad_norm": 16.076799392700195, + "learning_rate": 0.00015545878160690583 + }, + { + "step": 338, + "epoch": 0.5416666666666666, + "cpu_mem": 1.594847232, + "gpu_mem": 4.436939776, + "loss": 0.674, + "grad_norm": 16.31786346435547, + "learning_rate": 0.00015461925878342556 + }, + { + "step": 339, + "epoch": 0.5432692307692307, + "cpu_mem": 1.594847232, + "gpu_mem": 4.436952064, + "loss": 0.6147, + "grad_norm": 16.828685760498047, + "learning_rate": 0.00015377959110104584 + }, + { + "step": 340, + "epoch": 0.5448717948717948, + "cpu_mem": 1.59504384, + "gpu_mem": 4.436927488, + "loss": 0.8487, + "grad_norm": 24.979177474975586, + "learning_rate": 0.00015293980489155333 + }, + { + "step": 341, + "epoch": 0.5464743589743589, + "cpu_mem": 1.59504384, + "gpu_mem": 4.436972032, + "loss": 1.0216, + "grad_norm": 23.142898559570312, + "learning_rate": 0.00015209992649045152 + }, + { + "step": 342, + "epoch": 0.5480769230769231, + "cpu_mem": 1.59504384, + "gpu_mem": 4.43694592, + "loss": 0.8074, + "grad_norm": 21.19594955444336, + "learning_rate": 0.000151259982236135 + }, + { + "step": 343, + "epoch": 0.5496794871794872, + "cpu_mem": 1.595240448, + "gpu_mem": 4.436942848, + "loss": 0.5722, + "grad_norm": 15.304789543151855, + "learning_rate": 0.00015041999846906367 + }, + { + "step": 344, + "epoch": 0.5512820512820513, + "cpu_mem": 1.595437056, + "gpu_mem": 4.436924416, + "loss": 0.5401, + "grad_norm": 14.968832015991211, + "learning_rate": 0.00014958000153093634 + }, + { + "step": 345, + "epoch": 0.5528846153846154, + "cpu_mem": 1.595437056, + "gpu_mem": 4.43693056, + "loss": 0.5843, + "grad_norm": 16.399700164794922, + "learning_rate": 0.000148740017763865 + }, + { + "step": 346, + "epoch": 0.5544871794871795, + "cpu_mem": 1.595437056, + "gpu_mem": 4.43689984, + "loss": 0.7979, + "grad_norm": 21.134702682495117, + "learning_rate": 0.00014790007350954845 + }, + { + "step": 347, + "epoch": 0.5560897435897436, + "cpu_mem": 1.595633664, + "gpu_mem": 4.436964352, + "loss": 0.7384, + "grad_norm": 19.78372573852539, + "learning_rate": 0.00014706019510844664 + }, + { + "step": 348, + "epoch": 0.5576923076923077, + "cpu_mem": 1.595633664, + "gpu_mem": 4.436918272, + "loss": 0.6944, + "grad_norm": 19.546159744262695, + "learning_rate": 0.0001462204088989541 + }, + { + "step": 349, + "epoch": 0.5592948717948718, + "cpu_mem": 1.595830272, + "gpu_mem": 4.436912128, + "loss": 0.753, + "grad_norm": 19.75483512878418, + "learning_rate": 0.00014538074121657447 + }, + { + "step": 350, + "epoch": 0.5608974358974359, + "cpu_mem": 1.595830272, + "gpu_mem": 4.436967424, + "loss": 0.5537, + "grad_norm": 16.68903923034668, + "learning_rate": 0.00014454121839309415 + }, + { + "step": 351, + "epoch": 0.5625, + "cpu_mem": 1.595830272, + "gpu_mem": 4.436933632, + "loss": 0.6418, + "grad_norm": 22.884973526000977, + "learning_rate": 0.00014370186675575705 + }, + { + "step": 352, + "epoch": 0.5641025641025641, + "cpu_mem": 1.59602688, + "gpu_mem": 4.436921344, + "loss": 0.6371, + "grad_norm": 20.107852935791016, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 353, + "epoch": 0.5657051282051282, + "cpu_mem": 1.59602688, + "gpu_mem": 4.436925952, + "loss": 0.5546, + "grad_norm": 16.256052017211914, + "learning_rate": 0.00014202378232082053 + }, + { + "step": 354, + "epoch": 0.5673076923076923, + "cpu_mem": 1.59602688, + "gpu_mem": 4.436905984, + "loss": 0.6996, + "grad_norm": 23.405519485473633, + "learning_rate": 0.00014118510214756536 + }, + { + "step": 355, + "epoch": 0.5689102564102564, + "cpu_mem": 1.596223488, + "gpu_mem": 4.43693056, + "loss": 0.6388, + "grad_norm": 18.495140075683594, + "learning_rate": 0.00014034669840749152 + }, + { + "step": 356, + "epoch": 0.5705128205128205, + "cpu_mem": 1.596223488, + "gpu_mem": 4.436909056, + "loss": 0.5382, + "grad_norm": 18.691219329833984, + "learning_rate": 0.0001395085973927487 + }, + { + "step": 357, + "epoch": 0.5721153846153846, + "cpu_mem": 1.596223488, + "gpu_mem": 4.436925952, + "loss": 0.5742, + "grad_norm": 20.012691497802734, + "learning_rate": 0.00013867082538599317 + }, + { + "step": 358, + "epoch": 0.5737179487179487, + "cpu_mem": 1.596420096, + "gpu_mem": 4.436890624, + "loss": 0.8283, + "grad_norm": 19.03293800354004, + "learning_rate": 0.00013783340865956338 + }, + { + "step": 359, + "epoch": 0.5753205128205128, + "cpu_mem": 1.596616704, + "gpu_mem": 4.43692288, + "loss": 0.6105, + "grad_norm": 15.918546676635742, + "learning_rate": 0.0001369963734746566 + }, + { + "step": 360, + "epoch": 0.5769230769230769, + "cpu_mem": 1.596616704, + "gpu_mem": 4.436912128, + "loss": 0.7001, + "grad_norm": 19.3937931060791, + "learning_rate": 0.0001361597460805047 + }, + { + "step": 361, + "epoch": 0.5785256410256411, + "cpu_mem": 1.596813312, + "gpu_mem": 4.436948992, + "loss": 0.7276, + "grad_norm": 16.817014694213867, + "learning_rate": 0.00013532355271355154 + }, + { + "step": 362, + "epoch": 0.5801282051282052, + "cpu_mem": 1.596813312, + "gpu_mem": 4.4369152, + "loss": 0.8731, + "grad_norm": 18.69092559814453, + "learning_rate": 0.00013448781959663004 + }, + { + "step": 363, + "epoch": 0.5817307692307693, + "cpu_mem": 1.596813312, + "gpu_mem": 4.43693824, + "loss": 0.8148, + "grad_norm": 18.156118392944336, + "learning_rate": 0.00013365257293813956 + }, + { + "step": 364, + "epoch": 0.5833333333333334, + "cpu_mem": 1.596813312, + "gpu_mem": 4.436927488, + "loss": 0.6588, + "grad_norm": 14.902803421020508, + "learning_rate": 0.00013281783893122446 + }, + { + "step": 365, + "epoch": 0.5849358974358975, + "cpu_mem": 1.59700992, + "gpu_mem": 4.436933632, + "loss": 0.6317, + "grad_norm": 16.067657470703125, + "learning_rate": 0.00013198364375295224 + }, + { + "step": 366, + "epoch": 0.5865384615384616, + "cpu_mem": 1.59700992, + "gpu_mem": 4.436927488, + "loss": 0.7853, + "grad_norm": 17.04909324645996, + "learning_rate": 0.000131150013563493 + }, + { + "step": 367, + "epoch": 0.5881410256410257, + "cpu_mem": 1.59700992, + "gpu_mem": 4.43694592, + "loss": 0.8101, + "grad_norm": 17.53873634338379, + "learning_rate": 0.00013031697450529902 + }, + { + "step": 368, + "epoch": 0.5897435897435898, + "cpu_mem": 1.59700992, + "gpu_mem": 4.436905984, + "loss": 0.6802, + "grad_norm": 17.59600257873535, + "learning_rate": 0.0001294845527022846 + }, + { + "step": 369, + "epoch": 0.5913461538461539, + "cpu_mem": 1.59700992, + "gpu_mem": 4.43693824, + "loss": 0.8878, + "grad_norm": 17.993879318237305, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 370, + "epoch": 0.592948717948718, + "cpu_mem": 1.597206528, + "gpu_mem": 4.436958208, + "loss": 0.8237, + "grad_norm": 15.670769691467285, + "learning_rate": 0.0001278216652598487 + }, + { + "step": 371, + "epoch": 0.594551282051282, + "cpu_mem": 1.597403136, + "gpu_mem": 4.436952064, + "loss": 0.6331, + "grad_norm": 14.512020111083984, + "learning_rate": 0.00012699125176819716 + }, + { + "step": 372, + "epoch": 0.5961538461538461, + "cpu_mem": 1.597403136, + "gpu_mem": 4.4369152, + "loss": 0.7014, + "grad_norm": 15.316640853881836, + "learning_rate": 0.00012616155982563 + }, + { + "step": 373, + "epoch": 0.5977564102564102, + "cpu_mem": 1.597403136, + "gpu_mem": 4.436932096, + "loss": 0.5804, + "grad_norm": 15.086557388305664, + "learning_rate": 0.00012533261545109674 + }, + { + "step": 374, + "epoch": 0.5993589743589743, + "cpu_mem": 1.597403136, + "gpu_mem": 4.436909056, + "loss": 0.8475, + "grad_norm": 15.285741806030273, + "learning_rate": 0.00012450444464010352 + }, + { + "step": 375, + "epoch": 0.6009615384615384, + "cpu_mem": 1.597403136, + "gpu_mem": 4.436941312, + "loss": 0.7196, + "grad_norm": 16.01751708984375, + "learning_rate": 0.0001236770733638976 + }, + { + "step": 376, + "epoch": 0.6025641025641025, + "cpu_mem": 1.597599744, + "gpu_mem": 4.436936704, + "loss": 0.6657, + "grad_norm": 14.658589363098145, + "learning_rate": 0.000122850527568653 + }, + { + "step": 377, + "epoch": 0.6041666666666666, + "cpu_mem": 1.597599744, + "gpu_mem": 4.43694592, + "loss": 0.5707, + "grad_norm": 13.987504959106445, + "learning_rate": 0.00012202483317465704 + }, + { + "step": 378, + "epoch": 0.6057692307692307, + "cpu_mem": 1.597796352, + "gpu_mem": 4.436919808, + "loss": 0.6462, + "grad_norm": 16.161376953125, + "learning_rate": 0.00012120001607549698 + }, + { + "step": 379, + "epoch": 0.6073717948717948, + "cpu_mem": 1.597796352, + "gpu_mem": 4.436939776, + "loss": 0.6419, + "grad_norm": 14.892619132995605, + "learning_rate": 0.00012037610213724862 + }, + { + "step": 380, + "epoch": 0.6089743589743589, + "cpu_mem": 1.597796352, + "gpu_mem": 4.436913664, + "loss": 0.7909, + "grad_norm": 20.421140670776367, + "learning_rate": 0.0001195531171976646 + }, + { + "step": 381, + "epoch": 0.6105769230769231, + "cpu_mem": 1.597796352, + "gpu_mem": 4.43693824, + "loss": 0.7902, + "grad_norm": 19.43617057800293, + "learning_rate": 0.00011873108706536448 + }, + { + "step": 382, + "epoch": 0.6121794871794872, + "cpu_mem": 1.597796352, + "gpu_mem": 4.43692288, + "loss": 0.5189, + "grad_norm": 13.899816513061523, + "learning_rate": 0.00011791003751902542 + }, + { + "step": 383, + "epoch": 0.6137820512820513, + "cpu_mem": 1.59799296, + "gpu_mem": 4.436956672, + "loss": 0.8969, + "grad_norm": 20.122882843017578, + "learning_rate": 0.00011708999430657325 + }, + { + "step": 384, + "epoch": 0.6153846153846154, + "cpu_mem": 1.59799296, + "gpu_mem": 4.436936704, + "loss": 0.7027, + "grad_norm": 18.85349464416504, + "learning_rate": 0.00011627098314437586 + }, + { + "step": 385, + "epoch": 0.6169871794871795, + "cpu_mem": 1.598189568, + "gpu_mem": 4.436921344, + "loss": 0.6072, + "grad_norm": 14.317890167236328, + "learning_rate": 0.0001154530297164359 + }, + { + "step": 386, + "epoch": 0.6185897435897436, + "cpu_mem": 1.598189568, + "gpu_mem": 4.436956672, + "loss": 0.7896, + "grad_norm": 15.42298412322998, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 387, + "epoch": 0.6201923076923077, + "cpu_mem": 1.598189568, + "gpu_mem": 4.436962816, + "loss": 0.4405, + "grad_norm": 11.796723365783691, + "learning_rate": 0.00011382039863268374 + }, + { + "step": 388, + "epoch": 0.6217948717948718, + "cpu_mem": 1.598189568, + "gpu_mem": 4.436925952, + "loss": 0.6571, + "grad_norm": 18.241872787475586, + "learning_rate": 0.00011300577217580905 + }, + { + "step": 389, + "epoch": 0.6233974358974359, + "cpu_mem": 1.598386176, + "gpu_mem": 4.436904448, + "loss": 0.7262, + "grad_norm": 16.054351806640625, + "learning_rate": 0.00011219230584946136 + }, + { + "step": 390, + "epoch": 0.625, + "cpu_mem": 1.598386176, + "gpu_mem": 4.436956672, + "loss": 0.727, + "grad_norm": 14.673053741455078, + "learning_rate": 0.00011138002516375864 + }, + { + "step": 391, + "epoch": 0.6266025641025641, + "cpu_mem": 1.598386176, + "gpu_mem": 4.436942848, + "loss": 0.5437, + "grad_norm": 16.495819091796875, + "learning_rate": 0.00011056895559163748 + }, + { + "step": 392, + "epoch": 0.6282051282051282, + "cpu_mem": 1.598386176, + "gpu_mem": 4.436936704, + "loss": 0.8116, + "grad_norm": 20.99553108215332, + "learning_rate": 0.00010975912256805436 + }, + { + "step": 393, + "epoch": 0.6298076923076923, + "cpu_mem": 1.598386176, + "gpu_mem": 4.436942848, + "loss": 0.623, + "grad_norm": 15.789752960205078, + "learning_rate": 0.00010895055148918756 + }, + { + "step": 394, + "epoch": 0.6314102564102564, + "cpu_mem": 1.598386176, + "gpu_mem": 4.436919808, + "loss": 0.736, + "grad_norm": 15.50102710723877, + "learning_rate": 0.00010814326771164141 + }, + { + "step": 395, + "epoch": 0.6330128205128205, + "cpu_mem": 1.598582784, + "gpu_mem": 4.436933632, + "loss": 0.4783, + "grad_norm": 13.301855087280273, + "learning_rate": 0.00010733729655165054 + }, + { + "step": 396, + "epoch": 0.6346153846153846, + "cpu_mem": 1.598582784, + "gpu_mem": 4.436933632, + "loss": 0.8035, + "grad_norm": 16.664751052856445, + "learning_rate": 0.00010653266328428628 + }, + { + "step": 397, + "epoch": 0.6362179487179487, + "cpu_mem": 1.598779392, + "gpu_mem": 4.436902912, + "loss": 0.5659, + "grad_norm": 12.607199668884277, + "learning_rate": 0.00010572939314266402 + }, + { + "step": 398, + "epoch": 0.6378205128205128, + "cpu_mem": 1.598779392, + "gpu_mem": 4.436936704, + "loss": 0.7759, + "grad_norm": 19.707883834838867, + "learning_rate": 0.00010492751131715159 + }, + { + "step": 399, + "epoch": 0.6394230769230769, + "cpu_mem": 1.598779392, + "gpu_mem": 4.4369152, + "loss": 0.6024, + "grad_norm": 13.373634338378906, + "learning_rate": 0.00010412704295457988 + }, + { + "step": 400, + "epoch": 0.6410256410256411, + "cpu_mem": 1.598779392, + "gpu_mem": 4.43692288, + "loss": 0.6398, + "grad_norm": 16.981311798095703, + "learning_rate": 0.00010332801315745361 + }, + { + "step": 401, + "epoch": 0.6426282051282052, + "cpu_mem": 1.598779392, + "gpu_mem": 4.436941312, + "loss": 0.7424, + "grad_norm": 18.689470291137695, + "learning_rate": 0.00010253044698316464 + }, + { + "step": 402, + "epoch": 0.6442307692307693, + "cpu_mem": 1.598976, + "gpu_mem": 4.436909056, + "loss": 0.8125, + "grad_norm": 20.15205955505371, + "learning_rate": 0.00010173436944320582 + }, + { + "step": 403, + "epoch": 0.6458333333333334, + "cpu_mem": 1.598976, + "gpu_mem": 4.436913664, + "loss": 0.6231, + "grad_norm": 16.987598419189453, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 404, + "epoch": 0.6474358974358975, + "cpu_mem": 1.598976, + "gpu_mem": 4.436909056, + "loss": 0.5753, + "grad_norm": 14.123588562011719, + "learning_rate": 0.00010014678007805106 + }, + { + "step": 405, + "epoch": 0.6490384615384616, + "cpu_mem": 1.598976, + "gpu_mem": 4.4369536, + "loss": 0.6416, + "grad_norm": 15.37139892578125, + "learning_rate": 9.935531803929469e-05 + }, + { + "step": 406, + "epoch": 0.6506410256410257, + "cpu_mem": 1.598976, + "gpu_mem": 4.436936704, + "loss": 0.6101, + "grad_norm": 15.264710426330566, + "learning_rate": 9.856544420618624e-05 + }, + { + "step": 407, + "epoch": 0.6522435897435898, + "cpu_mem": 1.598976, + "gpu_mem": 4.436925952, + "loss": 0.5424, + "grad_norm": 13.287707328796387, + "learning_rate": 9.777718334898859e-05 + }, + { + "step": 408, + "epoch": 0.6538461538461539, + "cpu_mem": 1.599172608, + "gpu_mem": 4.436947456, + "loss": 0.643, + "grad_norm": 16.137733459472656, + "learning_rate": 9.699056018738192e-05 + }, + { + "step": 409, + "epoch": 0.655448717948718, + "cpu_mem": 1.599172608, + "gpu_mem": 4.436913664, + "loss": 0.5236, + "grad_norm": 13.411752700805664, + "learning_rate": 9.62055993896888e-05 + }, + { + "step": 410, + "epoch": 0.657051282051282, + "cpu_mem": 1.599369216, + "gpu_mem": 4.436929024, + "loss": 0.7037, + "grad_norm": 19.13396453857422, + "learning_rate": 9.542232557210039e-05 + }, + { + "step": 411, + "epoch": 0.6586538461538461, + "cpu_mem": 1.599369216, + "gpu_mem": 4.436929024, + "loss": 0.62, + "grad_norm": 19.31201171875, + "learning_rate": 9.464076329790451e-05 + }, + { + "step": 412, + "epoch": 0.6602564102564102, + "cpu_mem": 1.599565824, + "gpu_mem": 4.436919808, + "loss": 0.4933, + "grad_norm": 17.773523330688477, + "learning_rate": 9.386093707671543e-05 + }, + { + "step": 413, + "epoch": 0.6618589743589743, + "cpu_mem": 1.599565824, + "gpu_mem": 4.43693056, + "loss": 0.9072, + "grad_norm": 23.425249099731445, + "learning_rate": 9.308287136370511e-05 + }, + { + "step": 414, + "epoch": 0.6634615384615384, + "cpu_mem": 1.599565824, + "gpu_mem": 4.436955136, + "loss": 0.5917, + "grad_norm": 15.613957405090332, + "learning_rate": 9.230659055883649e-05 + }, + { + "step": 415, + "epoch": 0.6650641025641025, + "cpu_mem": 1.599565824, + "gpu_mem": 4.43690752, + "loss": 0.7479, + "grad_norm": 17.668733596801758, + "learning_rate": 9.15321190060981e-05 + }, + { + "step": 416, + "epoch": 0.6666666666666666, + "cpu_mem": 1.599565824, + "gpu_mem": 4.436942848, + "loss": 0.6127, + "grad_norm": 18.21131134033203, + "learning_rate": 9.075948099274078e-05 + }, + { + "step": 417, + "epoch": 0.6682692307692307, + "cpu_mem": 1.599565824, + "gpu_mem": 4.436904448, + "loss": 0.5973, + "grad_norm": 15.488285064697266, + "learning_rate": 8.998870074851604e-05 + }, + { + "step": 418, + "epoch": 0.6698717948717948, + "cpu_mem": 1.599565824, + "gpu_mem": 4.43692288, + "loss": 0.6427, + "grad_norm": 17.640857696533203, + "learning_rate": 8.9219802444916e-05 + }, + { + "step": 419, + "epoch": 0.6714743589743589, + "cpu_mem": 1.599565824, + "gpu_mem": 4.4369152, + "loss": 0.6945, + "grad_norm": 18.074541091918945, + "learning_rate": 8.845281019441583e-05 + }, + { + "step": 420, + "epoch": 0.6730769230769231, + "cpu_mem": 1.599762432, + "gpu_mem": 4.436952064, + "loss": 0.6594, + "grad_norm": 16.855730056762695, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 421, + "epoch": 0.6746794871794872, + "cpu_mem": 1.599762432, + "gpu_mem": 4.436912128, + "loss": 0.5714, + "grad_norm": 15.603768348693848, + "learning_rate": 8.692464000299362e-05 + }, + { + "step": 422, + "epoch": 0.6762820512820513, + "cpu_mem": 1.599762432, + "gpu_mem": 4.436925952, + "loss": 0.4892, + "grad_norm": 13.39127254486084, + "learning_rate": 8.61635099851395e-05 + }, + { + "step": 423, + "epoch": 0.6778846153846154, + "cpu_mem": 1.59995904, + "gpu_mem": 4.43693056, + "loss": 0.4016, + "grad_norm": 13.416767120361328, + "learning_rate": 8.540438186501792e-05 + }, + { + "step": 424, + "epoch": 0.6794871794871795, + "cpu_mem": 1.59995904, + "gpu_mem": 4.43689216, + "loss": 0.631, + "grad_norm": 14.418962478637695, + "learning_rate": 8.464727944871322e-05 + }, + { + "step": 425, + "epoch": 0.6810897435897436, + "cpu_mem": 1.59995904, + "gpu_mem": 4.4369152, + "loss": 0.8359, + "grad_norm": 16.727617263793945, + "learning_rate": 8.389222647878426e-05 + }, + { + "step": 426, + "epoch": 0.6826923076923077, + "cpu_mem": 1.59995904, + "gpu_mem": 4.436913664, + "loss": 0.6041, + "grad_norm": 16.177026748657227, + "learning_rate": 8.313924663351926e-05 + }, + { + "step": 427, + "epoch": 0.6842948717948718, + "cpu_mem": 1.59995904, + "gpu_mem": 4.436932096, + "loss": 0.6515, + "grad_norm": 16.122493743896484, + "learning_rate": 8.238836352619424e-05 + }, + { + "step": 428, + "epoch": 0.6858974358974359, + "cpu_mem": 1.600155648, + "gpu_mem": 4.436929024, + "loss": 0.6664, + "grad_norm": 17.431591033935547, + "learning_rate": 8.163960070433164e-05 + }, + { + "step": 429, + "epoch": 0.6875, + "cpu_mem": 1.600155648, + "gpu_mem": 4.436927488, + "loss": 0.6457, + "grad_norm": 15.978360176086426, + "learning_rate": 8.089298164896245e-05 + }, + { + "step": 430, + "epoch": 0.6891025641025641, + "cpu_mem": 1.600352256, + "gpu_mem": 4.43694592, + "loss": 0.4586, + "grad_norm": 16.935365676879883, + "learning_rate": 8.014852977388964e-05 + }, + { + "step": 431, + "epoch": 0.6907051282051282, + "cpu_mem": 1.600352256, + "gpu_mem": 4.43690752, + "loss": 0.7108, + "grad_norm": 18.572053909301758, + "learning_rate": 7.940626842495362e-05 + }, + { + "step": 432, + "epoch": 0.6923076923076923, + "cpu_mem": 1.600352256, + "gpu_mem": 4.436952064, + "loss": 0.4705, + "grad_norm": 14.376107215881348, + "learning_rate": 7.866622087930074e-05 + }, + { + "step": 433, + "epoch": 0.6939102564102564, + "cpu_mem": 1.600352256, + "gpu_mem": 4.436916736, + "loss": 0.6177, + "grad_norm": 18.60902214050293, + "learning_rate": 7.792841034465275e-05 + }, + { + "step": 434, + "epoch": 0.6955128205128205, + "cpu_mem": 1.600352256, + "gpu_mem": 4.436944384, + "loss": 0.649, + "grad_norm": 17.856176376342773, + "learning_rate": 7.719285995857938e-05 + }, + { + "step": 435, + "epoch": 0.6971153846153846, + "cpu_mem": 1.600352256, + "gpu_mem": 4.436924416, + "loss": 0.6059, + "grad_norm": 15.642708778381348, + "learning_rate": 7.64595927877727e-05 + }, + { + "step": 436, + "epoch": 0.6987179487179487, + "cpu_mem": 1.600352256, + "gpu_mem": 4.436970496, + "loss": 0.7956, + "grad_norm": 18.344951629638672, + "learning_rate": 7.572863182732332e-05 + }, + { + "step": 437, + "epoch": 0.7003205128205128, + "cpu_mem": 1.600352256, + "gpu_mem": 4.436935168, + "loss": 0.557, + "grad_norm": 13.898965835571289, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 438, + "epoch": 0.7019230769230769, + "cpu_mem": 1.600548864, + "gpu_mem": 4.436925952, + "loss": 0.5075, + "grad_norm": 16.26864242553711, + "learning_rate": 7.42737201555302e-05 + }, + { + "step": 439, + "epoch": 0.7035256410256411, + "cpu_mem": 1.600548864, + "gpu_mem": 4.436919808, + "loss": 0.5331, + "grad_norm": 15.974427223205566, + "learning_rate": 7.354981506988387e-05 + }, + { + "step": 440, + "epoch": 0.7051282051282052, + "cpu_mem": 1.600548864, + "gpu_mem": 4.436904448, + "loss": 0.6843, + "grad_norm": 18.71124267578125, + "learning_rate": 7.282830744455895e-05 + }, + { + "step": 441, + "epoch": 0.7067307692307693, + "cpu_mem": 1.600548864, + "gpu_mem": 4.43692288, + "loss": 0.7346, + "grad_norm": 19.61048126220703, + "learning_rate": 7.210921990586957e-05 + }, + { + "step": 442, + "epoch": 0.7083333333333334, + "cpu_mem": 1.600548864, + "gpu_mem": 4.436924416, + "loss": 0.6187, + "grad_norm": 20.987255096435547, + "learning_rate": 7.139257500423665e-05 + }, + { + "step": 443, + "epoch": 0.7099358974358975, + "cpu_mem": 1.600745472, + "gpu_mem": 4.436929024, + "loss": 0.6664, + "grad_norm": 21.08690071105957, + "learning_rate": 7.067839521348035e-05 + }, + { + "step": 444, + "epoch": 0.7115384615384616, + "cpu_mem": 1.600745472, + "gpu_mem": 4.436932096, + "loss": 0.4841, + "grad_norm": 17.971864700317383, + "learning_rate": 6.996670293011575e-05 + }, + { + "step": 445, + "epoch": 0.7131410256410257, + "cpu_mem": 1.600745472, + "gpu_mem": 4.436925952, + "loss": 0.7156, + "grad_norm": 17.415672302246094, + "learning_rate": 6.92575204726501e-05 + }, + { + "step": 446, + "epoch": 0.7147435897435898, + "cpu_mem": 1.600745472, + "gpu_mem": 4.436952064, + "loss": 0.7016, + "grad_norm": 17.462265014648438, + "learning_rate": 6.855087008088307e-05 + }, + { + "step": 447, + "epoch": 0.7163461538461539, + "cpu_mem": 1.600745472, + "gpu_mem": 4.436919808, + "loss": 0.3888, + "grad_norm": 11.58098316192627, + "learning_rate": 6.784677391520952e-05 + }, + { + "step": 448, + "epoch": 0.717948717948718, + "cpu_mem": 1.600745472, + "gpu_mem": 4.436947456, + "loss": 0.4984, + "grad_norm": 12.686566352844238, + "learning_rate": 6.714525405592412e-05 + }, + { + "step": 449, + "epoch": 0.719551282051282, + "cpu_mem": 1.600745472, + "gpu_mem": 4.436955136, + "loss": 0.6294, + "grad_norm": 17.73417854309082, + "learning_rate": 6.644633250252937e-05 + }, + { + "step": 450, + "epoch": 0.7211538461538461, + "cpu_mem": 1.60094208, + "gpu_mem": 4.436936704, + "loss": 0.6481, + "grad_norm": 16.461219787597656, + "learning_rate": 6.575003117304535e-05 + }, + { + "step": 451, + "epoch": 0.7227564102564102, + "cpu_mem": 1.60094208, + "gpu_mem": 4.43692288, + "loss": 0.4695, + "grad_norm": 17.10310173034668, + "learning_rate": 6.50563719033225e-05 + }, + { + "step": 452, + "epoch": 0.7243589743589743, + "cpu_mem": 1.60094208, + "gpu_mem": 4.436933632, + "loss": 0.5428, + "grad_norm": 15.674758911132812, + "learning_rate": 6.436537644635705e-05 + }, + { + "step": 453, + "epoch": 0.7259615384615384, + "cpu_mem": 1.60094208, + "gpu_mem": 4.436925952, + "loss": 0.6102, + "grad_norm": 21.17792320251465, + "learning_rate": 6.367706647160847e-05 + }, + { + "step": 454, + "epoch": 0.7275641025641025, + "cpu_mem": 1.60094208, + "gpu_mem": 4.436942848, + "loss": 0.554, + "grad_norm": 12.891200065612793, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 455, + "epoch": 0.7291666666666666, + "cpu_mem": 1.60094208, + "gpu_mem": 4.4369152, + "loss": 0.4911, + "grad_norm": 13.826659202575684, + "learning_rate": 6.230858922484288e-05 + }, + { + "step": 456, + "epoch": 0.7307692307692307, + "cpu_mem": 1.60094208, + "gpu_mem": 4.43694592, + "loss": 0.5037, + "grad_norm": 16.219091415405273, + "learning_rate": 6.162846486795938e-05 + }, + { + "step": 457, + "epoch": 0.7323717948717948, + "cpu_mem": 1.60094208, + "gpu_mem": 4.436927488, + "loss": 0.6638, + "grad_norm": 24.065914154052734, + "learning_rate": 6.095111182221422e-05 + }, + { + "step": 458, + "epoch": 0.7339743589743589, + "cpu_mem": 1.601138688, + "gpu_mem": 4.4369152, + "loss": 0.7032, + "grad_norm": 20.87546730041504, + "learning_rate": 6.027655132924397e-05 + }, + { + "step": 459, + "epoch": 0.7355769230769231, + "cpu_mem": 1.601138688, + "gpu_mem": 4.436927488, + "loss": 0.536, + "grad_norm": 19.139184951782227, + "learning_rate": 5.960480454311155e-05 + }, + { + "step": 460, + "epoch": 0.7371794871794872, + "cpu_mem": 1.601138688, + "gpu_mem": 4.436933632, + "loss": 0.5284, + "grad_norm": 20.721454620361328, + "learning_rate": 5.893589252964258e-05 + }, + { + "step": 461, + "epoch": 0.7387820512820513, + "cpu_mem": 1.601138688, + "gpu_mem": 4.436921344, + "loss": 0.4545, + "grad_norm": 16.5568790435791, + "learning_rate": 5.826983626576479e-05 + }, + { + "step": 462, + "epoch": 0.7403846153846154, + "cpu_mem": 1.601138688, + "gpu_mem": 4.436910592, + "loss": 0.5425, + "grad_norm": 17.787151336669922, + "learning_rate": 5.760665663885046e-05 + }, + { + "step": 463, + "epoch": 0.7419871794871795, + "cpu_mem": 1.601335296, + "gpu_mem": 4.436912128, + "loss": 0.6073, + "grad_norm": 22.321462631225586, + "learning_rate": 5.6946374446060984e-05 + }, + { + "step": 464, + "epoch": 0.7435897435897436, + "cpu_mem": 1.601335296, + "gpu_mem": 4.436925952, + "loss": 0.6282, + "grad_norm": 19.181758880615234, + "learning_rate": 5.6289010393695056e-05 + }, + { + "step": 465, + "epoch": 0.7451923076923077, + "cpu_mem": 1.601335296, + "gpu_mem": 4.436929024, + "loss": 0.5122, + "grad_norm": 16.554208755493164, + "learning_rate": 5.563458509653904e-05 + }, + { + "step": 466, + "epoch": 0.7467948717948718, + "cpu_mem": 1.601335296, + "gpu_mem": 4.436939776, + "loss": 0.6347, + "grad_norm": 20.132646560668945, + "learning_rate": 5.498311907722057e-05 + }, + { + "step": 467, + "epoch": 0.7483974358974359, + "cpu_mem": 1.601335296, + "gpu_mem": 4.436913664, + "loss": 0.5509, + "grad_norm": 13.966801643371582, + "learning_rate": 5.43346327655652e-05 + }, + { + "step": 468, + "epoch": 0.75, + "cpu_mem": 1.601335296, + "gpu_mem": 4.436929024, + "loss": 0.7989, + "grad_norm": 22.66693115234375, + "learning_rate": 5.3689146497955274e-05 + }, + { + "step": 469, + "epoch": 0.7516025641025641, + "cpu_mem": 1.601335296, + "gpu_mem": 4.43693824, + "loss": 0.6479, + "grad_norm": 18.851064682006836, + "learning_rate": 5.30466805166927e-05 + }, + { + "step": 470, + "epoch": 0.7532051282051282, + "cpu_mem": 1.601335296, + "gpu_mem": 4.436912128, + "loss": 0.6457, + "grad_norm": 18.89881706237793, + "learning_rate": 5.240725496936372e-05 + }, + { + "step": 471, + "epoch": 0.7548076923076923, + "cpu_mem": 1.601335296, + "gpu_mem": 4.436918272, + "loss": 0.5989, + "grad_norm": 15.893594741821289, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 472, + "epoch": 0.7564102564102564, + "cpu_mem": 1.601335296, + "gpu_mem": 4.43690752, + "loss": 0.6257, + "grad_norm": 14.4843168258667, + "learning_rate": 5.113760528948622e-05 + }, + { + "step": 473, + "epoch": 0.7580128205128205, + "cpu_mem": 1.601335296, + "gpu_mem": 4.436913664, + "loss": 0.5477, + "grad_norm": 15.94545841217041, + "learning_rate": 5.05074209728614e-05 + }, + { + "step": 474, + "epoch": 0.7596153846153846, + "cpu_mem": 1.601531904, + "gpu_mem": 4.436950528, + "loss": 0.4314, + "grad_norm": 16.108261108398438, + "learning_rate": 4.988035672076899e-05 + }, + { + "step": 475, + "epoch": 0.7612179487179487, + "cpu_mem": 1.601531904, + "gpu_mem": 4.436898304, + "loss": 0.7293, + "grad_norm": 18.59519386291504, + "learning_rate": 4.925643219780052e-05 + }, + { + "step": 476, + "epoch": 0.7628205128205128, + "cpu_mem": 1.601531904, + "gpu_mem": 4.436918272, + "loss": 0.5356, + "grad_norm": 15.805500984191895, + "learning_rate": 4.863566697008634e-05 + }, + { + "step": 477, + "epoch": 0.7644230769230769, + "cpu_mem": 1.601531904, + "gpu_mem": 4.436918272, + "loss": 0.4623, + "grad_norm": 14.974614143371582, + "learning_rate": 4.801808050468219e-05 + }, + { + "step": 478, + "epoch": 0.7660256410256411, + "cpu_mem": 1.601531904, + "gpu_mem": 4.436916736, + "loss": 0.6571, + "grad_norm": 17.42402458190918, + "learning_rate": 4.7403692168958305e-05 + }, + { + "step": 479, + "epoch": 0.7676282051282052, + "cpu_mem": 1.601531904, + "gpu_mem": 4.4369152, + "loss": 0.4361, + "grad_norm": 17.593778610229492, + "learning_rate": 4.679252122999255e-05 + }, + { + "step": 480, + "epoch": 0.7692307692307693, + "cpu_mem": 1.601531904, + "gpu_mem": 4.43690752, + "loss": 0.6892, + "grad_norm": 20.187175750732422, + "learning_rate": 4.618458685396579e-05 + }, + { + "step": 481, + "epoch": 0.7708333333333334, + "cpu_mem": 1.601531904, + "gpu_mem": 4.436967424, + "loss": 0.4511, + "grad_norm": 17.09150505065918, + "learning_rate": 4.5579908105561016e-05 + }, + { + "step": 482, + "epoch": 0.7724358974358975, + "cpu_mem": 1.601728512, + "gpu_mem": 4.436912128, + "loss": 0.5398, + "grad_norm": 16.119115829467773, + "learning_rate": 4.497850394736563e-05 + }, + { + "step": 483, + "epoch": 0.7740384615384616, + "cpu_mem": 1.601728512, + "gpu_mem": 4.436895232, + "loss": 0.6896, + "grad_norm": 17.41630744934082, + "learning_rate": 4.438039323927648e-05 + }, + { + "step": 484, + "epoch": 0.7756410256410257, + "cpu_mem": 1.601728512, + "gpu_mem": 4.436925952, + "loss": 0.6744, + "grad_norm": 16.084402084350586, + "learning_rate": 4.3785594737908676e-05 + }, + { + "step": 485, + "epoch": 0.7772435897435898, + "cpu_mem": 1.601728512, + "gpu_mem": 4.436970496, + "loss": 0.7774, + "grad_norm": 21.84779167175293, + "learning_rate": 4.319412709600723e-05 + }, + { + "step": 486, + "epoch": 0.7788461538461539, + "cpu_mem": 1.60192512, + "gpu_mem": 4.436950528, + "loss": 0.3586, + "grad_norm": 11.665193557739258, + "learning_rate": 4.2606008861862116e-05 + }, + { + "step": 487, + "epoch": 0.780448717948718, + "cpu_mem": 1.60192512, + "gpu_mem": 4.436950528, + "loss": 0.4292, + "grad_norm": 17.926979064941406, + "learning_rate": 4.2021258478726774e-05 + }, + { + "step": 488, + "epoch": 0.782051282051282, + "cpu_mem": 1.60192512, + "gpu_mem": 4.436916736, + "loss": 0.5109, + "grad_norm": 20.63884925842285, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 489, + "epoch": 0.7836538461538461, + "cpu_mem": 1.60192512, + "gpu_mem": 4.436941312, + "loss": 0.5038, + "grad_norm": 14.3490571975708, + "learning_rate": 4.0861934509848507e-05 + }, + { + "step": 490, + "epoch": 0.7852564102564102, + "cpu_mem": 1.60192512, + "gpu_mem": 4.436944384, + "loss": 0.5291, + "grad_norm": 16.429729461669922, + "learning_rate": 4.028739728024022e-05 + }, + { + "step": 491, + "epoch": 0.7868589743589743, + "cpu_mem": 1.60192512, + "gpu_mem": 4.43692288, + "loss": 0.4231, + "grad_norm": 16.346628189086914, + "learning_rate": 3.971630061277077e-05 + }, + { + "step": 492, + "epoch": 0.7884615384615384, + "cpu_mem": 1.60192512, + "gpu_mem": 4.43694592, + "loss": 0.4874, + "grad_norm": 17.505640029907227, + "learning_rate": 3.914866241690115e-05 + }, + { + "step": 493, + "epoch": 0.7900641025641025, + "cpu_mem": 1.60192512, + "gpu_mem": 4.436925952, + "loss": 0.7187, + "grad_norm": 24.07867431640625, + "learning_rate": 3.858450049363532e-05 + }, + { + "step": 494, + "epoch": 0.7916666666666666, + "cpu_mem": 1.60192512, + "gpu_mem": 4.436950528, + "loss": 0.4297, + "grad_norm": 12.626046180725098, + "learning_rate": 3.8023832534962314e-05 + }, + { + "step": 495, + "epoch": 0.7932692307692307, + "cpu_mem": 1.60192512, + "gpu_mem": 4.436933632, + "loss": 0.5672, + "grad_norm": 19.668537139892578, + "learning_rate": 3.746667612330109e-05 + }, + { + "step": 496, + "epoch": 0.7948717948717948, + "cpu_mem": 1.60192512, + "gpu_mem": 4.436929024, + "loss": 0.502, + "grad_norm": 18.715856552124023, + "learning_rate": 3.691304873094927e-05 + }, + { + "step": 497, + "epoch": 0.7964743589743589, + "cpu_mem": 1.60192512, + "gpu_mem": 4.436941312, + "loss": 0.579, + "grad_norm": 17.770305633544922, + "learning_rate": 3.636296771953544e-05 + }, + { + "step": 498, + "epoch": 0.7980769230769231, + "cpu_mem": 1.60192512, + "gpu_mem": 4.436912128, + "loss": 0.6292, + "grad_norm": 18.273117065429688, + "learning_rate": 3.581645033947425e-05 + }, + { + "step": 499, + "epoch": 0.7996794871794872, + "cpu_mem": 1.60192512, + "gpu_mem": 4.436925952, + "loss": 0.7132, + "grad_norm": 20.024412155151367, + "learning_rate": 3.527351372942588e-05 + }, + { + "step": 500, + "epoch": 0.8012820512820513, + "cpu_mem": 1.60192512, + "gpu_mem": 4.436912128, + "loss": 0.4998, + "grad_norm": 15.930155754089355, + "learning_rate": 3.473417491575824e-05 + }, + { + "step": 501, + "epoch": 0.8028846153846154, + "cpu_mem": 1.60192512, + "gpu_mem": 4.436905984, + "loss": 0.6005, + "grad_norm": 19.201862335205078, + "learning_rate": 3.41984508120132e-05 + }, + { + "step": 502, + "epoch": 0.8044871794871795, + "cpu_mem": 1.60192512, + "gpu_mem": 4.436912128, + "loss": 0.4354, + "grad_norm": 13.440332412719727, + "learning_rate": 3.366635821837627e-05 + }, + { + "step": 503, + "epoch": 0.8060897435897436, + "cpu_mem": 1.60192512, + "gpu_mem": 4.436925952, + "loss": 0.6141, + "grad_norm": 17.000104904174805, + "learning_rate": 3.3137913821149425e-05 + }, + { + "step": 504, + "epoch": 0.8076923076923077, + "cpu_mem": 1.60192512, + "gpu_mem": 4.436909056, + "loss": 0.6596, + "grad_norm": 15.076282501220703, + "learning_rate": 3.261313419222825e-05 + }, + { + "step": 505, + "epoch": 0.8092948717948718, + "cpu_mem": 1.602121728, + "gpu_mem": 4.436962816, + "loss": 0.4761, + "grad_norm": 13.45787525177002, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 506, + "epoch": 0.8108974358974359, + "cpu_mem": 1.602121728, + "gpu_mem": 4.436905984, + "loss": 0.4571, + "grad_norm": 12.376617431640625, + "learning_rate": 3.157463495173713e-05 + }, + { + "step": 507, + "epoch": 0.8125, + "cpu_mem": 1.602121728, + "gpu_mem": 4.43698432, + "loss": 0.5278, + "grad_norm": 13.656095504760742, + "learning_rate": 3.1060947907265936e-05 + }, + { + "step": 508, + "epoch": 0.8141025641025641, + "cpu_mem": 1.602121728, + "gpu_mem": 4.436927488, + "loss": 0.4919, + "grad_norm": 14.627265930175781, + "learning_rate": 3.0550990764276634e-05 + }, + { + "step": 509, + "epoch": 0.8157051282051282, + "cpu_mem": 1.602121728, + "gpu_mem": 4.43694592, + "loss": 0.6545, + "grad_norm": 15.35767650604248, + "learning_rate": 3.00447795149086e-05 + }, + { + "step": 510, + "epoch": 0.8173076923076923, + "cpu_mem": 1.602121728, + "gpu_mem": 4.436921344, + "loss": 0.4678, + "grad_norm": 13.804354667663574, + "learning_rate": 2.9542330033830884e-05 + }, + { + "step": 511, + "epoch": 0.8189102564102564, + "cpu_mem": 1.602121728, + "gpu_mem": 4.4369536, + "loss": 0.5563, + "grad_norm": 14.784932136535645, + "learning_rate": 2.9043658077744316e-05 + }, + { + "step": 512, + "epoch": 0.8205128205128205, + "cpu_mem": 1.602121728, + "gpu_mem": 4.436973568, + "loss": 0.6614, + "grad_norm": 18.76473617553711, + "learning_rate": 2.8548779284887442e-05 + }, + { + "step": 513, + "epoch": 0.8221153846153846, + "cpu_mem": 1.602318336, + "gpu_mem": 4.436902912, + "loss": 0.4874, + "grad_norm": 16.57867431640625, + "learning_rate": 2.805770917454614e-05 + }, + { + "step": 514, + "epoch": 0.8237179487179487, + "cpu_mem": 1.602318336, + "gpu_mem": 4.436916736, + "loss": 0.3197, + "grad_norm": 14.598225593566895, + "learning_rate": 2.7570463146566758e-05 + }, + { + "step": 515, + "epoch": 0.8253205128205128, + "cpu_mem": 1.602318336, + "gpu_mem": 4.436901376, + "loss": 0.7331, + "grad_norm": 22.817983627319336, + "learning_rate": 2.708705648087332e-05 + }, + { + "step": 516, + "epoch": 0.8269230769230769, + "cpu_mem": 1.602318336, + "gpu_mem": 4.436939776, + "loss": 0.4072, + "grad_norm": 14.899096488952637, + "learning_rate": 2.6607504336988317e-05 + }, + { + "step": 517, + "epoch": 0.8285256410256411, + "cpu_mem": 1.602318336, + "gpu_mem": 4.436939776, + "loss": 0.6086, + "grad_norm": 18.94478416442871, + "learning_rate": 2.613182175355739e-05 + }, + { + "step": 518, + "epoch": 0.8301282051282052, + "cpu_mem": 1.602318336, + "gpu_mem": 4.436925952, + "loss": 0.6146, + "grad_norm": 18.853437423706055, + "learning_rate": 2.5660023647877644e-05 + }, + { + "step": 519, + "epoch": 0.8317307692307693, + "cpu_mem": 1.602318336, + "gpu_mem": 4.436916736, + "loss": 0.4791, + "grad_norm": 15.45250415802002, + "learning_rate": 2.5192124815429777e-05 + }, + { + "step": 520, + "epoch": 0.8333333333333334, + "cpu_mem": 1.602318336, + "gpu_mem": 4.436921344, + "loss": 0.5383, + "grad_norm": 15.0850248336792, + "learning_rate": 2.472813992941418e-05 + }, + { + "step": 521, + "epoch": 0.8349358974358975, + "cpu_mem": 1.602318336, + "gpu_mem": 4.436924416, + "loss": 0.8434, + "grad_norm": 25.946069717407227, + "learning_rate": 2.426808354029078e-05 + }, + { + "step": 522, + "epoch": 0.8365384615384616, + "cpu_mem": 1.602318336, + "gpu_mem": 4.43693056, + "loss": 0.4806, + "grad_norm": 16.02838706970215, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 523, + "epoch": 0.8381410256410257, + "cpu_mem": 1.602318336, + "gpu_mem": 4.436948992, + "loss": 0.4951, + "grad_norm": 18.025707244873047, + "learning_rate": 2.3359813838124277e-05 + }, + { + "step": 524, + "epoch": 0.8397435897435898, + "cpu_mem": 1.602514944, + "gpu_mem": 4.436942848, + "loss": 0.7444, + "grad_norm": 22.749496459960938, + "learning_rate": 2.2911629008211363e-05 + }, + { + "step": 525, + "epoch": 0.8413461538461539, + "cpu_mem": 1.602514944, + "gpu_mem": 4.436919808, + "loss": 0.4385, + "grad_norm": 19.637008666992188, + "learning_rate": 2.24674296405579e-05 + }, + { + "step": 526, + "epoch": 0.842948717948718, + "cpu_mem": 1.602514944, + "gpu_mem": 4.43690752, + "loss": 0.696, + "grad_norm": 21.8166561126709, + "learning_rate": 2.2027229665154446e-05 + }, + { + "step": 527, + "epoch": 0.844551282051282, + "cpu_mem": 1.602514944, + "gpu_mem": 4.436873728, + "loss": 0.6688, + "grad_norm": 21.058216094970703, + "learning_rate": 2.1591042886571634e-05 + }, + { + "step": 528, + "epoch": 0.8461538461538461, + "cpu_mem": 1.602514944, + "gpu_mem": 4.436921344, + "loss": 0.6057, + "grad_norm": 18.300865173339844, + "learning_rate": 2.1158882983527166e-05 + }, + { + "step": 529, + "epoch": 0.8477564102564102, + "cpu_mem": 1.602514944, + "gpu_mem": 4.436887552, + "loss": 0.6403, + "grad_norm": 16.147666931152344, + "learning_rate": 2.0730763508456738e-05 + }, + { + "step": 530, + "epoch": 0.8493589743589743, + "cpu_mem": 1.602514944, + "gpu_mem": 4.436935168, + "loss": 0.5772, + "grad_norm": 16.42509651184082, + "learning_rate": 2.0306697887089235e-05 + }, + { + "step": 531, + "epoch": 0.8509615384615384, + "cpu_mem": 1.602514944, + "gpu_mem": 4.436933632, + "loss": 0.6271, + "grad_norm": 15.57359790802002, + "learning_rate": 1.9886699418025543e-05 + }, + { + "step": 532, + "epoch": 0.8525641025641025, + "cpu_mem": 1.602514944, + "gpu_mem": 4.436935168, + "loss": 0.3898, + "grad_norm": 12.728279113769531, + "learning_rate": 1.947078127232169e-05 + }, + { + "step": 533, + "epoch": 0.8541666666666666, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436944384, + "loss": 0.6584, + "grad_norm": 18.257699966430664, + "learning_rate": 1.9058956493075644e-05 + }, + { + "step": 534, + "epoch": 0.8557692307692307, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436919808, + "loss": 0.6845, + "grad_norm": 17.00620460510254, + "learning_rate": 1.8651237995018324e-05 + }, + { + "step": 535, + "epoch": 0.8573717948717948, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436904448, + "loss": 0.6283, + "grad_norm": 15.356157302856445, + "learning_rate": 1.8247638564108607e-05 + }, + { + "step": 536, + "epoch": 0.8589743589743589, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436933632, + "loss": 0.6272, + "grad_norm": 15.59634017944336, + "learning_rate": 1.7848170857132325e-05 + }, + { + "step": 537, + "epoch": 0.8605769230769231, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436947456, + "loss": 0.5798, + "grad_norm": 15.051132202148438, + "learning_rate": 1.7452847401305496e-05 + }, + { + "step": 538, + "epoch": 0.8621794871794872, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436902912, + "loss": 0.4645, + "grad_norm": 13.531730651855469, + "learning_rate": 1.7061680593881344e-05 + }, + { + "step": 539, + "epoch": 0.8637820512820513, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436909056, + "loss": 0.5733, + "grad_norm": 14.674735069274902, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 540, + "epoch": 0.8653846153846154, + "cpu_mem": 1.602711552, + "gpu_mem": 4.43693824, + "loss": 0.5987, + "grad_norm": 13.38436508178711, + "learning_rate": 1.6291865861111353e-05 + }, + { + "step": 541, + "epoch": 0.8669871794871795, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436933632, + "loss": 0.529, + "grad_norm": 13.826847076416016, + "learning_rate": 1.5913242076979493e-05 + }, + { + "step": 542, + "epoch": 0.8685897435897436, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436919808, + "loss": 0.5988, + "grad_norm": 13.51820182800293, + "learning_rate": 1.5538823222921288e-05 + }, + { + "step": 543, + "epoch": 0.8701923076923077, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436933632, + "loss": 0.5696, + "grad_norm": 16.349960327148438, + "learning_rate": 1.5168621040626388e-05 + }, + { + "step": 544, + "epoch": 0.8717948717948718, + "cpu_mem": 1.602711552, + "gpu_mem": 4.43692288, + "loss": 0.5611, + "grad_norm": 14.18722915649414, + "learning_rate": 1.4802647139550577e-05 + }, + { + "step": 545, + "epoch": 0.8733974358974359, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436929024, + "loss": 0.4386, + "grad_norm": 12.110021591186523, + "learning_rate": 1.444091299655175e-05 + }, + { + "step": 546, + "epoch": 0.875, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436933632, + "loss": 0.6844, + "grad_norm": 18.813871383666992, + "learning_rate": 1.408342995552988e-05 + }, + { + "step": 547, + "epoch": 0.8766025641025641, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436929024, + "loss": 0.4272, + "grad_norm": 11.674015045166016, + "learning_rate": 1.3730209227071436e-05 + }, + { + "step": 548, + "epoch": 0.8782051282051282, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436902912, + "loss": 0.5054, + "grad_norm": 14.826602935791016, + "learning_rate": 1.3381261888097755e-05 + }, + { + "step": 549, + "epoch": 0.8798076923076923, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436912128, + "loss": 0.506, + "grad_norm": 14.982539176940918, + "learning_rate": 1.303659888151753e-05 + }, + { + "step": 550, + "epoch": 0.8814102564102564, + "cpu_mem": 1.602711552, + "gpu_mem": 4.43693056, + "loss": 0.632, + "grad_norm": 17.70216178894043, + "learning_rate": 1.2696231015883913e-05 + }, + { + "step": 551, + "epoch": 0.8830128205128205, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436901376, + "loss": 0.5847, + "grad_norm": 15.246076583862305, + "learning_rate": 1.2360168965055301e-05 + }, + { + "step": 552, + "epoch": 0.8846153846153846, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436932096, + "loss": 0.7476, + "grad_norm": 17.954666137695312, + "learning_rate": 1.2028423267860805e-05 + }, + { + "step": 553, + "epoch": 0.8862179487179487, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436941312, + "loss": 0.5975, + "grad_norm": 16.72612762451172, + "learning_rate": 1.1701004327769709e-05 + }, + { + "step": 554, + "epoch": 0.8878205128205128, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436902912, + "loss": 0.8033, + "grad_norm": 18.41470718383789, + "learning_rate": 1.1377922412565005e-05 + }, + { + "step": 555, + "epoch": 0.8894230769230769, + "cpu_mem": 1.602711552, + "gpu_mem": 4.43690752, + "loss": 0.5484, + "grad_norm": 15.98901653289795, + "learning_rate": 1.1059187654021762e-05 + }, + { + "step": 556, + "epoch": 0.8910256410256411, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436932096, + "loss": 0.544, + "grad_norm": 14.799599647521973, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 557, + "epoch": 0.8926282051282052, + "cpu_mem": 1.602711552, + "gpu_mem": 4.436950528, + "loss": 0.493, + "grad_norm": 18.060483932495117, + "learning_rate": 1.0434799452076915e-05 + }, + { + "step": 558, + "epoch": 0.8942307692307693, + "cpu_mem": 1.60290816, + "gpu_mem": 4.436932096, + "loss": 0.4989, + "grad_norm": 14.406967163085938, + "learning_rate": 1.0129165589346643e-05 + }, + { + "step": 559, + "epoch": 0.8958333333333334, + "cpu_mem": 1.60290816, + "gpu_mem": 4.436982784, + "loss": 0.5597, + "grad_norm": 15.136961936950684, + "learning_rate": 9.82791804400626e-06 + }, + { + "step": 560, + "epoch": 0.8974358974358975, + "cpu_mem": 1.60290816, + "gpu_mem": 4.4369152, + "loss": 0.8606, + "grad_norm": 20.78803062438965, + "learning_rate": 9.531066263109971e-06 + }, + { + "step": 561, + "epoch": 0.8990384615384616, + "cpu_mem": 1.60290816, + "gpu_mem": 4.436916736, + "loss": 0.6195, + "grad_norm": 17.412851333618164, + "learning_rate": 9.238619555861731e-06 + }, + { + "step": 562, + "epoch": 0.9006410256410257, + "cpu_mem": 1.60290816, + "gpu_mem": 4.436916736, + "loss": 0.5651, + "grad_norm": 13.913155555725098, + "learning_rate": 8.950587093323435e-06 + }, + { + "step": 563, + "epoch": 0.9022435897435898, + "cpu_mem": 1.60290816, + "gpu_mem": 4.43692288, + "loss": 0.5309, + "grad_norm": 15.279781341552734, + "learning_rate": 8.66697790812731e-06 + }, + { + "step": 564, + "epoch": 0.9038461538461539, + "cpu_mem": 1.60290816, + "gpu_mem": 4.436936704, + "loss": 0.4724, + "grad_norm": 14.58300495147705, + "learning_rate": 8.387800894192453e-06 + }, + { + "step": 565, + "epoch": 0.905448717948718, + "cpu_mem": 1.60290816, + "gpu_mem": 4.436941312, + "loss": 0.6365, + "grad_norm": 17.007728576660156, + "learning_rate": 8.113064806446285e-06 + }, + { + "step": 566, + "epoch": 0.907051282051282, + "cpu_mem": 1.60290816, + "gpu_mem": 4.436935168, + "loss": 0.4098, + "grad_norm": 13.13451099395752, + "learning_rate": 7.842778260549654e-06 + }, + { + "step": 567, + "epoch": 0.9086538461538461, + "cpu_mem": 1.60290816, + "gpu_mem": 4.436929024, + "loss": 0.4156, + "grad_norm": 13.054964065551758, + "learning_rate": 7.5769497326268804e-06 + }, + { + "step": 568, + "epoch": 0.9102564102564102, + "cpu_mem": 1.60290816, + "gpu_mem": 4.436942848, + "loss": 0.6286, + "grad_norm": 17.646623611450195, + "learning_rate": 7.315587558999864e-06 + }, + { + "step": 569, + "epoch": 0.9118589743589743, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436935168, + "loss": 0.6612, + "grad_norm": 18.78885841369629, + "learning_rate": 7.058699935926526e-06 + }, + { + "step": 570, + "epoch": 0.9134615384615384, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436919808, + "loss": 0.55, + "grad_norm": 16.956432342529297, + "learning_rate": 6.8062949193440515e-06 + }, + { + "step": 571, + "epoch": 0.9150641025641025, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436929024, + "loss": 0.7343, + "grad_norm": 18.79220199584961, + "learning_rate": 6.5583804246160385e-06 + }, + { + "step": 572, + "epoch": 0.9166666666666666, + "cpu_mem": 1.603104768, + "gpu_mem": 4.43693824, + "loss": 0.4993, + "grad_norm": 16.02692413330078, + "learning_rate": 6.3149642262843804e-06 + }, + { + "step": 573, + "epoch": 0.9182692307692307, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436941312, + "loss": 0.5158, + "grad_norm": 15.800230026245117, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 574, + "epoch": 0.9198717948717948, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436895232, + "loss": 0.5404, + "grad_norm": 15.02462387084961, + "learning_rate": 5.84165711141048e-06 + }, + { + "step": 575, + "epoch": 0.9214743589743589, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436948992, + "loss": 0.2999, + "grad_norm": 12.197375297546387, + "learning_rate": 5.611781037671176e-06 + }, + { + "step": 576, + "epoch": 0.9230769230769231, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436950528, + "loss": 0.4632, + "grad_norm": 14.600228309631348, + "learning_rate": 5.386432945468555e-06 + }, + { + "step": 577, + "epoch": 0.9246794871794872, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436895232, + "loss": 0.5256, + "grad_norm": 13.9147367477417, + "learning_rate": 5.165619901667311e-06 + }, + { + "step": 578, + "epoch": 0.9262820512820513, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436929024, + "loss": 0.6853, + "grad_norm": 15.993699073791504, + "learning_rate": 4.949348830914002e-06 + }, + { + "step": 579, + "epoch": 0.9278846153846154, + "cpu_mem": 1.603104768, + "gpu_mem": 4.43690752, + "loss": 0.5601, + "grad_norm": 17.368450164794922, + "learning_rate": 4.737626515419951e-06 + }, + { + "step": 580, + "epoch": 0.9294871794871795, + "cpu_mem": 1.603104768, + "gpu_mem": 4.43693824, + "loss": 0.574, + "grad_norm": 17.083736419677734, + "learning_rate": 4.530459594748592e-06 + }, + { + "step": 581, + "epoch": 0.9310897435897436, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436913664, + "loss": 0.5518, + "grad_norm": 16.396587371826172, + "learning_rate": 4.327854565607164e-06 + }, + { + "step": 582, + "epoch": 0.9326923076923077, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436947456, + "loss": 0.4068, + "grad_norm": 13.29260540008545, + "learning_rate": 4.129817781643091e-06 + }, + { + "step": 583, + "epoch": 0.9342948717948718, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436967424, + "loss": 0.7244, + "grad_norm": 16.86695671081543, + "learning_rate": 3.9363554532446276e-06 + }, + { + "step": 584, + "epoch": 0.9358974358974359, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436932096, + "loss": 0.5048, + "grad_norm": 14.015303611755371, + "learning_rate": 3.7474736473461607e-06 + }, + { + "step": 585, + "epoch": 0.9375, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436952064, + "loss": 0.3939, + "grad_norm": 14.296407699584961, + "learning_rate": 3.56317828723795e-06 + }, + { + "step": 586, + "epoch": 0.9391025641025641, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436932096, + "loss": 0.4047, + "grad_norm": 13.573908805847168, + "learning_rate": 3.383475152380355e-06 + }, + { + "step": 587, + "epoch": 0.9407051282051282, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436933632, + "loss": 0.5385, + "grad_norm": 17.336406707763672, + "learning_rate": 3.2083698782225997e-06 + }, + { + "step": 588, + "epoch": 0.9423076923076923, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436925952, + "loss": 0.3672, + "grad_norm": 12.451120376586914, + "learning_rate": 3.0378679560260467e-06 + }, + { + "step": 589, + "epoch": 0.9439102564102564, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436924416, + "loss": 0.5508, + "grad_norm": 16.951068878173828, + "learning_rate": 2.871974732691984e-06 + }, + { + "step": 590, + "epoch": 0.9455128205128205, + "cpu_mem": 1.603104768, + "gpu_mem": 4.43693824, + "loss": 0.5398, + "grad_norm": 17.679006576538086, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 591, + "epoch": 0.9471153846153846, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436909056, + "loss": 0.5231, + "grad_norm": 16.765413284301758, + "learning_rate": 2.554035047414732e-06 + }, + { + "step": 592, + "epoch": 0.9487179487179487, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436959744, + "loss": 0.5383, + "grad_norm": 16.528413772583008, + "learning_rate": 2.401998555987389e-06 + }, + { + "step": 593, + "epoch": 0.9503205128205128, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436956672, + "loss": 0.5834, + "grad_norm": 14.182188987731934, + "learning_rate": 2.2545907041415457e-06 + }, + { + "step": 594, + "epoch": 0.9519230769230769, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436936704, + "loss": 0.5641, + "grad_norm": 17.734580993652344, + "learning_rate": 2.1118161145537436e-06 + }, + { + "step": 595, + "epoch": 0.9535256410256411, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436918272, + "loss": 0.4581, + "grad_norm": 13.467229843139648, + "learning_rate": 1.973679264602485e-06 + }, + { + "step": 596, + "epoch": 0.9551282051282052, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436927488, + "loss": 0.5886, + "grad_norm": 19.73381233215332, + "learning_rate": 1.840184486227808e-06 + }, + { + "step": 597, + "epoch": 0.9567307692307693, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436895232, + "loss": 0.517, + "grad_norm": 18.149707794189453, + "learning_rate": 1.711335965795435e-06 + }, + { + "step": 598, + "epoch": 0.9583333333333334, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436956672, + "loss": 0.6138, + "grad_norm": 15.585168838500977, + "learning_rate": 1.5871377439655054e-06 + }, + { + "step": 599, + "epoch": 0.9599358974358975, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436955136, + "loss": 0.5503, + "grad_norm": 16.613147735595703, + "learning_rate": 1.4675937155658456e-06 + }, + { + "step": 600, + "epoch": 0.9615384615384616, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436910592, + "loss": 0.5723, + "grad_norm": 14.131546974182129, + "learning_rate": 1.3527076294698846e-06 + }, + { + "step": 601, + "epoch": 0.9631410256410257, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436942848, + "loss": 0.4206, + "grad_norm": 13.432826042175293, + "learning_rate": 1.2424830884790126e-06 + }, + { + "step": 602, + "epoch": 0.9647435897435898, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436936704, + "loss": 0.5563, + "grad_norm": 17.747425079345703, + "learning_rate": 1.1369235492096397e-06 + }, + { + "step": 603, + "epoch": 0.9663461538461539, + "cpu_mem": 1.603104768, + "gpu_mem": 4.43692288, + "loss": 0.4764, + "grad_norm": 14.853006362915039, + "learning_rate": 1.0360323219847645e-06 + }, + { + "step": 604, + "epoch": 0.967948717948718, + "cpu_mem": 1.603104768, + "gpu_mem": 4.43692288, + "loss": 0.5622, + "grad_norm": 19.073272705078125, + "learning_rate": 9.398125707302084e-07 + }, + { + "step": 605, + "epoch": 0.969551282051282, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436948992, + "loss": 0.5273, + "grad_norm": 18.40043067932129, + "learning_rate": 8.482673128753947e-07 + }, + { + "step": 606, + "epoch": 0.9711538461538461, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436936704, + "loss": 0.3855, + "grad_norm": 15.22576904296875, + "learning_rate": 7.613994192586736e-07 + }, + { + "step": 607, + "epoch": 0.9727564102564102, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436927488, + "loss": 0.6178, + "grad_norm": 14.744359016418457, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 608, + "epoch": 0.9743589743589743, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436919808, + "loss": 0.666, + "grad_norm": 19.768892288208008, + "learning_rate": 6.017064746021094e-07 + }, + { + "step": 609, + "epoch": 0.9759615384615384, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436942848, + "loss": 0.4902, + "grad_norm": 14.939704895019531, + "learning_rate": 5.288864314965003e-07 + }, + { + "step": 610, + "epoch": 0.9775641025641025, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436932096, + "loss": 0.3683, + "grad_norm": 14.810441017150879, + "learning_rate": 4.607537683404106e-07 + }, + { + "step": 611, + "epoch": 0.9791666666666666, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436916736, + "loss": 0.3841, + "grad_norm": 14.184685707092285, + "learning_rate": 3.973106217585842e-07 + }, + { + "step": 612, + "epoch": 0.9807692307692307, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436982784, + "loss": 0.4284, + "grad_norm": 13.818171501159668, + "learning_rate": 3.3855898131356915e-07 + }, + { + "step": 613, + "epoch": 0.9823717948717948, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436925952, + "loss": 0.5806, + "grad_norm": 16.559803009033203, + "learning_rate": 2.845006894433843e-07 + }, + { + "step": 614, + "epoch": 0.9839743589743589, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436912128, + "loss": 0.5568, + "grad_norm": 17.104646682739258, + "learning_rate": 2.351374414037155e-07 + }, + { + "step": 615, + "epoch": 0.9855769230769231, + "cpu_mem": 1.603104768, + "gpu_mem": 4.43697664, + "loss": 0.5284, + "grad_norm": 14.745490074157715, + "learning_rate": 1.9047078521474135e-07 + }, + { + "step": 616, + "epoch": 0.9871794871794872, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436905984, + "loss": 0.5634, + "grad_norm": 16.035083770751953, + "learning_rate": 1.505021216125557e-07 + }, + { + "step": 617, + "epoch": 0.9887820512820513, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436933632, + "loss": 0.3336, + "grad_norm": 13.802583694458008, + "learning_rate": 1.1523270400535245e-07 + }, + { + "step": 618, + "epoch": 0.9903846153846154, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436936704, + "loss": 0.5102, + "grad_norm": 14.44839859008789, + "learning_rate": 8.466363843397383e-08 + }, + { + "step": 619, + "epoch": 0.9919871794871795, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436912128, + "loss": 0.5737, + "grad_norm": 16.932273864746094, + "learning_rate": 5.8795883537338106e-08 + }, + { + "step": 620, + "epoch": 0.9935897435897436, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436939776, + "loss": 0.5132, + "grad_norm": 18.179079055786133, + "learning_rate": 3.763025052231361e-08 + }, + { + "step": 621, + "epoch": 0.9951923076923077, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436948992, + "loss": 0.5511, + "grad_norm": 17.299436569213867, + "learning_rate": 2.1167403138339088e-08 + }, + { + "step": 622, + "epoch": 0.9967948717948718, + "cpu_mem": 1.603104768, + "gpu_mem": 4.43694592, + "loss": 0.4371, + "grad_norm": 14.124671936035156, + "learning_rate": 9.407857656540397e-09 + }, + { + "step": 623, + "epoch": 0.9983974358974359, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436918272, + "loss": 0.592, + "grad_norm": 15.902559280395508, + "learning_rate": 2.3519828535434325e-09 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436664832, + "loss": 0.5445, + "grad_norm": 20.001577377319336, + "learning_rate": 0.0 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.603104768, + "gpu_mem": 4.436664832, + "train_runtime": 8064.8014, + "train_samples_per_second": 4.948, + "train_steps_per_second": 0.077, + "total_flos": 0.0, + "train_loss": 0.9421886211404433 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/adapter_config.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0052eed638e4aeb48f103586efb96096bb8d3ed --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 64, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 32, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "A" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/eval_results.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7b5c2e829292894193327b0e9baac81fca10288c --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "hellaswag", + "results": 0.33917546305516827 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/training_configuration.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..f56974e0cd6d3cfb30f5b380d829e8fc8f2e0487 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "HELLASWAG", + "dataset_id": "Rowan/hellaswag", + "preprocess_id": "hellaswag_train_deepeval" + }, + "peft_config": { + "method": "abl_A", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 25389056 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 1, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_A-hellaswag-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2", + "seed": 42, + "timestamp": "2025-08-31T06:39:18.710581" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/training_logs.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..feca5cf89e1fd423f0456784055b3332d131eee6 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/training_logs.json @@ -0,0 +1,5629 @@ +[ + { + "step": 1, + "epoch": 0.0016025641025641025, + "cpu_mem": 1.497624576, + "gpu_mem": 4.519314944, + "loss": 3.4877, + "grad_norm": 206.80653381347656, + "learning_rate": 4.7619047619047615e-06 + }, + { + "step": 2, + "epoch": 0.003205128205128205, + "cpu_mem": 1.503916032, + "gpu_mem": 4.722420736, + "loss": 3.6203, + "grad_norm": 205.2892608642578, + "learning_rate": 9.523809523809523e-06 + }, + { + "step": 3, + "epoch": 0.004807692307692308, + "cpu_mem": 1.504899072, + "gpu_mem": 4.722428416, + "loss": 2.5214, + "grad_norm": 125.75362396240234, + "learning_rate": 1.4285714285714284e-05 + }, + { + "step": 4, + "epoch": 0.00641025641025641, + "cpu_mem": 1.50607872, + "gpu_mem": 4.722462208, + "loss": 1.9847, + "grad_norm": 62.69116973876953, + "learning_rate": 1.9047619047619046e-05 + }, + { + "step": 5, + "epoch": 0.008012820512820512, + "cpu_mem": 1.50706176, + "gpu_mem": 4.722425344, + "loss": 1.4603, + "grad_norm": 17.37816047668457, + "learning_rate": 2.3809523809523807e-05 + }, + { + "step": 6, + "epoch": 0.009615384615384616, + "cpu_mem": 1.507848192, + "gpu_mem": 4.722471424, + "loss": 1.5535, + "grad_norm": 34.58354568481445, + "learning_rate": 2.8571428571428567e-05 + }, + { + "step": 7, + "epoch": 0.011217948717948718, + "cpu_mem": 1.508831232, + "gpu_mem": 4.722431488, + "loss": 1.3393, + "grad_norm": 9.02051830291748, + "learning_rate": 3.333333333333333e-05 + }, + { + "step": 8, + "epoch": 0.01282051282051282, + "cpu_mem": 1.509617664, + "gpu_mem": 4.722462208, + "loss": 1.4101, + "grad_norm": 22.1068172454834, + "learning_rate": 3.809523809523809e-05 + }, + { + "step": 9, + "epoch": 0.014423076923076924, + "cpu_mem": 1.510404096, + "gpu_mem": 4.722462208, + "loss": 1.5199, + "grad_norm": 25.473468780517578, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 10, + "epoch": 0.016025641025641024, + "cpu_mem": 1.511190528, + "gpu_mem": 4.722405376, + "loss": 1.6228, + "grad_norm": 28.643245697021484, + "learning_rate": 4.7619047619047614e-05 + }, + { + "step": 11, + "epoch": 0.017628205128205128, + "cpu_mem": 1.51197696, + "gpu_mem": 4.722425344, + "loss": 1.4541, + "grad_norm": 15.961499214172363, + "learning_rate": 5.238095238095237e-05 + }, + { + "step": 12, + "epoch": 0.019230769230769232, + "cpu_mem": 1.512763392, + "gpu_mem": 4.722422272, + "loss": 1.4275, + "grad_norm": 9.634175300598145, + "learning_rate": 5.7142857142857135e-05 + }, + { + "step": 13, + "epoch": 0.020833333333333332, + "cpu_mem": 1.513549824, + "gpu_mem": 4.722414592, + "loss": 1.4401, + "grad_norm": 10.941969871520996, + "learning_rate": 6.190476190476189e-05 + }, + { + "step": 14, + "epoch": 0.022435897435897436, + "cpu_mem": 1.514336256, + "gpu_mem": 4.722440704, + "loss": 1.4618, + "grad_norm": 9.79638671875, + "learning_rate": 6.666666666666666e-05 + }, + { + "step": 15, + "epoch": 0.02403846153846154, + "cpu_mem": 1.515122688, + "gpu_mem": 4.722439168, + "loss": 1.4712, + "grad_norm": 10.663593292236328, + "learning_rate": 7.142857142857142e-05 + }, + { + "step": 16, + "epoch": 0.02564102564102564, + "cpu_mem": 1.515712512, + "gpu_mem": 4.722431488, + "loss": 1.3681, + "grad_norm": 2.4212963581085205, + "learning_rate": 7.619047619047618e-05 + }, + { + "step": 17, + "epoch": 0.027243589743589744, + "cpu_mem": 1.516498944, + "gpu_mem": 4.722431488, + "loss": 1.459, + "grad_norm": 11.662422180175781, + "learning_rate": 8.095238095238093e-05 + }, + { + "step": 18, + "epoch": 0.028846153846153848, + "cpu_mem": 1.517088768, + "gpu_mem": 4.722431488, + "loss": 1.3307, + "grad_norm": 6.916178226470947, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 19, + "epoch": 0.030448717948717948, + "cpu_mem": 1.517678592, + "gpu_mem": 4.722431488, + "loss": 1.5011, + "grad_norm": 12.171334266662598, + "learning_rate": 9.047619047619046e-05 + }, + { + "step": 20, + "epoch": 0.03205128205128205, + "cpu_mem": 1.518465024, + "gpu_mem": 4.722405376, + "loss": 1.4824, + "grad_norm": 12.152570724487305, + "learning_rate": 9.523809523809523e-05 + }, + { + "step": 21, + "epoch": 0.03365384615384615, + "cpu_mem": 1.519054848, + "gpu_mem": 4.722422272, + "loss": 1.473, + "grad_norm": 12.573491096496582, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 22, + "epoch": 0.035256410256410256, + "cpu_mem": 1.51984128, + "gpu_mem": 4.722429952, + "loss": 1.4922, + "grad_norm": 13.624589920043945, + "learning_rate": 0.00010476190476190474 + }, + { + "step": 23, + "epoch": 0.03685897435897436, + "cpu_mem": 1.520431104, + "gpu_mem": 4.722443776, + "loss": 1.3736, + "grad_norm": 6.45763635635376, + "learning_rate": 0.0001095238095238095 + }, + { + "step": 24, + "epoch": 0.038461538461538464, + "cpu_mem": 1.521020928, + "gpu_mem": 4.722428416, + "loss": 1.3956, + "grad_norm": 9.163595199584961, + "learning_rate": 0.00011428571428571427 + }, + { + "step": 25, + "epoch": 0.04006410256410257, + "cpu_mem": 1.521610752, + "gpu_mem": 4.722416128, + "loss": 1.5325, + "grad_norm": 13.91305160522461, + "learning_rate": 0.00011904761904761903 + }, + { + "step": 26, + "epoch": 0.041666666666666664, + "cpu_mem": 1.522200576, + "gpu_mem": 4.722422272, + "loss": 1.508, + "grad_norm": 13.167956352233887, + "learning_rate": 0.00012380952380952378 + }, + { + "step": 27, + "epoch": 0.04326923076923077, + "cpu_mem": 1.5227904, + "gpu_mem": 4.722429952, + "loss": 1.4744, + "grad_norm": 12.06039810180664, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 28, + "epoch": 0.04487179487179487, + "cpu_mem": 1.523380224, + "gpu_mem": 4.722425344, + "loss": 1.4052, + "grad_norm": 5.644166946411133, + "learning_rate": 0.0001333333333333333 + }, + { + "step": 29, + "epoch": 0.046474358974358976, + "cpu_mem": 1.523970048, + "gpu_mem": 4.72243456, + "loss": 1.4767, + "grad_norm": 7.836399078369141, + "learning_rate": 0.00013809523809523808 + }, + { + "step": 30, + "epoch": 0.04807692307692308, + "cpu_mem": 1.524559872, + "gpu_mem": 4.722406912, + "loss": 1.4402, + "grad_norm": 6.692495822906494, + "learning_rate": 0.00014285714285714284 + }, + { + "step": 31, + "epoch": 0.049679487179487176, + "cpu_mem": 1.525149696, + "gpu_mem": 4.722462208, + "loss": 1.3998, + "grad_norm": 3.5728275775909424, + "learning_rate": 0.0001476190476190476 + }, + { + "step": 32, + "epoch": 0.05128205128205128, + "cpu_mem": 1.52573952, + "gpu_mem": 4.722454528, + "loss": 1.3697, + "grad_norm": 1.213369607925415, + "learning_rate": 0.00015238095238095237 + }, + { + "step": 33, + "epoch": 0.052884615384615384, + "cpu_mem": 1.526329344, + "gpu_mem": 4.722408448, + "loss": 1.4302, + "grad_norm": 4.651853084564209, + "learning_rate": 0.00015714285714285713 + }, + { + "step": 34, + "epoch": 0.05448717948717949, + "cpu_mem": 1.52672256, + "gpu_mem": 4.72242688, + "loss": 1.4009, + "grad_norm": 2.059593439102173, + "learning_rate": 0.00016190476190476187 + }, + { + "step": 35, + "epoch": 0.05608974358974359, + "cpu_mem": 1.527312384, + "gpu_mem": 4.722448384, + "loss": 1.3722, + "grad_norm": 5.320507526397705, + "learning_rate": 0.00016666666666666666 + }, + { + "step": 36, + "epoch": 0.057692307692307696, + "cpu_mem": 1.527902208, + "gpu_mem": 4.722446848, + "loss": 1.511, + "grad_norm": 12.134772300720215, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 37, + "epoch": 0.05929487179487179, + "cpu_mem": 1.528492032, + "gpu_mem": 4.722479104, + "loss": 1.4928, + "grad_norm": 9.001901626586914, + "learning_rate": 0.0001761904761904762 + }, + { + "step": 38, + "epoch": 0.060897435897435896, + "cpu_mem": 1.529081856, + "gpu_mem": 4.722431488, + "loss": 1.4343, + "grad_norm": 5.4666829109191895, + "learning_rate": 0.00018095238095238093 + }, + { + "step": 39, + "epoch": 0.0625, + "cpu_mem": 1.52967168, + "gpu_mem": 4.72248832, + "loss": 1.4734, + "grad_norm": 11.486861228942871, + "learning_rate": 0.00018571428571428572 + }, + { + "step": 40, + "epoch": 0.0641025641025641, + "cpu_mem": 1.530261504, + "gpu_mem": 4.722416128, + "loss": 1.4266, + "grad_norm": 4.9751691818237305, + "learning_rate": 0.00019047619047619045 + }, + { + "step": 41, + "epoch": 0.06570512820512821, + "cpu_mem": 1.530851328, + "gpu_mem": 4.722443776, + "loss": 1.4071, + "grad_norm": 5.411238193511963, + "learning_rate": 0.00019523809523809522 + }, + { + "step": 42, + "epoch": 0.0673076923076923, + "cpu_mem": 1.531244544, + "gpu_mem": 4.7224576, + "loss": 1.4826, + "grad_norm": 6.636025905609131, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 43, + "epoch": 0.06891025641025642, + "cpu_mem": 1.531834368, + "gpu_mem": 4.722463744, + "loss": 1.396, + "grad_norm": 2.6955389976501465, + "learning_rate": 0.00020476190476190475 + }, + { + "step": 44, + "epoch": 0.07051282051282051, + "cpu_mem": 1.532424192, + "gpu_mem": 4.72244224, + "loss": 1.3941, + "grad_norm": 2.0994045734405518, + "learning_rate": 0.00020952380952380948 + }, + { + "step": 45, + "epoch": 0.07211538461538461, + "cpu_mem": 1.532817408, + "gpu_mem": 4.72244224, + "loss": 1.453, + "grad_norm": 5.582932949066162, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 46, + "epoch": 0.07371794871794872, + "cpu_mem": 1.533407232, + "gpu_mem": 4.72244224, + "loss": 1.3386, + "grad_norm": 3.96662974357605, + "learning_rate": 0.000219047619047619 + }, + { + "step": 47, + "epoch": 0.07532051282051282, + "cpu_mem": 1.533997056, + "gpu_mem": 4.722428416, + "loss": 1.418, + "grad_norm": 5.093774318695068, + "learning_rate": 0.0002238095238095238 + }, + { + "step": 48, + "epoch": 0.07692307692307693, + "cpu_mem": 1.534390272, + "gpu_mem": 4.722446848, + "loss": 1.3603, + "grad_norm": 1.871256947517395, + "learning_rate": 0.00022857142857142854 + }, + { + "step": 49, + "epoch": 0.07852564102564102, + "cpu_mem": 1.534783488, + "gpu_mem": 4.722459136, + "loss": 1.432, + "grad_norm": 3.994227409362793, + "learning_rate": 0.0002333333333333333 + }, + { + "step": 50, + "epoch": 0.08012820512820513, + "cpu_mem": 1.535373312, + "gpu_mem": 4.722436096, + "loss": 1.3913, + "grad_norm": 2.2748217582702637, + "learning_rate": 0.00023809523809523807 + }, + { + "step": 51, + "epoch": 0.08173076923076923, + "cpu_mem": 1.535963136, + "gpu_mem": 4.722420736, + "loss": 1.4377, + "grad_norm": 5.555188179016113, + "learning_rate": 0.00024285714285714283 + }, + { + "step": 52, + "epoch": 0.08333333333333333, + "cpu_mem": 1.53655296, + "gpu_mem": 4.722425344, + "loss": 1.387, + "grad_norm": 3.4296271800994873, + "learning_rate": 0.00024761904761904757 + }, + { + "step": 53, + "epoch": 0.08493589743589744, + "cpu_mem": 1.536946176, + "gpu_mem": 4.722452992, + "loss": 1.4705, + "grad_norm": 6.458395481109619, + "learning_rate": 0.0002523809523809524 + }, + { + "step": 54, + "epoch": 0.08653846153846154, + "cpu_mem": 1.537339392, + "gpu_mem": 4.722428416, + "loss": 1.4061, + "grad_norm": 4.719325542449951, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 55, + "epoch": 0.08814102564102565, + "cpu_mem": 1.537929216, + "gpu_mem": 4.722446848, + "loss": 1.4203, + "grad_norm": 4.7936811447143555, + "learning_rate": 0.00026190476190476186 + }, + { + "step": 56, + "epoch": 0.08974358974358974, + "cpu_mem": 1.53851904, + "gpu_mem": 4.722440704, + "loss": 1.488, + "grad_norm": 6.068424224853516, + "learning_rate": 0.0002666666666666666 + }, + { + "step": 57, + "epoch": 0.09134615384615384, + "cpu_mem": 1.538912256, + "gpu_mem": 4.722406912, + "loss": 1.3676, + "grad_norm": 2.8458752632141113, + "learning_rate": 0.0002714285714285714 + }, + { + "step": 58, + "epoch": 0.09294871794871795, + "cpu_mem": 1.539305472, + "gpu_mem": 4.722436096, + "loss": 1.4614, + "grad_norm": 5.1316728591918945, + "learning_rate": 0.00027619047619047615 + }, + { + "step": 59, + "epoch": 0.09455128205128205, + "cpu_mem": 1.539895296, + "gpu_mem": 4.7224192, + "loss": 1.4677, + "grad_norm": 6.081681251525879, + "learning_rate": 0.0002809523809523809 + }, + { + "step": 60, + "epoch": 0.09615384615384616, + "cpu_mem": 1.540288512, + "gpu_mem": 4.722460672, + "loss": 1.4781, + "grad_norm": 5.4584736824035645, + "learning_rate": 0.0002857142857142857 + }, + { + "step": 61, + "epoch": 0.09775641025641026, + "cpu_mem": 1.540681728, + "gpu_mem": 4.72242688, + "loss": 1.4096, + "grad_norm": 2.259345293045044, + "learning_rate": 0.00029047619047619045 + }, + { + "step": 62, + "epoch": 0.09935897435897435, + "cpu_mem": 1.541271552, + "gpu_mem": 4.722466816, + "loss": 1.3113, + "grad_norm": 1.284903883934021, + "learning_rate": 0.0002952380952380952 + }, + { + "step": 63, + "epoch": 0.10096153846153846, + "cpu_mem": 1.541664768, + "gpu_mem": 4.722420736, + "loss": 1.599, + "grad_norm": 6.069429397583008, + "learning_rate": 0.0003 + }, + { + "step": 64, + "epoch": 0.10256410256410256, + "cpu_mem": 1.542254592, + "gpu_mem": 4.722425344, + "loss": 1.5779, + "grad_norm": 5.764050006866455, + "learning_rate": 0.00029999764801714643 + }, + { + "step": 65, + "epoch": 0.10416666666666667, + "cpu_mem": 1.542647808, + "gpu_mem": 4.722422272, + "loss": 1.4256, + "grad_norm": 2.4828882217407227, + "learning_rate": 0.00029999059214234344 + }, + { + "step": 66, + "epoch": 0.10576923076923077, + "cpu_mem": 1.543041024, + "gpu_mem": 4.722440704, + "loss": 1.4357, + "grad_norm": 3.0779500007629395, + "learning_rate": 0.00029997883259686163 + }, + { + "step": 67, + "epoch": 0.10737179487179487, + "cpu_mem": 1.543630848, + "gpu_mem": 4.722433024, + "loss": 1.4396, + "grad_norm": 3.9280571937561035, + "learning_rate": 0.00029996236974947764 + }, + { + "step": 68, + "epoch": 0.10897435897435898, + "cpu_mem": 1.544024064, + "gpu_mem": 4.722417664, + "loss": 1.4198, + "grad_norm": 5.579930305480957, + "learning_rate": 0.00029994120411646263 + }, + { + "step": 69, + "epoch": 0.11057692307692307, + "cpu_mem": 1.54441728, + "gpu_mem": 4.72248832, + "loss": 1.4352, + "grad_norm": 5.997748374938965, + "learning_rate": 0.00029991533636156603 + }, + { + "step": 70, + "epoch": 0.11217948717948718, + "cpu_mem": 1.544810496, + "gpu_mem": 4.722439168, + "loss": 1.4231, + "grad_norm": 3.3999271392822266, + "learning_rate": 0.00029988476729599464 + }, + { + "step": 71, + "epoch": 0.11378205128205128, + "cpu_mem": 1.545203712, + "gpu_mem": 4.722463744, + "loss": 1.3506, + "grad_norm": 1.8234281539916992, + "learning_rate": 0.0002998494978783874 + }, + { + "step": 72, + "epoch": 0.11538461538461539, + "cpu_mem": 1.545793536, + "gpu_mem": 4.72243456, + "loss": 1.4309, + "grad_norm": 3.1378674507141113, + "learning_rate": 0.0002998095292147852 + }, + { + "step": 73, + "epoch": 0.11698717948717949, + "cpu_mem": 1.54638336, + "gpu_mem": 4.72242688, + "loss": 1.4917, + "grad_norm": 6.492273807525635, + "learning_rate": 0.0002997648625585962 + }, + { + "step": 74, + "epoch": 0.11858974358974358, + "cpu_mem": 1.546776576, + "gpu_mem": 4.722420736, + "loss": 1.418, + "grad_norm": 3.871703624725342, + "learning_rate": 0.0002997154993105566 + }, + { + "step": 75, + "epoch": 0.1201923076923077, + "cpu_mem": 1.547169792, + "gpu_mem": 4.72244992, + "loss": 1.4501, + "grad_norm": 3.3749847412109375, + "learning_rate": 0.00029966144101868636 + }, + { + "step": 76, + "epoch": 0.12179487179487179, + "cpu_mem": 1.547563008, + "gpu_mem": 4.722440704, + "loss": 1.4523, + "grad_norm": 4.267124652862549, + "learning_rate": 0.0002996026893782414 + }, + { + "step": 77, + "epoch": 0.1233974358974359, + "cpu_mem": 1.547956224, + "gpu_mem": 4.722428416, + "loss": 1.4128, + "grad_norm": 2.430520534515381, + "learning_rate": 0.00029953924623165955 + }, + { + "step": 78, + "epoch": 0.125, + "cpu_mem": 1.54834944, + "gpu_mem": 4.722420736, + "loss": 1.5126, + "grad_norm": 4.287976264953613, + "learning_rate": 0.0002994711135685035 + }, + { + "step": 79, + "epoch": 0.1266025641025641, + "cpu_mem": 1.548742656, + "gpu_mem": 4.72247296, + "loss": 1.4548, + "grad_norm": 3.82047963142395, + "learning_rate": 0.00029939829352539787 + }, + { + "step": 80, + "epoch": 0.1282051282051282, + "cpu_mem": 1.549135872, + "gpu_mem": 4.722451456, + "loss": 1.4104, + "grad_norm": 1.3908411264419556, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 81, + "epoch": 0.12980769230769232, + "cpu_mem": 1.549529088, + "gpu_mem": 4.722445312, + "loss": 1.3769, + "grad_norm": 2.093559503555298, + "learning_rate": 0.0002992386005807413 + }, + { + "step": 82, + "epoch": 0.13141025641025642, + "cpu_mem": 1.550118912, + "gpu_mem": 4.722422272, + "loss": 1.4242, + "grad_norm": 2.8620200157165527, + "learning_rate": 0.00029915173268712456 + }, + { + "step": 83, + "epoch": 0.1330128205128205, + "cpu_mem": 1.550512128, + "gpu_mem": 4.722443776, + "loss": 1.4925, + "grad_norm": 4.449801921844482, + "learning_rate": 0.0002990601874292698 + }, + { + "step": 84, + "epoch": 0.1346153846153846, + "cpu_mem": 1.550905344, + "gpu_mem": 4.722416128, + "loss": 1.4365, + "grad_norm": 3.1816444396972656, + "learning_rate": 0.0002989639676780152 + }, + { + "step": 85, + "epoch": 0.1362179487179487, + "cpu_mem": 1.551101952, + "gpu_mem": 4.722423808, + "loss": 1.4112, + "grad_norm": 2.2299880981445312, + "learning_rate": 0.0002988630764507904 + }, + { + "step": 86, + "epoch": 0.13782051282051283, + "cpu_mem": 1.551691776, + "gpu_mem": 4.72244224, + "loss": 1.4323, + "grad_norm": 3.1118712425231934, + "learning_rate": 0.00029875751691152094 + }, + { + "step": 87, + "epoch": 0.13942307692307693, + "cpu_mem": 1.552084992, + "gpu_mem": 4.722431488, + "loss": 1.4147, + "grad_norm": 2.128331422805786, + "learning_rate": 0.0002986472923705301 + }, + { + "step": 88, + "epoch": 0.14102564102564102, + "cpu_mem": 1.5522816, + "gpu_mem": 4.722429952, + "loss": 1.3969, + "grad_norm": 2.4060044288635254, + "learning_rate": 0.0002985324062844341 + }, + { + "step": 89, + "epoch": 0.14262820512820512, + "cpu_mem": 1.552674816, + "gpu_mem": 4.722425344, + "loss": 1.4333, + "grad_norm": 2.7487502098083496, + "learning_rate": 0.0002984128622560345 + }, + { + "step": 90, + "epoch": 0.14423076923076922, + "cpu_mem": 1.553068032, + "gpu_mem": 4.722429952, + "loss": 1.4427, + "grad_norm": 3.0203230381011963, + "learning_rate": 0.0002982886640342046 + }, + { + "step": 91, + "epoch": 0.14583333333333334, + "cpu_mem": 1.553461248, + "gpu_mem": 4.722440704, + "loss": 1.428, + "grad_norm": 2.442150354385376, + "learning_rate": 0.00029815981551377217 + }, + { + "step": 92, + "epoch": 0.14743589743589744, + "cpu_mem": 1.553854464, + "gpu_mem": 4.722443776, + "loss": 1.4006, + "grad_norm": 1.1741440296173096, + "learning_rate": 0.00029802632073539745 + }, + { + "step": 93, + "epoch": 0.14903846153846154, + "cpu_mem": 1.55424768, + "gpu_mem": 4.722443776, + "loss": 1.4334, + "grad_norm": 1.703808307647705, + "learning_rate": 0.0002978881838854462 + }, + { + "step": 94, + "epoch": 0.15064102564102563, + "cpu_mem": 1.554640896, + "gpu_mem": 4.722439168, + "loss": 1.3907, + "grad_norm": 1.2035653591156006, + "learning_rate": 0.00029774540929585847 + }, + { + "step": 95, + "epoch": 0.15224358974358973, + "cpu_mem": 1.555034112, + "gpu_mem": 4.7224576, + "loss": 1.4377, + "grad_norm": 2.6262130737304688, + "learning_rate": 0.0002975980014440126 + }, + { + "step": 96, + "epoch": 0.15384615384615385, + "cpu_mem": 1.555427328, + "gpu_mem": 4.722460672, + "loss": 1.4164, + "grad_norm": 1.7251969575881958, + "learning_rate": 0.00029744596495258525 + }, + { + "step": 97, + "epoch": 0.15544871794871795, + "cpu_mem": 1.555820544, + "gpu_mem": 4.722437632, + "loss": 1.4307, + "grad_norm": 2.2175540924072266, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 98, + "epoch": 0.15705128205128205, + "cpu_mem": 1.55621376, + "gpu_mem": 4.722448384, + "loss": 1.3971, + "grad_norm": 1.1592307090759277, + "learning_rate": 0.000297128025267308 + }, + { + "step": 99, + "epoch": 0.15865384615384615, + "cpu_mem": 1.556606976, + "gpu_mem": 4.722448384, + "loss": 1.4845, + "grad_norm": 3.845695734024048, + "learning_rate": 0.00029696213204397396 + }, + { + "step": 100, + "epoch": 0.16025641025641027, + "cpu_mem": 1.557000192, + "gpu_mem": 4.722423808, + "loss": 1.4292, + "grad_norm": 2.4601757526397705, + "learning_rate": 0.00029679163012177737 + }, + { + "step": 101, + "epoch": 0.16185897435897437, + "cpu_mem": 1.557393408, + "gpu_mem": 4.722452992, + "loss": 1.3986, + "grad_norm": 1.7245676517486572, + "learning_rate": 0.0002966165248476196 + }, + { + "step": 102, + "epoch": 0.16346153846153846, + "cpu_mem": 1.557786624, + "gpu_mem": 4.722429952, + "loss": 1.3702, + "grad_norm": 1.6252422332763672, + "learning_rate": 0.00029643682171276203 + }, + { + "step": 103, + "epoch": 0.16506410256410256, + "cpu_mem": 1.55817984, + "gpu_mem": 4.722446848, + "loss": 1.432, + "grad_norm": 2.9775218963623047, + "learning_rate": 0.0002962525263526538 + }, + { + "step": 104, + "epoch": 0.16666666666666666, + "cpu_mem": 1.558376448, + "gpu_mem": 4.722414592, + "loss": 1.3925, + "grad_norm": 2.624054193496704, + "learning_rate": 0.0002960636445467553 + }, + { + "step": 105, + "epoch": 0.16826923076923078, + "cpu_mem": 1.558769664, + "gpu_mem": 4.722429952, + "loss": 1.3719, + "grad_norm": 1.5614051818847656, + "learning_rate": 0.0002958701822183569 + }, + { + "step": 106, + "epoch": 0.16987179487179488, + "cpu_mem": 1.558966272, + "gpu_mem": 4.722409984, + "loss": 1.5501, + "grad_norm": 5.381682872772217, + "learning_rate": 0.0002956721454343928 + }, + { + "step": 107, + "epoch": 0.17147435897435898, + "cpu_mem": 1.559359488, + "gpu_mem": 4.722451456, + "loss": 1.4305, + "grad_norm": 2.4415228366851807, + "learning_rate": 0.0002954695404052514 + }, + { + "step": 108, + "epoch": 0.17307692307692307, + "cpu_mem": 1.559752704, + "gpu_mem": 4.722446848, + "loss": 1.3802, + "grad_norm": 1.2388726472854614, + "learning_rate": 0.00029526237348458003 + }, + { + "step": 109, + "epoch": 0.17467948717948717, + "cpu_mem": 1.559949312, + "gpu_mem": 4.722452992, + "loss": 1.4365, + "grad_norm": 3.2533674240112305, + "learning_rate": 0.000295050651169086 + }, + { + "step": 110, + "epoch": 0.1762820512820513, + "cpu_mem": 1.560342528, + "gpu_mem": 4.72244992, + "loss": 1.4564, + "grad_norm": 3.199009418487549, + "learning_rate": 0.00029483438009833264 + }, + { + "step": 111, + "epoch": 0.1778846153846154, + "cpu_mem": 1.560539136, + "gpu_mem": 4.722451456, + "loss": 1.4518, + "grad_norm": 3.5905706882476807, + "learning_rate": 0.0002946135670545314 + }, + { + "step": 112, + "epoch": 0.1794871794871795, + "cpu_mem": 1.560932352, + "gpu_mem": 4.722448384, + "loss": 1.4176, + "grad_norm": 2.1957826614379883, + "learning_rate": 0.0002943882189623288 + }, + { + "step": 113, + "epoch": 0.18108974358974358, + "cpu_mem": 1.561325568, + "gpu_mem": 4.722428416, + "loss": 1.4061, + "grad_norm": 1.9331721067428589, + "learning_rate": 0.00029415834288858947 + }, + { + "step": 114, + "epoch": 0.18269230769230768, + "cpu_mem": 1.561522176, + "gpu_mem": 4.722423808, + "loss": 1.4281, + "grad_norm": 3.259641170501709, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 115, + "epoch": 0.1842948717948718, + "cpu_mem": 1.561915392, + "gpu_mem": 4.72244224, + "loss": 1.4007, + "grad_norm": 1.1054850816726685, + "learning_rate": 0.0002936850357737156 + }, + { + "step": 116, + "epoch": 0.1858974358974359, + "cpu_mem": 1.562308608, + "gpu_mem": 4.722452992, + "loss": 1.387, + "grad_norm": 0.4628679156303406, + "learning_rate": 0.0002934416195753839 + }, + { + "step": 117, + "epoch": 0.1875, + "cpu_mem": 1.562701824, + "gpu_mem": 4.722439168, + "loss": 1.401, + "grad_norm": 1.1815773248672485, + "learning_rate": 0.00029319370508065594 + }, + { + "step": 118, + "epoch": 0.1891025641025641, + "cpu_mem": 1.562898432, + "gpu_mem": 4.722454528, + "loss": 1.3949, + "grad_norm": 2.1022043228149414, + "learning_rate": 0.0002929413000640735 + }, + { + "step": 119, + "epoch": 0.1907051282051282, + "cpu_mem": 1.563291648, + "gpu_mem": 4.722436096, + "loss": 1.3966, + "grad_norm": 2.702901840209961, + "learning_rate": 0.0002926844124410001 + }, + { + "step": 120, + "epoch": 0.19230769230769232, + "cpu_mem": 1.563684864, + "gpu_mem": 4.722462208, + "loss": 1.431, + "grad_norm": 2.5681867599487305, + "learning_rate": 0.0002924230502673731 + }, + { + "step": 121, + "epoch": 0.19391025641025642, + "cpu_mem": 1.563881472, + "gpu_mem": 4.722420736, + "loss": 1.3535, + "grad_norm": 0.7861620187759399, + "learning_rate": 0.00029215722173945034 + }, + { + "step": 122, + "epoch": 0.1955128205128205, + "cpu_mem": 1.564274688, + "gpu_mem": 4.722452992, + "loss": 1.3924, + "grad_norm": 2.103865623474121, + "learning_rate": 0.0002918869351935537 + }, + { + "step": 123, + "epoch": 0.1971153846153846, + "cpu_mem": 1.564667904, + "gpu_mem": 4.722446848, + "loss": 1.3661, + "grad_norm": 1.2845231294631958, + "learning_rate": 0.00029161219910580754 + }, + { + "step": 124, + "epoch": 0.1987179487179487, + "cpu_mem": 1.56506112, + "gpu_mem": 4.722448384, + "loss": 1.3573, + "grad_norm": 2.1432478427886963, + "learning_rate": 0.00029133302209187267 + }, + { + "step": 125, + "epoch": 0.20032051282051283, + "cpu_mem": 1.565454336, + "gpu_mem": 4.722423808, + "loss": 1.337, + "grad_norm": 2.771897554397583, + "learning_rate": 0.00029104941290667655 + }, + { + "step": 126, + "epoch": 0.20192307692307693, + "cpu_mem": 1.565650944, + "gpu_mem": 4.722433024, + "loss": 1.281, + "grad_norm": 4.327815055847168, + "learning_rate": 0.00029076138044413827 + }, + { + "step": 127, + "epoch": 0.20352564102564102, + "cpu_mem": 1.56604416, + "gpu_mem": 4.7224192, + "loss": 1.3106, + "grad_norm": 5.115114688873291, + "learning_rate": 0.00029046893373689 + }, + { + "step": 128, + "epoch": 0.20512820512820512, + "cpu_mem": 1.566437376, + "gpu_mem": 4.722456064, + "loss": 1.1087, + "grad_norm": 6.275544166564941, + "learning_rate": 0.00029017208195599375 + }, + { + "step": 129, + "epoch": 0.20673076923076922, + "cpu_mem": 1.566633984, + "gpu_mem": 4.722452992, + "loss": 1.2338, + "grad_norm": 9.331634521484375, + "learning_rate": 0.0002898708344106533 + }, + { + "step": 130, + "epoch": 0.20833333333333334, + "cpu_mem": 1.5670272, + "gpu_mem": 4.722452992, + "loss": 1.1956, + "grad_norm": 23.019941329956055, + "learning_rate": 0.00028956520054792303 + }, + { + "step": 131, + "epoch": 0.20993589743589744, + "cpu_mem": 1.567223808, + "gpu_mem": 4.72244224, + "loss": 1.4869, + "grad_norm": 18.142070770263672, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 132, + "epoch": 0.21153846153846154, + "cpu_mem": 1.567617024, + "gpu_mem": 4.72244224, + "loss": 1.1378, + "grad_norm": 21.46803092956543, + "learning_rate": 0.0002889408123459782 + }, + { + "step": 133, + "epoch": 0.21314102564102563, + "cpu_mem": 1.567813632, + "gpu_mem": 4.722423808, + "loss": 1.621, + "grad_norm": 11.068168640136719, + "learning_rate": 0.000288622077587435 + }, + { + "step": 134, + "epoch": 0.21474358974358973, + "cpu_mem": 1.568206848, + "gpu_mem": 4.72243456, + "loss": 1.3117, + "grad_norm": 4.053557395935059, + "learning_rate": 0.0002882989956722303 + }, + { + "step": 135, + "epoch": 0.21634615384615385, + "cpu_mem": 1.568403456, + "gpu_mem": 4.722443776, + "loss": 1.5003, + "grad_norm": 17.426782608032227, + "learning_rate": 0.00028797157673213914 + }, + { + "step": 136, + "epoch": 0.21794871794871795, + "cpu_mem": 1.568796672, + "gpu_mem": 4.722459136, + "loss": 1.42, + "grad_norm": 19.8308162689209, + "learning_rate": 0.00028763983103494465 + }, + { + "step": 137, + "epoch": 0.21955128205128205, + "cpu_mem": 1.56899328, + "gpu_mem": 4.722406912, + "loss": 1.5648, + "grad_norm": 50.59262466430664, + "learning_rate": 0.00028730376898411606 + }, + { + "step": 138, + "epoch": 0.22115384615384615, + "cpu_mem": 1.569386496, + "gpu_mem": 4.72242688, + "loss": 1.4659, + "grad_norm": 34.8862190246582, + "learning_rate": 0.00028696340111848245 + }, + { + "step": 139, + "epoch": 0.22275641025641027, + "cpu_mem": 1.569779712, + "gpu_mem": 4.722408448, + "loss": 1.3647, + "grad_norm": 11.367494583129883, + "learning_rate": 0.00028661873811190226 + }, + { + "step": 140, + "epoch": 0.22435897435897437, + "cpu_mem": 1.56997632, + "gpu_mem": 4.722425344, + "loss": 1.4947, + "grad_norm": 14.498015403747559, + "learning_rate": 0.0002862697907729285 + }, + { + "step": 141, + "epoch": 0.22596153846153846, + "cpu_mem": 1.570369536, + "gpu_mem": 4.722431488, + "loss": 1.3975, + "grad_norm": 8.78077220916748, + "learning_rate": 0.0002859165700444701 + }, + { + "step": 142, + "epoch": 0.22756410256410256, + "cpu_mem": 1.570566144, + "gpu_mem": 4.722428416, + "loss": 1.3666, + "grad_norm": 5.997957706451416, + "learning_rate": 0.00028555908700344824 + }, + { + "step": 143, + "epoch": 0.22916666666666666, + "cpu_mem": 1.57095936, + "gpu_mem": 4.722454528, + "loss": 1.3465, + "grad_norm": 9.759771347045898, + "learning_rate": 0.00028519735286044936 + }, + { + "step": 144, + "epoch": 0.23076923076923078, + "cpu_mem": 1.571155968, + "gpu_mem": 4.722428416, + "loss": 1.4114, + "grad_norm": 7.126190185546875, + "learning_rate": 0.0002848313789593736 + }, + { + "step": 145, + "epoch": 0.23237179487179488, + "cpu_mem": 1.571549184, + "gpu_mem": 4.722468352, + "loss": 1.3975, + "grad_norm": 7.005679607391357, + "learning_rate": 0.00028446117677707867 + }, + { + "step": 146, + "epoch": 0.23397435897435898, + "cpu_mem": 1.571745792, + "gpu_mem": 4.722417664, + "loss": 1.393, + "grad_norm": 1.8218727111816406, + "learning_rate": 0.0002840867579230205 + }, + { + "step": 147, + "epoch": 0.23557692307692307, + "cpu_mem": 1.5719424, + "gpu_mem": 4.72242688, + "loss": 1.4318, + "grad_norm": 3.2201011180877686, + "learning_rate": 0.00028370813413888866 + }, + { + "step": 148, + "epoch": 0.23717948717948717, + "cpu_mem": 1.572139008, + "gpu_mem": 4.722446848, + "loss": 1.4583, + "grad_norm": 3.4501521587371826, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 149, + "epoch": 0.2387820512820513, + "cpu_mem": 1.572335616, + "gpu_mem": 4.722437632, + "loss": 1.4217, + "grad_norm": 2.572826862335205, + "learning_rate": 0.0002829383194061186 + }, + { + "step": 150, + "epoch": 0.2403846153846154, + "cpu_mem": 1.572728832, + "gpu_mem": 4.72244992, + "loss": 1.441, + "grad_norm": 2.904571771621704, + "learning_rate": 0.00028254715259869444 + }, + { + "step": 151, + "epoch": 0.2419871794871795, + "cpu_mem": 1.57292544, + "gpu_mem": 4.722414592, + "loss": 1.4881, + "grad_norm": 3.377671718597412, + "learning_rate": 0.00028215182914286766 + }, + { + "step": 152, + "epoch": 0.24358974358974358, + "cpu_mem": 1.573122048, + "gpu_mem": 4.722445312, + "loss": 1.3845, + "grad_norm": 1.5085033178329468, + "learning_rate": 0.00028175236143589144 + }, + { + "step": 153, + "epoch": 0.24519230769230768, + "cpu_mem": 1.573515264, + "gpu_mem": 4.722440704, + "loss": 1.452, + "grad_norm": 3.825517177581787, + "learning_rate": 0.0002813487620049817 + }, + { + "step": 154, + "epoch": 0.2467948717948718, + "cpu_mem": 1.573711872, + "gpu_mem": 4.72246528, + "loss": 1.5476, + "grad_norm": 5.007538318634033, + "learning_rate": 0.00028094104350692435 + }, + { + "step": 155, + "epoch": 0.2483974358974359, + "cpu_mem": 1.574105088, + "gpu_mem": 4.722402304, + "loss": 1.5175, + "grad_norm": 3.7622299194335938, + "learning_rate": 0.0002805292187276783 + }, + { + "step": 156, + "epoch": 0.25, + "cpu_mem": 1.574301696, + "gpu_mem": 4.722456064, + "loss": 1.3788, + "grad_norm": 0.7600662708282471, + "learning_rate": 0.0002801133005819744 + }, + { + "step": 157, + "epoch": 0.2516025641025641, + "cpu_mem": 1.574498304, + "gpu_mem": 4.722448384, + "loss": 1.4394, + "grad_norm": 2.114057779312134, + "learning_rate": 0.00027969330211291077 + }, + { + "step": 158, + "epoch": 0.2532051282051282, + "cpu_mem": 1.57489152, + "gpu_mem": 4.722463744, + "loss": 1.4456, + "grad_norm": 1.844547986984253, + "learning_rate": 0.00027926923649154327 + }, + { + "step": 159, + "epoch": 0.2548076923076923, + "cpu_mem": 1.575088128, + "gpu_mem": 4.72246528, + "loss": 1.4021, + "grad_norm": 0.9557872414588928, + "learning_rate": 0.00027884111701647284 + }, + { + "step": 160, + "epoch": 0.2564102564102564, + "cpu_mem": 1.575481344, + "gpu_mem": 4.722433024, + "loss": 1.4121, + "grad_norm": 2.265564203262329, + "learning_rate": 0.00027840895711342834 + }, + { + "step": 161, + "epoch": 0.25801282051282054, + "cpu_mem": 1.575677952, + "gpu_mem": 4.722425344, + "loss": 1.3913, + "grad_norm": 1.8652262687683105, + "learning_rate": 0.00027797277033484553 + }, + { + "step": 162, + "epoch": 0.25961538461538464, + "cpu_mem": 1.57587456, + "gpu_mem": 4.722460672, + "loss": 1.4228, + "grad_norm": 2.991990804672241, + "learning_rate": 0.0002775325703594421 + }, + { + "step": 163, + "epoch": 0.26121794871794873, + "cpu_mem": 1.576071168, + "gpu_mem": 4.722408448, + "loss": 1.4271, + "grad_norm": 3.0147430896759033, + "learning_rate": 0.0002770883709917886 + }, + { + "step": 164, + "epoch": 0.26282051282051283, + "cpu_mem": 1.576464384, + "gpu_mem": 4.722443776, + "loss": 1.433, + "grad_norm": 3.3442089557647705, + "learning_rate": 0.0002766401861618757 + }, + { + "step": 165, + "epoch": 0.2644230769230769, + "cpu_mem": 1.576660992, + "gpu_mem": 4.722433024, + "loss": 1.3897, + "grad_norm": 1.7460036277770996, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 166, + "epoch": 0.266025641025641, + "cpu_mem": 1.577054208, + "gpu_mem": 4.72246528, + "loss": 1.4429, + "grad_norm": 3.4637093544006348, + "learning_rate": 0.0002757319164597092 + }, + { + "step": 167, + "epoch": 0.2676282051282051, + "cpu_mem": 1.577250816, + "gpu_mem": 4.722459136, + "loss": 1.3571, + "grad_norm": 1.767704725265503, + "learning_rate": 0.0002752718600705858 + }, + { + "step": 168, + "epoch": 0.2692307692307692, + "cpu_mem": 1.577447424, + "gpu_mem": 4.722437632, + "loss": 1.4393, + "grad_norm": 3.608151912689209, + "learning_rate": 0.00027480787518457023 + }, + { + "step": 169, + "epoch": 0.2708333333333333, + "cpu_mem": 1.577644032, + "gpu_mem": 4.72243456, + "loss": 1.4243, + "grad_norm": 2.947531223297119, + "learning_rate": 0.0002743399763521223 + }, + { + "step": 170, + "epoch": 0.2724358974358974, + "cpu_mem": 1.578037248, + "gpu_mem": 4.722471424, + "loss": 1.3742, + "grad_norm": 2.114414930343628, + "learning_rate": 0.0002738681782464426 + }, + { + "step": 171, + "epoch": 0.27403846153846156, + "cpu_mem": 1.578233856, + "gpu_mem": 4.722445312, + "loss": 1.4175, + "grad_norm": 3.3065853118896484, + "learning_rate": 0.0002733924956630117 + }, + { + "step": 172, + "epoch": 0.27564102564102566, + "cpu_mem": 1.578430464, + "gpu_mem": 4.722422272, + "loss": 1.396, + "grad_norm": 1.5328835248947144, + "learning_rate": 0.00027291294351912664 + }, + { + "step": 173, + "epoch": 0.27724358974358976, + "cpu_mem": 1.578627072, + "gpu_mem": 4.722448384, + "loss": 1.3948, + "grad_norm": 1.3869370222091675, + "learning_rate": 0.00027242953685343327 + }, + { + "step": 174, + "epoch": 0.27884615384615385, + "cpu_mem": 1.57882368, + "gpu_mem": 4.722460672, + "loss": 1.4205, + "grad_norm": 2.629889726638794, + "learning_rate": 0.0002719422908254538 + }, + { + "step": 175, + "epoch": 0.28044871794871795, + "cpu_mem": 1.579020288, + "gpu_mem": 4.722422272, + "loss": 1.4218, + "grad_norm": 2.953979253768921, + "learning_rate": 0.0002714512207151125 + }, + { + "step": 176, + "epoch": 0.28205128205128205, + "cpu_mem": 1.579216896, + "gpu_mem": 4.722431488, + "loss": 1.5115, + "grad_norm": 40.56075668334961, + "learning_rate": 0.0002709563419222557 + }, + { + "step": 177, + "epoch": 0.28365384615384615, + "cpu_mem": 1.579413504, + "gpu_mem": 4.722413056, + "loss": 1.3764, + "grad_norm": 2.611297607421875, + "learning_rate": 0.0002704576699661691 + }, + { + "step": 178, + "epoch": 0.28525641025641024, + "cpu_mem": 1.579610112, + "gpu_mem": 4.72242688, + "loss": 1.5569, + "grad_norm": 11.415590286254883, + "learning_rate": 0.0002699552204850914 + }, + { + "step": 179, + "epoch": 0.28685897435897434, + "cpu_mem": 1.580003328, + "gpu_mem": 4.72243456, + "loss": 1.4793, + "grad_norm": 4.962634086608887, + "learning_rate": 0.0002694490092357233 + }, + { + "step": 180, + "epoch": 0.28846153846153844, + "cpu_mem": 1.580199936, + "gpu_mem": 4.722416128, + "loss": 1.4183, + "grad_norm": 3.5406746864318848, + "learning_rate": 0.00026893905209273404 + }, + { + "step": 181, + "epoch": 0.2900641025641026, + "cpu_mem": 1.580396544, + "gpu_mem": 4.722446848, + "loss": 1.4577, + "grad_norm": 4.6437458992004395, + "learning_rate": 0.00026842536504826286 + }, + { + "step": 182, + "epoch": 0.2916666666666667, + "cpu_mem": 1.580593152, + "gpu_mem": 4.722417664, + "loss": 1.3516, + "grad_norm": 1.3187172412872314, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 183, + "epoch": 0.2932692307692308, + "cpu_mem": 1.58078976, + "gpu_mem": 4.72244224, + "loss": 1.4228, + "grad_norm": 2.972943067550659, + "learning_rate": 0.0002673868658077717 + }, + { + "step": 184, + "epoch": 0.2948717948717949, + "cpu_mem": 1.581182976, + "gpu_mem": 4.722422272, + "loss": 1.4417, + "grad_norm": 3.1724488735198975, + "learning_rate": 0.00026686208617885055 + }, + { + "step": 185, + "epoch": 0.296474358974359, + "cpu_mem": 1.581379584, + "gpu_mem": 4.722454528, + "loss": 1.4318, + "grad_norm": 3.1886136531829834, + "learning_rate": 0.0002663336417816238 + }, + { + "step": 186, + "epoch": 0.2980769230769231, + "cpu_mem": 1.581576192, + "gpu_mem": 4.722445312, + "loss": 1.419, + "grad_norm": 1.9611780643463135, + "learning_rate": 0.0002658015491879868 + }, + { + "step": 187, + "epoch": 0.29967948717948717, + "cpu_mem": 1.5817728, + "gpu_mem": 4.722440704, + "loss": 1.3703, + "grad_norm": 2.001094102859497, + "learning_rate": 0.00026526582508424175 + }, + { + "step": 188, + "epoch": 0.30128205128205127, + "cpu_mem": 1.581969408, + "gpu_mem": 4.722397696, + "loss": 1.4646, + "grad_norm": 3.241692543029785, + "learning_rate": 0.0002647264862705741 + }, + { + "step": 189, + "epoch": 0.30288461538461536, + "cpu_mem": 1.582362624, + "gpu_mem": 4.722477568, + "loss": 1.4471, + "grad_norm": 3.134511947631836, + "learning_rate": 0.00026418354966052573 + }, + { + "step": 190, + "epoch": 0.30448717948717946, + "cpu_mem": 1.582559232, + "gpu_mem": 4.722428416, + "loss": 16.2524, + "grad_norm": 513.8385620117188, + "learning_rate": 0.00026363703228046454 + }, + { + "step": 191, + "epoch": 0.3060897435897436, + "cpu_mem": 1.58275584, + "gpu_mem": 4.722428416, + "loss": 1.375, + "grad_norm": 0.6915599703788757, + "learning_rate": 0.0002630869512690507 + }, + { + "step": 192, + "epoch": 0.3076923076923077, + "cpu_mem": 1.582952448, + "gpu_mem": 4.722394624, + "loss": 1.4506, + "grad_norm": 3.003168821334839, + "learning_rate": 0.0002625333238766989 + }, + { + "step": 193, + "epoch": 0.3092948717948718, + "cpu_mem": 1.583149056, + "gpu_mem": 4.72243456, + "loss": 1.3689, + "grad_norm": 1.5186249017715454, + "learning_rate": 0.0002619761674650377 + }, + { + "step": 194, + "epoch": 0.3108974358974359, + "cpu_mem": 1.583345664, + "gpu_mem": 4.722429952, + "loss": 1.3802, + "grad_norm": 1.8120955228805542, + "learning_rate": 0.0002614154995063647 + }, + { + "step": 195, + "epoch": 0.3125, + "cpu_mem": 1.583542272, + "gpu_mem": 4.722417664, + "loss": 1.4023, + "grad_norm": 1.3597509860992432, + "learning_rate": 0.00026085133758309883 + }, + { + "step": 196, + "epoch": 0.3141025641025641, + "cpu_mem": 1.58373888, + "gpu_mem": 4.72244224, + "loss": 1.4003, + "grad_norm": 2.536520004272461, + "learning_rate": 0.0002602836993872292 + }, + { + "step": 197, + "epoch": 0.3157051282051282, + "cpu_mem": 1.583935488, + "gpu_mem": 4.7224576, + "loss": 1.399, + "grad_norm": 1.984102725982666, + "learning_rate": 0.0002597126027197598 + }, + { + "step": 198, + "epoch": 0.3173076923076923, + "cpu_mem": 1.584132096, + "gpu_mem": 4.722429952, + "loss": 1.3774, + "grad_norm": 1.5506010055541992, + "learning_rate": 0.0002591380654901515 + }, + { + "step": 199, + "epoch": 0.3189102564102564, + "cpu_mem": 1.584328704, + "gpu_mem": 4.72242688, + "loss": 1.5196, + "grad_norm": 3.529722213745117, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 200, + "epoch": 0.32051282051282054, + "cpu_mem": 1.584525312, + "gpu_mem": 4.72244224, + "loss": 1.4672, + "grad_norm": 2.5541861057281494, + "learning_rate": 0.0002579787415212732 + }, + { + "step": 201, + "epoch": 0.32211538461538464, + "cpu_mem": 1.58472192, + "gpu_mem": 4.7224192, + "loss": 1.3888, + "grad_norm": 0.4045758843421936, + "learning_rate": 0.00025739399113813784 + }, + { + "step": 202, + "epoch": 0.32371794871794873, + "cpu_mem": 1.584918528, + "gpu_mem": 4.722420736, + "loss": 1.3845, + "grad_norm": 0.42493075132369995, + "learning_rate": 0.00025680587290399277 + }, + { + "step": 203, + "epoch": 0.32532051282051283, + "cpu_mem": 1.585115136, + "gpu_mem": 4.722462208, + "loss": 1.3775, + "grad_norm": 1.5816408395767212, + "learning_rate": 0.0002562144052620913 + }, + { + "step": 204, + "epoch": 0.3269230769230769, + "cpu_mem": 1.585311744, + "gpu_mem": 4.722433024, + "loss": 1.4661, + "grad_norm": 3.0499348640441895, + "learning_rate": 0.00025561960676072354 + }, + { + "step": 205, + "epoch": 0.328525641025641, + "cpu_mem": 1.585508352, + "gpu_mem": 4.722433024, + "loss": 1.4908, + "grad_norm": 3.6302878856658936, + "learning_rate": 0.0002550214960526344 + }, + { + "step": 206, + "epoch": 0.3301282051282051, + "cpu_mem": 1.58570496, + "gpu_mem": 4.722429952, + "loss": 1.4231, + "grad_norm": 2.2628281116485596, + "learning_rate": 0.000254420091894439 + }, + { + "step": 207, + "epoch": 0.3317307692307692, + "cpu_mem": 1.585901568, + "gpu_mem": 4.722429952, + "loss": 1.4017, + "grad_norm": 1.6914762258529663, + "learning_rate": 0.0002538154131460342 + }, + { + "step": 208, + "epoch": 0.3333333333333333, + "cpu_mem": 1.586098176, + "gpu_mem": 4.722420736, + "loss": 1.3845, + "grad_norm": 0.6615778207778931, + "learning_rate": 0.00025320747877000745 + }, + { + "step": 209, + "epoch": 0.3349358974358974, + "cpu_mem": 1.586294784, + "gpu_mem": 4.722456064, + "loss": 1.4592, + "grad_norm": 3.784411668777466, + "learning_rate": 0.00025259630783104164 + }, + { + "step": 210, + "epoch": 0.33653846153846156, + "cpu_mem": 1.586491392, + "gpu_mem": 4.722413056, + "loss": 1.4175, + "grad_norm": 2.486010789871216, + "learning_rate": 0.00025198191949531786 + }, + { + "step": 211, + "epoch": 0.33814102564102566, + "cpu_mem": 1.586688, + "gpu_mem": 4.722440704, + "loss": 1.3839, + "grad_norm": 1.2530264854431152, + "learning_rate": 0.00025136433302991366 + }, + { + "step": 212, + "epoch": 0.33974358974358976, + "cpu_mem": 1.586884608, + "gpu_mem": 4.72244992, + "loss": 1.332, + "grad_norm": 2.7633841037750244, + "learning_rate": 0.00025074356780219946 + }, + { + "step": 213, + "epoch": 0.34134615384615385, + "cpu_mem": 1.587081216, + "gpu_mem": 4.722422272, + "loss": 1.4996, + "grad_norm": 4.313691139221191, + "learning_rate": 0.000250119643279231 + }, + { + "step": 214, + "epoch": 0.34294871794871795, + "cpu_mem": 1.587277824, + "gpu_mem": 4.722431488, + "loss": 1.7336, + "grad_norm": 6.433745384216309, + "learning_rate": 0.0002494925790271386 + }, + { + "step": 215, + "epoch": 0.34455128205128205, + "cpu_mem": 1.587474432, + "gpu_mem": 4.722433024, + "loss": 1.7266, + "grad_norm": 7.260377883911133, + "learning_rate": 0.00024886239471051376 + }, + { + "step": 216, + "epoch": 0.34615384615384615, + "cpu_mem": 1.58767104, + "gpu_mem": 4.722433024, + "loss": 1.4707, + "grad_norm": 3.5277178287506104, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 217, + "epoch": 0.34775641025641024, + "cpu_mem": 1.587867648, + "gpu_mem": 4.722417664, + "loss": 1.4413, + "grad_norm": 2.6769254207611084, + "learning_rate": 0.0002475927450306363 + }, + { + "step": 218, + "epoch": 0.34935897435897434, + "cpu_mem": 1.588064256, + "gpu_mem": 4.722439168, + "loss": 1.4947, + "grad_norm": 4.20396089553833, + "learning_rate": 0.0002469533194833073 + }, + { + "step": 219, + "epoch": 0.35096153846153844, + "cpu_mem": 1.588260864, + "gpu_mem": 4.72247296, + "loss": 1.3753, + "grad_norm": 1.4305341243743896, + "learning_rate": 0.0002463108535020447 + }, + { + "step": 220, + "epoch": 0.3525641025641026, + "cpu_mem": 1.588457472, + "gpu_mem": 4.72242688, + "loss": 1.3944, + "grad_norm": 1.6689095497131348, + "learning_rate": 0.0002456653672344348 + }, + { + "step": 221, + "epoch": 0.3541666666666667, + "cpu_mem": 1.58865408, + "gpu_mem": 4.722433024, + "loss": 1.4284, + "grad_norm": 2.096540689468384, + "learning_rate": 0.0002450168809227794 + }, + { + "step": 222, + "epoch": 0.3557692307692308, + "cpu_mem": 1.588850688, + "gpu_mem": 4.722448384, + "loss": 1.4017, + "grad_norm": 1.722987413406372, + "learning_rate": 0.00024436541490346095 + }, + { + "step": 223, + "epoch": 0.3573717948717949, + "cpu_mem": 1.589047296, + "gpu_mem": 4.722466816, + "loss": 1.442, + "grad_norm": 2.4494879245758057, + "learning_rate": 0.00024371098960630495 + }, + { + "step": 224, + "epoch": 0.358974358974359, + "cpu_mem": 1.589047296, + "gpu_mem": 4.722436096, + "loss": 1.4706, + "grad_norm": 3.6240603923797607, + "learning_rate": 0.000243053625553939 + }, + { + "step": 225, + "epoch": 0.3605769230769231, + "cpu_mem": 1.589243904, + "gpu_mem": 4.722422272, + "loss": 1.4173, + "grad_norm": 2.2243576049804688, + "learning_rate": 0.00024239334336114953 + }, + { + "step": 226, + "epoch": 0.36217948717948717, + "cpu_mem": 1.589440512, + "gpu_mem": 4.722414592, + "loss": 1.3926, + "grad_norm": 0.7710844278335571, + "learning_rate": 0.0002417301637342352 + }, + { + "step": 227, + "epoch": 0.36378205128205127, + "cpu_mem": 1.58963712, + "gpu_mem": 4.722479104, + "loss": 1.3955, + "grad_norm": 1.3209964036941528, + "learning_rate": 0.00024106410747035744 + }, + { + "step": 228, + "epoch": 0.36538461538461536, + "cpu_mem": 1.589833728, + "gpu_mem": 4.722417664, + "loss": 1.3976, + "grad_norm": 1.8462011814117432, + "learning_rate": 0.00024039519545688846 + }, + { + "step": 229, + "epoch": 0.36698717948717946, + "cpu_mem": 1.590030336, + "gpu_mem": 4.722469888, + "loss": 1.4099, + "grad_norm": 1.2638853788375854, + "learning_rate": 0.000239723448670756 + }, + { + "step": 230, + "epoch": 0.3685897435897436, + "cpu_mem": 1.590226944, + "gpu_mem": 4.722451456, + "loss": 1.4169, + "grad_norm": 1.3429983854293823, + "learning_rate": 0.0002390488881777858 + }, + { + "step": 231, + "epoch": 0.3701923076923077, + "cpu_mem": 1.590423552, + "gpu_mem": 4.72244992, + "loss": 1.3993, + "grad_norm": 1.514525294303894, + "learning_rate": 0.0002383715351320406 + }, + { + "step": 232, + "epoch": 0.3717948717948718, + "cpu_mem": 1.59062016, + "gpu_mem": 4.722454528, + "loss": 1.4047, + "grad_norm": 1.3557519912719727, + "learning_rate": 0.00023769141077515713 + }, + { + "step": 233, + "epoch": 0.3733974358974359, + "cpu_mem": 1.590816768, + "gpu_mem": 4.722429952, + "loss": 1.3301, + "grad_norm": 1.594268560409546, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 234, + "epoch": 0.375, + "cpu_mem": 1.591013376, + "gpu_mem": 4.722459136, + "loss": 1.4871, + "grad_norm": 3.107274293899536, + "learning_rate": 0.0002363229335283915 + }, + { + "step": 235, + "epoch": 0.3766025641025641, + "cpu_mem": 1.591209984, + "gpu_mem": 4.722436096, + "loss": 1.3567, + "grad_norm": 1.4325249195098877, + "learning_rate": 0.00023563462355364297 + }, + { + "step": 236, + "epoch": 0.3782051282051282, + "cpu_mem": 1.591406592, + "gpu_mem": 4.722497536, + "loss": 1.5368, + "grad_norm": 3.7011468410491943, + "learning_rate": 0.0002349436280966775 + }, + { + "step": 237, + "epoch": 0.3798076923076923, + "cpu_mem": 1.5916032, + "gpu_mem": 4.722422272, + "loss": 1.5046, + "grad_norm": 3.768364429473877, + "learning_rate": 0.00023424996882695468 + }, + { + "step": 238, + "epoch": 0.3814102564102564, + "cpu_mem": 1.5916032, + "gpu_mem": 4.722433024, + "loss": 1.4396, + "grad_norm": 2.5473203659057617, + "learning_rate": 0.00023355366749747063 + }, + { + "step": 239, + "epoch": 0.38301282051282054, + "cpu_mem": 1.591799808, + "gpu_mem": 4.722431488, + "loss": 1.4107, + "grad_norm": 1.2733670473098755, + "learning_rate": 0.00023285474594407585 + }, + { + "step": 240, + "epoch": 0.38461538461538464, + "cpu_mem": 1.591996416, + "gpu_mem": 4.722428416, + "loss": 1.3965, + "grad_norm": 1.3513084650039673, + "learning_rate": 0.0002321532260847905 + }, + { + "step": 241, + "epoch": 0.38621794871794873, + "cpu_mem": 1.592193024, + "gpu_mem": 4.722459136, + "loss": 1.3853, + "grad_norm": 1.1246074438095093, + "learning_rate": 0.00023144912991911691 + }, + { + "step": 242, + "epoch": 0.38782051282051283, + "cpu_mem": 1.592389632, + "gpu_mem": 4.722437632, + "loss": 1.3998, + "grad_norm": 1.7422782182693481, + "learning_rate": 0.0002307424795273499 + }, + { + "step": 243, + "epoch": 0.3894230769230769, + "cpu_mem": 1.59258624, + "gpu_mem": 4.722433024, + "loss": 1.5164, + "grad_norm": 5.017183303833008, + "learning_rate": 0.00023003329706988425 + }, + { + "step": 244, + "epoch": 0.391025641025641, + "cpu_mem": 1.592782848, + "gpu_mem": 4.722443776, + "loss": 1.372, + "grad_norm": 2.323369026184082, + "learning_rate": 0.00022932160478651963 + }, + { + "step": 245, + "epoch": 0.3926282051282051, + "cpu_mem": 1.592979456, + "gpu_mem": 4.722448384, + "loss": 1.4363, + "grad_norm": 2.778162717819214, + "learning_rate": 0.00022860742499576338 + }, + { + "step": 246, + "epoch": 0.3942307692307692, + "cpu_mem": 1.592979456, + "gpu_mem": 4.722409984, + "loss": 1.4059, + "grad_norm": 1.4805610179901123, + "learning_rate": 0.00022789078009413042 + }, + { + "step": 247, + "epoch": 0.3958333333333333, + "cpu_mem": 1.593176064, + "gpu_mem": 4.722477568, + "loss": 1.4019, + "grad_norm": 1.2946662902832031, + "learning_rate": 0.00022717169255544108 + }, + { + "step": 248, + "epoch": 0.3974358974358974, + "cpu_mem": 1.593372672, + "gpu_mem": 4.722440704, + "loss": 1.3976, + "grad_norm": 1.8461631536483765, + "learning_rate": 0.00022645018493011612 + }, + { + "step": 249, + "epoch": 0.39903846153846156, + "cpu_mem": 1.59356928, + "gpu_mem": 4.722429952, + "loss": 1.4169, + "grad_norm": 1.4749032258987427, + "learning_rate": 0.0002257262798444698 + }, + { + "step": 250, + "epoch": 0.40064102564102566, + "cpu_mem": 1.593765888, + "gpu_mem": 4.722446848, + "loss": 1.3942, + "grad_norm": 1.1028556823730469, + "learning_rate": 0.000225 + }, + { + "step": 251, + "epoch": 0.40224358974358976, + "cpu_mem": 1.593765888, + "gpu_mem": 4.722420736, + "loss": 1.3927, + "grad_norm": 1.5785672664642334, + "learning_rate": 0.00022427136817267668 + }, + { + "step": 252, + "epoch": 0.40384615384615385, + "cpu_mem": 1.593962496, + "gpu_mem": 4.722468352, + "loss": 1.4076, + "grad_norm": 1.1293792724609375, + "learning_rate": 0.0002235404072122273 + }, + { + "step": 253, + "epoch": 0.40544871794871795, + "cpu_mem": 1.594159104, + "gpu_mem": 4.722436096, + "loss": 1.4653, + "grad_norm": 2.4549241065979004, + "learning_rate": 0.00022280714004142054 + }, + { + "step": 254, + "epoch": 0.40705128205128205, + "cpu_mem": 1.594355712, + "gpu_mem": 4.722425344, + "loss": 1.4171, + "grad_norm": 1.6131293773651123, + "learning_rate": 0.00022207158965534726 + }, + { + "step": 255, + "epoch": 0.40865384615384615, + "cpu_mem": 1.59455232, + "gpu_mem": 4.722440704, + "loss": 1.3817, + "grad_norm": 0.21497821807861328, + "learning_rate": 0.0002213337791206993 + }, + { + "step": 256, + "epoch": 0.41025641025641024, + "cpu_mem": 1.59455232, + "gpu_mem": 4.722437632, + "loss": 1.3855, + "grad_norm": 0.5771036148071289, + "learning_rate": 0.00022059373157504636 + }, + { + "step": 257, + "epoch": 0.41185897435897434, + "cpu_mem": 1.594748928, + "gpu_mem": 4.722437632, + "loss": 1.3866, + "grad_norm": 1.4057711362838745, + "learning_rate": 0.00021985147022611038 + }, + { + "step": 258, + "epoch": 0.41346153846153844, + "cpu_mem": 1.594945536, + "gpu_mem": 4.722425344, + "loss": 1.3901, + "grad_norm": 1.0046086311340332, + "learning_rate": 0.0002191070183510375 + }, + { + "step": 259, + "epoch": 0.4150641025641026, + "cpu_mem": 1.595142144, + "gpu_mem": 4.722408448, + "loss": 1.3694, + "grad_norm": 0.4486810863018036, + "learning_rate": 0.00021836039929566835 + }, + { + "step": 260, + "epoch": 0.4166666666666667, + "cpu_mem": 1.595338752, + "gpu_mem": 4.722471424, + "loss": 1.4073, + "grad_norm": 1.5608114004135132, + "learning_rate": 0.0002176116364738058 + }, + { + "step": 261, + "epoch": 0.4182692307692308, + "cpu_mem": 1.595338752, + "gpu_mem": 4.722425344, + "loss": 1.3879, + "grad_norm": 1.0217952728271484, + "learning_rate": 0.00021686075336648075 + }, + { + "step": 262, + "epoch": 0.4198717948717949, + "cpu_mem": 1.59553536, + "gpu_mem": 4.72243456, + "loss": 1.4221, + "grad_norm": 2.1287567615509033, + "learning_rate": 0.00021610777352121574 + }, + { + "step": 263, + "epoch": 0.421474358974359, + "cpu_mem": 1.59553536, + "gpu_mem": 4.722469888, + "loss": 1.4132, + "grad_norm": 1.4676884412765503, + "learning_rate": 0.0002153527205512867 + }, + { + "step": 264, + "epoch": 0.4230769230769231, + "cpu_mem": 1.595731968, + "gpu_mem": 4.72243456, + "loss": 1.3809, + "grad_norm": 0.8270678520202637, + "learning_rate": 0.0002145956181349821 + }, + { + "step": 265, + "epoch": 0.42467948717948717, + "cpu_mem": 1.595928576, + "gpu_mem": 4.722439168, + "loss": 1.448, + "grad_norm": 1.984511375427246, + "learning_rate": 0.00021383649001486055 + }, + { + "step": 266, + "epoch": 0.42628205128205127, + "cpu_mem": 1.596125184, + "gpu_mem": 4.722486784, + "loss": 1.4172, + "grad_norm": 1.4028806686401367, + "learning_rate": 0.00021307535999700637 + }, + { + "step": 267, + "epoch": 0.42788461538461536, + "cpu_mem": 1.596125184, + "gpu_mem": 4.722496, + "loss": 1.4287, + "grad_norm": 1.5114654302597046, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 268, + "epoch": 0.42948717948717946, + "cpu_mem": 1.596321792, + "gpu_mem": 4.72244992, + "loss": 1.4112, + "grad_norm": 1.130182147026062, + "learning_rate": 0.00021154718980558417 + }, + { + "step": 269, + "epoch": 0.4310897435897436, + "cpu_mem": 1.5965184, + "gpu_mem": 4.722443776, + "loss": 1.3752, + "grad_norm": 1.4197263717651367, + "learning_rate": 0.00021078019755508398 + }, + { + "step": 270, + "epoch": 0.4326923076923077, + "cpu_mem": 1.5965184, + "gpu_mem": 4.722505216, + "loss": 1.3921, + "grad_norm": 1.4581854343414307, + "learning_rate": 0.00021001129925148396 + }, + { + "step": 271, + "epoch": 0.4342948717948718, + "cpu_mem": 1.596715008, + "gpu_mem": 4.722431488, + "loss": 1.4748, + "grad_norm": 3.1804046630859375, + "learning_rate": 0.00020924051900725923 + }, + { + "step": 272, + "epoch": 0.4358974358974359, + "cpu_mem": 1.596911616, + "gpu_mem": 4.722429952, + "loss": 1.4828, + "grad_norm": 3.5991506576538086, + "learning_rate": 0.00020846788099390188 + }, + { + "step": 273, + "epoch": 0.4375, + "cpu_mem": 1.597108224, + "gpu_mem": 4.722433024, + "loss": 1.418, + "grad_norm": 2.02384877204895, + "learning_rate": 0.0002076934094411635 + }, + { + "step": 274, + "epoch": 0.4391025641025641, + "cpu_mem": 1.597304832, + "gpu_mem": 4.7224192, + "loss": 1.4104, + "grad_norm": 2.6172730922698975, + "learning_rate": 0.0002069171286362949 + }, + { + "step": 275, + "epoch": 0.4407051282051282, + "cpu_mem": 1.597304832, + "gpu_mem": 4.72243456, + "loss": 1.3948, + "grad_norm": 1.6382040977478027, + "learning_rate": 0.00020613906292328457 + }, + { + "step": 276, + "epoch": 0.4423076923076923, + "cpu_mem": 1.59750144, + "gpu_mem": 4.72247296, + "loss": 1.3915, + "grad_norm": 1.1048332452774048, + "learning_rate": 0.0002053592367020955 + }, + { + "step": 277, + "epoch": 0.4439102564102564, + "cpu_mem": 1.59750144, + "gpu_mem": 4.722452992, + "loss": 1.4155, + "grad_norm": 2.5705323219299316, + "learning_rate": 0.0002045776744278996 + }, + { + "step": 278, + "epoch": 0.44551282051282054, + "cpu_mem": 1.597698048, + "gpu_mem": 4.722479104, + "loss": 1.4488, + "grad_norm": 3.5037522315979004, + "learning_rate": 0.00020379440061031118 + }, + { + "step": 279, + "epoch": 0.44711538461538464, + "cpu_mem": 1.597894656, + "gpu_mem": 4.722429952, + "loss": 1.4288, + "grad_norm": 2.4826934337615967, + "learning_rate": 0.00020300943981261808 + }, + { + "step": 280, + "epoch": 0.44871794871794873, + "cpu_mem": 1.598091264, + "gpu_mem": 4.722423808, + "loss": 1.4026, + "grad_norm": 1.7591763734817505, + "learning_rate": 0.0002022228166510114 + }, + { + "step": 281, + "epoch": 0.45032051282051283, + "cpu_mem": 1.598091264, + "gpu_mem": 4.722446848, + "loss": 1.4038, + "grad_norm": 1.6296037435531616, + "learning_rate": 0.00020143455579381373 + }, + { + "step": 282, + "epoch": 0.4519230769230769, + "cpu_mem": 1.598287872, + "gpu_mem": 4.722425344, + "loss": 1.3761, + "grad_norm": 0.8546644449234009, + "learning_rate": 0.0002006446819607053 + }, + { + "step": 283, + "epoch": 0.453525641025641, + "cpu_mem": 1.598287872, + "gpu_mem": 4.722439168, + "loss": 1.4066, + "grad_norm": 1.2767428159713745, + "learning_rate": 0.00019985321992194892 + }, + { + "step": 284, + "epoch": 0.4551282051282051, + "cpu_mem": 1.59848448, + "gpu_mem": 4.722443776, + "loss": 1.4454, + "grad_norm": 2.251019239425659, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 285, + "epoch": 0.4567307692307692, + "cpu_mem": 1.598681088, + "gpu_mem": 4.722462208, + "loss": 1.4672, + "grad_norm": 2.2998292446136475, + "learning_rate": 0.00019826563055679418 + }, + { + "step": 286, + "epoch": 0.4583333333333333, + "cpu_mem": 1.598681088, + "gpu_mem": 4.722433024, + "loss": 1.4021, + "grad_norm": 1.1300181150436401, + "learning_rate": 0.00019746955301683537 + }, + { + "step": 287, + "epoch": 0.4599358974358974, + "cpu_mem": 1.598877696, + "gpu_mem": 4.722460672, + "loss": 1.3976, + "grad_norm": 0.776098906993866, + "learning_rate": 0.0001966719868425464 + }, + { + "step": 288, + "epoch": 0.46153846153846156, + "cpu_mem": 1.599074304, + "gpu_mem": 4.72244224, + "loss": 1.378, + "grad_norm": 0.9076316356658936, + "learning_rate": 0.0001958729570454201 + }, + { + "step": 289, + "epoch": 0.46314102564102566, + "cpu_mem": 1.599074304, + "gpu_mem": 4.722429952, + "loss": 1.4253, + "grad_norm": 1.6103192567825317, + "learning_rate": 0.0001950724886828484 + }, + { + "step": 290, + "epoch": 0.46474358974358976, + "cpu_mem": 1.599074304, + "gpu_mem": 4.722439168, + "loss": 1.4313, + "grad_norm": 1.7175077199935913, + "learning_rate": 0.000194270606857336 + }, + { + "step": 291, + "epoch": 0.46634615384615385, + "cpu_mem": 1.599270912, + "gpu_mem": 4.722436096, + "loss": 1.408, + "grad_norm": 1.290252447128296, + "learning_rate": 0.00019346733671571367 + }, + { + "step": 292, + "epoch": 0.46794871794871795, + "cpu_mem": 1.59946752, + "gpu_mem": 4.722451456, + "loss": 1.4114, + "grad_norm": 1.1790993213653564, + "learning_rate": 0.00019266270344834942 + }, + { + "step": 293, + "epoch": 0.46955128205128205, + "cpu_mem": 1.599664128, + "gpu_mem": 4.722459136, + "loss": 1.3815, + "grad_norm": 0.17870669066905975, + "learning_rate": 0.00019185673228835857 + }, + { + "step": 294, + "epoch": 0.47115384615384615, + "cpu_mem": 1.599664128, + "gpu_mem": 4.722448384, + "loss": 1.389, + "grad_norm": 0.8957750201225281, + "learning_rate": 0.00019104944851081244 + }, + { + "step": 295, + "epoch": 0.47275641025641024, + "cpu_mem": 1.599664128, + "gpu_mem": 4.722433024, + "loss": 1.4377, + "grad_norm": 2.0576515197753906, + "learning_rate": 0.00019024087743194564 + }, + { + "step": 296, + "epoch": 0.47435897435897434, + "cpu_mem": 1.599860736, + "gpu_mem": 4.722436096, + "loss": 1.4297, + "grad_norm": 1.655872106552124, + "learning_rate": 0.0001894310444083625 + }, + { + "step": 297, + "epoch": 0.47596153846153844, + "cpu_mem": 1.600057344, + "gpu_mem": 4.722429952, + "loss": 1.3864, + "grad_norm": 0.9234837293624878, + "learning_rate": 0.00018861997483624136 + }, + { + "step": 298, + "epoch": 0.4775641025641026, + "cpu_mem": 1.600253952, + "gpu_mem": 4.722425344, + "loss": 1.4469, + "grad_norm": 1.869870901107788, + "learning_rate": 0.00018780769415053866 + }, + { + "step": 299, + "epoch": 0.4791666666666667, + "cpu_mem": 1.600253952, + "gpu_mem": 4.722446848, + "loss": 1.4054, + "grad_norm": 0.7750421762466431, + "learning_rate": 0.00018699422782419094 + }, + { + "step": 300, + "epoch": 0.4807692307692308, + "cpu_mem": 1.60045056, + "gpu_mem": 4.722439168, + "loss": 1.3647, + "grad_norm": 0.9150461554527283, + "learning_rate": 0.00018617960136731624 + }, + { + "step": 301, + "epoch": 0.4823717948717949, + "cpu_mem": 1.60045056, + "gpu_mem": 4.72241152, + "loss": 1.3839, + "grad_norm": 0.5319207310676575, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 302, + "epoch": 0.483974358974359, + "cpu_mem": 1.600647168, + "gpu_mem": 4.722409984, + "loss": 1.3826, + "grad_norm": 0.5328527092933655, + "learning_rate": 0.0001845469702835641 + }, + { + "step": 303, + "epoch": 0.4855769230769231, + "cpu_mem": 1.600647168, + "gpu_mem": 4.722436096, + "loss": 1.4554, + "grad_norm": 3.1824607849121094, + "learning_rate": 0.00018372901685562414 + }, + { + "step": 304, + "epoch": 0.48717948717948717, + "cpu_mem": 1.600843776, + "gpu_mem": 4.7224192, + "loss": 1.3837, + "grad_norm": 0.5900561213493347, + "learning_rate": 0.00018291000569342676 + }, + { + "step": 305, + "epoch": 0.48878205128205127, + "cpu_mem": 1.600843776, + "gpu_mem": 4.72244992, + "loss": 1.3618, + "grad_norm": 3.622708559036255, + "learning_rate": 0.00018208996248097458 + }, + { + "step": 306, + "epoch": 0.49038461538461536, + "cpu_mem": 1.601040384, + "gpu_mem": 4.722433024, + "loss": 1.4049, + "grad_norm": 2.096372127532959, + "learning_rate": 0.00018126891293463547 + }, + { + "step": 307, + "epoch": 0.49198717948717946, + "cpu_mem": 1.601040384, + "gpu_mem": 4.722463744, + "loss": 1.3919, + "grad_norm": 1.7261409759521484, + "learning_rate": 0.0001804468828023354 + }, + { + "step": 308, + "epoch": 0.4935897435897436, + "cpu_mem": 1.601040384, + "gpu_mem": 4.722431488, + "loss": 1.3919, + "grad_norm": 1.043400526046753, + "learning_rate": 0.00017962389786275142 + }, + { + "step": 309, + "epoch": 0.4951923076923077, + "cpu_mem": 1.601236992, + "gpu_mem": 4.7224576, + "loss": 1.4156, + "grad_norm": 2.4391214847564697, + "learning_rate": 0.000178799983924503 + }, + { + "step": 310, + "epoch": 0.4967948717948718, + "cpu_mem": 1.6014336, + "gpu_mem": 4.722433024, + "loss": 1.3944, + "grad_norm": 1.2637019157409668, + "learning_rate": 0.00017797516682534293 + }, + { + "step": 311, + "epoch": 0.4983974358974359, + "cpu_mem": 1.6014336, + "gpu_mem": 4.722428416, + "loss": 1.4061, + "grad_norm": 1.5955090522766113, + "learning_rate": 0.00017714947243134695 + }, + { + "step": 312, + "epoch": 0.5, + "cpu_mem": 1.601630208, + "gpu_mem": 4.722431488, + "loss": 1.3767, + "grad_norm": 1.2379603385925293, + "learning_rate": 0.0001763229266361024 + }, + { + "step": 313, + "epoch": 0.5016025641025641, + "cpu_mem": 1.601630208, + "gpu_mem": 4.72244992, + "loss": 1.3911, + "grad_norm": 0.9667835235595703, + "learning_rate": 0.00017549555535989648 + }, + { + "step": 314, + "epoch": 0.5032051282051282, + "cpu_mem": 1.601630208, + "gpu_mem": 4.722429952, + "loss": 1.386, + "grad_norm": 1.5125170946121216, + "learning_rate": 0.00017466738454890323 + }, + { + "step": 315, + "epoch": 0.5048076923076923, + "cpu_mem": 1.601826816, + "gpu_mem": 4.72243456, + "loss": 1.3834, + "grad_norm": 0.46077975630760193, + "learning_rate": 0.00017383844017436996 + }, + { + "step": 316, + "epoch": 0.5064102564102564, + "cpu_mem": 1.601826816, + "gpu_mem": 4.722429952, + "loss": 1.399, + "grad_norm": 1.540825605392456, + "learning_rate": 0.00017300874823180282 + }, + { + "step": 317, + "epoch": 0.5080128205128205, + "cpu_mem": 1.602023424, + "gpu_mem": 4.722437632, + "loss": 1.3868, + "grad_norm": 0.3748496174812317, + "learning_rate": 0.00017217833474015128 + }, + { + "step": 318, + "epoch": 0.5096153846153846, + "cpu_mem": 1.602023424, + "gpu_mem": 4.722462208, + "loss": 1.3726, + "grad_norm": 0.6428777575492859, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 319, + "epoch": 0.5112179487179487, + "cpu_mem": 1.602220032, + "gpu_mem": 4.722454528, + "loss": 1.3938, + "grad_norm": 0.9057937860488892, + "learning_rate": 0.0001705154472977154 + }, + { + "step": 320, + "epoch": 0.5128205128205128, + "cpu_mem": 1.60241664, + "gpu_mem": 4.722456064, + "loss": 1.3646, + "grad_norm": 0.6638514995574951, + "learning_rate": 0.00016968302549470095 + }, + { + "step": 321, + "epoch": 0.5144230769230769, + "cpu_mem": 1.60241664, + "gpu_mem": 4.722431488, + "loss": 1.3697, + "grad_norm": 0.9693477153778076, + "learning_rate": 0.00016884998643650694 + }, + { + "step": 322, + "epoch": 0.5160256410256411, + "cpu_mem": 1.60241664, + "gpu_mem": 4.722433024, + "loss": 1.4307, + "grad_norm": 2.396036148071289, + "learning_rate": 0.00016801635624704776 + }, + { + "step": 323, + "epoch": 0.5176282051282052, + "cpu_mem": 1.602613248, + "gpu_mem": 4.722452992, + "loss": 1.3777, + "grad_norm": 0.9697574973106384, + "learning_rate": 0.0001671821610687756 + }, + { + "step": 324, + "epoch": 0.5192307692307693, + "cpu_mem": 1.602809856, + "gpu_mem": 4.722425344, + "loss": 1.4387, + "grad_norm": 2.2656309604644775, + "learning_rate": 0.00016634742706186036 + }, + { + "step": 325, + "epoch": 0.5208333333333334, + "cpu_mem": 1.602809856, + "gpu_mem": 4.722437632, + "loss": 1.357, + "grad_norm": 1.0590201616287231, + "learning_rate": 0.00016551218040336993 + }, + { + "step": 326, + "epoch": 0.5224358974358975, + "cpu_mem": 1.603006464, + "gpu_mem": 4.722446848, + "loss": 1.4211, + "grad_norm": 1.464443564414978, + "learning_rate": 0.00016467644728644843 + }, + { + "step": 327, + "epoch": 0.5240384615384616, + "cpu_mem": 1.603203072, + "gpu_mem": 4.722423808, + "loss": 1.3571, + "grad_norm": 1.055216908454895, + "learning_rate": 0.0001638402539194953 + }, + { + "step": 328, + "epoch": 0.5256410256410257, + "cpu_mem": 1.603203072, + "gpu_mem": 4.722448384, + "loss": 1.4144, + "grad_norm": 1.4881309270858765, + "learning_rate": 0.00016300362652534346 + }, + { + "step": 329, + "epoch": 0.5272435897435898, + "cpu_mem": 1.603203072, + "gpu_mem": 4.722448384, + "loss": 1.407, + "grad_norm": 0.8018154501914978, + "learning_rate": 0.00016216659134043657 + }, + { + "step": 330, + "epoch": 0.5288461538461539, + "cpu_mem": 1.60339968, + "gpu_mem": 4.722431488, + "loss": 1.4056, + "grad_norm": 0.7236229181289673, + "learning_rate": 0.00016132917461400686 + }, + { + "step": 331, + "epoch": 0.530448717948718, + "cpu_mem": 1.60339968, + "gpu_mem": 4.722428416, + "loss": 1.3258, + "grad_norm": 1.0320041179656982, + "learning_rate": 0.00016049140260725127 + }, + { + "step": 332, + "epoch": 0.532051282051282, + "cpu_mem": 1.60339968, + "gpu_mem": 4.722420736, + "loss": 1.4321, + "grad_norm": 1.2596511840820312, + "learning_rate": 0.00015965330159250845 + }, + { + "step": 333, + "epoch": 0.5336538461538461, + "cpu_mem": 1.60339968, + "gpu_mem": 4.722459136, + "loss": 1.4996, + "grad_norm": 2.07421612739563, + "learning_rate": 0.00015881489785243467 + }, + { + "step": 334, + "epoch": 0.5352564102564102, + "cpu_mem": 1.603596288, + "gpu_mem": 4.722436096, + "loss": 1.4493, + "grad_norm": 1.445887565612793, + "learning_rate": 0.00015797621767917942 + }, + { + "step": 335, + "epoch": 0.5368589743589743, + "cpu_mem": 1.603596288, + "gpu_mem": 4.72243456, + "loss": 1.4298, + "grad_norm": 1.1198362112045288, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 336, + "epoch": 0.5384615384615384, + "cpu_mem": 1.603596288, + "gpu_mem": 4.722451456, + "loss": 1.4087, + "grad_norm": 1.0315138101577759, + "learning_rate": 0.00015629813324424292 + }, + { + "step": 337, + "epoch": 0.5400641025641025, + "cpu_mem": 1.603792896, + "gpu_mem": 4.722436096, + "loss": 1.4021, + "grad_norm": 0.9127596020698547, + "learning_rate": 0.00015545878160690583 + }, + { + "step": 338, + "epoch": 0.5416666666666666, + "cpu_mem": 1.603989504, + "gpu_mem": 4.722448384, + "loss": 1.3744, + "grad_norm": 0.8786847591400146, + "learning_rate": 0.00015461925878342556 + }, + { + "step": 339, + "epoch": 0.5432692307692307, + "cpu_mem": 1.603989504, + "gpu_mem": 4.722460672, + "loss": 1.3716, + "grad_norm": 0.6664982438087463, + "learning_rate": 0.00015377959110104584 + }, + { + "step": 340, + "epoch": 0.5448717948717948, + "cpu_mem": 1.604186112, + "gpu_mem": 4.722436096, + "loss": 1.415, + "grad_norm": 1.4941357374191284, + "learning_rate": 0.00015293980489155333 + }, + { + "step": 341, + "epoch": 0.5464743589743589, + "cpu_mem": 1.604186112, + "gpu_mem": 4.72248064, + "loss": 1.3491, + "grad_norm": 1.428702712059021, + "learning_rate": 0.00015209992649045152 + }, + { + "step": 342, + "epoch": 0.5480769230769231, + "cpu_mem": 1.604186112, + "gpu_mem": 4.722454528, + "loss": 1.4091, + "grad_norm": 1.6814671754837036, + "learning_rate": 0.000151259982236135 + }, + { + "step": 343, + "epoch": 0.5496794871794872, + "cpu_mem": 1.60438272, + "gpu_mem": 4.722451456, + "loss": 1.3801, + "grad_norm": 1.1495227813720703, + "learning_rate": 0.00015041999846906367 + }, + { + "step": 344, + "epoch": 0.5512820512820513, + "cpu_mem": 1.60438272, + "gpu_mem": 4.722433024, + "loss": 1.398, + "grad_norm": 1.2410920858383179, + "learning_rate": 0.00014958000153093634 + }, + { + "step": 345, + "epoch": 0.5528846153846154, + "cpu_mem": 1.604579328, + "gpu_mem": 4.722439168, + "loss": 1.4164, + "grad_norm": 1.9049948453903198, + "learning_rate": 0.000148740017763865 + }, + { + "step": 346, + "epoch": 0.5544871794871795, + "cpu_mem": 1.604579328, + "gpu_mem": 4.722408448, + "loss": 1.3892, + "grad_norm": 1.3961580991744995, + "learning_rate": 0.00014790007350954845 + }, + { + "step": 347, + "epoch": 0.5560897435897436, + "cpu_mem": 1.604579328, + "gpu_mem": 4.72247296, + "loss": 1.4024, + "grad_norm": 1.6122605800628662, + "learning_rate": 0.00014706019510844664 + }, + { + "step": 348, + "epoch": 0.5576923076923077, + "cpu_mem": 1.604775936, + "gpu_mem": 4.72242688, + "loss": 1.3728, + "grad_norm": 1.0319862365722656, + "learning_rate": 0.0001462204088989541 + }, + { + "step": 349, + "epoch": 0.5592948717948718, + "cpu_mem": 1.604775936, + "gpu_mem": 4.722420736, + "loss": 1.3907, + "grad_norm": 0.9848273992538452, + "learning_rate": 0.00014538074121657447 + }, + { + "step": 350, + "epoch": 0.5608974358974359, + "cpu_mem": 1.604775936, + "gpu_mem": 4.722476032, + "loss": 1.407, + "grad_norm": 1.3855117559432983, + "learning_rate": 0.00014454121839309415 + }, + { + "step": 351, + "epoch": 0.5625, + "cpu_mem": 1.604972544, + "gpu_mem": 4.72244224, + "loss": 1.3706, + "grad_norm": 0.8098156452178955, + "learning_rate": 0.00014370186675575705 + }, + { + "step": 352, + "epoch": 0.5641025641025641, + "cpu_mem": 1.605169152, + "gpu_mem": 4.722429952, + "loss": 1.3788, + "grad_norm": 1.4372848272323608, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 353, + "epoch": 0.5657051282051282, + "cpu_mem": 1.605169152, + "gpu_mem": 4.72243456, + "loss": 1.4082, + "grad_norm": 1.3669991493225098, + "learning_rate": 0.00014202378232082053 + }, + { + "step": 354, + "epoch": 0.5673076923076923, + "cpu_mem": 1.605169152, + "gpu_mem": 4.722414592, + "loss": 1.4029, + "grad_norm": 1.1052916049957275, + "learning_rate": 0.00014118510214756536 + }, + { + "step": 355, + "epoch": 0.5689102564102564, + "cpu_mem": 1.60536576, + "gpu_mem": 4.722439168, + "loss": 1.3812, + "grad_norm": 0.9743290543556213, + "learning_rate": 0.00014034669840749152 + }, + { + "step": 356, + "epoch": 0.5705128205128205, + "cpu_mem": 1.60536576, + "gpu_mem": 4.722417664, + "loss": 1.385, + "grad_norm": 0.9301744699478149, + "learning_rate": 0.0001395085973927487 + }, + { + "step": 357, + "epoch": 0.5721153846153846, + "cpu_mem": 1.60536576, + "gpu_mem": 4.72243456, + "loss": 1.3709, + "grad_norm": 1.010877251625061, + "learning_rate": 0.00013867082538599317 + }, + { + "step": 358, + "epoch": 0.5737179487179487, + "cpu_mem": 1.60536576, + "gpu_mem": 4.722399232, + "loss": 1.3701, + "grad_norm": 1.4049146175384521, + "learning_rate": 0.00013783340865956338 + }, + { + "step": 359, + "epoch": 0.5753205128205128, + "cpu_mem": 1.605562368, + "gpu_mem": 4.722431488, + "loss": 1.3731, + "grad_norm": 2.057569742202759, + "learning_rate": 0.0001369963734746566 + }, + { + "step": 360, + "epoch": 0.5769230769230769, + "cpu_mem": 1.605562368, + "gpu_mem": 4.722420736, + "loss": 1.3595, + "grad_norm": 2.2469840049743652, + "learning_rate": 0.0001361597460805047 + }, + { + "step": 361, + "epoch": 0.5785256410256411, + "cpu_mem": 1.605562368, + "gpu_mem": 4.7224576, + "loss": 1.342, + "grad_norm": 3.2228143215179443, + "learning_rate": 0.00013532355271355154 + }, + { + "step": 362, + "epoch": 0.5801282051282052, + "cpu_mem": 1.605758976, + "gpu_mem": 4.722423808, + "loss": 1.3504, + "grad_norm": 3.717660903930664, + "learning_rate": 0.00013448781959663004 + }, + { + "step": 363, + "epoch": 0.5817307692307693, + "cpu_mem": 1.605758976, + "gpu_mem": 4.722446848, + "loss": 1.3569, + "grad_norm": 5.9503326416015625, + "learning_rate": 0.00013365257293813956 + }, + { + "step": 364, + "epoch": 0.5833333333333334, + "cpu_mem": 1.605955584, + "gpu_mem": 4.722436096, + "loss": 1.3444, + "grad_norm": 5.348925590515137, + "learning_rate": 0.00013281783893122446 + }, + { + "step": 365, + "epoch": 0.5849358974358975, + "cpu_mem": 1.605955584, + "gpu_mem": 4.72244224, + "loss": 1.3828, + "grad_norm": 5.41358757019043, + "learning_rate": 0.00013198364375295224 + }, + { + "step": 366, + "epoch": 0.5865384615384616, + "cpu_mem": 1.605955584, + "gpu_mem": 4.722436096, + "loss": 1.4097, + "grad_norm": 5.044307708740234, + "learning_rate": 0.000131150013563493 + }, + { + "step": 367, + "epoch": 0.5881410256410257, + "cpu_mem": 1.605955584, + "gpu_mem": 4.722454528, + "loss": 1.3926, + "grad_norm": 6.6494669914245605, + "learning_rate": 0.00013031697450529902 + }, + { + "step": 368, + "epoch": 0.5897435897435898, + "cpu_mem": 1.605955584, + "gpu_mem": 4.722414592, + "loss": 1.3947, + "grad_norm": 2.565629482269287, + "learning_rate": 0.0001294845527022846 + }, + { + "step": 369, + "epoch": 0.5913461538461539, + "cpu_mem": 1.606152192, + "gpu_mem": 4.722446848, + "loss": 1.378, + "grad_norm": 2.2482383251190186, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 370, + "epoch": 0.592948717948718, + "cpu_mem": 1.6063488, + "gpu_mem": 4.722466816, + "loss": 1.4119, + "grad_norm": 4.856621742248535, + "learning_rate": 0.0001278216652598487 + }, + { + "step": 371, + "epoch": 0.594551282051282, + "cpu_mem": 1.6063488, + "gpu_mem": 4.722460672, + "loss": 1.4346, + "grad_norm": 3.6042072772979736, + "learning_rate": 0.00012699125176819716 + }, + { + "step": 372, + "epoch": 0.5961538461538461, + "cpu_mem": 1.6063488, + "gpu_mem": 4.722423808, + "loss": 1.3251, + "grad_norm": 3.4491512775421143, + "learning_rate": 0.00012616155982563 + }, + { + "step": 373, + "epoch": 0.5977564102564102, + "cpu_mem": 1.6063488, + "gpu_mem": 4.722440704, + "loss": 1.366, + "grad_norm": 3.8757376670837402, + "learning_rate": 0.00012533261545109674 + }, + { + "step": 374, + "epoch": 0.5993589743589743, + "cpu_mem": 1.606545408, + "gpu_mem": 4.722417664, + "loss": 1.3843, + "grad_norm": 4.232378959655762, + "learning_rate": 0.00012450444464010352 + }, + { + "step": 375, + "epoch": 0.6009615384615384, + "cpu_mem": 1.606545408, + "gpu_mem": 4.72244992, + "loss": 1.3569, + "grad_norm": 3.3628809452056885, + "learning_rate": 0.0001236770733638976 + }, + { + "step": 376, + "epoch": 0.6025641025641025, + "cpu_mem": 1.606545408, + "gpu_mem": 4.722445312, + "loss": 1.3444, + "grad_norm": 4.446598052978516, + "learning_rate": 0.000122850527568653 + }, + { + "step": 377, + "epoch": 0.6041666666666666, + "cpu_mem": 1.606545408, + "gpu_mem": 4.722454528, + "loss": 1.4087, + "grad_norm": 4.462116718292236, + "learning_rate": 0.00012202483317465704 + }, + { + "step": 378, + "epoch": 0.6057692307692307, + "cpu_mem": 1.606742016, + "gpu_mem": 4.722428416, + "loss": 1.3905, + "grad_norm": 3.9077913761138916, + "learning_rate": 0.00012120001607549698 + }, + { + "step": 379, + "epoch": 0.6073717948717948, + "cpu_mem": 1.606742016, + "gpu_mem": 4.722448384, + "loss": 1.3974, + "grad_norm": 1.6813745498657227, + "learning_rate": 0.00012037610213724862 + }, + { + "step": 380, + "epoch": 0.6089743589743589, + "cpu_mem": 1.606742016, + "gpu_mem": 4.722422272, + "loss": 1.3856, + "grad_norm": 0.24823980033397675, + "learning_rate": 0.0001195531171976646 + }, + { + "step": 381, + "epoch": 0.6105769230769231, + "cpu_mem": 1.606938624, + "gpu_mem": 4.722446848, + "loss": 1.3856, + "grad_norm": 0.9861867427825928, + "learning_rate": 0.00011873108706536448 + }, + { + "step": 382, + "epoch": 0.6121794871794872, + "cpu_mem": 1.606938624, + "gpu_mem": 4.722431488, + "loss": 1.4092, + "grad_norm": 2.0093586444854736, + "learning_rate": 0.00011791003751902542 + }, + { + "step": 383, + "epoch": 0.6137820512820513, + "cpu_mem": 1.606938624, + "gpu_mem": 4.72246528, + "loss": 1.356, + "grad_norm": 1.206144094467163, + "learning_rate": 0.00011708999430657325 + }, + { + "step": 384, + "epoch": 0.6153846153846154, + "cpu_mem": 1.607135232, + "gpu_mem": 4.722445312, + "loss": 1.3764, + "grad_norm": 1.0379422903060913, + "learning_rate": 0.00011627098314437586 + }, + { + "step": 385, + "epoch": 0.6169871794871795, + "cpu_mem": 1.607135232, + "gpu_mem": 4.722429952, + "loss": 1.4146, + "grad_norm": 2.009974479675293, + "learning_rate": 0.0001154530297164359 + }, + { + "step": 386, + "epoch": 0.6185897435897436, + "cpu_mem": 1.607135232, + "gpu_mem": 4.72246528, + "loss": 1.4065, + "grad_norm": 1.3824514150619507, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 387, + "epoch": 0.6201923076923077, + "cpu_mem": 1.607135232, + "gpu_mem": 4.722471424, + "loss": 1.4402, + "grad_norm": 2.700124502182007, + "learning_rate": 0.00011382039863268374 + }, + { + "step": 388, + "epoch": 0.6217948717948718, + "cpu_mem": 1.60733184, + "gpu_mem": 4.72243456, + "loss": 1.3896, + "grad_norm": 1.5500456094741821, + "learning_rate": 0.00011300577217580905 + }, + { + "step": 389, + "epoch": 0.6233974358974359, + "cpu_mem": 1.60733184, + "gpu_mem": 4.722413056, + "loss": 1.3968, + "grad_norm": 1.6770946979522705, + "learning_rate": 0.00011219230584946136 + }, + { + "step": 390, + "epoch": 0.625, + "cpu_mem": 1.60733184, + "gpu_mem": 4.72246528, + "loss": 1.3706, + "grad_norm": 1.7144904136657715, + "learning_rate": 0.00011138002516375864 + }, + { + "step": 391, + "epoch": 0.6266025641025641, + "cpu_mem": 1.60733184, + "gpu_mem": 4.722451456, + "loss": 1.4344, + "grad_norm": 2.8785345554351807, + "learning_rate": 0.00011056895559163748 + }, + { + "step": 392, + "epoch": 0.6282051282051282, + "cpu_mem": 1.60733184, + "gpu_mem": 4.722445312, + "loss": 1.4021, + "grad_norm": 1.9984567165374756, + "learning_rate": 0.00010975912256805436 + }, + { + "step": 393, + "epoch": 0.6298076923076923, + "cpu_mem": 1.607528448, + "gpu_mem": 4.722451456, + "loss": 1.3902, + "grad_norm": 1.672563076019287, + "learning_rate": 0.00010895055148918756 + }, + { + "step": 394, + "epoch": 0.6314102564102564, + "cpu_mem": 1.607528448, + "gpu_mem": 4.722428416, + "loss": 1.4232, + "grad_norm": 1.9962295293807983, + "learning_rate": 0.00010814326771164141 + }, + { + "step": 395, + "epoch": 0.6330128205128205, + "cpu_mem": 1.607528448, + "gpu_mem": 4.72244224, + "loss": 1.4133, + "grad_norm": 1.5227537155151367, + "learning_rate": 0.00010733729655165054 + }, + { + "step": 396, + "epoch": 0.6346153846153846, + "cpu_mem": 1.607725056, + "gpu_mem": 4.72244224, + "loss": 1.3797, + "grad_norm": 0.8671183586120605, + "learning_rate": 0.00010653266328428628 + }, + { + "step": 397, + "epoch": 0.6362179487179487, + "cpu_mem": 1.607725056, + "gpu_mem": 4.72241152, + "loss": 1.4201, + "grad_norm": 1.9777090549468994, + "learning_rate": 0.00010572939314266402 + }, + { + "step": 398, + "epoch": 0.6378205128205128, + "cpu_mem": 1.607725056, + "gpu_mem": 4.722445312, + "loss": 1.4026, + "grad_norm": 1.035771369934082, + "learning_rate": 0.00010492751131715159 + }, + { + "step": 399, + "epoch": 0.6394230769230769, + "cpu_mem": 1.607725056, + "gpu_mem": 4.722423808, + "loss": 1.3868, + "grad_norm": 0.37821176648139954, + "learning_rate": 0.00010412704295457988 + }, + { + "step": 400, + "epoch": 0.6410256410256411, + "cpu_mem": 1.607725056, + "gpu_mem": 4.722431488, + "loss": 1.3914, + "grad_norm": 0.4145619869232178, + "learning_rate": 0.00010332801315745361 + }, + { + "step": 401, + "epoch": 0.6426282051282052, + "cpu_mem": 1.607921664, + "gpu_mem": 4.72244992, + "loss": 1.3928, + "grad_norm": 0.42201513051986694, + "learning_rate": 0.00010253044698316464 + }, + { + "step": 402, + "epoch": 0.6442307692307693, + "cpu_mem": 1.607921664, + "gpu_mem": 4.722417664, + "loss": 1.3956, + "grad_norm": 0.5902168154716492, + "learning_rate": 0.00010173436944320582 + }, + { + "step": 403, + "epoch": 0.6458333333333334, + "cpu_mem": 1.607921664, + "gpu_mem": 4.722422272, + "loss": 1.3864, + "grad_norm": 0.3691595196723938, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 404, + "epoch": 0.6474358974358975, + "cpu_mem": 1.608118272, + "gpu_mem": 4.722417664, + "loss": 1.3909, + "grad_norm": 0.6151823997497559, + "learning_rate": 0.00010014678007805106 + }, + { + "step": 405, + "epoch": 0.6490384615384616, + "cpu_mem": 1.608118272, + "gpu_mem": 4.722462208, + "loss": 1.3772, + "grad_norm": 0.6575400233268738, + "learning_rate": 9.935531803929469e-05 + }, + { + "step": 406, + "epoch": 0.6506410256410257, + "cpu_mem": 1.608118272, + "gpu_mem": 4.722445312, + "loss": 1.3937, + "grad_norm": 1.4033901691436768, + "learning_rate": 9.856544420618624e-05 + }, + { + "step": 407, + "epoch": 0.6522435897435898, + "cpu_mem": 1.608118272, + "gpu_mem": 4.72243456, + "loss": 1.3875, + "grad_norm": 0.494539350271225, + "learning_rate": 9.777718334898859e-05 + }, + { + "step": 408, + "epoch": 0.6538461538461539, + "cpu_mem": 1.608118272, + "gpu_mem": 4.722456064, + "loss": 1.3827, + "grad_norm": 1.529107689857483, + "learning_rate": 9.699056018738192e-05 + }, + { + "step": 409, + "epoch": 0.655448717948718, + "cpu_mem": 1.60831488, + "gpu_mem": 4.722422272, + "loss": 1.3845, + "grad_norm": 0.3872825801372528, + "learning_rate": 9.62055993896888e-05 + }, + { + "step": 410, + "epoch": 0.657051282051282, + "cpu_mem": 1.60831488, + "gpu_mem": 4.722437632, + "loss": 1.405, + "grad_norm": 1.366998553276062, + "learning_rate": 9.542232557210039e-05 + }, + { + "step": 411, + "epoch": 0.6586538461538461, + "cpu_mem": 1.60831488, + "gpu_mem": 4.722437632, + "loss": 1.4384, + "grad_norm": 1.4649250507354736, + "learning_rate": 9.464076329790451e-05 + }, + { + "step": 412, + "epoch": 0.6602564102564102, + "cpu_mem": 1.608511488, + "gpu_mem": 4.722428416, + "loss": 1.4102, + "grad_norm": 1.0243183374404907, + "learning_rate": 9.386093707671543e-05 + }, + { + "step": 413, + "epoch": 0.6618589743589743, + "cpu_mem": 1.608511488, + "gpu_mem": 4.722439168, + "loss": 1.4357, + "grad_norm": 1.9205125570297241, + "learning_rate": 9.308287136370511e-05 + }, + { + "step": 414, + "epoch": 0.6634615384615384, + "cpu_mem": 1.608511488, + "gpu_mem": 4.722463744, + "loss": 1.378, + "grad_norm": 0.6563411355018616, + "learning_rate": 9.230659055883649e-05 + }, + { + "step": 415, + "epoch": 0.6650641025641025, + "cpu_mem": 1.608511488, + "gpu_mem": 4.722416128, + "loss": 1.3712, + "grad_norm": 0.8689450621604919, + "learning_rate": 9.15321190060981e-05 + }, + { + "step": 416, + "epoch": 0.6666666666666666, + "cpu_mem": 1.608511488, + "gpu_mem": 4.722451456, + "loss": 1.4103, + "grad_norm": 1.2989752292633057, + "learning_rate": 9.075948099274078e-05 + }, + { + "step": 417, + "epoch": 0.6682692307692307, + "cpu_mem": 1.608511488, + "gpu_mem": 4.722413056, + "loss": 1.4124, + "grad_norm": 1.8457000255584717, + "learning_rate": 8.998870074851604e-05 + }, + { + "step": 418, + "epoch": 0.6698717948717948, + "cpu_mem": 1.608708096, + "gpu_mem": 4.722431488, + "loss": 1.3765, + "grad_norm": 0.8638191223144531, + "learning_rate": 8.9219802444916e-05 + }, + { + "step": 419, + "epoch": 0.6714743589743589, + "cpu_mem": 1.608708096, + "gpu_mem": 4.722423808, + "loss": 1.3977, + "grad_norm": 1.0114805698394775, + "learning_rate": 8.845281019441583e-05 + }, + { + "step": 420, + "epoch": 0.6730769230769231, + "cpu_mem": 1.608904704, + "gpu_mem": 4.722460672, + "loss": 1.3859, + "grad_norm": 0.36698541045188904, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 421, + "epoch": 0.6746794871794872, + "cpu_mem": 1.608904704, + "gpu_mem": 4.722420736, + "loss": 1.4121, + "grad_norm": 1.349655032157898, + "learning_rate": 8.692464000299362e-05 + }, + { + "step": 422, + "epoch": 0.6762820512820513, + "cpu_mem": 1.608904704, + "gpu_mem": 4.72243456, + "loss": 1.4258, + "grad_norm": 1.595308780670166, + "learning_rate": 8.61635099851395e-05 + }, + { + "step": 423, + "epoch": 0.6778846153846154, + "cpu_mem": 1.608904704, + "gpu_mem": 4.722439168, + "loss": 1.3897, + "grad_norm": 0.9533817172050476, + "learning_rate": 8.540438186501792e-05 + }, + { + "step": 424, + "epoch": 0.6794871794871795, + "cpu_mem": 1.608904704, + "gpu_mem": 4.722400768, + "loss": 1.3821, + "grad_norm": 1.2481635808944702, + "learning_rate": 8.464727944871322e-05 + }, + { + "step": 425, + "epoch": 0.6810897435897436, + "cpu_mem": 1.608904704, + "gpu_mem": 4.722423808, + "loss": 1.3905, + "grad_norm": 0.4243612289428711, + "learning_rate": 8.389222647878426e-05 + }, + { + "step": 426, + "epoch": 0.6826923076923077, + "cpu_mem": 1.608904704, + "gpu_mem": 4.722422272, + "loss": 1.3897, + "grad_norm": 0.6699212789535522, + "learning_rate": 8.313924663351926e-05 + }, + { + "step": 427, + "epoch": 0.6842948717948718, + "cpu_mem": 1.609101312, + "gpu_mem": 4.722440704, + "loss": 1.4001, + "grad_norm": 0.8736286163330078, + "learning_rate": 8.238836352619424e-05 + }, + { + "step": 428, + "epoch": 0.6858974358974359, + "cpu_mem": 1.609101312, + "gpu_mem": 4.722437632, + "loss": 1.3845, + "grad_norm": 0.5848421454429626, + "learning_rate": 8.163960070433164e-05 + }, + { + "step": 429, + "epoch": 0.6875, + "cpu_mem": 1.609101312, + "gpu_mem": 4.722436096, + "loss": 1.3974, + "grad_norm": 0.5167127251625061, + "learning_rate": 8.089298164896245e-05 + }, + { + "step": 430, + "epoch": 0.6891025641025641, + "cpu_mem": 1.60929792, + "gpu_mem": 4.722454528, + "loss": 1.3742, + "grad_norm": 0.6368122696876526, + "learning_rate": 8.014852977388964e-05 + }, + { + "step": 431, + "epoch": 0.6907051282051282, + "cpu_mem": 1.60929792, + "gpu_mem": 4.722416128, + "loss": 1.3835, + "grad_norm": 0.1171102449297905, + "learning_rate": 7.940626842495362e-05 + }, + { + "step": 432, + "epoch": 0.6923076923076923, + "cpu_mem": 1.60929792, + "gpu_mem": 4.722460672, + "loss": 1.4104, + "grad_norm": 1.0352524518966675, + "learning_rate": 7.866622087930074e-05 + }, + { + "step": 433, + "epoch": 0.6939102564102564, + "cpu_mem": 1.60929792, + "gpu_mem": 4.722425344, + "loss": 1.3891, + "grad_norm": 0.6311654448509216, + "learning_rate": 7.792841034465275e-05 + }, + { + "step": 434, + "epoch": 0.6955128205128205, + "cpu_mem": 1.609494528, + "gpu_mem": 4.722452992, + "loss": 1.4044, + "grad_norm": 0.9704092741012573, + "learning_rate": 7.719285995857938e-05 + }, + { + "step": 435, + "epoch": 0.6971153846153846, + "cpu_mem": 1.609494528, + "gpu_mem": 4.722433024, + "loss": 1.3846, + "grad_norm": 0.3373461961746216, + "learning_rate": 7.64595927877727e-05 + }, + { + "step": 436, + "epoch": 0.6987179487179487, + "cpu_mem": 1.609494528, + "gpu_mem": 4.722479104, + "loss": 1.3824, + "grad_norm": 1.0861529111862183, + "learning_rate": 7.572863182732332e-05 + }, + { + "step": 437, + "epoch": 0.7003205128205128, + "cpu_mem": 1.609494528, + "gpu_mem": 4.722443776, + "loss": 1.385, + "grad_norm": 0.36155375838279724, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 438, + "epoch": 0.7019230769230769, + "cpu_mem": 1.609494528, + "gpu_mem": 4.72243456, + "loss": 1.3963, + "grad_norm": 0.801810622215271, + "learning_rate": 7.42737201555302e-05 + }, + { + "step": 439, + "epoch": 0.7035256410256411, + "cpu_mem": 1.609494528, + "gpu_mem": 4.722428416, + "loss": 1.3827, + "grad_norm": 0.3272384703159332, + "learning_rate": 7.354981506988387e-05 + }, + { + "step": 440, + "epoch": 0.7051282051282052, + "cpu_mem": 1.609494528, + "gpu_mem": 4.722413056, + "loss": 1.3791, + "grad_norm": 0.35559698939323425, + "learning_rate": 7.282830744455895e-05 + }, + { + "step": 441, + "epoch": 0.7067307692307693, + "cpu_mem": 1.609691136, + "gpu_mem": 4.722431488, + "loss": 1.3647, + "grad_norm": 1.2348371744155884, + "learning_rate": 7.210921990586957e-05 + }, + { + "step": 442, + "epoch": 0.7083333333333334, + "cpu_mem": 1.609691136, + "gpu_mem": 4.722433024, + "loss": 1.3718, + "grad_norm": 1.0610803365707397, + "learning_rate": 7.139257500423665e-05 + }, + { + "step": 443, + "epoch": 0.7099358974358975, + "cpu_mem": 1.609691136, + "gpu_mem": 4.722437632, + "loss": 1.4118, + "grad_norm": 1.1146924495697021, + "learning_rate": 7.067839521348035e-05 + }, + { + "step": 444, + "epoch": 0.7115384615384616, + "cpu_mem": 1.609691136, + "gpu_mem": 4.722440704, + "loss": 1.3928, + "grad_norm": 0.5746317505836487, + "learning_rate": 6.996670293011575e-05 + }, + { + "step": 445, + "epoch": 0.7131410256410257, + "cpu_mem": 1.609691136, + "gpu_mem": 4.72243456, + "loss": 1.4028, + "grad_norm": 1.1448612213134766, + "learning_rate": 6.92575204726501e-05 + }, + { + "step": 446, + "epoch": 0.7147435897435898, + "cpu_mem": 1.609691136, + "gpu_mem": 4.722460672, + "loss": 1.4134, + "grad_norm": 0.9430534839630127, + "learning_rate": 6.855087008088307e-05 + }, + { + "step": 447, + "epoch": 0.7163461538461539, + "cpu_mem": 1.609691136, + "gpu_mem": 4.722428416, + "loss": 1.3757, + "grad_norm": 0.9812300205230713, + "learning_rate": 6.784677391520952e-05 + }, + { + "step": 448, + "epoch": 0.717948717948718, + "cpu_mem": 1.609691136, + "gpu_mem": 4.722456064, + "loss": 1.3932, + "grad_norm": 0.8907880187034607, + "learning_rate": 6.714525405592412e-05 + }, + { + "step": 449, + "epoch": 0.719551282051282, + "cpu_mem": 1.609887744, + "gpu_mem": 4.722463744, + "loss": 1.3865, + "grad_norm": 1.136398196220398, + "learning_rate": 6.644633250252937e-05 + }, + { + "step": 450, + "epoch": 0.7211538461538461, + "cpu_mem": 1.609887744, + "gpu_mem": 4.722445312, + "loss": 1.3944, + "grad_norm": 0.973312497138977, + "learning_rate": 6.575003117304535e-05 + }, + { + "step": 451, + "epoch": 0.7227564102564102, + "cpu_mem": 1.609887744, + "gpu_mem": 4.722431488, + "loss": 1.3985, + "grad_norm": 0.7315878868103027, + "learning_rate": 6.50563719033225e-05 + }, + { + "step": 452, + "epoch": 0.7243589743589743, + "cpu_mem": 1.609887744, + "gpu_mem": 4.72244224, + "loss": 1.3978, + "grad_norm": 0.564954936504364, + "learning_rate": 6.436537644635705e-05 + }, + { + "step": 453, + "epoch": 0.7259615384615384, + "cpu_mem": 1.609887744, + "gpu_mem": 4.72243456, + "loss": 1.3801, + "grad_norm": 0.5291483402252197, + "learning_rate": 6.367706647160847e-05 + }, + { + "step": 454, + "epoch": 0.7275641025641025, + "cpu_mem": 1.609887744, + "gpu_mem": 4.722451456, + "loss": 1.384, + "grad_norm": 0.17033983767032623, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 455, + "epoch": 0.7291666666666666, + "cpu_mem": 1.609887744, + "gpu_mem": 4.722423808, + "loss": 1.3692, + "grad_norm": 0.29265546798706055, + "learning_rate": 6.230858922484288e-05 + }, + { + "step": 456, + "epoch": 0.7307692307692307, + "cpu_mem": 1.609887744, + "gpu_mem": 4.722454528, + "loss": 1.4085, + "grad_norm": 0.9132630825042725, + "learning_rate": 6.162846486795938e-05 + }, + { + "step": 457, + "epoch": 0.7323717948717948, + "cpu_mem": 1.610084352, + "gpu_mem": 4.722436096, + "loss": 1.3874, + "grad_norm": 0.8261816501617432, + "learning_rate": 6.095111182221422e-05 + }, + { + "step": 458, + "epoch": 0.7339743589743589, + "cpu_mem": 1.610084352, + "gpu_mem": 4.722423808, + "loss": 1.3967, + "grad_norm": 0.6913045644760132, + "learning_rate": 6.027655132924397e-05 + }, + { + "step": 459, + "epoch": 0.7355769230769231, + "cpu_mem": 1.610084352, + "gpu_mem": 4.722436096, + "loss": 1.4154, + "grad_norm": 1.1446325778961182, + "learning_rate": 5.960480454311155e-05 + }, + { + "step": 460, + "epoch": 0.7371794871794872, + "cpu_mem": 1.610084352, + "gpu_mem": 4.72244224, + "loss": 1.4008, + "grad_norm": 0.7479490041732788, + "learning_rate": 5.893589252964258e-05 + }, + { + "step": 461, + "epoch": 0.7387820512820513, + "cpu_mem": 1.610084352, + "gpu_mem": 4.722429952, + "loss": 1.3917, + "grad_norm": 0.6727002263069153, + "learning_rate": 5.826983626576479e-05 + }, + { + "step": 462, + "epoch": 0.7403846153846154, + "cpu_mem": 1.610084352, + "gpu_mem": 4.7224192, + "loss": 1.3906, + "grad_norm": 0.8892552852630615, + "learning_rate": 5.760665663885046e-05 + }, + { + "step": 463, + "epoch": 0.7419871794871795, + "cpu_mem": 1.610084352, + "gpu_mem": 4.722420736, + "loss": 1.378, + "grad_norm": 0.756604790687561, + "learning_rate": 5.6946374446060984e-05 + }, + { + "step": 464, + "epoch": 0.7435897435897436, + "cpu_mem": 1.61028096, + "gpu_mem": 4.72243456, + "loss": 1.3937, + "grad_norm": 0.5460458993911743, + "learning_rate": 5.6289010393695056e-05 + }, + { + "step": 465, + "epoch": 0.7451923076923077, + "cpu_mem": 1.61028096, + "gpu_mem": 4.722437632, + "loss": 1.3986, + "grad_norm": 0.8170487880706787, + "learning_rate": 5.563458509653904e-05 + }, + { + "step": 466, + "epoch": 0.7467948717948718, + "cpu_mem": 1.61028096, + "gpu_mem": 4.722448384, + "loss": 1.4025, + "grad_norm": 1.5717213153839111, + "learning_rate": 5.498311907722057e-05 + }, + { + "step": 467, + "epoch": 0.7483974358974359, + "cpu_mem": 1.61028096, + "gpu_mem": 4.722422272, + "loss": 1.3809, + "grad_norm": 0.3439090847969055, + "learning_rate": 5.43346327655652e-05 + }, + { + "step": 468, + "epoch": 0.75, + "cpu_mem": 1.61028096, + "gpu_mem": 4.722437632, + "loss": 1.3944, + "grad_norm": 0.9885152578353882, + "learning_rate": 5.3689146497955274e-05 + }, + { + "step": 469, + "epoch": 0.7516025641025641, + "cpu_mem": 1.61028096, + "gpu_mem": 4.722446848, + "loss": 1.3819, + "grad_norm": 1.081560492515564, + "learning_rate": 5.30466805166927e-05 + }, + { + "step": 470, + "epoch": 0.7532051282051282, + "cpu_mem": 1.61028096, + "gpu_mem": 4.722420736, + "loss": 1.4104, + "grad_norm": 1.7075902223587036, + "learning_rate": 5.240725496936372e-05 + }, + { + "step": 471, + "epoch": 0.7548076923076923, + "cpu_mem": 1.61028096, + "gpu_mem": 4.72242688, + "loss": 1.3742, + "grad_norm": 0.54472815990448, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 472, + "epoch": 0.7564102564102564, + "cpu_mem": 1.61028096, + "gpu_mem": 4.722416128, + "loss": 1.3655, + "grad_norm": 1.1097460985183716, + "learning_rate": 5.113760528948622e-05 + }, + { + "step": 473, + "epoch": 0.7580128205128205, + "cpu_mem": 1.610477568, + "gpu_mem": 4.722422272, + "loss": 1.3996, + "grad_norm": 1.0393513441085815, + "learning_rate": 5.05074209728614e-05 + }, + { + "step": 474, + "epoch": 0.7596153846153846, + "cpu_mem": 1.610674176, + "gpu_mem": 4.722459136, + "loss": 1.4086, + "grad_norm": 1.0467528104782104, + "learning_rate": 4.988035672076899e-05 + }, + { + "step": 475, + "epoch": 0.7612179487179487, + "cpu_mem": 1.610674176, + "gpu_mem": 4.722406912, + "loss": 1.4099, + "grad_norm": 1.0224844217300415, + "learning_rate": 4.925643219780052e-05 + }, + { + "step": 476, + "epoch": 0.7628205128205128, + "cpu_mem": 1.610674176, + "gpu_mem": 4.72242688, + "loss": 1.4019, + "grad_norm": 1.3160039186477661, + "learning_rate": 4.863566697008634e-05 + }, + { + "step": 477, + "epoch": 0.7644230769230769, + "cpu_mem": 1.610674176, + "gpu_mem": 4.72242688, + "loss": 1.4102, + "grad_norm": 1.3742938041687012, + "learning_rate": 4.801808050468219e-05 + }, + { + "step": 478, + "epoch": 0.7660256410256411, + "cpu_mem": 1.610674176, + "gpu_mem": 4.722425344, + "loss": 1.3884, + "grad_norm": 0.5741173624992371, + "learning_rate": 4.7403692168958305e-05 + }, + { + "step": 479, + "epoch": 0.7676282051282052, + "cpu_mem": 1.610674176, + "gpu_mem": 4.722423808, + "loss": 1.3877, + "grad_norm": 0.9824524521827698, + "learning_rate": 4.679252122999255e-05 + }, + { + "step": 480, + "epoch": 0.7692307692307693, + "cpu_mem": 1.610674176, + "gpu_mem": 4.722416128, + "loss": 1.3942, + "grad_norm": 1.097610592842102, + "learning_rate": 4.618458685396579e-05 + }, + { + "step": 481, + "epoch": 0.7708333333333334, + "cpu_mem": 1.610674176, + "gpu_mem": 4.722476032, + "loss": 1.3845, + "grad_norm": 0.33602264523506165, + "learning_rate": 4.5579908105561016e-05 + }, + { + "step": 482, + "epoch": 0.7724358974358975, + "cpu_mem": 1.610870784, + "gpu_mem": 4.722420736, + "loss": 1.3907, + "grad_norm": 0.5198936462402344, + "learning_rate": 4.497850394736563e-05 + }, + { + "step": 483, + "epoch": 0.7740384615384616, + "cpu_mem": 1.610870784, + "gpu_mem": 4.72240384, + "loss": 1.3923, + "grad_norm": 0.9027726054191589, + "learning_rate": 4.438039323927648e-05 + }, + { + "step": 484, + "epoch": 0.7756410256410257, + "cpu_mem": 1.610870784, + "gpu_mem": 4.72243456, + "loss": 1.3784, + "grad_norm": 0.7389135956764221, + "learning_rate": 4.3785594737908676e-05 + }, + { + "step": 485, + "epoch": 0.7772435897435898, + "cpu_mem": 1.610870784, + "gpu_mem": 4.722479104, + "loss": 1.3864, + "grad_norm": 0.9099717736244202, + "learning_rate": 4.319412709600723e-05 + }, + { + "step": 486, + "epoch": 0.7788461538461539, + "cpu_mem": 1.610870784, + "gpu_mem": 4.722459136, + "loss": 1.3764, + "grad_norm": 0.5877934098243713, + "learning_rate": 4.2606008861862116e-05 + }, + { + "step": 487, + "epoch": 0.780448717948718, + "cpu_mem": 1.610870784, + "gpu_mem": 4.722459136, + "loss": 1.4026, + "grad_norm": 0.8454053401947021, + "learning_rate": 4.2021258478726774e-05 + }, + { + "step": 488, + "epoch": 0.782051282051282, + "cpu_mem": 1.610870784, + "gpu_mem": 4.722425344, + "loss": 1.4034, + "grad_norm": 1.2035117149353027, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 489, + "epoch": 0.7836538461538461, + "cpu_mem": 1.610870784, + "gpu_mem": 4.72244992, + "loss": 1.4004, + "grad_norm": 0.6351267695426941, + "learning_rate": 4.0861934509848507e-05 + }, + { + "step": 490, + "epoch": 0.7852564102564102, + "cpu_mem": 1.610870784, + "gpu_mem": 4.722452992, + "loss": 1.4004, + "grad_norm": 0.6629595756530762, + "learning_rate": 4.028739728024022e-05 + }, + { + "step": 491, + "epoch": 0.7868589743589743, + "cpu_mem": 1.610870784, + "gpu_mem": 4.722431488, + "loss": 1.3947, + "grad_norm": 0.8256568908691406, + "learning_rate": 3.971630061277077e-05 + }, + { + "step": 492, + "epoch": 0.7884615384615384, + "cpu_mem": 1.610870784, + "gpu_mem": 4.722454528, + "loss": 1.3884, + "grad_norm": 0.930081844329834, + "learning_rate": 3.914866241690115e-05 + }, + { + "step": 493, + "epoch": 0.7900641025641025, + "cpu_mem": 1.610870784, + "gpu_mem": 4.72243456, + "loss": 1.3931, + "grad_norm": 0.6072859764099121, + "learning_rate": 3.858450049363532e-05 + }, + { + "step": 494, + "epoch": 0.7916666666666666, + "cpu_mem": 1.610870784, + "gpu_mem": 4.722459136, + "loss": 1.388, + "grad_norm": 0.4087527096271515, + "learning_rate": 3.8023832534962314e-05 + }, + { + "step": 495, + "epoch": 0.7932692307692307, + "cpu_mem": 1.610870784, + "gpu_mem": 4.72244224, + "loss": 1.3909, + "grad_norm": 0.30152732133865356, + "learning_rate": 3.746667612330109e-05 + }, + { + "step": 496, + "epoch": 0.7948717948717948, + "cpu_mem": 1.610870784, + "gpu_mem": 4.722437632, + "loss": 1.3838, + "grad_norm": 0.6275619268417358, + "learning_rate": 3.691304873094927e-05 + }, + { + "step": 497, + "epoch": 0.7964743589743589, + "cpu_mem": 1.610870784, + "gpu_mem": 4.72244992, + "loss": 1.3829, + "grad_norm": 0.49203988909721375, + "learning_rate": 3.636296771953544e-05 + }, + { + "step": 498, + "epoch": 0.7980769230769231, + "cpu_mem": 1.610870784, + "gpu_mem": 4.722420736, + "loss": 1.3946, + "grad_norm": 0.6078224778175354, + "learning_rate": 3.581645033947425e-05 + }, + { + "step": 499, + "epoch": 0.7996794871794872, + "cpu_mem": 1.610870784, + "gpu_mem": 4.72243456, + "loss": 1.3917, + "grad_norm": 0.5574798583984375, + "learning_rate": 3.527351372942588e-05 + }, + { + "step": 500, + "epoch": 0.8012820512820513, + "cpu_mem": 1.610870784, + "gpu_mem": 4.722420736, + "loss": 1.396, + "grad_norm": 0.7223727703094482, + "learning_rate": 3.473417491575824e-05 + }, + { + "step": 501, + "epoch": 0.8028846153846154, + "cpu_mem": 1.610870784, + "gpu_mem": 4.722414592, + "loss": 1.3864, + "grad_norm": 0.2870572507381439, + "learning_rate": 3.41984508120132e-05 + }, + { + "step": 502, + "epoch": 0.8044871794871795, + "cpu_mem": 1.610870784, + "gpu_mem": 4.722420736, + "loss": 1.3881, + "grad_norm": 0.17508858442306519, + "learning_rate": 3.366635821837627e-05 + }, + { + "step": 503, + "epoch": 0.8060897435897436, + "cpu_mem": 1.611067392, + "gpu_mem": 4.72243456, + "loss": 1.3855, + "grad_norm": 0.410611629486084, + "learning_rate": 3.3137913821149425e-05 + }, + { + "step": 504, + "epoch": 0.8076923076923077, + "cpu_mem": 1.611067392, + "gpu_mem": 4.722417664, + "loss": 1.3834, + "grad_norm": 0.8159025311470032, + "learning_rate": 3.261313419222825e-05 + }, + { + "step": 505, + "epoch": 0.8092948717948718, + "cpu_mem": 1.611264, + "gpu_mem": 4.722471424, + "loss": 1.3851, + "grad_norm": 0.41964346170425415, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 506, + "epoch": 0.8108974358974359, + "cpu_mem": 1.611264, + "gpu_mem": 4.722414592, + "loss": 1.3877, + "grad_norm": 0.2693106532096863, + "learning_rate": 3.157463495173713e-05 + }, + { + "step": 507, + "epoch": 0.8125, + "cpu_mem": 1.611264, + "gpu_mem": 4.722492928, + "loss": 1.3891, + "grad_norm": 0.8223925232887268, + "learning_rate": 3.1060947907265936e-05 + }, + { + "step": 508, + "epoch": 0.8141025641025641, + "cpu_mem": 1.611264, + "gpu_mem": 4.722436096, + "loss": 1.3974, + "grad_norm": 1.2207140922546387, + "learning_rate": 3.0550990764276634e-05 + }, + { + "step": 509, + "epoch": 0.8157051282051282, + "cpu_mem": 1.611264, + "gpu_mem": 4.722454528, + "loss": 1.4083, + "grad_norm": 0.9572092294692993, + "learning_rate": 3.00447795149086e-05 + }, + { + "step": 510, + "epoch": 0.8173076923076923, + "cpu_mem": 1.611264, + "gpu_mem": 4.722429952, + "loss": 1.3995, + "grad_norm": 0.8338014483451843, + "learning_rate": 2.9542330033830884e-05 + }, + { + "step": 511, + "epoch": 0.8189102564102564, + "cpu_mem": 1.611264, + "gpu_mem": 4.722462208, + "loss": 1.3891, + "grad_norm": 0.42752838134765625, + "learning_rate": 2.9043658077744316e-05 + }, + { + "step": 512, + "epoch": 0.8205128205128205, + "cpu_mem": 1.611460608, + "gpu_mem": 4.722482176, + "loss": 1.3837, + "grad_norm": 0.5737965703010559, + "learning_rate": 2.8548779284887442e-05 + }, + { + "step": 513, + "epoch": 0.8221153846153846, + "cpu_mem": 1.611460608, + "gpu_mem": 4.72241152, + "loss": 1.3879, + "grad_norm": 0.45506542921066284, + "learning_rate": 2.805770917454614e-05 + }, + { + "step": 514, + "epoch": 0.8237179487179487, + "cpu_mem": 1.611460608, + "gpu_mem": 4.722425344, + "loss": 1.3786, + "grad_norm": 0.4867141544818878, + "learning_rate": 2.7570463146566758e-05 + }, + { + "step": 515, + "epoch": 0.8253205128205128, + "cpu_mem": 1.611460608, + "gpu_mem": 4.722409984, + "loss": 1.3954, + "grad_norm": 0.6283121705055237, + "learning_rate": 2.708705648087332e-05 + }, + { + "step": 516, + "epoch": 0.8269230769230769, + "cpu_mem": 1.611460608, + "gpu_mem": 4.722448384, + "loss": 1.3907, + "grad_norm": 0.3768954575061798, + "learning_rate": 2.6607504336988317e-05 + }, + { + "step": 517, + "epoch": 0.8285256410256411, + "cpu_mem": 1.611460608, + "gpu_mem": 4.722448384, + "loss": 1.3899, + "grad_norm": 0.6269468665122986, + "learning_rate": 2.613182175355739e-05 + }, + { + "step": 518, + "epoch": 0.8301282051282052, + "cpu_mem": 1.611460608, + "gpu_mem": 4.72243456, + "loss": 1.3926, + "grad_norm": 0.5019913911819458, + "learning_rate": 2.5660023647877644e-05 + }, + { + "step": 519, + "epoch": 0.8317307692307693, + "cpu_mem": 1.611460608, + "gpu_mem": 4.722425344, + "loss": 1.3853, + "grad_norm": 0.296915203332901, + "learning_rate": 2.5192124815429777e-05 + }, + { + "step": 520, + "epoch": 0.8333333333333334, + "cpu_mem": 1.611460608, + "gpu_mem": 4.722429952, + "loss": 1.3887, + "grad_norm": 0.2889338433742523, + "learning_rate": 2.472813992941418e-05 + }, + { + "step": 521, + "epoch": 0.8349358974358975, + "cpu_mem": 1.611460608, + "gpu_mem": 4.722433024, + "loss": 1.3865, + "grad_norm": 0.49494239687919617, + "learning_rate": 2.426808354029078e-05 + }, + { + "step": 522, + "epoch": 0.8365384615384616, + "cpu_mem": 1.611460608, + "gpu_mem": 4.722439168, + "loss": 1.384, + "grad_norm": 0.2657417058944702, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 523, + "epoch": 0.8381410256410257, + "cpu_mem": 1.611460608, + "gpu_mem": 4.7224576, + "loss": 1.3861, + "grad_norm": 0.22828762233257294, + "learning_rate": 2.3359813838124277e-05 + }, + { + "step": 524, + "epoch": 0.8397435897435898, + "cpu_mem": 1.611460608, + "gpu_mem": 4.722451456, + "loss": 1.39, + "grad_norm": 0.5762540102005005, + "learning_rate": 2.2911629008211363e-05 + }, + { + "step": 525, + "epoch": 0.8413461538461539, + "cpu_mem": 1.611460608, + "gpu_mem": 4.722428416, + "loss": 1.3847, + "grad_norm": 0.3682617247104645, + "learning_rate": 2.24674296405579e-05 + }, + { + "step": 526, + "epoch": 0.842948717948718, + "cpu_mem": 1.611460608, + "gpu_mem": 4.722416128, + "loss": 1.3818, + "grad_norm": 0.5307474732398987, + "learning_rate": 2.2027229665154446e-05 + }, + { + "step": 527, + "epoch": 0.844551282051282, + "cpu_mem": 1.611460608, + "gpu_mem": 4.722382336, + "loss": 1.3987, + "grad_norm": 1.069140911102295, + "learning_rate": 2.1591042886571634e-05 + }, + { + "step": 528, + "epoch": 0.8461538461538461, + "cpu_mem": 1.611657216, + "gpu_mem": 4.722429952, + "loss": 1.3857, + "grad_norm": 0.41975656151771545, + "learning_rate": 2.1158882983527166e-05 + }, + { + "step": 529, + "epoch": 0.8477564102564102, + "cpu_mem": 1.611657216, + "gpu_mem": 4.72239616, + "loss": 1.3829, + "grad_norm": 0.3498774766921997, + "learning_rate": 2.0730763508456738e-05 + }, + { + "step": 530, + "epoch": 0.8493589743589743, + "cpu_mem": 1.611657216, + "gpu_mem": 4.722443776, + "loss": 1.3887, + "grad_norm": 0.744016706943512, + "learning_rate": 2.0306697887089235e-05 + }, + { + "step": 531, + "epoch": 0.8509615384615384, + "cpu_mem": 1.611657216, + "gpu_mem": 4.72244224, + "loss": 1.3868, + "grad_norm": 0.3477950692176819, + "learning_rate": 1.9886699418025543e-05 + }, + { + "step": 532, + "epoch": 0.8525641025641025, + "cpu_mem": 1.611657216, + "gpu_mem": 4.722443776, + "loss": 1.3911, + "grad_norm": 0.6162286996841431, + "learning_rate": 1.947078127232169e-05 + }, + { + "step": 533, + "epoch": 0.8541666666666666, + "cpu_mem": 1.611657216, + "gpu_mem": 4.722452992, + "loss": 1.3883, + "grad_norm": 0.3727521300315857, + "learning_rate": 1.9058956493075644e-05 + }, + { + "step": 534, + "epoch": 0.8557692307692307, + "cpu_mem": 1.611657216, + "gpu_mem": 4.722428416, + "loss": 1.3858, + "grad_norm": 0.5726221203804016, + "learning_rate": 1.8651237995018324e-05 + }, + { + "step": 535, + "epoch": 0.8573717948717948, + "cpu_mem": 1.611657216, + "gpu_mem": 4.722413056, + "loss": 1.3897, + "grad_norm": 0.285888671875, + "learning_rate": 1.8247638564108607e-05 + }, + { + "step": 536, + "epoch": 0.8589743589743589, + "cpu_mem": 1.611657216, + "gpu_mem": 4.72244224, + "loss": 1.3915, + "grad_norm": 0.6513480544090271, + "learning_rate": 1.7848170857132325e-05 + }, + { + "step": 537, + "epoch": 0.8605769230769231, + "cpu_mem": 1.611657216, + "gpu_mem": 4.722456064, + "loss": 1.3831, + "grad_norm": 0.5252628922462463, + "learning_rate": 1.7452847401305496e-05 + }, + { + "step": 538, + "epoch": 0.8621794871794872, + "cpu_mem": 1.611657216, + "gpu_mem": 4.72241152, + "loss": 1.3879, + "grad_norm": 0.8586182594299316, + "learning_rate": 1.7061680593881344e-05 + }, + { + "step": 539, + "epoch": 0.8637820512820513, + "cpu_mem": 1.611657216, + "gpu_mem": 4.722417664, + "loss": 1.3859, + "grad_norm": 0.6484368443489075, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 540, + "epoch": 0.8653846153846154, + "cpu_mem": 1.611657216, + "gpu_mem": 4.722446848, + "loss": 1.3839, + "grad_norm": 0.25634437799453735, + "learning_rate": 1.6291865861111353e-05 + }, + { + "step": 541, + "epoch": 0.8669871794871795, + "cpu_mem": 1.611657216, + "gpu_mem": 4.72244224, + "loss": 1.3832, + "grad_norm": 0.47787532210350037, + "learning_rate": 1.5913242076979493e-05 + }, + { + "step": 542, + "epoch": 0.8685897435897436, + "cpu_mem": 1.611657216, + "gpu_mem": 4.722428416, + "loss": 1.3944, + "grad_norm": 0.5599976778030396, + "learning_rate": 1.5538823222921288e-05 + }, + { + "step": 543, + "epoch": 0.8701923076923077, + "cpu_mem": 1.611657216, + "gpu_mem": 4.72244224, + "loss": 1.3684, + "grad_norm": 0.9128372073173523, + "learning_rate": 1.5168621040626388e-05 + }, + { + "step": 544, + "epoch": 0.8717948717948718, + "cpu_mem": 1.611657216, + "gpu_mem": 4.722431488, + "loss": 1.3937, + "grad_norm": 0.7286197543144226, + "learning_rate": 1.4802647139550577e-05 + }, + { + "step": 545, + "epoch": 0.8733974358974359, + "cpu_mem": 1.611657216, + "gpu_mem": 4.722437632, + "loss": 1.3867, + "grad_norm": 0.44419610500335693, + "learning_rate": 1.444091299655175e-05 + }, + { + "step": 546, + "epoch": 0.875, + "cpu_mem": 1.611657216, + "gpu_mem": 4.72244224, + "loss": 1.3809, + "grad_norm": 0.3898725211620331, + "learning_rate": 1.408342995552988e-05 + }, + { + "step": 547, + "epoch": 0.8766025641025641, + "cpu_mem": 1.611657216, + "gpu_mem": 4.722437632, + "loss": 1.3901, + "grad_norm": 0.6062067151069641, + "learning_rate": 1.3730209227071436e-05 + }, + { + "step": 548, + "epoch": 0.8782051282051282, + "cpu_mem": 1.611657216, + "gpu_mem": 4.72241152, + "loss": 1.3777, + "grad_norm": 0.7227404713630676, + "learning_rate": 1.3381261888097755e-05 + }, + { + "step": 549, + "epoch": 0.8798076923076923, + "cpu_mem": 1.611657216, + "gpu_mem": 4.722420736, + "loss": 1.3905, + "grad_norm": 0.7412676215171814, + "learning_rate": 1.303659888151753e-05 + }, + { + "step": 550, + "epoch": 0.8814102564102564, + "cpu_mem": 1.611657216, + "gpu_mem": 4.722439168, + "loss": 1.391, + "grad_norm": 0.949923574924469, + "learning_rate": 1.2696231015883913e-05 + }, + { + "step": 551, + "epoch": 0.8830128205128205, + "cpu_mem": 1.611657216, + "gpu_mem": 4.722409984, + "loss": 1.3848, + "grad_norm": 0.9975295066833496, + "learning_rate": 1.2360168965055301e-05 + }, + { + "step": 552, + "epoch": 0.8846153846153846, + "cpu_mem": 1.611657216, + "gpu_mem": 4.722440704, + "loss": 1.3853, + "grad_norm": 0.9624658226966858, + "learning_rate": 1.2028423267860805e-05 + }, + { + "step": 553, + "epoch": 0.8862179487179487, + "cpu_mem": 1.611657216, + "gpu_mem": 4.72244992, + "loss": 1.3886, + "grad_norm": 1.0499154329299927, + "learning_rate": 1.1701004327769709e-05 + }, + { + "step": 554, + "epoch": 0.8878205128205128, + "cpu_mem": 1.611657216, + "gpu_mem": 4.72241152, + "loss": 1.3928, + "grad_norm": 0.9055944085121155, + "learning_rate": 1.1377922412565005e-05 + }, + { + "step": 555, + "epoch": 0.8894230769230769, + "cpu_mem": 1.611853824, + "gpu_mem": 4.722416128, + "loss": 1.3785, + "grad_norm": 1.0142476558685303, + "learning_rate": 1.1059187654021762e-05 + }, + { + "step": 556, + "epoch": 0.8910256410256411, + "cpu_mem": 1.611853824, + "gpu_mem": 4.722440704, + "loss": 1.3779, + "grad_norm": 1.0462636947631836, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 557, + "epoch": 0.8926282051282052, + "cpu_mem": 1.611853824, + "gpu_mem": 4.722459136, + "loss": 1.3685, + "grad_norm": 1.1422914266586304, + "learning_rate": 1.0434799452076915e-05 + }, + { + "step": 558, + "epoch": 0.8942307692307693, + "cpu_mem": 1.611853824, + "gpu_mem": 4.722440704, + "loss": 1.3674, + "grad_norm": 1.4782180786132812, + "learning_rate": 1.0129165589346643e-05 + }, + { + "step": 559, + "epoch": 0.8958333333333334, + "cpu_mem": 1.611853824, + "gpu_mem": 4.722491392, + "loss": 1.3707, + "grad_norm": 1.486499547958374, + "learning_rate": 9.82791804400626e-06 + }, + { + "step": 560, + "epoch": 0.8974358974358975, + "cpu_mem": 1.611853824, + "gpu_mem": 4.722423808, + "loss": 1.3739, + "grad_norm": 1.2915288209915161, + "learning_rate": 9.531066263109971e-06 + }, + { + "step": 561, + "epoch": 0.8990384615384616, + "cpu_mem": 1.611853824, + "gpu_mem": 4.722425344, + "loss": 1.3725, + "grad_norm": 1.6012204885482788, + "learning_rate": 9.238619555861731e-06 + }, + { + "step": 562, + "epoch": 0.9006410256410257, + "cpu_mem": 1.611853824, + "gpu_mem": 4.722425344, + "loss": 1.3836, + "grad_norm": 1.4433904886245728, + "learning_rate": 8.950587093323435e-06 + }, + { + "step": 563, + "epoch": 0.9022435897435898, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722431488, + "loss": 1.3703, + "grad_norm": 1.3450508117675781, + "learning_rate": 8.66697790812731e-06 + }, + { + "step": 564, + "epoch": 0.9038461538461539, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722445312, + "loss": 1.3622, + "grad_norm": 1.7130385637283325, + "learning_rate": 8.387800894192453e-06 + }, + { + "step": 565, + "epoch": 0.905448717948718, + "cpu_mem": 1.612050432, + "gpu_mem": 4.72244992, + "loss": 1.3687, + "grad_norm": 1.579100489616394, + "learning_rate": 8.113064806446285e-06 + }, + { + "step": 566, + "epoch": 0.907051282051282, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722443776, + "loss": 1.3584, + "grad_norm": 1.8224035501480103, + "learning_rate": 7.842778260549654e-06 + }, + { + "step": 567, + "epoch": 0.9086538461538461, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722437632, + "loss": 1.3537, + "grad_norm": 1.871652364730835, + "learning_rate": 7.5769497326268804e-06 + }, + { + "step": 568, + "epoch": 0.9102564102564102, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722451456, + "loss": 1.3876, + "grad_norm": 2.049762487411499, + "learning_rate": 7.315587558999864e-06 + }, + { + "step": 569, + "epoch": 0.9118589743589743, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722443776, + "loss": 1.3629, + "grad_norm": 1.5026346445083618, + "learning_rate": 7.058699935926526e-06 + }, + { + "step": 570, + "epoch": 0.9134615384615384, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722428416, + "loss": 1.371, + "grad_norm": 1.888851523399353, + "learning_rate": 6.8062949193440515e-06 + }, + { + "step": 571, + "epoch": 0.9150641025641025, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722437632, + "loss": 1.3552, + "grad_norm": 1.8216677904129028, + "learning_rate": 6.5583804246160385e-06 + }, + { + "step": 572, + "epoch": 0.9166666666666666, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722446848, + "loss": 1.3708, + "grad_norm": 1.6709445714950562, + "learning_rate": 6.3149642262843804e-06 + }, + { + "step": 573, + "epoch": 0.9182692307692307, + "cpu_mem": 1.612050432, + "gpu_mem": 4.72244992, + "loss": 1.3473, + "grad_norm": 1.775826334953308, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 574, + "epoch": 0.9198717948717948, + "cpu_mem": 1.612050432, + "gpu_mem": 4.72240384, + "loss": 1.3483, + "grad_norm": 1.7349151372909546, + "learning_rate": 5.84165711141048e-06 + }, + { + "step": 575, + "epoch": 0.9214743589743589, + "cpu_mem": 1.612050432, + "gpu_mem": 4.7224576, + "loss": 1.3538, + "grad_norm": 1.7192010879516602, + "learning_rate": 5.611781037671176e-06 + }, + { + "step": 576, + "epoch": 0.9230769230769231, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722459136, + "loss": 1.36, + "grad_norm": 1.9582927227020264, + "learning_rate": 5.386432945468555e-06 + }, + { + "step": 577, + "epoch": 0.9246794871794872, + "cpu_mem": 1.612050432, + "gpu_mem": 4.72240384, + "loss": 1.3518, + "grad_norm": 1.9528534412384033, + "learning_rate": 5.165619901667311e-06 + }, + { + "step": 578, + "epoch": 0.9262820512820513, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722437632, + "loss": 1.3371, + "grad_norm": 1.811846375465393, + "learning_rate": 4.949348830914002e-06 + }, + { + "step": 579, + "epoch": 0.9278846153846154, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722416128, + "loss": 1.3584, + "grad_norm": 2.013731002807617, + "learning_rate": 4.737626515419951e-06 + }, + { + "step": 580, + "epoch": 0.9294871794871795, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722446848, + "loss": 1.3613, + "grad_norm": 2.1947970390319824, + "learning_rate": 4.530459594748592e-06 + }, + { + "step": 581, + "epoch": 0.9310897435897436, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722422272, + "loss": 1.3613, + "grad_norm": 2.0811009407043457, + "learning_rate": 4.327854565607164e-06 + }, + { + "step": 582, + "epoch": 0.9326923076923077, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722456064, + "loss": 1.3686, + "grad_norm": 1.7894784212112427, + "learning_rate": 4.129817781643091e-06 + }, + { + "step": 583, + "epoch": 0.9342948717948718, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722476032, + "loss": 1.3834, + "grad_norm": 2.714114189147949, + "learning_rate": 3.9363554532446276e-06 + }, + { + "step": 584, + "epoch": 0.9358974358974359, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722440704, + "loss": 1.3612, + "grad_norm": 2.472625732421875, + "learning_rate": 3.7474736473461607e-06 + }, + { + "step": 585, + "epoch": 0.9375, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722460672, + "loss": 1.3486, + "grad_norm": 1.9119988679885864, + "learning_rate": 3.56317828723795e-06 + }, + { + "step": 586, + "epoch": 0.9391025641025641, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722440704, + "loss": 1.3585, + "grad_norm": 2.7448368072509766, + "learning_rate": 3.383475152380355e-06 + }, + { + "step": 587, + "epoch": 0.9407051282051282, + "cpu_mem": 1.612050432, + "gpu_mem": 4.72244224, + "loss": 1.3382, + "grad_norm": 2.2698476314544678, + "learning_rate": 3.2083698782225997e-06 + }, + { + "step": 588, + "epoch": 0.9423076923076923, + "cpu_mem": 1.612050432, + "gpu_mem": 4.72243456, + "loss": 1.365, + "grad_norm": 2.582028865814209, + "learning_rate": 3.0378679560260467e-06 + }, + { + "step": 589, + "epoch": 0.9439102564102564, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722433024, + "loss": 1.348, + "grad_norm": 2.3883917331695557, + "learning_rate": 2.871974732691984e-06 + }, + { + "step": 590, + "epoch": 0.9455128205128205, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722446848, + "loss": 1.3774, + "grad_norm": 2.3598060607910156, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 591, + "epoch": 0.9471153846153846, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722417664, + "loss": 1.3775, + "grad_norm": 2.397908926010132, + "learning_rate": 2.554035047414732e-06 + }, + { + "step": 592, + "epoch": 0.9487179487179487, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722468352, + "loss": 1.4001, + "grad_norm": 2.2537097930908203, + "learning_rate": 2.401998555987389e-06 + }, + { + "step": 593, + "epoch": 0.9503205128205128, + "cpu_mem": 1.612050432, + "gpu_mem": 4.72246528, + "loss": 1.3654, + "grad_norm": 1.9344204664230347, + "learning_rate": 2.2545907041415457e-06 + }, + { + "step": 594, + "epoch": 0.9519230769230769, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722445312, + "loss": 1.3411, + "grad_norm": 2.640138864517212, + "learning_rate": 2.1118161145537436e-06 + }, + { + "step": 595, + "epoch": 0.9535256410256411, + "cpu_mem": 1.612050432, + "gpu_mem": 4.72242688, + "loss": 1.3311, + "grad_norm": 2.7026572227478027, + "learning_rate": 1.973679264602485e-06 + }, + { + "step": 596, + "epoch": 0.9551282051282052, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722436096, + "loss": 1.3844, + "grad_norm": 2.5128817558288574, + "learning_rate": 1.840184486227808e-06 + }, + { + "step": 597, + "epoch": 0.9567307692307693, + "cpu_mem": 1.612050432, + "gpu_mem": 4.72240384, + "loss": 1.3618, + "grad_norm": 2.6440250873565674, + "learning_rate": 1.711335965795435e-06 + }, + { + "step": 598, + "epoch": 0.9583333333333334, + "cpu_mem": 1.612050432, + "gpu_mem": 4.72246528, + "loss": 1.3757, + "grad_norm": 2.665492057800293, + "learning_rate": 1.5871377439655054e-06 + }, + { + "step": 599, + "epoch": 0.9599358974358975, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722463744, + "loss": 1.365, + "grad_norm": 2.355131149291992, + "learning_rate": 1.4675937155658456e-06 + }, + { + "step": 600, + "epoch": 0.9615384615384616, + "cpu_mem": 1.612050432, + "gpu_mem": 4.7224192, + "loss": 1.359, + "grad_norm": 2.486896514892578, + "learning_rate": 1.3527076294698846e-06 + }, + { + "step": 601, + "epoch": 0.9631410256410257, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722451456, + "loss": 1.3555, + "grad_norm": 2.5654330253601074, + "learning_rate": 1.2424830884790126e-06 + }, + { + "step": 602, + "epoch": 0.9647435897435898, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722445312, + "loss": 1.3599, + "grad_norm": 2.109545946121216, + "learning_rate": 1.1369235492096397e-06 + }, + { + "step": 603, + "epoch": 0.9663461538461539, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722431488, + "loss": 1.4069, + "grad_norm": 2.5778605937957764, + "learning_rate": 1.0360323219847645e-06 + }, + { + "step": 604, + "epoch": 0.967948717948718, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722431488, + "loss": 1.3521, + "grad_norm": 2.6051065921783447, + "learning_rate": 9.398125707302084e-07 + }, + { + "step": 605, + "epoch": 0.969551282051282, + "cpu_mem": 1.612050432, + "gpu_mem": 4.7224576, + "loss": 1.3628, + "grad_norm": 2.4459681510925293, + "learning_rate": 8.482673128753947e-07 + }, + { + "step": 606, + "epoch": 0.9711538461538461, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722445312, + "loss": 1.3538, + "grad_norm": 2.714845895767212, + "learning_rate": 7.613994192586736e-07 + }, + { + "step": 607, + "epoch": 0.9727564102564102, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722436096, + "loss": 1.3544, + "grad_norm": 2.5137243270874023, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 608, + "epoch": 0.9743589743589743, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722428416, + "loss": 1.3796, + "grad_norm": 2.0661675930023193, + "learning_rate": 6.017064746021094e-07 + }, + { + "step": 609, + "epoch": 0.9759615384615384, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722451456, + "loss": 1.3633, + "grad_norm": 2.9593029022216797, + "learning_rate": 5.288864314965003e-07 + }, + { + "step": 610, + "epoch": 0.9775641025641025, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722440704, + "loss": 1.3574, + "grad_norm": 2.91758394241333, + "learning_rate": 4.607537683404106e-07 + }, + { + "step": 611, + "epoch": 0.9791666666666666, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722425344, + "loss": 1.3766, + "grad_norm": 2.43530011177063, + "learning_rate": 3.973106217585842e-07 + }, + { + "step": 612, + "epoch": 0.9807692307692307, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722491392, + "loss": 1.3356, + "grad_norm": 2.75797700881958, + "learning_rate": 3.3855898131356915e-07 + }, + { + "step": 613, + "epoch": 0.9823717948717948, + "cpu_mem": 1.612050432, + "gpu_mem": 4.72243456, + "loss": 1.39, + "grad_norm": 2.256016731262207, + "learning_rate": 2.845006894433843e-07 + }, + { + "step": 614, + "epoch": 0.9839743589743589, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722420736, + "loss": 1.3683, + "grad_norm": 2.3038384914398193, + "learning_rate": 2.351374414037155e-07 + }, + { + "step": 615, + "epoch": 0.9855769230769231, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722485248, + "loss": 1.3866, + "grad_norm": 3.3814566135406494, + "learning_rate": 1.9047078521474135e-07 + }, + { + "step": 616, + "epoch": 0.9871794871794872, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722414592, + "loss": 1.3525, + "grad_norm": 2.532308578491211, + "learning_rate": 1.505021216125557e-07 + }, + { + "step": 617, + "epoch": 0.9887820512820513, + "cpu_mem": 1.612050432, + "gpu_mem": 4.72244224, + "loss": 1.3491, + "grad_norm": 1.9540045261383057, + "learning_rate": 1.1523270400535245e-07 + }, + { + "step": 618, + "epoch": 0.9903846153846154, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722445312, + "loss": 1.3672, + "grad_norm": 2.476719617843628, + "learning_rate": 8.466363843397383e-08 + }, + { + "step": 619, + "epoch": 0.9919871794871795, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722420736, + "loss": 1.3892, + "grad_norm": 2.8668808937072754, + "learning_rate": 5.8795883537338106e-08 + }, + { + "step": 620, + "epoch": 0.9935897435897436, + "cpu_mem": 1.612050432, + "gpu_mem": 4.722448384, + "loss": 1.343, + "grad_norm": 2.3412957191467285, + "learning_rate": 3.763025052231361e-08 + }, + { + "step": 621, + "epoch": 0.9951923076923077, + "cpu_mem": 1.61224704, + "gpu_mem": 4.7224576, + "loss": 1.3519, + "grad_norm": 2.453914165496826, + "learning_rate": 2.1167403138339088e-08 + }, + { + "step": 622, + "epoch": 0.9967948717948718, + "cpu_mem": 1.61224704, + "gpu_mem": 4.722454528, + "loss": 1.3647, + "grad_norm": 2.5277469158172607, + "learning_rate": 9.407857656540397e-09 + }, + { + "step": 623, + "epoch": 0.9983974358974359, + "cpu_mem": 1.61224704, + "gpu_mem": 4.72242688, + "loss": 1.3736, + "grad_norm": 2.3556041717529297, + "learning_rate": 2.3519828535434325e-09 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.61224704, + "gpu_mem": 4.72217344, + "loss": 1.3102, + "grad_norm": 3.2788288593292236, + "learning_rate": 0.0 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.61224704, + "gpu_mem": 4.72217344, + "train_runtime": 8107.2833, + "train_samples_per_second": 4.922, + "train_steps_per_second": 0.077, + "total_flos": 0.0, + "train_loss": 1.4366886867926671 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/adapter_config.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6934cfad94edb068f0d54db83e6a8b58f0fc939 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 16, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 8, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "A" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/eval_results.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ee3780fdcf1bf03bfa213dc48659233894a2cedc --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "hellaswag", + "results": 0.2504481179047998 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/training_configuration.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..bd224c9c3277d9d10f3aa7709ec7c3fab19074f7 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "HELLASWAG", + "dataset_id": "Rowan/hellaswag", + "preprocess_id": "hellaswag_train_deepeval" + }, + "peft_config": { + "method": "abl_A", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 6317696 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 1, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_A-hellaswag-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2", + "seed": 42, + "timestamp": "2025-08-30T23:40:44.452046" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/training_logs.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..70b272950775fc017a206e4df0a8174695dea184 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/training_logs.json @@ -0,0 +1,5629 @@ +[ + { + "step": 1, + "epoch": 0.0016025641025641025, + "cpu_mem": 1.490477056, + "gpu_mem": 4.443068928, + "loss": 3.4877, + "grad_norm": 197.58782958984375, + "learning_rate": 4.7619047619047615e-06 + }, + { + "step": 2, + "epoch": 0.003205128205128205, + "cpu_mem": 1.49696512, + "gpu_mem": 4.493682688, + "loss": 3.6203, + "grad_norm": 197.1517791748047, + "learning_rate": 9.523809523809523e-06 + }, + { + "step": 3, + "epoch": 0.004807692307692308, + "cpu_mem": 1.49794816, + "gpu_mem": 4.493690368, + "loss": 2.9312, + "grad_norm": 161.58560180664062, + "learning_rate": 1.4285714285714284e-05 + }, + { + "step": 4, + "epoch": 0.00641025641025641, + "cpu_mem": 1.4989312, + "gpu_mem": 4.49372416, + "loss": 2.4845, + "grad_norm": 91.73587799072266, + "learning_rate": 1.9047619047619046e-05 + }, + { + "step": 5, + "epoch": 0.008012820512820512, + "cpu_mem": 1.49991424, + "gpu_mem": 4.493687296, + "loss": 1.8525, + "grad_norm": 52.71778869628906, + "learning_rate": 2.3809523809523807e-05 + }, + { + "step": 6, + "epoch": 0.009615384615384616, + "cpu_mem": 1.50089728, + "gpu_mem": 4.493733376, + "loss": 1.5453, + "grad_norm": 24.816925048828125, + "learning_rate": 2.8571428571428567e-05 + }, + { + "step": 7, + "epoch": 0.011217948717948718, + "cpu_mem": 1.50188032, + "gpu_mem": 4.49369344, + "loss": 1.3944, + "grad_norm": 18.616849899291992, + "learning_rate": 3.333333333333333e-05 + }, + { + "step": 8, + "epoch": 0.01282051282051282, + "cpu_mem": 1.502666752, + "gpu_mem": 4.49372416, + "loss": 1.6391, + "grad_norm": 49.84831237792969, + "learning_rate": 3.809523809523809e-05 + }, + { + "step": 9, + "epoch": 0.014423076923076924, + "cpu_mem": 1.503649792, + "gpu_mem": 4.49372416, + "loss": 1.5171, + "grad_norm": 28.693214416503906, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 10, + "epoch": 0.016025641025641024, + "cpu_mem": 1.504436224, + "gpu_mem": 4.493667328, + "loss": 1.51, + "grad_norm": 14.804232597351074, + "learning_rate": 4.7619047619047614e-05 + }, + { + "step": 11, + "epoch": 0.017628205128205128, + "cpu_mem": 1.505026048, + "gpu_mem": 4.493687296, + "loss": 1.4653, + "grad_norm": 13.072846412658691, + "learning_rate": 5.238095238095237e-05 + }, + { + "step": 12, + "epoch": 0.019230769230769232, + "cpu_mem": 1.50581248, + "gpu_mem": 4.493684224, + "loss": 1.5007, + "grad_norm": 19.242982864379883, + "learning_rate": 5.7142857142857135e-05 + }, + { + "step": 13, + "epoch": 0.020833333333333332, + "cpu_mem": 1.506402304, + "gpu_mem": 4.493676544, + "loss": 1.3914, + "grad_norm": 6.483973026275635, + "learning_rate": 6.190476190476189e-05 + }, + { + "step": 14, + "epoch": 0.022435897435897436, + "cpu_mem": 1.507188736, + "gpu_mem": 4.493702656, + "loss": 1.4362, + "grad_norm": 17.707304000854492, + "learning_rate": 6.666666666666666e-05 + }, + { + "step": 15, + "epoch": 0.02403846153846154, + "cpu_mem": 1.507975168, + "gpu_mem": 4.49370112, + "loss": 1.4809, + "grad_norm": 20.324914932250977, + "learning_rate": 7.142857142857142e-05 + }, + { + "step": 16, + "epoch": 0.02564102564102564, + "cpu_mem": 1.5087616, + "gpu_mem": 4.49369344, + "loss": 1.3772, + "grad_norm": 4.270448207855225, + "learning_rate": 7.619047619047618e-05 + }, + { + "step": 17, + "epoch": 0.027243589743589744, + "cpu_mem": 1.509351424, + "gpu_mem": 4.49369344, + "loss": 1.4619, + "grad_norm": 9.25676155090332, + "learning_rate": 8.095238095238093e-05 + }, + { + "step": 18, + "epoch": 0.028846153846153848, + "cpu_mem": 1.510137856, + "gpu_mem": 4.49369344, + "loss": 1.357, + "grad_norm": 6.73371696472168, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 19, + "epoch": 0.030448717948717948, + "cpu_mem": 1.51072768, + "gpu_mem": 4.49369344, + "loss": 1.462, + "grad_norm": 7.444686412811279, + "learning_rate": 9.047619047619046e-05 + }, + { + "step": 20, + "epoch": 0.03205128205128205, + "cpu_mem": 1.511514112, + "gpu_mem": 4.493667328, + "loss": 1.4961, + "grad_norm": 10.376049041748047, + "learning_rate": 9.523809523809523e-05 + }, + { + "step": 21, + "epoch": 0.03365384615384615, + "cpu_mem": 1.512103936, + "gpu_mem": 4.493684224, + "loss": 1.4243, + "grad_norm": 6.460087299346924, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 22, + "epoch": 0.035256410256410256, + "cpu_mem": 1.512890368, + "gpu_mem": 4.493691904, + "loss": 1.4484, + "grad_norm": 7.836177825927734, + "learning_rate": 0.00010476190476190474 + }, + { + "step": 23, + "epoch": 0.03685897435897436, + "cpu_mem": 1.513480192, + "gpu_mem": 4.493705728, + "loss": 1.3813, + "grad_norm": 3.215859889984131, + "learning_rate": 0.0001095238095238095 + }, + { + "step": 24, + "epoch": 0.038461538461538464, + "cpu_mem": 1.514070016, + "gpu_mem": 4.493690368, + "loss": 1.3993, + "grad_norm": 6.221399784088135, + "learning_rate": 0.00011428571428571427 + }, + { + "step": 25, + "epoch": 0.04006410256410257, + "cpu_mem": 1.51465984, + "gpu_mem": 4.49367808, + "loss": 1.4951, + "grad_norm": 10.708608627319336, + "learning_rate": 0.00011904761904761903 + }, + { + "step": 26, + "epoch": 0.041666666666666664, + "cpu_mem": 1.515249664, + "gpu_mem": 4.493684224, + "loss": 1.393, + "grad_norm": 5.559924602508545, + "learning_rate": 0.00012380952380952378 + }, + { + "step": 27, + "epoch": 0.04326923076923077, + "cpu_mem": 1.515839488, + "gpu_mem": 4.493691904, + "loss": 1.4489, + "grad_norm": 10.240148544311523, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 28, + "epoch": 0.04487179487179487, + "cpu_mem": 1.516429312, + "gpu_mem": 4.493687296, + "loss": 1.3712, + "grad_norm": 2.7824208736419678, + "learning_rate": 0.0001333333333333333 + }, + { + "step": 29, + "epoch": 0.046474358974358976, + "cpu_mem": 1.517019136, + "gpu_mem": 4.493696512, + "loss": 1.4622, + "grad_norm": 7.267579078674316, + "learning_rate": 0.00013809523809523808 + }, + { + "step": 30, + "epoch": 0.04807692307692308, + "cpu_mem": 1.51760896, + "gpu_mem": 4.493668864, + "loss": 1.4239, + "grad_norm": 6.568663597106934, + "learning_rate": 0.00014285714285714284 + }, + { + "step": 31, + "epoch": 0.049679487179487176, + "cpu_mem": 1.518198784, + "gpu_mem": 4.49372416, + "loss": 1.3946, + "grad_norm": 2.786107063293457, + "learning_rate": 0.0001476190476190476 + }, + { + "step": 32, + "epoch": 0.05128205128205128, + "cpu_mem": 1.518788608, + "gpu_mem": 4.49371648, + "loss": 1.391, + "grad_norm": 2.4760096073150635, + "learning_rate": 0.00015238095238095237 + }, + { + "step": 33, + "epoch": 0.052884615384615384, + "cpu_mem": 1.519378432, + "gpu_mem": 4.4936704, + "loss": 1.4038, + "grad_norm": 3.5292749404907227, + "learning_rate": 0.00015714285714285713 + }, + { + "step": 34, + "epoch": 0.05448717948717949, + "cpu_mem": 1.519968256, + "gpu_mem": 4.493688832, + "loss": 1.3928, + "grad_norm": 2.1710269451141357, + "learning_rate": 0.00016190476190476187 + }, + { + "step": 35, + "epoch": 0.05608974358974359, + "cpu_mem": 1.52055808, + "gpu_mem": 4.493710336, + "loss": 1.4333, + "grad_norm": 7.068583011627197, + "learning_rate": 0.00016666666666666666 + }, + { + "step": 36, + "epoch": 0.057692307692307696, + "cpu_mem": 1.521147904, + "gpu_mem": 4.4937088, + "loss": 1.3827, + "grad_norm": 1.9546290636062622, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 37, + "epoch": 0.05929487179487179, + "cpu_mem": 1.52154112, + "gpu_mem": 4.493741056, + "loss": 1.4203, + "grad_norm": 4.018156051635742, + "learning_rate": 0.0001761904761904762 + }, + { + "step": 38, + "epoch": 0.060897435897435896, + "cpu_mem": 1.522130944, + "gpu_mem": 4.49369344, + "loss": 1.4079, + "grad_norm": 2.447579860687256, + "learning_rate": 0.00018095238095238093 + }, + { + "step": 39, + "epoch": 0.0625, + "cpu_mem": 1.522720768, + "gpu_mem": 4.493750272, + "loss": 1.3816, + "grad_norm": 5.477646350860596, + "learning_rate": 0.00018571428571428572 + }, + { + "step": 40, + "epoch": 0.0641025641025641, + "cpu_mem": 1.523310592, + "gpu_mem": 4.49367808, + "loss": 1.3896, + "grad_norm": 2.5201098918914795, + "learning_rate": 0.00019047619047619045 + }, + { + "step": 41, + "epoch": 0.06570512820512821, + "cpu_mem": 1.523900416, + "gpu_mem": 4.493705728, + "loss": 1.4047, + "grad_norm": 4.747108459472656, + "learning_rate": 0.00019523809523809522 + }, + { + "step": 42, + "epoch": 0.0673076923076923, + "cpu_mem": 1.524293632, + "gpu_mem": 4.493719552, + "loss": 1.4888, + "grad_norm": 6.449206352233887, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 43, + "epoch": 0.06891025641025642, + "cpu_mem": 1.524883456, + "gpu_mem": 4.493725696, + "loss": 1.3844, + "grad_norm": 2.276243209838867, + "learning_rate": 0.00020476190476190475 + }, + { + "step": 44, + "epoch": 0.07051282051282051, + "cpu_mem": 1.52547328, + "gpu_mem": 4.493704192, + "loss": 1.3976, + "grad_norm": 1.6573036909103394, + "learning_rate": 0.00020952380952380948 + }, + { + "step": 45, + "epoch": 0.07211538461538461, + "cpu_mem": 1.525866496, + "gpu_mem": 4.493704192, + "loss": 1.4198, + "grad_norm": 3.31832218170166, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 46, + "epoch": 0.07371794871794872, + "cpu_mem": 1.526259712, + "gpu_mem": 4.493704192, + "loss": 1.3434, + "grad_norm": 2.0278918743133545, + "learning_rate": 0.000219047619047619 + }, + { + "step": 47, + "epoch": 0.07532051282051282, + "cpu_mem": 1.526849536, + "gpu_mem": 4.493690368, + "loss": 1.435, + "grad_norm": 4.17558479309082, + "learning_rate": 0.0002238095238095238 + }, + { + "step": 48, + "epoch": 0.07692307692307693, + "cpu_mem": 1.52743936, + "gpu_mem": 4.4937088, + "loss": 1.3653, + "grad_norm": 2.224799394607544, + "learning_rate": 0.00022857142857142854 + }, + { + "step": 49, + "epoch": 0.07852564102564102, + "cpu_mem": 1.528029184, + "gpu_mem": 4.493721088, + "loss": 1.4487, + "grad_norm": 5.568036079406738, + "learning_rate": 0.0002333333333333333 + }, + { + "step": 50, + "epoch": 0.08012820512820513, + "cpu_mem": 1.528619008, + "gpu_mem": 4.493698048, + "loss": 1.396, + "grad_norm": 4.042112827301025, + "learning_rate": 0.00023809523809523807 + }, + { + "step": 51, + "epoch": 0.08173076923076923, + "cpu_mem": 1.529012224, + "gpu_mem": 4.493682688, + "loss": 1.5374, + "grad_norm": 17.654705047607422, + "learning_rate": 0.00024285714285714283 + }, + { + "step": 52, + "epoch": 0.08333333333333333, + "cpu_mem": 1.529602048, + "gpu_mem": 4.493687296, + "loss": 1.374, + "grad_norm": 4.561004161834717, + "learning_rate": 0.00024761904761904757 + }, + { + "step": 53, + "epoch": 0.08493589743589744, + "cpu_mem": 1.529995264, + "gpu_mem": 4.493714944, + "loss": 1.4217, + "grad_norm": 7.734982490539551, + "learning_rate": 0.0002523809523809524 + }, + { + "step": 54, + "epoch": 0.08653846153846154, + "cpu_mem": 1.53038848, + "gpu_mem": 4.493690368, + "loss": 1.5671, + "grad_norm": 22.13733673095703, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 55, + "epoch": 0.08814102564102565, + "cpu_mem": 1.530978304, + "gpu_mem": 4.4937088, + "loss": 1.418, + "grad_norm": 7.439602375030518, + "learning_rate": 0.00026190476190476186 + }, + { + "step": 56, + "epoch": 0.08974358974358974, + "cpu_mem": 1.53137152, + "gpu_mem": 4.493702656, + "loss": 1.412, + "grad_norm": 6.829029083251953, + "learning_rate": 0.0002666666666666666 + }, + { + "step": 57, + "epoch": 0.09134615384615384, + "cpu_mem": 1.531764736, + "gpu_mem": 4.493668864, + "loss": 1.3833, + "grad_norm": 6.900156021118164, + "learning_rate": 0.0002714285714285714 + }, + { + "step": 58, + "epoch": 0.09294871794871795, + "cpu_mem": 1.53235456, + "gpu_mem": 4.493698048, + "loss": 1.432, + "grad_norm": 9.24257755279541, + "learning_rate": 0.00027619047619047615 + }, + { + "step": 59, + "epoch": 0.09455128205128205, + "cpu_mem": 1.532747776, + "gpu_mem": 4.493681152, + "loss": 1.3852, + "grad_norm": 8.134661674499512, + "learning_rate": 0.0002809523809523809 + }, + { + "step": 60, + "epoch": 0.09615384615384616, + "cpu_mem": 1.533140992, + "gpu_mem": 4.493722624, + "loss": 1.4738, + "grad_norm": 10.107804298400879, + "learning_rate": 0.0002857142857142857 + }, + { + "step": 61, + "epoch": 0.09775641025641026, + "cpu_mem": 1.533730816, + "gpu_mem": 4.493688832, + "loss": 1.4322, + "grad_norm": 4.196715831756592, + "learning_rate": 0.00029047619047619045 + }, + { + "step": 62, + "epoch": 0.09935897435897435, + "cpu_mem": 1.534124032, + "gpu_mem": 4.493728768, + "loss": 1.3219, + "grad_norm": 4.355276584625244, + "learning_rate": 0.0002952380952380952 + }, + { + "step": 63, + "epoch": 0.10096153846153846, + "cpu_mem": 1.534713856, + "gpu_mem": 4.493682688, + "loss": 1.4755, + "grad_norm": 8.10551929473877, + "learning_rate": 0.0003 + }, + { + "step": 64, + "epoch": 0.10256410256410256, + "cpu_mem": 1.535107072, + "gpu_mem": 4.493687296, + "loss": 1.4869, + "grad_norm": 8.475208282470703, + "learning_rate": 0.00029999764801714643 + }, + { + "step": 65, + "epoch": 0.10416666666666667, + "cpu_mem": 1.535696896, + "gpu_mem": 4.493684224, + "loss": 1.4113, + "grad_norm": 3.9081897735595703, + "learning_rate": 0.00029999059214234344 + }, + { + "step": 66, + "epoch": 0.10576923076923077, + "cpu_mem": 1.536090112, + "gpu_mem": 4.493702656, + "loss": 1.4301, + "grad_norm": 3.5299301147460938, + "learning_rate": 0.00029997883259686163 + }, + { + "step": 67, + "epoch": 0.10737179487179487, + "cpu_mem": 1.536679936, + "gpu_mem": 4.493694976, + "loss": 1.432, + "grad_norm": 3.733750581741333, + "learning_rate": 0.00029996236974947764 + }, + { + "step": 68, + "epoch": 0.10897435897435898, + "cpu_mem": 1.537073152, + "gpu_mem": 4.493679616, + "loss": 1.4131, + "grad_norm": 4.54693603515625, + "learning_rate": 0.00029994120411646263 + }, + { + "step": 69, + "epoch": 0.11057692307692307, + "cpu_mem": 1.537662976, + "gpu_mem": 4.493750272, + "loss": 1.4287, + "grad_norm": 7.766341209411621, + "learning_rate": 0.00029991533636156603 + }, + { + "step": 70, + "epoch": 0.11217948717948718, + "cpu_mem": 1.538056192, + "gpu_mem": 4.49370112, + "loss": 1.5011, + "grad_norm": 7.705587863922119, + "learning_rate": 0.00029988476729599464 + }, + { + "step": 71, + "epoch": 0.11378205128205128, + "cpu_mem": 1.538449408, + "gpu_mem": 4.493725696, + "loss": 1.4505, + "grad_norm": 11.612521171569824, + "learning_rate": 0.0002998494978783874 + }, + { + "step": 72, + "epoch": 0.11538461538461539, + "cpu_mem": 1.538842624, + "gpu_mem": 4.493696512, + "loss": 1.3902, + "grad_norm": 1.5748393535614014, + "learning_rate": 0.0002998095292147852 + }, + { + "step": 73, + "epoch": 0.11698717948717949, + "cpu_mem": 1.53923584, + "gpu_mem": 4.493688832, + "loss": 1.4001, + "grad_norm": 5.085866451263428, + "learning_rate": 0.0002997648625585962 + }, + { + "step": 74, + "epoch": 0.11858974358974358, + "cpu_mem": 1.539825664, + "gpu_mem": 4.493682688, + "loss": 1.4183, + "grad_norm": 5.551912307739258, + "learning_rate": 0.0002997154993105566 + }, + { + "step": 75, + "epoch": 0.1201923076923077, + "cpu_mem": 1.54021888, + "gpu_mem": 4.493711872, + "loss": 1.4013, + "grad_norm": 2.9323136806488037, + "learning_rate": 0.00029966144101868636 + }, + { + "step": 76, + "epoch": 0.12179487179487179, + "cpu_mem": 1.540415488, + "gpu_mem": 4.493702656, + "loss": 1.3999, + "grad_norm": 5.52058219909668, + "learning_rate": 0.0002996026893782414 + }, + { + "step": 77, + "epoch": 0.1233974358974359, + "cpu_mem": 1.540808704, + "gpu_mem": 4.493690368, + "loss": 1.4038, + "grad_norm": 2.821887493133545, + "learning_rate": 0.00029953924623165955 + }, + { + "step": 78, + "epoch": 0.125, + "cpu_mem": 1.54120192, + "gpu_mem": 4.493682688, + "loss": 1.484, + "grad_norm": 5.656933307647705, + "learning_rate": 0.0002994711135685035 + }, + { + "step": 79, + "epoch": 0.1266025641025641, + "cpu_mem": 1.541595136, + "gpu_mem": 4.493734912, + "loss": 1.3851, + "grad_norm": 2.727860689163208, + "learning_rate": 0.00029939829352539787 + }, + { + "step": 80, + "epoch": 0.1282051282051282, + "cpu_mem": 1.54218496, + "gpu_mem": 4.493713408, + "loss": 1.4116, + "grad_norm": 3.2063324451446533, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 81, + "epoch": 0.12980769230769232, + "cpu_mem": 1.542578176, + "gpu_mem": 4.493707264, + "loss": 1.3777, + "grad_norm": 2.35791277885437, + "learning_rate": 0.0002992386005807413 + }, + { + "step": 82, + "epoch": 0.13141025641025642, + "cpu_mem": 1.542971392, + "gpu_mem": 4.493684224, + "loss": 1.3823, + "grad_norm": 2.764387845993042, + "learning_rate": 0.00029915173268712456 + }, + { + "step": 83, + "epoch": 0.1330128205128205, + "cpu_mem": 1.543364608, + "gpu_mem": 4.493705728, + "loss": 1.4616, + "grad_norm": 5.056830883026123, + "learning_rate": 0.0002990601874292698 + }, + { + "step": 84, + "epoch": 0.1346153846153846, + "cpu_mem": 1.543757824, + "gpu_mem": 4.49367808, + "loss": 1.4266, + "grad_norm": 3.285480499267578, + "learning_rate": 0.0002989639676780152 + }, + { + "step": 85, + "epoch": 0.1362179487179487, + "cpu_mem": 1.54415104, + "gpu_mem": 4.49368576, + "loss": 1.4089, + "grad_norm": 3.3122127056121826, + "learning_rate": 0.0002988630764507904 + }, + { + "step": 86, + "epoch": 0.13782051282051283, + "cpu_mem": 1.544544256, + "gpu_mem": 4.493704192, + "loss": 1.3932, + "grad_norm": 2.9871890544891357, + "learning_rate": 0.00029875751691152094 + }, + { + "step": 87, + "epoch": 0.13942307692307693, + "cpu_mem": 1.544937472, + "gpu_mem": 4.49369344, + "loss": 1.3894, + "grad_norm": 2.3417396545410156, + "learning_rate": 0.0002986472923705301 + }, + { + "step": 88, + "epoch": 0.14102564102564102, + "cpu_mem": 1.545330688, + "gpu_mem": 4.493691904, + "loss": 1.3928, + "grad_norm": 4.328145980834961, + "learning_rate": 0.0002985324062844341 + }, + { + "step": 89, + "epoch": 0.14262820512820512, + "cpu_mem": 1.545723904, + "gpu_mem": 4.493687296, + "loss": 1.4125, + "grad_norm": 3.5072646141052246, + "learning_rate": 0.0002984128622560345 + }, + { + "step": 90, + "epoch": 0.14423076923076922, + "cpu_mem": 1.54611712, + "gpu_mem": 4.493691904, + "loss": 1.39, + "grad_norm": 2.7311153411865234, + "learning_rate": 0.0002982886640342046 + }, + { + "step": 91, + "epoch": 0.14583333333333334, + "cpu_mem": 1.546510336, + "gpu_mem": 4.493702656, + "loss": 1.4606, + "grad_norm": 5.132165431976318, + "learning_rate": 0.00029815981551377217 + }, + { + "step": 92, + "epoch": 0.14743589743589744, + "cpu_mem": 1.546903552, + "gpu_mem": 4.493705728, + "loss": 1.4585, + "grad_norm": 4.3537774085998535, + "learning_rate": 0.00029802632073539745 + }, + { + "step": 93, + "epoch": 0.14903846153846154, + "cpu_mem": 1.547296768, + "gpu_mem": 4.493705728, + "loss": 1.4237, + "grad_norm": 2.462576389312744, + "learning_rate": 0.0002978881838854462 + }, + { + "step": 94, + "epoch": 0.15064102564102563, + "cpu_mem": 1.547689984, + "gpu_mem": 4.49370112, + "loss": 1.3792, + "grad_norm": 1.582403540611267, + "learning_rate": 0.00029774540929585847 + }, + { + "step": 95, + "epoch": 0.15224358974358973, + "cpu_mem": 1.5480832, + "gpu_mem": 4.493719552, + "loss": 1.4416, + "grad_norm": 4.008783340454102, + "learning_rate": 0.0002975980014440126 + }, + { + "step": 96, + "epoch": 0.15384615384615385, + "cpu_mem": 1.548476416, + "gpu_mem": 4.493722624, + "loss": 1.4157, + "grad_norm": 2.7078158855438232, + "learning_rate": 0.00029744596495258525 + }, + { + "step": 97, + "epoch": 0.15544871794871795, + "cpu_mem": 1.548869632, + "gpu_mem": 4.493699584, + "loss": 1.3855, + "grad_norm": 2.399202585220337, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 98, + "epoch": 0.15705128205128205, + "cpu_mem": 1.549262848, + "gpu_mem": 4.493710336, + "loss": 1.98, + "grad_norm": 99.14408874511719, + "learning_rate": 0.000297128025267308 + }, + { + "step": 99, + "epoch": 0.15865384615384615, + "cpu_mem": 1.549459456, + "gpu_mem": 4.493710336, + "loss": 1.4071, + "grad_norm": 14.556191444396973, + "learning_rate": 0.00029696213204397396 + }, + { + "step": 100, + "epoch": 0.16025641025641027, + "cpu_mem": 1.55004928, + "gpu_mem": 4.49368576, + "loss": 1.3791, + "grad_norm": 1.4977352619171143, + "learning_rate": 0.00029679163012177737 + }, + { + "step": 101, + "epoch": 0.16185897435897437, + "cpu_mem": 1.550442496, + "gpu_mem": 4.493714944, + "loss": 1.5159, + "grad_norm": 12.847485542297363, + "learning_rate": 0.0002966165248476196 + }, + { + "step": 102, + "epoch": 0.16346153846153846, + "cpu_mem": 1.550835712, + "gpu_mem": 4.493691904, + "loss": 1.4608, + "grad_norm": 8.917826652526855, + "learning_rate": 0.00029643682171276203 + }, + { + "step": 103, + "epoch": 0.16506410256410256, + "cpu_mem": 1.551228928, + "gpu_mem": 4.4937088, + "loss": 1.4116, + "grad_norm": 4.394010066986084, + "learning_rate": 0.0002962525263526538 + }, + { + "step": 104, + "epoch": 0.16666666666666666, + "cpu_mem": 1.551425536, + "gpu_mem": 4.493676544, + "loss": 1.3509, + "grad_norm": 1.5941187143325806, + "learning_rate": 0.0002960636445467553 + }, + { + "step": 105, + "epoch": 0.16826923076923078, + "cpu_mem": 1.551818752, + "gpu_mem": 4.493691904, + "loss": 1.641, + "grad_norm": 21.597627639770508, + "learning_rate": 0.0002958701822183569 + }, + { + "step": 106, + "epoch": 0.16987179487179488, + "cpu_mem": 1.55201536, + "gpu_mem": 4.493671936, + "loss": 1.8886, + "grad_norm": 24.821104049682617, + "learning_rate": 0.0002956721454343928 + }, + { + "step": 107, + "epoch": 0.17147435897435898, + "cpu_mem": 1.552211968, + "gpu_mem": 4.493713408, + "loss": 1.5249, + "grad_norm": 13.287762641906738, + "learning_rate": 0.0002954695404052514 + }, + { + "step": 108, + "epoch": 0.17307692307692307, + "cpu_mem": 1.552605184, + "gpu_mem": 4.4937088, + "loss": 1.4285, + "grad_norm": 7.848981857299805, + "learning_rate": 0.00029526237348458003 + }, + { + "step": 109, + "epoch": 0.17467948717948717, + "cpu_mem": 1.5529984, + "gpu_mem": 4.493714944, + "loss": 1.4, + "grad_norm": 4.302374362945557, + "learning_rate": 0.000295050651169086 + }, + { + "step": 110, + "epoch": 0.1762820512820513, + "cpu_mem": 1.553391616, + "gpu_mem": 4.493711872, + "loss": 1.4734, + "grad_norm": 6.15388822555542, + "learning_rate": 0.00029483438009833264 + }, + { + "step": 111, + "epoch": 0.1778846153846154, + "cpu_mem": 1.553784832, + "gpu_mem": 4.493713408, + "loss": 1.4333, + "grad_norm": 6.044255256652832, + "learning_rate": 0.0002946135670545314 + }, + { + "step": 112, + "epoch": 0.1794871794871795, + "cpu_mem": 1.55398144, + "gpu_mem": 4.493710336, + "loss": 1.3763, + "grad_norm": 1.221531867980957, + "learning_rate": 0.0002943882189623288 + }, + { + "step": 113, + "epoch": 0.18108974358974358, + "cpu_mem": 1.554374656, + "gpu_mem": 4.493690368, + "loss": 1.4282, + "grad_norm": 3.8024141788482666, + "learning_rate": 0.00029415834288858947 + }, + { + "step": 114, + "epoch": 0.18269230769230768, + "cpu_mem": 1.554571264, + "gpu_mem": 4.49368576, + "loss": 1.4037, + "grad_norm": 4.485004425048828, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 115, + "epoch": 0.1842948717948718, + "cpu_mem": 1.55496448, + "gpu_mem": 4.493704192, + "loss": 1.4284, + "grad_norm": 3.353834867477417, + "learning_rate": 0.0002936850357737156 + }, + { + "step": 116, + "epoch": 0.1858974358974359, + "cpu_mem": 1.555357696, + "gpu_mem": 4.493714944, + "loss": 1.4158, + "grad_norm": 2.560337781906128, + "learning_rate": 0.0002934416195753839 + }, + { + "step": 117, + "epoch": 0.1875, + "cpu_mem": 1.55594752, + "gpu_mem": 4.49370112, + "loss": 1.3896, + "grad_norm": 1.378231167793274, + "learning_rate": 0.00029319370508065594 + }, + { + "step": 118, + "epoch": 0.1891025641025641, + "cpu_mem": 1.556340736, + "gpu_mem": 4.49371648, + "loss": 1.3589, + "grad_norm": 2.1569507122039795, + "learning_rate": 0.0002929413000640735 + }, + { + "step": 119, + "epoch": 0.1907051282051282, + "cpu_mem": 1.556733952, + "gpu_mem": 4.493698048, + "loss": 1.5194, + "grad_norm": 8.196989059448242, + "learning_rate": 0.0002926844124410001 + }, + { + "step": 120, + "epoch": 0.19230769230769232, + "cpu_mem": 1.55693056, + "gpu_mem": 4.49372416, + "loss": 1.488, + "grad_norm": 5.567226409912109, + "learning_rate": 0.0002924230502673731 + }, + { + "step": 121, + "epoch": 0.19391025641025642, + "cpu_mem": 1.557323776, + "gpu_mem": 4.493682688, + "loss": 1.4152, + "grad_norm": 3.380398988723755, + "learning_rate": 0.00029215722173945034 + }, + { + "step": 122, + "epoch": 0.1955128205128205, + "cpu_mem": 1.557520384, + "gpu_mem": 4.493714944, + "loss": 1.4248, + "grad_norm": 2.661524772644043, + "learning_rate": 0.0002918869351935537 + }, + { + "step": 123, + "epoch": 0.1971153846153846, + "cpu_mem": 1.5579136, + "gpu_mem": 4.4937088, + "loss": 1.4139, + "grad_norm": 1.5159317255020142, + "learning_rate": 0.00029161219910580754 + }, + { + "step": 124, + "epoch": 0.1987179487179487, + "cpu_mem": 1.558306816, + "gpu_mem": 4.493710336, + "loss": 1.4303, + "grad_norm": 3.063915252685547, + "learning_rate": 0.00029133302209187267 + }, + { + "step": 125, + "epoch": 0.20032051282051283, + "cpu_mem": 1.558700032, + "gpu_mem": 4.49368576, + "loss": 1.402, + "grad_norm": 3.1166622638702393, + "learning_rate": 0.00029104941290667655 + }, + { + "step": 126, + "epoch": 0.20192307692307693, + "cpu_mem": 1.55889664, + "gpu_mem": 4.493694976, + "loss": 1.3961, + "grad_norm": 2.442560911178589, + "learning_rate": 0.00029076138044413827 + }, + { + "step": 127, + "epoch": 0.20352564102564102, + "cpu_mem": 1.559289856, + "gpu_mem": 4.493681152, + "loss": 1.4309, + "grad_norm": 15.250005722045898, + "learning_rate": 0.00029046893373689 + }, + { + "step": 128, + "epoch": 0.20512820512820512, + "cpu_mem": 1.559486464, + "gpu_mem": 4.493718016, + "loss": 1.3627, + "grad_norm": 1.024130940437317, + "learning_rate": 0.00029017208195599375 + }, + { + "step": 129, + "epoch": 0.20673076923076922, + "cpu_mem": 1.55987968, + "gpu_mem": 4.493714944, + "loss": 1.4223, + "grad_norm": 7.407578468322754, + "learning_rate": 0.0002898708344106533 + }, + { + "step": 130, + "epoch": 0.20833333333333334, + "cpu_mem": 1.560076288, + "gpu_mem": 4.493714944, + "loss": 1.4793, + "grad_norm": 5.812378406524658, + "learning_rate": 0.00028956520054792303 + }, + { + "step": 131, + "epoch": 0.20993589743589744, + "cpu_mem": 1.560469504, + "gpu_mem": 4.493704192, + "loss": 1.3989, + "grad_norm": 2.9975409507751465, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 132, + "epoch": 0.21153846153846154, + "cpu_mem": 1.56086272, + "gpu_mem": 4.493704192, + "loss": 1.3909, + "grad_norm": 3.545229434967041, + "learning_rate": 0.0002889408123459782 + }, + { + "step": 133, + "epoch": 0.21314102564102563, + "cpu_mem": 1.561059328, + "gpu_mem": 4.49368576, + "loss": 1.432, + "grad_norm": 4.322649955749512, + "learning_rate": 0.000288622077587435 + }, + { + "step": 134, + "epoch": 0.21474358974358973, + "cpu_mem": 1.561452544, + "gpu_mem": 4.493696512, + "loss": 1.436, + "grad_norm": 3.5125973224639893, + "learning_rate": 0.0002882989956722303 + }, + { + "step": 135, + "epoch": 0.21634615384615385, + "cpu_mem": 1.56184576, + "gpu_mem": 4.493705728, + "loss": 1.3855, + "grad_norm": 1.8077596426010132, + "learning_rate": 0.00028797157673213914 + }, + { + "step": 136, + "epoch": 0.21794871794871795, + "cpu_mem": 1.562042368, + "gpu_mem": 4.493721088, + "loss": 1.4225, + "grad_norm": 3.695363998413086, + "learning_rate": 0.00028763983103494465 + }, + { + "step": 137, + "epoch": 0.21955128205128205, + "cpu_mem": 1.562435584, + "gpu_mem": 4.493668864, + "loss": 1.3985, + "grad_norm": 2.126291275024414, + "learning_rate": 0.00028730376898411606 + }, + { + "step": 138, + "epoch": 0.22115384615384615, + "cpu_mem": 1.562632192, + "gpu_mem": 4.493688832, + "loss": 1.4683, + "grad_norm": 4.943833351135254, + "learning_rate": 0.00028696340111848245 + }, + { + "step": 139, + "epoch": 0.22275641025641027, + "cpu_mem": 1.5628288, + "gpu_mem": 4.4936704, + "loss": 1.4495, + "grad_norm": 4.794160842895508, + "learning_rate": 0.00028661873811190226 + }, + { + "step": 140, + "epoch": 0.22435897435897437, + "cpu_mem": 1.563025408, + "gpu_mem": 4.493687296, + "loss": 1.3801, + "grad_norm": 1.7661609649658203, + "learning_rate": 0.0002862697907729285 + }, + { + "step": 141, + "epoch": 0.22596153846153846, + "cpu_mem": 1.563418624, + "gpu_mem": 4.49369344, + "loss": 1.4387, + "grad_norm": 3.704603910446167, + "learning_rate": 0.0002859165700444701 + }, + { + "step": 142, + "epoch": 0.22756410256410256, + "cpu_mem": 1.56381184, + "gpu_mem": 4.493690368, + "loss": 1.4046, + "grad_norm": 1.7467725276947021, + "learning_rate": 0.00028555908700344824 + }, + { + "step": 143, + "epoch": 0.22916666666666666, + "cpu_mem": 1.564008448, + "gpu_mem": 4.49371648, + "loss": 1.4123, + "grad_norm": 1.7539911270141602, + "learning_rate": 0.00028519735286044936 + }, + { + "step": 144, + "epoch": 0.23076923076923078, + "cpu_mem": 1.564401664, + "gpu_mem": 4.493690368, + "loss": 1.3909, + "grad_norm": 1.0946848392486572, + "learning_rate": 0.0002848313789593736 + }, + { + "step": 145, + "epoch": 0.23237179487179488, + "cpu_mem": 1.564598272, + "gpu_mem": 4.493730304, + "loss": 1.3994, + "grad_norm": 3.101085662841797, + "learning_rate": 0.00028446117677707867 + }, + { + "step": 146, + "epoch": 0.23397435897435898, + "cpu_mem": 1.564991488, + "gpu_mem": 4.493679616, + "loss": 1.3827, + "grad_norm": 0.8464837670326233, + "learning_rate": 0.0002840867579230205 + }, + { + "step": 147, + "epoch": 0.23557692307692307, + "cpu_mem": 1.565188096, + "gpu_mem": 4.493688832, + "loss": 1.4307, + "grad_norm": 3.4220845699310303, + "learning_rate": 0.00028370813413888866 + }, + { + "step": 148, + "epoch": 0.23717948717948717, + "cpu_mem": 1.565384704, + "gpu_mem": 4.4937088, + "loss": 1.3846, + "grad_norm": 1.3682743310928345, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 149, + "epoch": 0.2387820512820513, + "cpu_mem": 1.56577792, + "gpu_mem": 4.493699584, + "loss": 1.3904, + "grad_norm": 1.7180205583572388, + "learning_rate": 0.0002829383194061186 + }, + { + "step": 150, + "epoch": 0.2403846153846154, + "cpu_mem": 1.565974528, + "gpu_mem": 4.493711872, + "loss": 1.3966, + "grad_norm": 2.085845947265625, + "learning_rate": 0.00028254715259869444 + }, + { + "step": 151, + "epoch": 0.2419871794871795, + "cpu_mem": 1.566171136, + "gpu_mem": 4.493676544, + "loss": 1.4418, + "grad_norm": 3.004077434539795, + "learning_rate": 0.00028215182914286766 + }, + { + "step": 152, + "epoch": 0.24358974358974358, + "cpu_mem": 1.566367744, + "gpu_mem": 4.493707264, + "loss": 1.4035, + "grad_norm": 2.583996057510376, + "learning_rate": 0.00028175236143589144 + }, + { + "step": 153, + "epoch": 0.24519230769230768, + "cpu_mem": 1.56676096, + "gpu_mem": 4.493702656, + "loss": 1.3589, + "grad_norm": 1.6755117177963257, + "learning_rate": 0.0002813487620049817 + }, + { + "step": 154, + "epoch": 0.2467948717948718, + "cpu_mem": 1.566957568, + "gpu_mem": 4.493727232, + "loss": 1.4213, + "grad_norm": 3.266226291656494, + "learning_rate": 0.00028094104350692435 + }, + { + "step": 155, + "epoch": 0.2483974358974359, + "cpu_mem": 1.567350784, + "gpu_mem": 4.493664256, + "loss": 1.4986, + "grad_norm": 6.114184379577637, + "learning_rate": 0.0002805292187276783 + }, + { + "step": 156, + "epoch": 0.25, + "cpu_mem": 1.567547392, + "gpu_mem": 4.493718016, + "loss": 1.4156, + "grad_norm": 4.01192569732666, + "learning_rate": 0.0002801133005819744 + }, + { + "step": 157, + "epoch": 0.2516025641025641, + "cpu_mem": 1.567744, + "gpu_mem": 4.493710336, + "loss": 1.4292, + "grad_norm": 7.798695087432861, + "learning_rate": 0.00027969330211291077 + }, + { + "step": 158, + "epoch": 0.2532051282051282, + "cpu_mem": 1.567940608, + "gpu_mem": 4.493725696, + "loss": 1.4027, + "grad_norm": 2.2231059074401855, + "learning_rate": 0.00027926923649154327 + }, + { + "step": 159, + "epoch": 0.2548076923076923, + "cpu_mem": 1.568333824, + "gpu_mem": 4.493727232, + "loss": 1.3881, + "grad_norm": 1.0521613359451294, + "learning_rate": 0.00027884111701647284 + }, + { + "step": 160, + "epoch": 0.2564102564102564, + "cpu_mem": 1.568530432, + "gpu_mem": 4.493694976, + "loss": 1.3942, + "grad_norm": 2.6183040142059326, + "learning_rate": 0.00027840895711342834 + }, + { + "step": 161, + "epoch": 0.25801282051282054, + "cpu_mem": 1.568923648, + "gpu_mem": 4.493687296, + "loss": 1.381, + "grad_norm": 2.2281689643859863, + "learning_rate": 0.00027797277033484553 + }, + { + "step": 162, + "epoch": 0.25961538461538464, + "cpu_mem": 1.569120256, + "gpu_mem": 4.493722624, + "loss": 1.4081, + "grad_norm": 3.263665199279785, + "learning_rate": 0.0002775325703594421 + }, + { + "step": 163, + "epoch": 0.26121794871794873, + "cpu_mem": 1.569513472, + "gpu_mem": 4.4936704, + "loss": 1.4317, + "grad_norm": 4.399308204650879, + "learning_rate": 0.0002770883709917886 + }, + { + "step": 164, + "epoch": 0.26282051282051283, + "cpu_mem": 1.56971008, + "gpu_mem": 4.493705728, + "loss": 1.3721, + "grad_norm": 2.141644239425659, + "learning_rate": 0.0002766401861618757 + }, + { + "step": 165, + "epoch": 0.2644230769230769, + "cpu_mem": 1.569906688, + "gpu_mem": 4.493694976, + "loss": 1.4706, + "grad_norm": 4.285233974456787, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 166, + "epoch": 0.266025641025641, + "cpu_mem": 1.570103296, + "gpu_mem": 4.493727232, + "loss": 1.552, + "grad_norm": 6.6666998863220215, + "learning_rate": 0.0002757319164597092 + }, + { + "step": 167, + "epoch": 0.2676282051282051, + "cpu_mem": 1.570299904, + "gpu_mem": 4.493721088, + "loss": 1.3913, + "grad_norm": 3.463447332382202, + "learning_rate": 0.0002752718600705858 + }, + { + "step": 168, + "epoch": 0.2692307692307692, + "cpu_mem": 1.570496512, + "gpu_mem": 4.493699584, + "loss": 1.4353, + "grad_norm": 2.7253623008728027, + "learning_rate": 0.00027480787518457023 + }, + { + "step": 169, + "epoch": 0.2708333333333333, + "cpu_mem": 1.570889728, + "gpu_mem": 4.493696512, + "loss": 1.4023, + "grad_norm": 2.6509969234466553, + "learning_rate": 0.0002743399763521223 + }, + { + "step": 170, + "epoch": 0.2724358974358974, + "cpu_mem": 1.571086336, + "gpu_mem": 4.493733376, + "loss": 1.3941, + "grad_norm": 1.3880903720855713, + "learning_rate": 0.0002738681782464426 + }, + { + "step": 171, + "epoch": 0.27403846153846156, + "cpu_mem": 1.571282944, + "gpu_mem": 4.493707264, + "loss": 1.4069, + "grad_norm": 2.654820680618286, + "learning_rate": 0.0002733924956630117 + }, + { + "step": 172, + "epoch": 0.27564102564102566, + "cpu_mem": 1.571479552, + "gpu_mem": 4.493684224, + "loss": 1.4101, + "grad_norm": 3.2832846641540527, + "learning_rate": 0.00027291294351912664 + }, + { + "step": 173, + "epoch": 0.27724358974358976, + "cpu_mem": 1.57167616, + "gpu_mem": 4.493710336, + "loss": 1.4399, + "grad_norm": 5.191057205200195, + "learning_rate": 0.00027242953685343327 + }, + { + "step": 174, + "epoch": 0.27884615384615385, + "cpu_mem": 1.572069376, + "gpu_mem": 4.493722624, + "loss": 1.4419, + "grad_norm": 5.686864852905273, + "learning_rate": 0.0002719422908254538 + }, + { + "step": 175, + "epoch": 0.28044871794871795, + "cpu_mem": 1.572265984, + "gpu_mem": 4.493684224, + "loss": 1.3784, + "grad_norm": 1.6513653993606567, + "learning_rate": 0.0002714512207151125 + }, + { + "step": 176, + "epoch": 0.28205128205128205, + "cpu_mem": 1.5726592, + "gpu_mem": 4.49369344, + "loss": 1.4147, + "grad_norm": 3.035165548324585, + "learning_rate": 0.0002709563419222557 + }, + { + "step": 177, + "epoch": 0.28365384615384615, + "cpu_mem": 1.572855808, + "gpu_mem": 4.493675008, + "loss": 1.4157, + "grad_norm": 3.022012233734131, + "learning_rate": 0.0002704576699661691 + }, + { + "step": 178, + "epoch": 0.28525641025641024, + "cpu_mem": 1.573052416, + "gpu_mem": 4.493688832, + "loss": 1.383, + "grad_norm": 1.7340534925460815, + "learning_rate": 0.0002699552204850914 + }, + { + "step": 179, + "epoch": 0.28685897435897434, + "cpu_mem": 1.573249024, + "gpu_mem": 4.493696512, + "loss": 1.4127, + "grad_norm": 1.7332732677459717, + "learning_rate": 0.0002694490092357233 + }, + { + "step": 180, + "epoch": 0.28846153846153844, + "cpu_mem": 1.573445632, + "gpu_mem": 4.49367808, + "loss": 1.4004, + "grad_norm": 1.399063229560852, + "learning_rate": 0.00026893905209273404 + }, + { + "step": 181, + "epoch": 0.2900641025641026, + "cpu_mem": 1.57364224, + "gpu_mem": 4.4937088, + "loss": 1.3718, + "grad_norm": 0.6799705028533936, + "learning_rate": 0.00026842536504826286 + }, + { + "step": 182, + "epoch": 0.2916666666666667, + "cpu_mem": 1.573838848, + "gpu_mem": 4.493679616, + "loss": 1.388, + "grad_norm": 1.7067478895187378, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 183, + "epoch": 0.2932692307692308, + "cpu_mem": 1.574035456, + "gpu_mem": 4.493704192, + "loss": 1.3945, + "grad_norm": 1.186632752418518, + "learning_rate": 0.0002673868658077717 + }, + { + "step": 184, + "epoch": 0.2948717948717949, + "cpu_mem": 1.574232064, + "gpu_mem": 4.493684224, + "loss": 1.4723, + "grad_norm": 3.7060043811798096, + "learning_rate": 0.00026686208617885055 + }, + { + "step": 185, + "epoch": 0.296474358974359, + "cpu_mem": 1.57462528, + "gpu_mem": 4.49371648, + "loss": 1.3572, + "grad_norm": 2.197092056274414, + "learning_rate": 0.0002663336417816238 + }, + { + "step": 186, + "epoch": 0.2980769230769231, + "cpu_mem": 1.574821888, + "gpu_mem": 4.493707264, + "loss": 1.5174, + "grad_norm": 7.260722637176514, + "learning_rate": 0.0002658015491879868 + }, + { + "step": 187, + "epoch": 0.29967948717948717, + "cpu_mem": 1.575018496, + "gpu_mem": 4.493702656, + "loss": 1.3486, + "grad_norm": 1.9174777269363403, + "learning_rate": 0.00026526582508424175 + }, + { + "step": 188, + "epoch": 0.30128205128205127, + "cpu_mem": 1.575018496, + "gpu_mem": 4.493659648, + "loss": 1.4793, + "grad_norm": 5.139273643493652, + "learning_rate": 0.0002647264862705741 + }, + { + "step": 189, + "epoch": 0.30288461538461536, + "cpu_mem": 1.575411712, + "gpu_mem": 4.49373952, + "loss": 1.41, + "grad_norm": 2.1777617931365967, + "learning_rate": 0.00026418354966052573 + }, + { + "step": 190, + "epoch": 0.30448717948717946, + "cpu_mem": 1.57560832, + "gpu_mem": 4.493690368, + "loss": 1.4048, + "grad_norm": 2.9862215518951416, + "learning_rate": 0.00026363703228046454 + }, + { + "step": 191, + "epoch": 0.3060897435897436, + "cpu_mem": 1.575804928, + "gpu_mem": 4.493690368, + "loss": 1.4144, + "grad_norm": 3.4619853496551514, + "learning_rate": 0.0002630869512690507 + }, + { + "step": 192, + "epoch": 0.3076923076923077, + "cpu_mem": 1.576001536, + "gpu_mem": 4.493656576, + "loss": 1.5094, + "grad_norm": 6.558995723724365, + "learning_rate": 0.0002625333238766989 + }, + { + "step": 193, + "epoch": 0.3092948717948718, + "cpu_mem": 1.576198144, + "gpu_mem": 4.493696512, + "loss": 1.3693, + "grad_norm": 2.896925687789917, + "learning_rate": 0.0002619761674650377 + }, + { + "step": 194, + "epoch": 0.3108974358974359, + "cpu_mem": 1.576394752, + "gpu_mem": 4.493691904, + "loss": 1.4249, + "grad_norm": 6.217208385467529, + "learning_rate": 0.0002614154995063647 + }, + { + "step": 195, + "epoch": 0.3125, + "cpu_mem": 1.576787968, + "gpu_mem": 4.493679616, + "loss": 1.4463, + "grad_norm": 4.499203205108643, + "learning_rate": 0.00026085133758309883 + }, + { + "step": 196, + "epoch": 0.3141025641025641, + "cpu_mem": 1.576984576, + "gpu_mem": 4.493704192, + "loss": 1.4207, + "grad_norm": 3.666639804840088, + "learning_rate": 0.0002602836993872292 + }, + { + "step": 197, + "epoch": 0.3157051282051282, + "cpu_mem": 1.577181184, + "gpu_mem": 4.493719552, + "loss": 1.4176, + "grad_norm": 3.0117995738983154, + "learning_rate": 0.0002597126027197598 + }, + { + "step": 198, + "epoch": 0.3173076923076923, + "cpu_mem": 1.577377792, + "gpu_mem": 4.493691904, + "loss": 1.466, + "grad_norm": 5.473692893981934, + "learning_rate": 0.0002591380654901515 + }, + { + "step": 199, + "epoch": 0.3189102564102564, + "cpu_mem": 1.5775744, + "gpu_mem": 4.493688832, + "loss": 1.4231, + "grad_norm": 5.374265193939209, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 200, + "epoch": 0.32051282051282054, + "cpu_mem": 1.5775744, + "gpu_mem": 4.493704192, + "loss": 1.4664, + "grad_norm": 6.867026329040527, + "learning_rate": 0.0002579787415212732 + }, + { + "step": 201, + "epoch": 0.32211538461538464, + "cpu_mem": 1.577771008, + "gpu_mem": 4.493681152, + "loss": 1.4662, + "grad_norm": 7.550572395324707, + "learning_rate": 0.00025739399113813784 + }, + { + "step": 202, + "epoch": 0.32371794871794873, + "cpu_mem": 1.577967616, + "gpu_mem": 4.493682688, + "loss": 1.4229, + "grad_norm": 5.205974578857422, + "learning_rate": 0.00025680587290399277 + }, + { + "step": 203, + "epoch": 0.32532051282051283, + "cpu_mem": 1.578164224, + "gpu_mem": 4.49372416, + "loss": 1.3776, + "grad_norm": 3.7338075637817383, + "learning_rate": 0.0002562144052620913 + }, + { + "step": 204, + "epoch": 0.3269230769230769, + "cpu_mem": 1.57855744, + "gpu_mem": 4.493694976, + "loss": 1.4024, + "grad_norm": 2.724646806716919, + "learning_rate": 0.00025561960676072354 + }, + { + "step": 205, + "epoch": 0.328525641025641, + "cpu_mem": 1.578754048, + "gpu_mem": 4.493694976, + "loss": 1.3854, + "grad_norm": 2.165105104446411, + "learning_rate": 0.0002550214960526344 + }, + { + "step": 206, + "epoch": 0.3301282051282051, + "cpu_mem": 1.578950656, + "gpu_mem": 4.493691904, + "loss": 1.424, + "grad_norm": 3.1097586154937744, + "learning_rate": 0.000254420091894439 + }, + { + "step": 207, + "epoch": 0.3317307692307692, + "cpu_mem": 1.579147264, + "gpu_mem": 4.493691904, + "loss": 1.3886, + "grad_norm": 1.4011125564575195, + "learning_rate": 0.0002538154131460342 + }, + { + "step": 208, + "epoch": 0.3333333333333333, + "cpu_mem": 1.579343872, + "gpu_mem": 4.493682688, + "loss": 1.4011, + "grad_norm": 1.5423967838287354, + "learning_rate": 0.00025320747877000745 + }, + { + "step": 209, + "epoch": 0.3349358974358974, + "cpu_mem": 1.57954048, + "gpu_mem": 4.493718016, + "loss": 1.4325, + "grad_norm": 2.9985785484313965, + "learning_rate": 0.00025259630783104164 + }, + { + "step": 210, + "epoch": 0.33653846153846156, + "cpu_mem": 1.579737088, + "gpu_mem": 4.493675008, + "loss": 1.4045, + "grad_norm": 2.063185691833496, + "learning_rate": 0.00025198191949531786 + }, + { + "step": 211, + "epoch": 0.33814102564102566, + "cpu_mem": 1.579933696, + "gpu_mem": 4.493702656, + "loss": 1.3806, + "grad_norm": 1.3520841598510742, + "learning_rate": 0.00025136433302991366 + }, + { + "step": 212, + "epoch": 0.33974358974358976, + "cpu_mem": 1.580130304, + "gpu_mem": 4.493711872, + "loss": 1.3341, + "grad_norm": 2.6181979179382324, + "learning_rate": 0.00025074356780219946 + }, + { + "step": 213, + "epoch": 0.34134615384615385, + "cpu_mem": 1.580326912, + "gpu_mem": 4.493684224, + "loss": 1.3451, + "grad_norm": 2.5867297649383545, + "learning_rate": 0.000250119643279231 + }, + { + "step": 214, + "epoch": 0.34294871794871795, + "cpu_mem": 1.58052352, + "gpu_mem": 4.49369344, + "loss": 1.533, + "grad_norm": 7.587326526641846, + "learning_rate": 0.0002494925790271386 + }, + { + "step": 215, + "epoch": 0.34455128205128205, + "cpu_mem": 1.580720128, + "gpu_mem": 4.493694976, + "loss": 1.6414, + "grad_norm": 10.505845069885254, + "learning_rate": 0.00024886239471051376 + }, + { + "step": 216, + "epoch": 0.34615384615384615, + "cpu_mem": 1.580916736, + "gpu_mem": 4.493694976, + "loss": 1.4108, + "grad_norm": 3.6265363693237305, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 217, + "epoch": 0.34775641025641024, + "cpu_mem": 1.581113344, + "gpu_mem": 4.493679616, + "loss": 1.4475, + "grad_norm": 4.2208170890808105, + "learning_rate": 0.0002475927450306363 + }, + { + "step": 218, + "epoch": 0.34935897435897434, + "cpu_mem": 1.58150656, + "gpu_mem": 4.49370112, + "loss": 1.405, + "grad_norm": 2.6960268020629883, + "learning_rate": 0.0002469533194833073 + }, + { + "step": 219, + "epoch": 0.35096153846153844, + "cpu_mem": 1.58150656, + "gpu_mem": 4.493734912, + "loss": 1.373, + "grad_norm": 2.0517070293426514, + "learning_rate": 0.0002463108535020447 + }, + { + "step": 220, + "epoch": 0.3525641025641026, + "cpu_mem": 1.581703168, + "gpu_mem": 4.493688832, + "loss": 1.4247, + "grad_norm": 3.875562906265259, + "learning_rate": 0.0002456653672344348 + }, + { + "step": 221, + "epoch": 0.3541666666666667, + "cpu_mem": 1.581899776, + "gpu_mem": 4.493694976, + "loss": 1.4166, + "grad_norm": 3.199223041534424, + "learning_rate": 0.0002450168809227794 + }, + { + "step": 222, + "epoch": 0.3557692307692308, + "cpu_mem": 1.582096384, + "gpu_mem": 4.493710336, + "loss": 1.3921, + "grad_norm": 1.9863297939300537, + "learning_rate": 0.00024436541490346095 + }, + { + "step": 223, + "epoch": 0.3573717948717949, + "cpu_mem": 1.582292992, + "gpu_mem": 4.493728768, + "loss": 1.441, + "grad_norm": 3.047760009765625, + "learning_rate": 0.00024371098960630495 + }, + { + "step": 224, + "epoch": 0.358974358974359, + "cpu_mem": 1.5824896, + "gpu_mem": 4.493698048, + "loss": 1.4467, + "grad_norm": 3.33308744430542, + "learning_rate": 0.000243053625553939 + }, + { + "step": 225, + "epoch": 0.3605769230769231, + "cpu_mem": 1.582686208, + "gpu_mem": 4.493684224, + "loss": 1.4074, + "grad_norm": 1.8380095958709717, + "learning_rate": 0.00024239334336114953 + }, + { + "step": 226, + "epoch": 0.36217948717948717, + "cpu_mem": 1.582882816, + "gpu_mem": 4.493676544, + "loss": 1.4008, + "grad_norm": 1.3371466398239136, + "learning_rate": 0.0002417301637342352 + }, + { + "step": 227, + "epoch": 0.36378205128205127, + "cpu_mem": 1.582882816, + "gpu_mem": 4.493741056, + "loss": 1.386, + "grad_norm": 1.1064872741699219, + "learning_rate": 0.00024106410747035744 + }, + { + "step": 228, + "epoch": 0.36538461538461536, + "cpu_mem": 1.583079424, + "gpu_mem": 4.493679616, + "loss": 1.4048, + "grad_norm": 2.254940986633301, + "learning_rate": 0.00024039519545688846 + }, + { + "step": 229, + "epoch": 0.36698717948717946, + "cpu_mem": 1.58347264, + "gpu_mem": 4.49373184, + "loss": 1.3879, + "grad_norm": 0.4038461148738861, + "learning_rate": 0.000239723448670756 + }, + { + "step": 230, + "epoch": 0.3685897435897436, + "cpu_mem": 1.583669248, + "gpu_mem": 4.493713408, + "loss": 1.3935, + "grad_norm": 0.7845887541770935, + "learning_rate": 0.0002390488881777858 + }, + { + "step": 231, + "epoch": 0.3701923076923077, + "cpu_mem": 1.583865856, + "gpu_mem": 4.493711872, + "loss": 1.3984, + "grad_norm": 2.049678087234497, + "learning_rate": 0.0002383715351320406 + }, + { + "step": 232, + "epoch": 0.3717948717948718, + "cpu_mem": 1.584062464, + "gpu_mem": 4.49371648, + "loss": 1.4006, + "grad_norm": 1.617073655128479, + "learning_rate": 0.00023769141077515713 + }, + { + "step": 233, + "epoch": 0.3733974358974359, + "cpu_mem": 1.584259072, + "gpu_mem": 4.493691904, + "loss": 1.3703, + "grad_norm": 2.133340358734131, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 234, + "epoch": 0.375, + "cpu_mem": 1.584259072, + "gpu_mem": 4.493721088, + "loss": 1.4254, + "grad_norm": 1.990647315979004, + "learning_rate": 0.0002363229335283915 + }, + { + "step": 235, + "epoch": 0.3766025641025641, + "cpu_mem": 1.58445568, + "gpu_mem": 4.493698048, + "loss": 1.3543, + "grad_norm": 1.510504961013794, + "learning_rate": 0.00023563462355364297 + }, + { + "step": 236, + "epoch": 0.3782051282051282, + "cpu_mem": 1.584652288, + "gpu_mem": 4.493759488, + "loss": 1.4352, + "grad_norm": 2.5057499408721924, + "learning_rate": 0.0002349436280966775 + }, + { + "step": 237, + "epoch": 0.3798076923076923, + "cpu_mem": 1.584652288, + "gpu_mem": 4.493684224, + "loss": 1.4707, + "grad_norm": 3.7832345962524414, + "learning_rate": 0.00023424996882695468 + }, + { + "step": 238, + "epoch": 0.3814102564102564, + "cpu_mem": 1.584848896, + "gpu_mem": 4.493694976, + "loss": 1.44, + "grad_norm": 3.310288667678833, + "learning_rate": 0.00023355366749747063 + }, + { + "step": 239, + "epoch": 0.38301282051282054, + "cpu_mem": 1.585045504, + "gpu_mem": 4.49369344, + "loss": 1.408, + "grad_norm": 1.5556061267852783, + "learning_rate": 0.00023285474594407585 + }, + { + "step": 240, + "epoch": 0.38461538461538464, + "cpu_mem": 1.585242112, + "gpu_mem": 4.493690368, + "loss": 1.4012, + "grad_norm": 1.470806360244751, + "learning_rate": 0.0002321532260847905 + }, + { + "step": 241, + "epoch": 0.38621794871794873, + "cpu_mem": 1.58543872, + "gpu_mem": 4.493721088, + "loss": 1.3965, + "grad_norm": 1.5376025438308716, + "learning_rate": 0.00023144912991911691 + }, + { + "step": 242, + "epoch": 0.38782051282051283, + "cpu_mem": 1.585635328, + "gpu_mem": 4.493699584, + "loss": 1.3986, + "grad_norm": 1.8252325057983398, + "learning_rate": 0.0002307424795273499 + }, + { + "step": 243, + "epoch": 0.3894230769230769, + "cpu_mem": 1.585831936, + "gpu_mem": 4.493694976, + "loss": 1.4887, + "grad_norm": 4.529153823852539, + "learning_rate": 0.00023003329706988425 + }, + { + "step": 244, + "epoch": 0.391025641025641, + "cpu_mem": 1.586028544, + "gpu_mem": 4.493705728, + "loss": 1.3962, + "grad_norm": 2.618587017059326, + "learning_rate": 0.00022932160478651963 + }, + { + "step": 245, + "epoch": 0.3926282051282051, + "cpu_mem": 1.586225152, + "gpu_mem": 4.493710336, + "loss": 1.4063, + "grad_norm": 1.5824165344238281, + "learning_rate": 0.00022860742499576338 + }, + { + "step": 246, + "epoch": 0.3942307692307692, + "cpu_mem": 1.58642176, + "gpu_mem": 4.493671936, + "loss": 1.3881, + "grad_norm": 0.519913375377655, + "learning_rate": 0.00022789078009413042 + }, + { + "step": 247, + "epoch": 0.3958333333333333, + "cpu_mem": 1.586618368, + "gpu_mem": 4.49373952, + "loss": 1.3972, + "grad_norm": 1.3940930366516113, + "learning_rate": 0.00022717169255544108 + }, + { + "step": 248, + "epoch": 0.3974358974358974, + "cpu_mem": 1.586814976, + "gpu_mem": 4.493702656, + "loss": 1.3859, + "grad_norm": 1.6022673845291138, + "learning_rate": 0.00022645018493011612 + }, + { + "step": 249, + "epoch": 0.39903846153846156, + "cpu_mem": 1.586814976, + "gpu_mem": 4.493691904, + "loss": 1.4088, + "grad_norm": 1.3627089262008667, + "learning_rate": 0.0002257262798444698 + }, + { + "step": 250, + "epoch": 0.40064102564102566, + "cpu_mem": 1.587011584, + "gpu_mem": 4.4937088, + "loss": 1.4065, + "grad_norm": 1.6760859489440918, + "learning_rate": 0.000225 + }, + { + "step": 251, + "epoch": 0.40224358974358976, + "cpu_mem": 1.587011584, + "gpu_mem": 4.493682688, + "loss": 1.4501, + "grad_norm": 3.603651762008667, + "learning_rate": 0.00022427136817267668 + }, + { + "step": 252, + "epoch": 0.40384615384615385, + "cpu_mem": 1.587208192, + "gpu_mem": 4.493730304, + "loss": 1.4012, + "grad_norm": 1.254109263420105, + "learning_rate": 0.0002235404072122273 + }, + { + "step": 253, + "epoch": 0.40544871794871795, + "cpu_mem": 1.5874048, + "gpu_mem": 4.493698048, + "loss": 1.4569, + "grad_norm": 3.6934475898742676, + "learning_rate": 0.00022280714004142054 + }, + { + "step": 254, + "epoch": 0.40705128205128205, + "cpu_mem": 1.587601408, + "gpu_mem": 4.493687296, + "loss": 1.4062, + "grad_norm": 2.7059686183929443, + "learning_rate": 0.00022207158965534726 + }, + { + "step": 255, + "epoch": 0.40865384615384615, + "cpu_mem": 1.587798016, + "gpu_mem": 4.493702656, + "loss": 1.4042, + "grad_norm": 1.5245215892791748, + "learning_rate": 0.0002213337791206993 + }, + { + "step": 256, + "epoch": 0.41025641025641024, + "cpu_mem": 1.587798016, + "gpu_mem": 4.493699584, + "loss": 1.4003, + "grad_norm": 1.2780299186706543, + "learning_rate": 0.00022059373157504636 + }, + { + "step": 257, + "epoch": 0.41185897435897434, + "cpu_mem": 1.587994624, + "gpu_mem": 4.493699584, + "loss": 1.4221, + "grad_norm": 2.27565860748291, + "learning_rate": 0.00021985147022611038 + }, + { + "step": 258, + "epoch": 0.41346153846153844, + "cpu_mem": 1.588191232, + "gpu_mem": 4.493687296, + "loss": 1.3778, + "grad_norm": 0.8354502320289612, + "learning_rate": 0.0002191070183510375 + }, + { + "step": 259, + "epoch": 0.4150641025641026, + "cpu_mem": 1.58838784, + "gpu_mem": 4.4936704, + "loss": 1.3729, + "grad_norm": 0.608502984046936, + "learning_rate": 0.00021836039929566835 + }, + { + "step": 260, + "epoch": 0.4166666666666667, + "cpu_mem": 1.588584448, + "gpu_mem": 4.493733376, + "loss": 1.4182, + "grad_norm": 1.652549386024475, + "learning_rate": 0.0002176116364738058 + }, + { + "step": 261, + "epoch": 0.4182692307692308, + "cpu_mem": 1.588584448, + "gpu_mem": 4.493687296, + "loss": 1.4048, + "grad_norm": 1.5306965112686157, + "learning_rate": 0.00021686075336648075 + }, + { + "step": 262, + "epoch": 0.4198717948717949, + "cpu_mem": 1.588781056, + "gpu_mem": 4.493696512, + "loss": 1.4232, + "grad_norm": 2.658778429031372, + "learning_rate": 0.00021610777352121574 + }, + { + "step": 263, + "epoch": 0.421474358974359, + "cpu_mem": 1.588781056, + "gpu_mem": 4.49373184, + "loss": 1.4053, + "grad_norm": 1.580998420715332, + "learning_rate": 0.0002153527205512867 + }, + { + "step": 264, + "epoch": 0.4230769230769231, + "cpu_mem": 1.588977664, + "gpu_mem": 4.493696512, + "loss": 1.3727, + "grad_norm": 0.4003816843032837, + "learning_rate": 0.0002145956181349821 + }, + { + "step": 265, + "epoch": 0.42467948717948717, + "cpu_mem": 1.589174272, + "gpu_mem": 4.49370112, + "loss": 1.4444, + "grad_norm": 3.0271286964416504, + "learning_rate": 0.00021383649001486055 + }, + { + "step": 266, + "epoch": 0.42628205128205127, + "cpu_mem": 1.58937088, + "gpu_mem": 4.493748736, + "loss": 1.4373, + "grad_norm": 2.9099514484405518, + "learning_rate": 0.00021307535999700637 + }, + { + "step": 267, + "epoch": 0.42788461538461536, + "cpu_mem": 1.589567488, + "gpu_mem": 4.493757952, + "loss": 1.407, + "grad_norm": 1.8013681173324585, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 268, + "epoch": 0.42948717948717946, + "cpu_mem": 1.589764096, + "gpu_mem": 4.493711872, + "loss": 1.4199, + "grad_norm": 1.973847508430481, + "learning_rate": 0.00021154718980558417 + }, + { + "step": 269, + "epoch": 0.4310897435897436, + "cpu_mem": 1.589764096, + "gpu_mem": 4.493705728, + "loss": 1.3746, + "grad_norm": 1.5956662893295288, + "learning_rate": 0.00021078019755508398 + }, + { + "step": 270, + "epoch": 0.4326923076923077, + "cpu_mem": 1.589764096, + "gpu_mem": 4.493767168, + "loss": 1.3996, + "grad_norm": 1.5277106761932373, + "learning_rate": 0.00021001129925148396 + }, + { + "step": 271, + "epoch": 0.4342948717948718, + "cpu_mem": 1.589960704, + "gpu_mem": 4.49369344, + "loss": 1.4311, + "grad_norm": 1.7472527027130127, + "learning_rate": 0.00020924051900725923 + }, + { + "step": 272, + "epoch": 0.4358974358974359, + "cpu_mem": 1.590157312, + "gpu_mem": 4.493691904, + "loss": 1.4201, + "grad_norm": 1.3693735599517822, + "learning_rate": 0.00020846788099390188 + }, + { + "step": 273, + "epoch": 0.4375, + "cpu_mem": 1.59035392, + "gpu_mem": 4.493694976, + "loss": 1.3919, + "grad_norm": 0.5059611201286316, + "learning_rate": 0.0002076934094411635 + }, + { + "step": 274, + "epoch": 0.4391025641025641, + "cpu_mem": 1.590550528, + "gpu_mem": 4.493681152, + "loss": 1.4002, + "grad_norm": 1.5446606874465942, + "learning_rate": 0.0002069171286362949 + }, + { + "step": 275, + "epoch": 0.4407051282051282, + "cpu_mem": 1.590550528, + "gpu_mem": 4.493696512, + "loss": 1.3959, + "grad_norm": 1.0899953842163086, + "learning_rate": 0.00020613906292328457 + }, + { + "step": 276, + "epoch": 0.4423076923076923, + "cpu_mem": 1.590747136, + "gpu_mem": 4.493734912, + "loss": 1.3882, + "grad_norm": 0.6127272844314575, + "learning_rate": 0.0002053592367020955 + }, + { + "step": 277, + "epoch": 0.4439102564102564, + "cpu_mem": 1.590943744, + "gpu_mem": 4.493714944, + "loss": 1.4099, + "grad_norm": 1.6484986543655396, + "learning_rate": 0.0002045776744278996 + }, + { + "step": 278, + "epoch": 0.44551282051282054, + "cpu_mem": 1.590943744, + "gpu_mem": 4.493741056, + "loss": 1.3914, + "grad_norm": 1.6406583786010742, + "learning_rate": 0.00020379440061031118 + }, + { + "step": 279, + "epoch": 0.44711538461538464, + "cpu_mem": 1.591140352, + "gpu_mem": 4.493691904, + "loss": 1.3898, + "grad_norm": 1.0598000288009644, + "learning_rate": 0.00020300943981261808 + }, + { + "step": 280, + "epoch": 0.44871794871794873, + "cpu_mem": 1.59133696, + "gpu_mem": 4.49368576, + "loss": 1.3888, + "grad_norm": 1.1888771057128906, + "learning_rate": 0.0002022228166510114 + }, + { + "step": 281, + "epoch": 0.45032051282051283, + "cpu_mem": 1.59133696, + "gpu_mem": 4.4937088, + "loss": 1.3977, + "grad_norm": 1.2122161388397217, + "learning_rate": 0.00020143455579381373 + }, + { + "step": 282, + "epoch": 0.4519230769230769, + "cpu_mem": 1.591533568, + "gpu_mem": 4.493687296, + "loss": 1.3823, + "grad_norm": 1.0279420614242554, + "learning_rate": 0.0002006446819607053 + }, + { + "step": 283, + "epoch": 0.453525641025641, + "cpu_mem": 1.591533568, + "gpu_mem": 4.49370112, + "loss": 1.3991, + "grad_norm": 1.142988920211792, + "learning_rate": 0.00019985321992194892 + }, + { + "step": 284, + "epoch": 0.4551282051282051, + "cpu_mem": 1.591730176, + "gpu_mem": 4.493705728, + "loss": 1.4332, + "grad_norm": 2.2135331630706787, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 285, + "epoch": 0.4567307692307692, + "cpu_mem": 1.591926784, + "gpu_mem": 4.49372416, + "loss": 1.4516, + "grad_norm": 2.286379337310791, + "learning_rate": 0.00019826563055679418 + }, + { + "step": 286, + "epoch": 0.4583333333333333, + "cpu_mem": 1.591926784, + "gpu_mem": 4.493694976, + "loss": 1.4019, + "grad_norm": 1.2485969066619873, + "learning_rate": 0.00019746955301683537 + }, + { + "step": 287, + "epoch": 0.4599358974358974, + "cpu_mem": 1.591926784, + "gpu_mem": 4.493722624, + "loss": 1.3878, + "grad_norm": 0.6220881342887878, + "learning_rate": 0.0001966719868425464 + }, + { + "step": 288, + "epoch": 0.46153846153846156, + "cpu_mem": 1.592123392, + "gpu_mem": 4.493704192, + "loss": 1.3902, + "grad_norm": 1.849666714668274, + "learning_rate": 0.0001958729570454201 + }, + { + "step": 289, + "epoch": 0.46314102564102566, + "cpu_mem": 1.59232, + "gpu_mem": 4.493691904, + "loss": 1.4369, + "grad_norm": 3.4150469303131104, + "learning_rate": 0.0001950724886828484 + }, + { + "step": 290, + "epoch": 0.46474358974358976, + "cpu_mem": 1.59232, + "gpu_mem": 4.49370112, + "loss": 1.4393, + "grad_norm": 3.647698163986206, + "learning_rate": 0.000194270606857336 + }, + { + "step": 291, + "epoch": 0.46634615384615385, + "cpu_mem": 1.592516608, + "gpu_mem": 4.493698048, + "loss": 1.4181, + "grad_norm": 2.9625155925750732, + "learning_rate": 0.00019346733671571367 + }, + { + "step": 292, + "epoch": 0.46794871794871795, + "cpu_mem": 1.592713216, + "gpu_mem": 4.493713408, + "loss": 1.4101, + "grad_norm": 2.5676934719085693, + "learning_rate": 0.00019266270344834942 + }, + { + "step": 293, + "epoch": 0.46955128205128205, + "cpu_mem": 1.592909824, + "gpu_mem": 4.493721088, + "loss": 1.3872, + "grad_norm": 0.9573443531990051, + "learning_rate": 0.00019185673228835857 + }, + { + "step": 294, + "epoch": 0.47115384615384615, + "cpu_mem": 1.592909824, + "gpu_mem": 4.493710336, + "loss": 1.3942, + "grad_norm": 1.8636502027511597, + "learning_rate": 0.00019104944851081244 + }, + { + "step": 295, + "epoch": 0.47275641025641024, + "cpu_mem": 1.593106432, + "gpu_mem": 4.493694976, + "loss": 1.4266, + "grad_norm": 2.770134687423706, + "learning_rate": 0.00019024087743194564 + }, + { + "step": 296, + "epoch": 0.47435897435897434, + "cpu_mem": 1.59330304, + "gpu_mem": 4.493698048, + "loss": 1.4203, + "grad_norm": 2.0802536010742188, + "learning_rate": 0.0001894310444083625 + }, + { + "step": 297, + "epoch": 0.47596153846153844, + "cpu_mem": 1.59330304, + "gpu_mem": 4.493691904, + "loss": 1.3907, + "grad_norm": 1.5511400699615479, + "learning_rate": 0.00018861997483624136 + }, + { + "step": 298, + "epoch": 0.4775641025641026, + "cpu_mem": 1.593499648, + "gpu_mem": 4.493687296, + "loss": 1.4239, + "grad_norm": 2.4178197383880615, + "learning_rate": 0.00018780769415053866 + }, + { + "step": 299, + "epoch": 0.4791666666666667, + "cpu_mem": 1.593499648, + "gpu_mem": 4.4937088, + "loss": 1.4051, + "grad_norm": 1.148271083831787, + "learning_rate": 0.00018699422782419094 + }, + { + "step": 300, + "epoch": 0.4807692307692308, + "cpu_mem": 1.593696256, + "gpu_mem": 4.49370112, + "loss": 1.3854, + "grad_norm": 1.5475651025772095, + "learning_rate": 0.00018617960136731624 + }, + { + "step": 301, + "epoch": 0.4823717948717949, + "cpu_mem": 1.593696256, + "gpu_mem": 4.493673472, + "loss": 1.3826, + "grad_norm": 0.4008638560771942, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 302, + "epoch": 0.483974358974359, + "cpu_mem": 1.593696256, + "gpu_mem": 4.493671936, + "loss": 1.3876, + "grad_norm": 0.690343976020813, + "learning_rate": 0.0001845469702835641 + }, + { + "step": 303, + "epoch": 0.4855769230769231, + "cpu_mem": 1.593892864, + "gpu_mem": 4.493698048, + "loss": 1.4268, + "grad_norm": 1.9171669483184814, + "learning_rate": 0.00018372901685562414 + }, + { + "step": 304, + "epoch": 0.48717948717948717, + "cpu_mem": 1.594089472, + "gpu_mem": 4.493681152, + "loss": 1.3795, + "grad_norm": 0.17349448800086975, + "learning_rate": 0.00018291000569342676 + }, + { + "step": 305, + "epoch": 0.48878205128205127, + "cpu_mem": 1.594089472, + "gpu_mem": 4.493711872, + "loss": 1.3322, + "grad_norm": 2.179943561553955, + "learning_rate": 0.00018208996248097458 + }, + { + "step": 306, + "epoch": 0.49038461538461536, + "cpu_mem": 1.59428608, + "gpu_mem": 4.493694976, + "loss": 1.4204, + "grad_norm": 1.6761813163757324, + "learning_rate": 0.00018126891293463547 + }, + { + "step": 307, + "epoch": 0.49198717948717946, + "cpu_mem": 1.59428608, + "gpu_mem": 4.493725696, + "loss": 1.393, + "grad_norm": 1.165014386177063, + "learning_rate": 0.0001804468828023354 + }, + { + "step": 308, + "epoch": 0.4935897435897436, + "cpu_mem": 1.59428608, + "gpu_mem": 4.49369344, + "loss": 1.3879, + "grad_norm": 0.47699567675590515, + "learning_rate": 0.00017962389786275142 + }, + { + "step": 309, + "epoch": 0.4951923076923077, + "cpu_mem": 1.594482688, + "gpu_mem": 4.493719552, + "loss": 1.4132, + "grad_norm": 1.3407409191131592, + "learning_rate": 0.000178799983924503 + }, + { + "step": 310, + "epoch": 0.4967948717948718, + "cpu_mem": 1.594679296, + "gpu_mem": 4.493694976, + "loss": 1.3905, + "grad_norm": 0.6140868663787842, + "learning_rate": 0.00017797516682534293 + }, + { + "step": 311, + "epoch": 0.4983974358974359, + "cpu_mem": 1.594679296, + "gpu_mem": 4.493690368, + "loss": 1.3994, + "grad_norm": 0.9012115597724915, + "learning_rate": 0.00017714947243134695 + }, + { + "step": 312, + "epoch": 0.5, + "cpu_mem": 1.594679296, + "gpu_mem": 4.49369344, + "loss": 1.3729, + "grad_norm": 0.7588595747947693, + "learning_rate": 0.0001763229266361024 + }, + { + "step": 313, + "epoch": 0.5016025641025641, + "cpu_mem": 1.594875904, + "gpu_mem": 4.493711872, + "loss": 1.3866, + "grad_norm": 0.44822967052459717, + "learning_rate": 0.00017549555535989648 + }, + { + "step": 314, + "epoch": 0.5032051282051282, + "cpu_mem": 1.595072512, + "gpu_mem": 4.493691904, + "loss": 1.3898, + "grad_norm": 1.0178906917572021, + "learning_rate": 0.00017466738454890323 + }, + { + "step": 315, + "epoch": 0.5048076923076923, + "cpu_mem": 1.595072512, + "gpu_mem": 4.493696512, + "loss": 1.3833, + "grad_norm": 0.3506338894367218, + "learning_rate": 0.00017383844017436996 + }, + { + "step": 316, + "epoch": 0.5064102564102564, + "cpu_mem": 1.59526912, + "gpu_mem": 4.493691904, + "loss": 1.3822, + "grad_norm": 0.8158572316169739, + "learning_rate": 0.00017300874823180282 + }, + { + "step": 317, + "epoch": 0.5080128205128205, + "cpu_mem": 1.59526912, + "gpu_mem": 4.493699584, + "loss": 1.392, + "grad_norm": 0.4946703314781189, + "learning_rate": 0.00017217833474015128 + }, + { + "step": 318, + "epoch": 0.5096153846153846, + "cpu_mem": 1.595465728, + "gpu_mem": 4.49372416, + "loss": 1.3699, + "grad_norm": 0.4614749848842621, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 319, + "epoch": 0.5112179487179487, + "cpu_mem": 1.595465728, + "gpu_mem": 4.49371648, + "loss": 1.3976, + "grad_norm": 0.8733381628990173, + "learning_rate": 0.0001705154472977154 + }, + { + "step": 320, + "epoch": 0.5128205128205128, + "cpu_mem": 1.595662336, + "gpu_mem": 4.493718016, + "loss": 1.3627, + "grad_norm": 0.45789244771003723, + "learning_rate": 0.00016968302549470095 + }, + { + "step": 321, + "epoch": 0.5144230769230769, + "cpu_mem": 1.595662336, + "gpu_mem": 4.49369344, + "loss": 1.375, + "grad_norm": 0.9970502853393555, + "learning_rate": 0.00016884998643650694 + }, + { + "step": 322, + "epoch": 0.5160256410256411, + "cpu_mem": 1.595858944, + "gpu_mem": 4.493694976, + "loss": 1.4159, + "grad_norm": 1.6032041311264038, + "learning_rate": 0.00016801635624704776 + }, + { + "step": 323, + "epoch": 0.5176282051282052, + "cpu_mem": 1.595858944, + "gpu_mem": 4.493714944, + "loss": 1.3754, + "grad_norm": 0.6471379399299622, + "learning_rate": 0.0001671821610687756 + }, + { + "step": 324, + "epoch": 0.5192307692307693, + "cpu_mem": 1.596055552, + "gpu_mem": 4.493687296, + "loss": 1.4328, + "grad_norm": 2.1520540714263916, + "learning_rate": 0.00016634742706186036 + }, + { + "step": 325, + "epoch": 0.5208333333333334, + "cpu_mem": 1.596055552, + "gpu_mem": 4.493699584, + "loss": 1.3571, + "grad_norm": 1.0202984809875488, + "learning_rate": 0.00016551218040336993 + }, + { + "step": 326, + "epoch": 0.5224358974358975, + "cpu_mem": 1.59625216, + "gpu_mem": 4.4937088, + "loss": 1.4125, + "grad_norm": 1.6029874086380005, + "learning_rate": 0.00016467644728644843 + }, + { + "step": 327, + "epoch": 0.5240384615384616, + "cpu_mem": 1.596448768, + "gpu_mem": 4.49368576, + "loss": 1.367, + "grad_norm": 1.4833121299743652, + "learning_rate": 0.0001638402539194953 + }, + { + "step": 328, + "epoch": 0.5256410256410257, + "cpu_mem": 1.596448768, + "gpu_mem": 4.493710336, + "loss": 1.4041, + "grad_norm": 1.9707703590393066, + "learning_rate": 0.00016300362652534346 + }, + { + "step": 329, + "epoch": 0.5272435897435898, + "cpu_mem": 1.596448768, + "gpu_mem": 4.493710336, + "loss": 1.413, + "grad_norm": 1.553128957748413, + "learning_rate": 0.00016216659134043657 + }, + { + "step": 330, + "epoch": 0.5288461538461539, + "cpu_mem": 1.596448768, + "gpu_mem": 4.49369344, + "loss": 1.421, + "grad_norm": 1.793030858039856, + "learning_rate": 0.00016132917461400686 + }, + { + "step": 331, + "epoch": 0.530448717948718, + "cpu_mem": 1.596645376, + "gpu_mem": 4.493690368, + "loss": 1.3431, + "grad_norm": 2.297577381134033, + "learning_rate": 0.00016049140260725127 + }, + { + "step": 332, + "epoch": 0.532051282051282, + "cpu_mem": 1.596645376, + "gpu_mem": 4.493682688, + "loss": 1.3976, + "grad_norm": 1.7003600597381592, + "learning_rate": 0.00015965330159250845 + }, + { + "step": 333, + "epoch": 0.5336538461538461, + "cpu_mem": 1.596645376, + "gpu_mem": 4.493721088, + "loss": 1.4507, + "grad_norm": 2.6380066871643066, + "learning_rate": 0.00015881489785243467 + }, + { + "step": 334, + "epoch": 0.5352564102564102, + "cpu_mem": 1.596645376, + "gpu_mem": 4.493698048, + "loss": 1.4276, + "grad_norm": 1.6790764331817627, + "learning_rate": 0.00015797621767917942 + }, + { + "step": 335, + "epoch": 0.5368589743589743, + "cpu_mem": 1.596841984, + "gpu_mem": 4.493696512, + "loss": 1.4002, + "grad_norm": 0.9641213417053223, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 336, + "epoch": 0.5384615384615384, + "cpu_mem": 1.596841984, + "gpu_mem": 4.493713408, + "loss": 1.3872, + "grad_norm": 1.0013465881347656, + "learning_rate": 0.00015629813324424292 + }, + { + "step": 337, + "epoch": 0.5400641025641025, + "cpu_mem": 1.597038592, + "gpu_mem": 4.493698048, + "loss": 1.3894, + "grad_norm": 0.9656116366386414, + "learning_rate": 0.00015545878160690583 + }, + { + "step": 338, + "epoch": 0.5416666666666666, + "cpu_mem": 1.597038592, + "gpu_mem": 4.493710336, + "loss": 1.3981, + "grad_norm": 2.8231287002563477, + "learning_rate": 0.00015461925878342556 + }, + { + "step": 339, + "epoch": 0.5432692307692307, + "cpu_mem": 1.5972352, + "gpu_mem": 4.493722624, + "loss": 1.3842, + "grad_norm": 2.512460231781006, + "learning_rate": 0.00015377959110104584 + }, + { + "step": 340, + "epoch": 0.5448717948717948, + "cpu_mem": 1.5972352, + "gpu_mem": 4.493698048, + "loss": 1.4314, + "grad_norm": 4.306180477142334, + "learning_rate": 0.00015293980489155333 + }, + { + "step": 341, + "epoch": 0.5464743589743589, + "cpu_mem": 1.597431808, + "gpu_mem": 4.493742592, + "loss": 1.3422, + "grad_norm": 3.0044586658477783, + "learning_rate": 0.00015209992649045152 + }, + { + "step": 342, + "epoch": 0.5480769230769231, + "cpu_mem": 1.597431808, + "gpu_mem": 4.49371648, + "loss": 1.3965, + "grad_norm": 2.7223880290985107, + "learning_rate": 0.000151259982236135 + }, + { + "step": 343, + "epoch": 0.5496794871794872, + "cpu_mem": 1.597628416, + "gpu_mem": 4.493713408, + "loss": 1.3893, + "grad_norm": 1.8329936265945435, + "learning_rate": 0.00015041999846906367 + }, + { + "step": 344, + "epoch": 0.5512820512820513, + "cpu_mem": 1.597628416, + "gpu_mem": 4.493694976, + "loss": 1.4026, + "grad_norm": 2.222440719604492, + "learning_rate": 0.00014958000153093634 + }, + { + "step": 345, + "epoch": 0.5528846153846154, + "cpu_mem": 1.597825024, + "gpu_mem": 4.49370112, + "loss": 1.4278, + "grad_norm": 4.1339826583862305, + "learning_rate": 0.000148740017763865 + }, + { + "step": 346, + "epoch": 0.5544871794871795, + "cpu_mem": 1.597825024, + "gpu_mem": 4.4936704, + "loss": 1.3912, + "grad_norm": 2.4584829807281494, + "learning_rate": 0.00014790007350954845 + }, + { + "step": 347, + "epoch": 0.5560897435897436, + "cpu_mem": 1.597825024, + "gpu_mem": 4.493734912, + "loss": 1.3941, + "grad_norm": 2.1184160709381104, + "learning_rate": 0.00014706019510844664 + }, + { + "step": 348, + "epoch": 0.5576923076923077, + "cpu_mem": 1.597825024, + "gpu_mem": 4.493688832, + "loss": 1.3777, + "grad_norm": 1.837175726890564, + "learning_rate": 0.0001462204088989541 + }, + { + "step": 349, + "epoch": 0.5592948717948718, + "cpu_mem": 1.597825024, + "gpu_mem": 4.493682688, + "loss": 1.3849, + "grad_norm": 1.3756186962127686, + "learning_rate": 0.00014538074121657447 + }, + { + "step": 350, + "epoch": 0.5608974358974359, + "cpu_mem": 1.598021632, + "gpu_mem": 4.493737984, + "loss": 1.3961, + "grad_norm": 1.6659722328186035, + "learning_rate": 0.00014454121839309415 + }, + { + "step": 351, + "epoch": 0.5625, + "cpu_mem": 1.598021632, + "gpu_mem": 4.493704192, + "loss": 1.3756, + "grad_norm": 1.7549289464950562, + "learning_rate": 0.00014370186675575705 + }, + { + "step": 352, + "epoch": 0.5641025641025641, + "cpu_mem": 1.59821824, + "gpu_mem": 4.493691904, + "loss": 1.417, + "grad_norm": 4.088860034942627, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 353, + "epoch": 0.5657051282051282, + "cpu_mem": 1.598414848, + "gpu_mem": 4.493696512, + "loss": 1.3992, + "grad_norm": 1.7297906875610352, + "learning_rate": 0.00014202378232082053 + }, + { + "step": 354, + "epoch": 0.5673076923076923, + "cpu_mem": 1.598414848, + "gpu_mem": 4.493676544, + "loss": 1.3922, + "grad_norm": 1.0854960680007935, + "learning_rate": 0.00014118510214756536 + }, + { + "step": 355, + "epoch": 0.5689102564102564, + "cpu_mem": 1.598414848, + "gpu_mem": 4.49370112, + "loss": 1.3934, + "grad_norm": 1.028552532196045, + "learning_rate": 0.00014034669840749152 + }, + { + "step": 356, + "epoch": 0.5705128205128205, + "cpu_mem": 1.598611456, + "gpu_mem": 4.493679616, + "loss": 1.3859, + "grad_norm": 0.44537895917892456, + "learning_rate": 0.0001395085973927487 + }, + { + "step": 357, + "epoch": 0.5721153846153846, + "cpu_mem": 1.598611456, + "gpu_mem": 4.493696512, + "loss": 1.3917, + "grad_norm": 0.9094427227973938, + "learning_rate": 0.00013867082538599317 + }, + { + "step": 358, + "epoch": 0.5737179487179487, + "cpu_mem": 1.598611456, + "gpu_mem": 4.493661184, + "loss": 1.3917, + "grad_norm": 0.6126585602760315, + "learning_rate": 0.00013783340865956338 + }, + { + "step": 359, + "epoch": 0.5753205128205128, + "cpu_mem": 1.598611456, + "gpu_mem": 4.49369344, + "loss": 1.3916, + "grad_norm": 0.827272355556488, + "learning_rate": 0.0001369963734746566 + }, + { + "step": 360, + "epoch": 0.5769230769230769, + "cpu_mem": 1.598808064, + "gpu_mem": 4.493682688, + "loss": 1.3872, + "grad_norm": 0.4583181142807007, + "learning_rate": 0.0001361597460805047 + }, + { + "step": 361, + "epoch": 0.5785256410256411, + "cpu_mem": 1.598808064, + "gpu_mem": 4.493719552, + "loss": 1.4222, + "grad_norm": 2.1439390182495117, + "learning_rate": 0.00013532355271355154 + }, + { + "step": 362, + "epoch": 0.5801282051282052, + "cpu_mem": 1.599004672, + "gpu_mem": 4.49368576, + "loss": 1.3917, + "grad_norm": 1.1583024263381958, + "learning_rate": 0.00013448781959663004 + }, + { + "step": 363, + "epoch": 0.5817307692307693, + "cpu_mem": 1.599004672, + "gpu_mem": 4.4937088, + "loss": 1.3968, + "grad_norm": 1.9280472993850708, + "learning_rate": 0.00013365257293813956 + }, + { + "step": 364, + "epoch": 0.5833333333333334, + "cpu_mem": 1.59920128, + "gpu_mem": 4.493698048, + "loss": 1.3887, + "grad_norm": 0.9777529835700989, + "learning_rate": 0.00013281783893122446 + }, + { + "step": 365, + "epoch": 0.5849358974358975, + "cpu_mem": 1.59920128, + "gpu_mem": 4.493704192, + "loss": 1.3777, + "grad_norm": 0.43149808049201965, + "learning_rate": 0.00013198364375295224 + }, + { + "step": 366, + "epoch": 0.5865384615384616, + "cpu_mem": 1.59920128, + "gpu_mem": 4.493698048, + "loss": 1.4384, + "grad_norm": 2.2030677795410156, + "learning_rate": 0.000131150013563493 + }, + { + "step": 367, + "epoch": 0.5881410256410257, + "cpu_mem": 1.59920128, + "gpu_mem": 4.49371648, + "loss": 1.3711, + "grad_norm": 0.8699899911880493, + "learning_rate": 0.00013031697450529902 + }, + { + "step": 368, + "epoch": 0.5897435897435898, + "cpu_mem": 1.59920128, + "gpu_mem": 4.493676544, + "loss": 1.3851, + "grad_norm": 0.37553226947784424, + "learning_rate": 0.0001294845527022846 + }, + { + "step": 369, + "epoch": 0.5913461538461539, + "cpu_mem": 1.599397888, + "gpu_mem": 4.4937088, + "loss": 1.3818, + "grad_norm": 0.44437241554260254, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 370, + "epoch": 0.592948717948718, + "cpu_mem": 1.599594496, + "gpu_mem": 4.493728768, + "loss": 1.4118, + "grad_norm": 1.4248298406600952, + "learning_rate": 0.0001278216652598487 + }, + { + "step": 371, + "epoch": 0.594551282051282, + "cpu_mem": 1.599594496, + "gpu_mem": 4.493722624, + "loss": 1.3994, + "grad_norm": 1.1716581583023071, + "learning_rate": 0.00012699125176819716 + }, + { + "step": 372, + "epoch": 0.5961538461538461, + "cpu_mem": 1.599594496, + "gpu_mem": 4.49368576, + "loss": 1.4123, + "grad_norm": 2.0044562816619873, + "learning_rate": 0.00012616155982563 + }, + { + "step": 373, + "epoch": 0.5977564102564102, + "cpu_mem": 1.599594496, + "gpu_mem": 4.493702656, + "loss": 1.4045, + "grad_norm": 1.933052897453308, + "learning_rate": 0.00012533261545109674 + }, + { + "step": 374, + "epoch": 0.5993589743589743, + "cpu_mem": 1.599594496, + "gpu_mem": 4.493679616, + "loss": 1.4013, + "grad_norm": 2.7478530406951904, + "learning_rate": 0.00012450444464010352 + }, + { + "step": 375, + "epoch": 0.6009615384615384, + "cpu_mem": 1.599594496, + "gpu_mem": 4.493711872, + "loss": 1.4227, + "grad_norm": 3.7173879146575928, + "learning_rate": 0.0001236770733638976 + }, + { + "step": 376, + "epoch": 0.6025641025641025, + "cpu_mem": 1.599791104, + "gpu_mem": 4.493707264, + "loss": 1.3889, + "grad_norm": 1.0331683158874512, + "learning_rate": 0.000122850527568653 + }, + { + "step": 377, + "epoch": 0.6041666666666666, + "cpu_mem": 1.599791104, + "gpu_mem": 4.49371648, + "loss": 1.4087, + "grad_norm": 2.4171321392059326, + "learning_rate": 0.00012202483317465704 + }, + { + "step": 378, + "epoch": 0.6057692307692307, + "cpu_mem": 1.599987712, + "gpu_mem": 4.493690368, + "loss": 1.3958, + "grad_norm": 1.2371629476547241, + "learning_rate": 0.00012120001607549698 + }, + { + "step": 379, + "epoch": 0.6073717948717948, + "cpu_mem": 1.599987712, + "gpu_mem": 4.493710336, + "loss": 1.3991, + "grad_norm": 3.260352373123169, + "learning_rate": 0.00012037610213724862 + }, + { + "step": 380, + "epoch": 0.6089743589743589, + "cpu_mem": 1.599987712, + "gpu_mem": 4.493684224, + "loss": 1.3938, + "grad_norm": 1.2536730766296387, + "learning_rate": 0.0001195531171976646 + }, + { + "step": 381, + "epoch": 0.6105769230769231, + "cpu_mem": 1.599987712, + "gpu_mem": 4.4937088, + "loss": 1.382, + "grad_norm": 1.4392590522766113, + "learning_rate": 0.00011873108706536448 + }, + { + "step": 382, + "epoch": 0.6121794871794872, + "cpu_mem": 1.599987712, + "gpu_mem": 4.49369344, + "loss": 1.4042, + "grad_norm": 3.0108911991119385, + "learning_rate": 0.00011791003751902542 + }, + { + "step": 383, + "epoch": 0.6137820512820513, + "cpu_mem": 1.60018432, + "gpu_mem": 4.493727232, + "loss": 1.3566, + "grad_norm": 1.7607405185699463, + "learning_rate": 0.00011708999430657325 + }, + { + "step": 384, + "epoch": 0.6153846153846154, + "cpu_mem": 1.600380928, + "gpu_mem": 4.493707264, + "loss": 1.3725, + "grad_norm": 1.13426673412323, + "learning_rate": 0.00011627098314437586 + }, + { + "step": 385, + "epoch": 0.6169871794871795, + "cpu_mem": 1.600380928, + "gpu_mem": 4.493691904, + "loss": 1.4038, + "grad_norm": 2.009767770767212, + "learning_rate": 0.0001154530297164359 + }, + { + "step": 386, + "epoch": 0.6185897435897436, + "cpu_mem": 1.600380928, + "gpu_mem": 4.493727232, + "loss": 1.4014, + "grad_norm": 1.2211600542068481, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 387, + "epoch": 0.6201923076923077, + "cpu_mem": 1.600380928, + "gpu_mem": 4.493733376, + "loss": 1.4112, + "grad_norm": 2.0082504749298096, + "learning_rate": 0.00011382039863268374 + }, + { + "step": 388, + "epoch": 0.6217948717948718, + "cpu_mem": 1.600577536, + "gpu_mem": 4.493696512, + "loss": 1.4219, + "grad_norm": 1.952669620513916, + "learning_rate": 0.00011300577217580905 + }, + { + "step": 389, + "epoch": 0.6233974358974359, + "cpu_mem": 1.600577536, + "gpu_mem": 4.493675008, + "loss": 1.3863, + "grad_norm": 1.3320780992507935, + "learning_rate": 0.00011219230584946136 + }, + { + "step": 390, + "epoch": 0.625, + "cpu_mem": 1.600577536, + "gpu_mem": 4.493727232, + "loss": 1.3662, + "grad_norm": 1.1044321060180664, + "learning_rate": 0.00011138002516375864 + }, + { + "step": 391, + "epoch": 0.6266025641025641, + "cpu_mem": 1.600577536, + "gpu_mem": 4.493713408, + "loss": 1.4116, + "grad_norm": 1.1462204456329346, + "learning_rate": 0.00011056895559163748 + }, + { + "step": 392, + "epoch": 0.6282051282051282, + "cpu_mem": 1.600577536, + "gpu_mem": 4.493707264, + "loss": 1.4042, + "grad_norm": 1.1133263111114502, + "learning_rate": 0.00010975912256805436 + }, + { + "step": 393, + "epoch": 0.6298076923076923, + "cpu_mem": 1.600577536, + "gpu_mem": 4.493713408, + "loss": 1.3936, + "grad_norm": 1.117222547531128, + "learning_rate": 0.00010895055148918756 + }, + { + "step": 394, + "epoch": 0.6314102564102564, + "cpu_mem": 1.600774144, + "gpu_mem": 4.493690368, + "loss": 1.4025, + "grad_norm": 1.5059454441070557, + "learning_rate": 0.00010814326771164141 + }, + { + "step": 395, + "epoch": 0.6330128205128205, + "cpu_mem": 1.600774144, + "gpu_mem": 4.493704192, + "loss": 1.4298, + "grad_norm": 1.9505493640899658, + "learning_rate": 0.00010733729655165054 + }, + { + "step": 396, + "epoch": 0.6346153846153846, + "cpu_mem": 1.600774144, + "gpu_mem": 4.493704192, + "loss": 1.3983, + "grad_norm": 1.5805071592330933, + "learning_rate": 0.00010653266328428628 + }, + { + "step": 397, + "epoch": 0.6362179487179487, + "cpu_mem": 1.600774144, + "gpu_mem": 4.493673472, + "loss": 1.358, + "grad_norm": 1.768936038017273, + "learning_rate": 0.00010572939314266402 + }, + { + "step": 398, + "epoch": 0.6378205128205128, + "cpu_mem": 1.600774144, + "gpu_mem": 4.493707264, + "loss": 1.3697, + "grad_norm": 0.6294443607330322, + "learning_rate": 0.00010492751131715159 + }, + { + "step": 399, + "epoch": 0.6394230769230769, + "cpu_mem": 1.600970752, + "gpu_mem": 4.49368576, + "loss": 1.4018, + "grad_norm": 1.0273836851119995, + "learning_rate": 0.00010412704295457988 + }, + { + "step": 400, + "epoch": 0.6410256410256411, + "cpu_mem": 1.60116736, + "gpu_mem": 4.49369344, + "loss": 1.3997, + "grad_norm": 0.8733883500099182, + "learning_rate": 0.00010332801315745361 + }, + { + "step": 401, + "epoch": 0.6426282051282052, + "cpu_mem": 1.60116736, + "gpu_mem": 4.493711872, + "loss": 1.3957, + "grad_norm": 0.6830035448074341, + "learning_rate": 0.00010253044698316464 + }, + { + "step": 402, + "epoch": 0.6442307692307693, + "cpu_mem": 1.60116736, + "gpu_mem": 4.493679616, + "loss": 1.3875, + "grad_norm": 0.4566798508167267, + "learning_rate": 0.00010173436944320582 + }, + { + "step": 403, + "epoch": 0.6458333333333334, + "cpu_mem": 1.60116736, + "gpu_mem": 4.493684224, + "loss": 1.3894, + "grad_norm": 0.5209246277809143, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 404, + "epoch": 0.6474358974358975, + "cpu_mem": 1.60116736, + "gpu_mem": 4.493679616, + "loss": 1.3888, + "grad_norm": 0.6446499824523926, + "learning_rate": 0.00010014678007805106 + }, + { + "step": 405, + "epoch": 0.6490384615384616, + "cpu_mem": 1.60116736, + "gpu_mem": 4.49372416, + "loss": 1.3888, + "grad_norm": 1.1024909019470215, + "learning_rate": 9.935531803929469e-05 + }, + { + "step": 406, + "epoch": 0.6506410256410257, + "cpu_mem": 1.601363968, + "gpu_mem": 4.493707264, + "loss": 1.3957, + "grad_norm": 1.7732677459716797, + "learning_rate": 9.856544420618624e-05 + }, + { + "step": 407, + "epoch": 0.6522435897435898, + "cpu_mem": 1.601560576, + "gpu_mem": 4.493696512, + "loss": 1.3861, + "grad_norm": 0.5750253200531006, + "learning_rate": 9.777718334898859e-05 + }, + { + "step": 408, + "epoch": 0.6538461538461539, + "cpu_mem": 1.601560576, + "gpu_mem": 4.493718016, + "loss": 1.3979, + "grad_norm": 2.183753252029419, + "learning_rate": 9.699056018738192e-05 + }, + { + "step": 409, + "epoch": 0.655448717948718, + "cpu_mem": 1.601560576, + "gpu_mem": 4.493684224, + "loss": 1.382, + "grad_norm": 0.21420668065547943, + "learning_rate": 9.62055993896888e-05 + }, + { + "step": 410, + "epoch": 0.657051282051282, + "cpu_mem": 1.601560576, + "gpu_mem": 4.493699584, + "loss": 1.4124, + "grad_norm": 1.9374762773513794, + "learning_rate": 9.542232557210039e-05 + }, + { + "step": 411, + "epoch": 0.6586538461538461, + "cpu_mem": 1.601757184, + "gpu_mem": 4.493699584, + "loss": 1.3999, + "grad_norm": 1.1729366779327393, + "learning_rate": 9.464076329790451e-05 + }, + { + "step": 412, + "epoch": 0.6602564102564102, + "cpu_mem": 1.601757184, + "gpu_mem": 4.493690368, + "loss": 1.3956, + "grad_norm": 1.0518842935562134, + "learning_rate": 9.386093707671543e-05 + }, + { + "step": 413, + "epoch": 0.6618589743589743, + "cpu_mem": 1.601757184, + "gpu_mem": 4.49370112, + "loss": 1.3996, + "grad_norm": 1.779776692390442, + "learning_rate": 9.308287136370511e-05 + }, + { + "step": 414, + "epoch": 0.6634615384615384, + "cpu_mem": 1.601757184, + "gpu_mem": 4.493725696, + "loss": 1.3853, + "grad_norm": 0.8769326210021973, + "learning_rate": 9.230659055883649e-05 + }, + { + "step": 415, + "epoch": 0.6650641025641025, + "cpu_mem": 1.601757184, + "gpu_mem": 4.49367808, + "loss": 1.3737, + "grad_norm": 0.9254390597343445, + "learning_rate": 9.15321190060981e-05 + }, + { + "step": 416, + "epoch": 0.6666666666666666, + "cpu_mem": 1.601757184, + "gpu_mem": 4.493713408, + "loss": 1.3993, + "grad_norm": 1.0260326862335205, + "learning_rate": 9.075948099274078e-05 + }, + { + "step": 417, + "epoch": 0.6682692307692307, + "cpu_mem": 1.601953792, + "gpu_mem": 4.493675008, + "loss": 1.39, + "grad_norm": 1.2507468461990356, + "learning_rate": 8.998870074851604e-05 + }, + { + "step": 418, + "epoch": 0.6698717948717948, + "cpu_mem": 1.601953792, + "gpu_mem": 4.49369344, + "loss": 1.3861, + "grad_norm": 0.8870686888694763, + "learning_rate": 8.9219802444916e-05 + }, + { + "step": 419, + "epoch": 0.6714743589743589, + "cpu_mem": 1.601953792, + "gpu_mem": 4.49368576, + "loss": 1.3878, + "grad_norm": 0.602741539478302, + "learning_rate": 8.845281019441583e-05 + }, + { + "step": 420, + "epoch": 0.6730769230769231, + "cpu_mem": 1.601953792, + "gpu_mem": 4.493722624, + "loss": 1.3867, + "grad_norm": 0.37810513377189636, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 421, + "epoch": 0.6746794871794872, + "cpu_mem": 1.601953792, + "gpu_mem": 4.493682688, + "loss": 1.3963, + "grad_norm": 0.8927867412567139, + "learning_rate": 8.692464000299362e-05 + }, + { + "step": 422, + "epoch": 0.6762820512820513, + "cpu_mem": 1.601953792, + "gpu_mem": 4.493696512, + "loss": 1.4174, + "grad_norm": 1.4727171659469604, + "learning_rate": 8.61635099851395e-05 + }, + { + "step": 423, + "epoch": 0.6778846153846154, + "cpu_mem": 1.6021504, + "gpu_mem": 4.49370112, + "loss": 1.4028, + "grad_norm": 1.0990833044052124, + "learning_rate": 8.540438186501792e-05 + }, + { + "step": 424, + "epoch": 0.6794871794871795, + "cpu_mem": 1.602347008, + "gpu_mem": 4.49366272, + "loss": 1.3937, + "grad_norm": 1.3241934776306152, + "learning_rate": 8.464727944871322e-05 + }, + { + "step": 425, + "epoch": 0.6810897435897436, + "cpu_mem": 1.602347008, + "gpu_mem": 4.49368576, + "loss": 1.3966, + "grad_norm": 0.5847589373588562, + "learning_rate": 8.389222647878426e-05 + }, + { + "step": 426, + "epoch": 0.6826923076923077, + "cpu_mem": 1.602347008, + "gpu_mem": 4.493684224, + "loss": 1.3801, + "grad_norm": 0.45842060446739197, + "learning_rate": 8.313924663351926e-05 + }, + { + "step": 427, + "epoch": 0.6842948717948718, + "cpu_mem": 1.602347008, + "gpu_mem": 4.493702656, + "loss": 1.3971, + "grad_norm": 0.8098547458648682, + "learning_rate": 8.238836352619424e-05 + }, + { + "step": 428, + "epoch": 0.6858974358974359, + "cpu_mem": 1.602347008, + "gpu_mem": 4.493699584, + "loss": 1.3811, + "grad_norm": 0.511680006980896, + "learning_rate": 8.163960070433164e-05 + }, + { + "step": 429, + "epoch": 0.6875, + "cpu_mem": 1.602347008, + "gpu_mem": 4.493698048, + "loss": 1.3899, + "grad_norm": 0.29829666018486023, + "learning_rate": 8.089298164896245e-05 + }, + { + "step": 430, + "epoch": 0.6891025641025641, + "cpu_mem": 1.602543616, + "gpu_mem": 4.49371648, + "loss": 1.3767, + "grad_norm": 0.669538676738739, + "learning_rate": 8.014852977388964e-05 + }, + { + "step": 431, + "epoch": 0.6907051282051282, + "cpu_mem": 1.602543616, + "gpu_mem": 4.49367808, + "loss": 1.384, + "grad_norm": 0.17513222992420197, + "learning_rate": 7.940626842495362e-05 + }, + { + "step": 432, + "epoch": 0.6923076923076923, + "cpu_mem": 1.602543616, + "gpu_mem": 4.493722624, + "loss": 1.4047, + "grad_norm": 0.8988094329833984, + "learning_rate": 7.866622087930074e-05 + }, + { + "step": 433, + "epoch": 0.6939102564102564, + "cpu_mem": 1.602543616, + "gpu_mem": 4.493687296, + "loss": 1.3907, + "grad_norm": 0.6710613369941711, + "learning_rate": 7.792841034465275e-05 + }, + { + "step": 434, + "epoch": 0.6955128205128205, + "cpu_mem": 1.602543616, + "gpu_mem": 4.493714944, + "loss": 1.4087, + "grad_norm": 0.9900716543197632, + "learning_rate": 7.719285995857938e-05 + }, + { + "step": 435, + "epoch": 0.6971153846153846, + "cpu_mem": 1.602543616, + "gpu_mem": 4.493694976, + "loss": 1.3814, + "grad_norm": 0.2215026319026947, + "learning_rate": 7.64595927877727e-05 + }, + { + "step": 436, + "epoch": 0.6987179487179487, + "cpu_mem": 1.602543616, + "gpu_mem": 4.493741056, + "loss": 1.3965, + "grad_norm": 1.2002172470092773, + "learning_rate": 7.572863182732332e-05 + }, + { + "step": 437, + "epoch": 0.7003205128205128, + "cpu_mem": 1.602543616, + "gpu_mem": 4.493705728, + "loss": 1.3833, + "grad_norm": 0.3051542341709137, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 438, + "epoch": 0.7019230769230769, + "cpu_mem": 1.602740224, + "gpu_mem": 4.493696512, + "loss": 1.3812, + "grad_norm": 0.5095186233520508, + "learning_rate": 7.42737201555302e-05 + }, + { + "step": 439, + "epoch": 0.7035256410256411, + "cpu_mem": 1.602740224, + "gpu_mem": 4.493690368, + "loss": 1.3841, + "grad_norm": 0.3401910066604614, + "learning_rate": 7.354981506988387e-05 + }, + { + "step": 440, + "epoch": 0.7051282051282052, + "cpu_mem": 1.602936832, + "gpu_mem": 4.493675008, + "loss": 1.3826, + "grad_norm": 0.4024999141693115, + "learning_rate": 7.282830744455895e-05 + }, + { + "step": 441, + "epoch": 0.7067307692307693, + "cpu_mem": 1.602936832, + "gpu_mem": 4.49369344, + "loss": 1.3779, + "grad_norm": 1.2372214794158936, + "learning_rate": 7.210921990586957e-05 + }, + { + "step": 442, + "epoch": 0.7083333333333334, + "cpu_mem": 1.602936832, + "gpu_mem": 4.493694976, + "loss": 1.3636, + "grad_norm": 0.9630852341651917, + "learning_rate": 7.139257500423665e-05 + }, + { + "step": 443, + "epoch": 0.7099358974358975, + "cpu_mem": 1.602936832, + "gpu_mem": 4.493699584, + "loss": 1.4024, + "grad_norm": 1.0144751071929932, + "learning_rate": 7.067839521348035e-05 + }, + { + "step": 444, + "epoch": 0.7115384615384616, + "cpu_mem": 1.602936832, + "gpu_mem": 4.493702656, + "loss": 1.3882, + "grad_norm": 0.4893753230571747, + "learning_rate": 6.996670293011575e-05 + }, + { + "step": 445, + "epoch": 0.7131410256410257, + "cpu_mem": 1.602936832, + "gpu_mem": 4.493696512, + "loss": 1.3992, + "grad_norm": 1.1759904623031616, + "learning_rate": 6.92575204726501e-05 + }, + { + "step": 446, + "epoch": 0.7147435897435898, + "cpu_mem": 1.602936832, + "gpu_mem": 4.493722624, + "loss": 1.4107, + "grad_norm": 1.0433400869369507, + "learning_rate": 6.855087008088307e-05 + }, + { + "step": 447, + "epoch": 0.7163461538461539, + "cpu_mem": 1.602936832, + "gpu_mem": 4.493690368, + "loss": 1.3809, + "grad_norm": 1.2233047485351562, + "learning_rate": 6.784677391520952e-05 + }, + { + "step": 448, + "epoch": 0.717948717948718, + "cpu_mem": 1.60313344, + "gpu_mem": 4.493718016, + "loss": 1.4041, + "grad_norm": 1.2633004188537598, + "learning_rate": 6.714525405592412e-05 + }, + { + "step": 449, + "epoch": 0.719551282051282, + "cpu_mem": 1.60313344, + "gpu_mem": 4.493725696, + "loss": 1.368, + "grad_norm": 1.2140766382217407, + "learning_rate": 6.644633250252937e-05 + }, + { + "step": 450, + "epoch": 0.7211538461538461, + "cpu_mem": 1.60313344, + "gpu_mem": 4.493707264, + "loss": 1.4165, + "grad_norm": 1.542338490486145, + "learning_rate": 6.575003117304535e-05 + }, + { + "step": 451, + "epoch": 0.7227564102564102, + "cpu_mem": 1.60313344, + "gpu_mem": 4.49369344, + "loss": 1.4128, + "grad_norm": 1.205672264099121, + "learning_rate": 6.50563719033225e-05 + }, + { + "step": 452, + "epoch": 0.7243589743589743, + "cpu_mem": 1.60313344, + "gpu_mem": 4.493704192, + "loss": 1.3984, + "grad_norm": 0.7546482086181641, + "learning_rate": 6.436537644635705e-05 + }, + { + "step": 453, + "epoch": 0.7259615384615384, + "cpu_mem": 1.60313344, + "gpu_mem": 4.493696512, + "loss": 1.381, + "grad_norm": 0.7733486890792847, + "learning_rate": 6.367706647160847e-05 + }, + { + "step": 454, + "epoch": 0.7275641025641025, + "cpu_mem": 1.603330048, + "gpu_mem": 4.493713408, + "loss": 1.3839, + "grad_norm": 0.2343958169221878, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 455, + "epoch": 0.7291666666666666, + "cpu_mem": 1.603330048, + "gpu_mem": 4.49368576, + "loss": 1.3796, + "grad_norm": 0.7431733012199402, + "learning_rate": 6.230858922484288e-05 + }, + { + "step": 456, + "epoch": 0.7307692307692307, + "cpu_mem": 1.603330048, + "gpu_mem": 4.49371648, + "loss": 1.4029, + "grad_norm": 1.2416259050369263, + "learning_rate": 6.162846486795938e-05 + }, + { + "step": 457, + "epoch": 0.7323717948717948, + "cpu_mem": 1.603330048, + "gpu_mem": 4.493698048, + "loss": 1.3747, + "grad_norm": 0.8814178109169006, + "learning_rate": 6.095111182221422e-05 + }, + { + "step": 458, + "epoch": 0.7339743589743589, + "cpu_mem": 1.603330048, + "gpu_mem": 4.49368576, + "loss": 1.3909, + "grad_norm": 0.7970499396324158, + "learning_rate": 6.027655132924397e-05 + }, + { + "step": 459, + "epoch": 0.7355769230769231, + "cpu_mem": 1.603330048, + "gpu_mem": 4.493698048, + "loss": 1.416, + "grad_norm": 1.6416066884994507, + "learning_rate": 5.960480454311155e-05 + }, + { + "step": 460, + "epoch": 0.7371794871794872, + "cpu_mem": 1.603330048, + "gpu_mem": 4.493704192, + "loss": 1.4008, + "grad_norm": 1.0982202291488647, + "learning_rate": 5.893589252964258e-05 + }, + { + "step": 461, + "epoch": 0.7387820512820513, + "cpu_mem": 1.603330048, + "gpu_mem": 4.493691904, + "loss": 1.4012, + "grad_norm": 0.9338749647140503, + "learning_rate": 5.826983626576479e-05 + }, + { + "step": 462, + "epoch": 0.7403846153846154, + "cpu_mem": 1.603330048, + "gpu_mem": 4.493681152, + "loss": 1.4055, + "grad_norm": 1.1043078899383545, + "learning_rate": 5.760665663885046e-05 + }, + { + "step": 463, + "epoch": 0.7419871794871795, + "cpu_mem": 1.603330048, + "gpu_mem": 4.493682688, + "loss": 1.392, + "grad_norm": 0.9609983563423157, + "learning_rate": 5.6946374446060984e-05 + }, + { + "step": 464, + "epoch": 0.7435897435897436, + "cpu_mem": 1.603526656, + "gpu_mem": 4.493696512, + "loss": 1.3868, + "grad_norm": 0.1265050619840622, + "learning_rate": 5.6289010393695056e-05 + }, + { + "step": 465, + "epoch": 0.7451923076923077, + "cpu_mem": 1.603526656, + "gpu_mem": 4.493699584, + "loss": 1.3864, + "grad_norm": 0.17773757874965668, + "learning_rate": 5.563458509653904e-05 + }, + { + "step": 466, + "epoch": 0.7467948717948718, + "cpu_mem": 1.603526656, + "gpu_mem": 4.493710336, + "loss": 1.3779, + "grad_norm": 0.6510043740272522, + "learning_rate": 5.498311907722057e-05 + }, + { + "step": 467, + "epoch": 0.7483974358974359, + "cpu_mem": 1.603526656, + "gpu_mem": 4.493684224, + "loss": 1.3857, + "grad_norm": 0.3461027443408966, + "learning_rate": 5.43346327655652e-05 + }, + { + "step": 468, + "epoch": 0.75, + "cpu_mem": 1.603526656, + "gpu_mem": 4.493699584, + "loss": 1.3829, + "grad_norm": 0.3472058176994324, + "learning_rate": 5.3689146497955274e-05 + }, + { + "step": 469, + "epoch": 0.7516025641025641, + "cpu_mem": 1.603526656, + "gpu_mem": 4.4937088, + "loss": 1.3816, + "grad_norm": 0.6470133066177368, + "learning_rate": 5.30466805166927e-05 + }, + { + "step": 470, + "epoch": 0.7532051282051282, + "cpu_mem": 1.603526656, + "gpu_mem": 4.493682688, + "loss": 1.388, + "grad_norm": 0.8258925676345825, + "learning_rate": 5.240725496936372e-05 + }, + { + "step": 471, + "epoch": 0.7548076923076923, + "cpu_mem": 1.603526656, + "gpu_mem": 4.493688832, + "loss": 1.3781, + "grad_norm": 0.46463990211486816, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 472, + "epoch": 0.7564102564102564, + "cpu_mem": 1.603526656, + "gpu_mem": 4.49367808, + "loss": 1.3724, + "grad_norm": 0.896947979927063, + "learning_rate": 5.113760528948622e-05 + }, + { + "step": 473, + "epoch": 0.7580128205128205, + "cpu_mem": 1.603526656, + "gpu_mem": 4.493684224, + "loss": 1.3953, + "grad_norm": 0.764835774898529, + "learning_rate": 5.05074209728614e-05 + }, + { + "step": 474, + "epoch": 0.7596153846153846, + "cpu_mem": 1.603723264, + "gpu_mem": 4.493721088, + "loss": 1.4093, + "grad_norm": 0.9000777006149292, + "learning_rate": 4.988035672076899e-05 + }, + { + "step": 475, + "epoch": 0.7612179487179487, + "cpu_mem": 1.603919872, + "gpu_mem": 4.493668864, + "loss": 1.4138, + "grad_norm": 0.9584017395973206, + "learning_rate": 4.925643219780052e-05 + }, + { + "step": 476, + "epoch": 0.7628205128205128, + "cpu_mem": 1.603919872, + "gpu_mem": 4.493688832, + "loss": 1.4113, + "grad_norm": 1.2428643703460693, + "learning_rate": 4.863566697008634e-05 + }, + { + "step": 477, + "epoch": 0.7644230769230769, + "cpu_mem": 1.603919872, + "gpu_mem": 4.493688832, + "loss": 1.4259, + "grad_norm": 1.4249953031539917, + "learning_rate": 4.801808050468219e-05 + }, + { + "step": 478, + "epoch": 0.7660256410256411, + "cpu_mem": 1.603919872, + "gpu_mem": 4.493687296, + "loss": 1.3964, + "grad_norm": 0.6787549257278442, + "learning_rate": 4.7403692168958305e-05 + }, + { + "step": 479, + "epoch": 0.7676282051282052, + "cpu_mem": 1.603919872, + "gpu_mem": 4.49368576, + "loss": 1.3721, + "grad_norm": 0.7358736991882324, + "learning_rate": 4.679252122999255e-05 + }, + { + "step": 480, + "epoch": 0.7692307692307693, + "cpu_mem": 1.603919872, + "gpu_mem": 4.49367808, + "loss": 1.3855, + "grad_norm": 0.9281912446022034, + "learning_rate": 4.618458685396579e-05 + }, + { + "step": 481, + "epoch": 0.7708333333333334, + "cpu_mem": 1.603919872, + "gpu_mem": 4.493737984, + "loss": 1.3899, + "grad_norm": 0.48371177911758423, + "learning_rate": 4.5579908105561016e-05 + }, + { + "step": 482, + "epoch": 0.7724358974358975, + "cpu_mem": 1.603919872, + "gpu_mem": 4.493682688, + "loss": 1.3837, + "grad_norm": 0.33138900995254517, + "learning_rate": 4.497850394736563e-05 + }, + { + "step": 483, + "epoch": 0.7740384615384616, + "cpu_mem": 1.603919872, + "gpu_mem": 4.493665792, + "loss": 1.3797, + "grad_norm": 0.7497758865356445, + "learning_rate": 4.438039323927648e-05 + }, + { + "step": 484, + "epoch": 0.7756410256410257, + "cpu_mem": 1.603919872, + "gpu_mem": 4.493696512, + "loss": 1.386, + "grad_norm": 0.8916345834732056, + "learning_rate": 4.3785594737908676e-05 + }, + { + "step": 485, + "epoch": 0.7772435897435898, + "cpu_mem": 1.603919872, + "gpu_mem": 4.493741056, + "loss": 1.3774, + "grad_norm": 0.8646592497825623, + "learning_rate": 4.319412709600723e-05 + }, + { + "step": 486, + "epoch": 0.7788461538461539, + "cpu_mem": 1.603919872, + "gpu_mem": 4.493721088, + "loss": 1.3845, + "grad_norm": 0.8134586215019226, + "learning_rate": 4.2606008861862116e-05 + }, + { + "step": 487, + "epoch": 0.780448717948718, + "cpu_mem": 1.603919872, + "gpu_mem": 4.493721088, + "loss": 1.4012, + "grad_norm": 0.9545542001724243, + "learning_rate": 4.2021258478726774e-05 + }, + { + "step": 488, + "epoch": 0.782051282051282, + "cpu_mem": 1.603919872, + "gpu_mem": 4.493687296, + "loss": 1.4046, + "grad_norm": 1.4168310165405273, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 489, + "epoch": 0.7836538461538461, + "cpu_mem": 1.603919872, + "gpu_mem": 4.493711872, + "loss": 1.397, + "grad_norm": 0.6675275564193726, + "learning_rate": 4.0861934509848507e-05 + }, + { + "step": 490, + "epoch": 0.7852564102564102, + "cpu_mem": 1.60411648, + "gpu_mem": 4.493714944, + "loss": 1.3991, + "grad_norm": 0.7612125277519226, + "learning_rate": 4.028739728024022e-05 + }, + { + "step": 491, + "epoch": 0.7868589743589743, + "cpu_mem": 1.60411648, + "gpu_mem": 4.49369344, + "loss": 1.3942, + "grad_norm": 1.0415263175964355, + "learning_rate": 3.971630061277077e-05 + }, + { + "step": 492, + "epoch": 0.7884615384615384, + "cpu_mem": 1.60411648, + "gpu_mem": 4.49371648, + "loss": 1.3877, + "grad_norm": 1.1107438802719116, + "learning_rate": 3.914866241690115e-05 + }, + { + "step": 493, + "epoch": 0.7900641025641025, + "cpu_mem": 1.60411648, + "gpu_mem": 4.493696512, + "loss": 1.3924, + "grad_norm": 0.7298089861869812, + "learning_rate": 3.858450049363532e-05 + }, + { + "step": 494, + "epoch": 0.7916666666666666, + "cpu_mem": 1.60411648, + "gpu_mem": 4.493721088, + "loss": 1.3876, + "grad_norm": 0.45008188486099243, + "learning_rate": 3.8023832534962314e-05 + }, + { + "step": 495, + "epoch": 0.7932692307692307, + "cpu_mem": 1.604313088, + "gpu_mem": 4.493704192, + "loss": 1.3916, + "grad_norm": 0.34851163625717163, + "learning_rate": 3.746667612330109e-05 + }, + { + "step": 496, + "epoch": 0.7948717948717948, + "cpu_mem": 1.604313088, + "gpu_mem": 4.493699584, + "loss": 1.3821, + "grad_norm": 0.67151939868927, + "learning_rate": 3.691304873094927e-05 + }, + { + "step": 497, + "epoch": 0.7964743589743589, + "cpu_mem": 1.604313088, + "gpu_mem": 4.493711872, + "loss": 1.3813, + "grad_norm": 0.5252869129180908, + "learning_rate": 3.636296771953544e-05 + }, + { + "step": 498, + "epoch": 0.7980769230769231, + "cpu_mem": 1.604313088, + "gpu_mem": 4.493682688, + "loss": 1.3972, + "grad_norm": 0.6368927359580994, + "learning_rate": 3.581645033947425e-05 + }, + { + "step": 499, + "epoch": 0.7996794871794872, + "cpu_mem": 1.604313088, + "gpu_mem": 4.493696512, + "loss": 1.3945, + "grad_norm": 0.5968329310417175, + "learning_rate": 3.527351372942588e-05 + }, + { + "step": 500, + "epoch": 0.8012820512820513, + "cpu_mem": 1.604313088, + "gpu_mem": 4.493682688, + "loss": 1.3956, + "grad_norm": 0.7284405827522278, + "learning_rate": 3.473417491575824e-05 + }, + { + "step": 501, + "epoch": 0.8028846153846154, + "cpu_mem": 1.604313088, + "gpu_mem": 4.493676544, + "loss": 1.3881, + "grad_norm": 0.33032116293907166, + "learning_rate": 3.41984508120132e-05 + }, + { + "step": 502, + "epoch": 0.8044871794871795, + "cpu_mem": 1.604313088, + "gpu_mem": 4.493682688, + "loss": 1.3892, + "grad_norm": 0.2171952724456787, + "learning_rate": 3.366635821837627e-05 + }, + { + "step": 503, + "epoch": 0.8060897435897436, + "cpu_mem": 1.604313088, + "gpu_mem": 4.493696512, + "loss": 1.3892, + "grad_norm": 0.46422359347343445, + "learning_rate": 3.3137913821149425e-05 + }, + { + "step": 504, + "epoch": 0.8076923076923077, + "cpu_mem": 1.604313088, + "gpu_mem": 4.493679616, + "loss": 1.3883, + "grad_norm": 0.8638746738433838, + "learning_rate": 3.261313419222825e-05 + }, + { + "step": 505, + "epoch": 0.8092948717948718, + "cpu_mem": 1.604313088, + "gpu_mem": 4.493733376, + "loss": 1.3895, + "grad_norm": 0.45987656712532043, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 506, + "epoch": 0.8108974358974359, + "cpu_mem": 1.604313088, + "gpu_mem": 4.493676544, + "loss": 1.3869, + "grad_norm": 0.25890523195266724, + "learning_rate": 3.157463495173713e-05 + }, + { + "step": 507, + "epoch": 0.8125, + "cpu_mem": 1.604313088, + "gpu_mem": 4.49375488, + "loss": 1.3818, + "grad_norm": 0.6787480115890503, + "learning_rate": 3.1060947907265936e-05 + }, + { + "step": 508, + "epoch": 0.8141025641025641, + "cpu_mem": 1.604313088, + "gpu_mem": 4.493698048, + "loss": 1.3874, + "grad_norm": 1.0950523614883423, + "learning_rate": 3.0550990764276634e-05 + }, + { + "step": 509, + "epoch": 0.8157051282051282, + "cpu_mem": 1.604313088, + "gpu_mem": 4.49371648, + "loss": 1.3979, + "grad_norm": 0.7617290019989014, + "learning_rate": 3.00447795149086e-05 + }, + { + "step": 510, + "epoch": 0.8173076923076923, + "cpu_mem": 1.604313088, + "gpu_mem": 4.493691904, + "loss": 1.3984, + "grad_norm": 0.7897295951843262, + "learning_rate": 2.9542330033830884e-05 + }, + { + "step": 511, + "epoch": 0.8189102564102564, + "cpu_mem": 1.604509696, + "gpu_mem": 4.49372416, + "loss": 1.3864, + "grad_norm": 0.39449992775917053, + "learning_rate": 2.9043658077744316e-05 + }, + { + "step": 512, + "epoch": 0.8205128205128205, + "cpu_mem": 1.604509696, + "gpu_mem": 4.493744128, + "loss": 1.3857, + "grad_norm": 0.5921636819839478, + "learning_rate": 2.8548779284887442e-05 + }, + { + "step": 513, + "epoch": 0.8221153846153846, + "cpu_mem": 1.604509696, + "gpu_mem": 4.493673472, + "loss": 1.3901, + "grad_norm": 0.4348948895931244, + "learning_rate": 2.805770917454614e-05 + }, + { + "step": 514, + "epoch": 0.8237179487179487, + "cpu_mem": 1.604509696, + "gpu_mem": 4.493687296, + "loss": 1.3776, + "grad_norm": 0.45093029737472534, + "learning_rate": 2.7570463146566758e-05 + }, + { + "step": 515, + "epoch": 0.8253205128205128, + "cpu_mem": 1.604509696, + "gpu_mem": 4.493671936, + "loss": 1.3953, + "grad_norm": 0.6254445314407349, + "learning_rate": 2.708705648087332e-05 + }, + { + "step": 516, + "epoch": 0.8269230769230769, + "cpu_mem": 1.604509696, + "gpu_mem": 4.493710336, + "loss": 1.3921, + "grad_norm": 0.33055543899536133, + "learning_rate": 2.6607504336988317e-05 + }, + { + "step": 517, + "epoch": 0.8285256410256411, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493710336, + "loss": 1.3896, + "grad_norm": 0.579383909702301, + "learning_rate": 2.613182175355739e-05 + }, + { + "step": 518, + "epoch": 0.8301282051282052, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493696512, + "loss": 1.3947, + "grad_norm": 0.486250102519989, + "learning_rate": 2.5660023647877644e-05 + }, + { + "step": 519, + "epoch": 0.8317307692307693, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493687296, + "loss": 1.3856, + "grad_norm": 0.2698545753955841, + "learning_rate": 2.5192124815429777e-05 + }, + { + "step": 520, + "epoch": 0.8333333333333334, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493691904, + "loss": 1.3904, + "grad_norm": 0.2757352590560913, + "learning_rate": 2.472813992941418e-05 + }, + { + "step": 521, + "epoch": 0.8349358974358975, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493694976, + "loss": 1.3851, + "grad_norm": 0.4521386921405792, + "learning_rate": 2.426808354029078e-05 + }, + { + "step": 522, + "epoch": 0.8365384615384616, + "cpu_mem": 1.604706304, + "gpu_mem": 4.49370112, + "loss": 1.3857, + "grad_norm": 0.2825348377227783, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 523, + "epoch": 0.8381410256410257, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493719552, + "loss": 1.3867, + "grad_norm": 0.21252861618995667, + "learning_rate": 2.3359813838124277e-05 + }, + { + "step": 524, + "epoch": 0.8397435897435898, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493713408, + "loss": 1.3911, + "grad_norm": 0.5822197794914246, + "learning_rate": 2.2911629008211363e-05 + }, + { + "step": 525, + "epoch": 0.8413461538461539, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493690368, + "loss": 1.3847, + "grad_norm": 0.3752332925796509, + "learning_rate": 2.24674296405579e-05 + }, + { + "step": 526, + "epoch": 0.842948717948718, + "cpu_mem": 1.604706304, + "gpu_mem": 4.49367808, + "loss": 1.3827, + "grad_norm": 0.5531086921691895, + "learning_rate": 2.2027229665154446e-05 + }, + { + "step": 527, + "epoch": 0.844551282051282, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493644288, + "loss": 1.3947, + "grad_norm": 1.0810375213623047, + "learning_rate": 2.1591042886571634e-05 + }, + { + "step": 528, + "epoch": 0.8461538461538461, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493691904, + "loss": 1.3852, + "grad_norm": 0.3913367986679077, + "learning_rate": 2.1158882983527166e-05 + }, + { + "step": 529, + "epoch": 0.8477564102564102, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493658112, + "loss": 1.3839, + "grad_norm": 0.3635590970516205, + "learning_rate": 2.0730763508456738e-05 + }, + { + "step": 530, + "epoch": 0.8493589743589743, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493705728, + "loss": 1.3841, + "grad_norm": 0.6554697751998901, + "learning_rate": 2.0306697887089235e-05 + }, + { + "step": 531, + "epoch": 0.8509615384615384, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493704192, + "loss": 1.3848, + "grad_norm": 0.26782703399658203, + "learning_rate": 1.9886699418025543e-05 + }, + { + "step": 532, + "epoch": 0.8525641025641025, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493705728, + "loss": 1.3916, + "grad_norm": 0.6363285779953003, + "learning_rate": 1.947078127232169e-05 + }, + { + "step": 533, + "epoch": 0.8541666666666666, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493714944, + "loss": 1.3912, + "grad_norm": 0.3962929844856262, + "learning_rate": 1.9058956493075644e-05 + }, + { + "step": 534, + "epoch": 0.8557692307692307, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493690368, + "loss": 1.383, + "grad_norm": 0.5075817108154297, + "learning_rate": 1.8651237995018324e-05 + }, + { + "step": 535, + "epoch": 0.8573717948717948, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493675008, + "loss": 1.3882, + "grad_norm": 0.2634309232234955, + "learning_rate": 1.8247638564108607e-05 + }, + { + "step": 536, + "epoch": 0.8589743589743589, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493704192, + "loss": 1.3935, + "grad_norm": 0.6562682390213013, + "learning_rate": 1.7848170857132325e-05 + }, + { + "step": 537, + "epoch": 0.8605769230769231, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493718016, + "loss": 1.3827, + "grad_norm": 0.5025540590286255, + "learning_rate": 1.7452847401305496e-05 + }, + { + "step": 538, + "epoch": 0.8621794871794872, + "cpu_mem": 1.604706304, + "gpu_mem": 4.493673472, + "loss": 1.3875, + "grad_norm": 0.8416153788566589, + "learning_rate": 1.7061680593881344e-05 + }, + { + "step": 539, + "epoch": 0.8637820512820513, + "cpu_mem": 1.604902912, + "gpu_mem": 4.493679616, + "loss": 1.389, + "grad_norm": 0.6714017391204834, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 540, + "epoch": 0.8653846153846154, + "cpu_mem": 1.604902912, + "gpu_mem": 4.4937088, + "loss": 1.3869, + "grad_norm": 0.10729008913040161, + "learning_rate": 1.6291865861111353e-05 + }, + { + "step": 541, + "epoch": 0.8669871794871795, + "cpu_mem": 1.604902912, + "gpu_mem": 4.493704192, + "loss": 1.3833, + "grad_norm": 0.4481179416179657, + "learning_rate": 1.5913242076979493e-05 + }, + { + "step": 542, + "epoch": 0.8685897435897436, + "cpu_mem": 1.604902912, + "gpu_mem": 4.493690368, + "loss": 1.391, + "grad_norm": 0.4850403964519501, + "learning_rate": 1.5538823222921288e-05 + }, + { + "step": 543, + "epoch": 0.8701923076923077, + "cpu_mem": 1.604902912, + "gpu_mem": 4.493704192, + "loss": 1.379, + "grad_norm": 0.7116642594337463, + "learning_rate": 1.5168621040626388e-05 + }, + { + "step": 544, + "epoch": 0.8717948717948718, + "cpu_mem": 1.60509952, + "gpu_mem": 4.49369344, + "loss": 1.392, + "grad_norm": 0.6265073418617249, + "learning_rate": 1.4802647139550577e-05 + }, + { + "step": 545, + "epoch": 0.8733974358974359, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493699584, + "loss": 1.3886, + "grad_norm": 0.25711050629615784, + "learning_rate": 1.444091299655175e-05 + }, + { + "step": 546, + "epoch": 0.875, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493704192, + "loss": 1.384, + "grad_norm": 0.3103441596031189, + "learning_rate": 1.408342995552988e-05 + }, + { + "step": 547, + "epoch": 0.8766025641025641, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493699584, + "loss": 1.3902, + "grad_norm": 0.3619419038295746, + "learning_rate": 1.3730209227071436e-05 + }, + { + "step": 548, + "epoch": 0.8782051282051282, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493673472, + "loss": 1.3877, + "grad_norm": 0.28750282526016235, + "learning_rate": 1.3381261888097755e-05 + }, + { + "step": 549, + "epoch": 0.8798076923076923, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493682688, + "loss": 1.3882, + "grad_norm": 0.6173223257064819, + "learning_rate": 1.303659888151753e-05 + }, + { + "step": 550, + "epoch": 0.8814102564102564, + "cpu_mem": 1.60509952, + "gpu_mem": 4.49370112, + "loss": 1.3912, + "grad_norm": 0.6667125821113586, + "learning_rate": 1.2696231015883913e-05 + }, + { + "step": 551, + "epoch": 0.8830128205128205, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493671936, + "loss": 1.3878, + "grad_norm": 0.6776307821273804, + "learning_rate": 1.2360168965055301e-05 + }, + { + "step": 552, + "epoch": 0.8846153846153846, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493702656, + "loss": 1.3871, + "grad_norm": 0.6379751563072205, + "learning_rate": 1.2028423267860805e-05 + }, + { + "step": 553, + "epoch": 0.8862179487179487, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493711872, + "loss": 1.3916, + "grad_norm": 0.5579805374145508, + "learning_rate": 1.1701004327769709e-05 + }, + { + "step": 554, + "epoch": 0.8878205128205128, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493673472, + "loss": 1.3885, + "grad_norm": 0.33927759528160095, + "learning_rate": 1.1377922412565005e-05 + }, + { + "step": 555, + "epoch": 0.8894230769230769, + "cpu_mem": 1.60509952, + "gpu_mem": 4.49367808, + "loss": 1.3864, + "grad_norm": 0.37431737780570984, + "learning_rate": 1.1059187654021762e-05 + }, + { + "step": 556, + "epoch": 0.8910256410256411, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493702656, + "loss": 1.3869, + "grad_norm": 0.28183555603027344, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 557, + "epoch": 0.8926282051282052, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493721088, + "loss": 1.3891, + "grad_norm": 0.2982402741909027, + "learning_rate": 1.0434799452076915e-05 + }, + { + "step": 558, + "epoch": 0.8942307692307693, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493702656, + "loss": 1.3849, + "grad_norm": 0.8087599873542786, + "learning_rate": 1.0129165589346643e-05 + }, + { + "step": 559, + "epoch": 0.8958333333333334, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493753344, + "loss": 1.3822, + "grad_norm": 0.5288816690444946, + "learning_rate": 9.82791804400626e-06 + }, + { + "step": 560, + "epoch": 0.8974358974358975, + "cpu_mem": 1.60509952, + "gpu_mem": 4.49368576, + "loss": 1.3911, + "grad_norm": 0.47356680035591125, + "learning_rate": 9.531066263109971e-06 + }, + { + "step": 561, + "epoch": 0.8990384615384616, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493687296, + "loss": 1.385, + "grad_norm": 0.4644610285758972, + "learning_rate": 9.238619555861731e-06 + }, + { + "step": 562, + "epoch": 0.9006410256410257, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493687296, + "loss": 1.3866, + "grad_norm": 0.6312747001647949, + "learning_rate": 8.950587093323435e-06 + }, + { + "step": 563, + "epoch": 0.9022435897435898, + "cpu_mem": 1.60509952, + "gpu_mem": 4.49369344, + "loss": 1.3881, + "grad_norm": 0.3943093419075012, + "learning_rate": 8.66697790812731e-06 + }, + { + "step": 564, + "epoch": 0.9038461538461539, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493707264, + "loss": 1.3839, + "grad_norm": 0.5293429493904114, + "learning_rate": 8.387800894192453e-06 + }, + { + "step": 565, + "epoch": 0.905448717948718, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493711872, + "loss": 1.388, + "grad_norm": 0.30123311281204224, + "learning_rate": 8.113064806446285e-06 + }, + { + "step": 566, + "epoch": 0.907051282051282, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493705728, + "loss": 1.3885, + "grad_norm": 0.589518129825592, + "learning_rate": 7.842778260549654e-06 + }, + { + "step": 567, + "epoch": 0.9086538461538461, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493699584, + "loss": 1.3919, + "grad_norm": 0.7247474193572998, + "learning_rate": 7.5769497326268804e-06 + }, + { + "step": 568, + "epoch": 0.9102564102564102, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493713408, + "loss": 1.3871, + "grad_norm": 0.9126692414283752, + "learning_rate": 7.315587558999864e-06 + }, + { + "step": 569, + "epoch": 0.9118589743589743, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493705728, + "loss": 1.3867, + "grad_norm": 0.18154481053352356, + "learning_rate": 7.058699935926526e-06 + }, + { + "step": 570, + "epoch": 0.9134615384615384, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493690368, + "loss": 1.3837, + "grad_norm": 0.6712857484817505, + "learning_rate": 6.8062949193440515e-06 + }, + { + "step": 571, + "epoch": 0.9150641025641025, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493699584, + "loss": 1.3877, + "grad_norm": 0.23089227080345154, + "learning_rate": 6.5583804246160385e-06 + }, + { + "step": 572, + "epoch": 0.9166666666666666, + "cpu_mem": 1.60509952, + "gpu_mem": 4.4937088, + "loss": 1.3868, + "grad_norm": 0.3668045997619629, + "learning_rate": 6.3149642262843804e-06 + }, + { + "step": 573, + "epoch": 0.9182692307692307, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493711872, + "loss": 1.3916, + "grad_norm": 0.7648842930793762, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 574, + "epoch": 0.9198717948717948, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493665792, + "loss": 1.3871, + "grad_norm": 0.34318843483924866, + "learning_rate": 5.84165711141048e-06 + }, + { + "step": 575, + "epoch": 0.9214743589743589, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493719552, + "loss": 1.389, + "grad_norm": 0.7701164484024048, + "learning_rate": 5.611781037671176e-06 + }, + { + "step": 576, + "epoch": 0.9230769230769231, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493721088, + "loss": 1.3891, + "grad_norm": 0.4514608681201935, + "learning_rate": 5.386432945468555e-06 + }, + { + "step": 577, + "epoch": 0.9246794871794872, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493665792, + "loss": 1.3874, + "grad_norm": 0.33550116419792175, + "learning_rate": 5.165619901667311e-06 + }, + { + "step": 578, + "epoch": 0.9262820512820513, + "cpu_mem": 1.60509952, + "gpu_mem": 4.493699584, + "loss": 1.3853, + "grad_norm": 0.6384766101837158, + "learning_rate": 4.949348830914002e-06 + }, + { + "step": 579, + "epoch": 0.9278846153846154, + "cpu_mem": 1.605296128, + "gpu_mem": 4.49367808, + "loss": 1.3892, + "grad_norm": 0.44213274121284485, + "learning_rate": 4.737626515419951e-06 + }, + { + "step": 580, + "epoch": 0.9294871794871795, + "cpu_mem": 1.605296128, + "gpu_mem": 4.4937088, + "loss": 1.3863, + "grad_norm": 0.20060130953788757, + "learning_rate": 4.530459594748592e-06 + }, + { + "step": 581, + "epoch": 0.9310897435897436, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493684224, + "loss": 1.3871, + "grad_norm": 0.7004127502441406, + "learning_rate": 4.327854565607164e-06 + }, + { + "step": 582, + "epoch": 0.9326923076923077, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493718016, + "loss": 1.3888, + "grad_norm": 0.5933623313903809, + "learning_rate": 4.129817781643091e-06 + }, + { + "step": 583, + "epoch": 0.9342948717948718, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493737984, + "loss": 1.3873, + "grad_norm": 0.44857585430145264, + "learning_rate": 3.9363554532446276e-06 + }, + { + "step": 584, + "epoch": 0.9358974358974359, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493702656, + "loss": 1.3866, + "grad_norm": 0.478500097990036, + "learning_rate": 3.7474736473461607e-06 + }, + { + "step": 585, + "epoch": 0.9375, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493722624, + "loss": 1.385, + "grad_norm": 0.3888069689273834, + "learning_rate": 3.56317828723795e-06 + }, + { + "step": 586, + "epoch": 0.9391025641025641, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493702656, + "loss": 1.3879, + "grad_norm": 0.6951819062232971, + "learning_rate": 3.383475152380355e-06 + }, + { + "step": 587, + "epoch": 0.9407051282051282, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493704192, + "loss": 1.3882, + "grad_norm": 0.7651022672653198, + "learning_rate": 3.2083698782225997e-06 + }, + { + "step": 588, + "epoch": 0.9423076923076923, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493696512, + "loss": 1.3879, + "grad_norm": 0.5823536515235901, + "learning_rate": 3.0378679560260467e-06 + }, + { + "step": 589, + "epoch": 0.9439102564102564, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493694976, + "loss": 1.3868, + "grad_norm": 0.7235889434814453, + "learning_rate": 2.871974732691984e-06 + }, + { + "step": 590, + "epoch": 0.9455128205128205, + "cpu_mem": 1.605296128, + "gpu_mem": 4.4937088, + "loss": 1.3853, + "grad_norm": 0.350984126329422, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 591, + "epoch": 0.9471153846153846, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493679616, + "loss": 1.3884, + "grad_norm": 0.29162314534187317, + "learning_rate": 2.554035047414732e-06 + }, + { + "step": 592, + "epoch": 0.9487179487179487, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493730304, + "loss": 1.3867, + "grad_norm": 0.9161481857299805, + "learning_rate": 2.401998555987389e-06 + }, + { + "step": 593, + "epoch": 0.9503205128205128, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493727232, + "loss": 1.3878, + "grad_norm": 0.38924887776374817, + "learning_rate": 2.2545907041415457e-06 + }, + { + "step": 594, + "epoch": 0.9519230769230769, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493707264, + "loss": 1.3854, + "grad_norm": 0.4280828833580017, + "learning_rate": 2.1118161145537436e-06 + }, + { + "step": 595, + "epoch": 0.9535256410256411, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493688832, + "loss": 1.3892, + "grad_norm": 0.5120537281036377, + "learning_rate": 1.973679264602485e-06 + }, + { + "step": 596, + "epoch": 0.9551282051282052, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493698048, + "loss": 1.3868, + "grad_norm": 0.1266588568687439, + "learning_rate": 1.840184486227808e-06 + }, + { + "step": 597, + "epoch": 0.9567307692307693, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493665792, + "loss": 1.3871, + "grad_norm": 0.43774276971817017, + "learning_rate": 1.711335965795435e-06 + }, + { + "step": 598, + "epoch": 0.9583333333333334, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493727232, + "loss": 1.3877, + "grad_norm": 0.37881991267204285, + "learning_rate": 1.5871377439655054e-06 + }, + { + "step": 599, + "epoch": 0.9599358974358975, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493725696, + "loss": 1.3863, + "grad_norm": 0.28704485297203064, + "learning_rate": 1.4675937155658456e-06 + }, + { + "step": 600, + "epoch": 0.9615384615384616, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493681152, + "loss": 1.3855, + "grad_norm": 0.47494763135910034, + "learning_rate": 1.3527076294698846e-06 + }, + { + "step": 601, + "epoch": 0.9631410256410257, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493713408, + "loss": 1.3865, + "grad_norm": 0.3030700385570526, + "learning_rate": 1.2424830884790126e-06 + }, + { + "step": 602, + "epoch": 0.9647435897435898, + "cpu_mem": 1.605296128, + "gpu_mem": 4.493707264, + "loss": 1.3871, + "grad_norm": 0.6933444142341614, + "learning_rate": 1.1369235492096397e-06 + }, + { + "step": 603, + "epoch": 0.9663461538461539, + "cpu_mem": 1.605296128, + "gpu_mem": 4.49369344, + "loss": 1.3902, + "grad_norm": 0.7377634644508362, + "learning_rate": 1.0360323219847645e-06 + }, + { + "step": 604, + "epoch": 0.967948717948718, + "cpu_mem": 1.605296128, + "gpu_mem": 4.49369344, + "loss": 1.3859, + "grad_norm": 0.5475888252258301, + "learning_rate": 9.398125707302084e-07 + }, + { + "step": 605, + "epoch": 0.969551282051282, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493719552, + "loss": 1.3897, + "grad_norm": 0.5178420543670654, + "learning_rate": 8.482673128753947e-07 + }, + { + "step": 606, + "epoch": 0.9711538461538461, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493707264, + "loss": 1.388, + "grad_norm": 0.5333667397499084, + "learning_rate": 7.613994192586736e-07 + }, + { + "step": 607, + "epoch": 0.9727564102564102, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493698048, + "loss": 1.3866, + "grad_norm": 0.43449029326438904, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 608, + "epoch": 0.9743589743589743, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493690368, + "loss": 1.3879, + "grad_norm": 0.39289358258247375, + "learning_rate": 6.017064746021094e-07 + }, + { + "step": 609, + "epoch": 0.9759615384615384, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493713408, + "loss": 1.39, + "grad_norm": 0.7628423571586609, + "learning_rate": 5.288864314965003e-07 + }, + { + "step": 610, + "epoch": 0.9775641025641025, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493702656, + "loss": 1.3889, + "grad_norm": 0.5300387144088745, + "learning_rate": 4.607537683404106e-07 + }, + { + "step": 611, + "epoch": 0.9791666666666666, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493687296, + "loss": 1.3867, + "grad_norm": 0.5272076725959778, + "learning_rate": 3.973106217585842e-07 + }, + { + "step": 612, + "epoch": 0.9807692307692307, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493753344, + "loss": 1.3861, + "grad_norm": 0.6303485631942749, + "learning_rate": 3.3855898131356915e-07 + }, + { + "step": 613, + "epoch": 0.9823717948717948, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493696512, + "loss": 1.3879, + "grad_norm": 0.40895283222198486, + "learning_rate": 2.845006894433843e-07 + }, + { + "step": 614, + "epoch": 0.9839743589743589, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493682688, + "loss": 1.3852, + "grad_norm": 0.42513909935951233, + "learning_rate": 2.351374414037155e-07 + }, + { + "step": 615, + "epoch": 0.9855769230769231, + "cpu_mem": 1.605492736, + "gpu_mem": 4.4937472, + "loss": 1.3908, + "grad_norm": 0.8127328753471375, + "learning_rate": 1.9047078521474135e-07 + }, + { + "step": 616, + "epoch": 0.9871794871794872, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493676544, + "loss": 1.3881, + "grad_norm": 0.6447884440422058, + "learning_rate": 1.505021216125557e-07 + }, + { + "step": 617, + "epoch": 0.9887820512820513, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493704192, + "loss": 1.3888, + "grad_norm": 0.4774654805660248, + "learning_rate": 1.1523270400535245e-07 + }, + { + "step": 618, + "epoch": 0.9903846153846154, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493707264, + "loss": 1.3864, + "grad_norm": 0.22659142315387726, + "learning_rate": 8.466363843397383e-08 + }, + { + "step": 619, + "epoch": 0.9919871794871795, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493682688, + "loss": 1.3873, + "grad_norm": 0.6049101948738098, + "learning_rate": 5.8795883537338106e-08 + }, + { + "step": 620, + "epoch": 0.9935897435897436, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493710336, + "loss": 1.3874, + "grad_norm": 0.4242684543132782, + "learning_rate": 3.763025052231361e-08 + }, + { + "step": 621, + "epoch": 0.9951923076923077, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493719552, + "loss": 1.3874, + "grad_norm": 0.3358238935470581, + "learning_rate": 2.1167403138339088e-08 + }, + { + "step": 622, + "epoch": 0.9967948717948718, + "cpu_mem": 1.605492736, + "gpu_mem": 4.49371648, + "loss": 1.3858, + "grad_norm": 0.43234607577323914, + "learning_rate": 9.407857656540397e-09 + }, + { + "step": 623, + "epoch": 0.9983974358974359, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493688832, + "loss": 1.3898, + "grad_norm": 0.6846294403076172, + "learning_rate": 2.3519828535434325e-09 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493435392, + "loss": 1.3861, + "grad_norm": 0.4091266691684723, + "learning_rate": 0.0 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.605492736, + "gpu_mem": 4.493435392, + "train_runtime": 8037.0879, + "train_samples_per_second": 4.965, + "train_steps_per_second": 0.078, + "total_flos": 0.0, + "train_loss": 1.4179506748914719 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r2-a2/adapter_config.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..91562a2718627f56cb3f88093dd26c3a98c35384 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r2-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 4, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 2, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "A" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r2-a2/eval_results.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..9b98a42d44a0680fa244002a81c0cad372d6496e --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "logiqa", + "results": 0.28465193141912826 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r2-a2/training_configuration.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..af69066a2d014660d2cddd5f32d9f66ec57e2892 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "LOGIQA", + "dataset_id": "data/logiqa_train", + "preprocess_id": "logiqa_train_deepeval" + }, + "peft_config": { + "method": "abl_A", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 1577576 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 3, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_A-logiqa-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r2-a2", + "seed": 42, + "timestamp": "2025-08-30T13:31:32.519987" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r2-a2/training_logs.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..922af470cb21613cda46834f932a8eb6aa6899e3 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r2-a2/training_logs.json @@ -0,0 +1,5305 @@ +[ + { + "step": 1, + "epoch": 0.005089058524173028, + "cpu_mem": 1.542684672, + "gpu_mem": 4.424188416, + "loss": 3.8396, + "grad_norm": 307.03704833984375, + "learning_rate": 5.084745762711864e-06 + }, + { + "step": 2, + "epoch": 0.010178117048346057, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436981248, + "loss": 3.9728, + "grad_norm": 306.5046081542969, + "learning_rate": 1.0169491525423728e-05 + }, + { + "step": 3, + "epoch": 0.015267175572519083, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437058048, + "loss": 3.4583, + "grad_norm": 265.2538146972656, + "learning_rate": 1.5254237288135592e-05 + }, + { + "step": 4, + "epoch": 0.020356234096692113, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436956672, + "loss": 2.7487, + "grad_norm": 198.7176055908203, + "learning_rate": 2.0338983050847455e-05 + }, + { + "step": 5, + "epoch": 0.02544529262086514, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436972032, + "loss": 2.3141, + "grad_norm": 109.92964935302734, + "learning_rate": 2.542372881355932e-05 + }, + { + "step": 6, + "epoch": 0.030534351145038167, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436964352, + "loss": 1.9647, + "grad_norm": 63.298133850097656, + "learning_rate": 3.0508474576271185e-05 + }, + { + "step": 7, + "epoch": 0.035623409669211195, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437035008, + "loss": 1.6245, + "grad_norm": 38.076515197753906, + "learning_rate": 3.559322033898305e-05 + }, + { + "step": 8, + "epoch": 0.04071246819338423, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437019648, + "loss": 1.5282, + "grad_norm": 21.16168212890625, + "learning_rate": 4.067796610169491e-05 + }, + { + "step": 9, + "epoch": 0.04580152671755725, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437018112, + "loss": 1.4958, + "grad_norm": 18.895023345947266, + "learning_rate": 4.576271186440678e-05 + }, + { + "step": 10, + "epoch": 0.05089058524173028, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437028864, + "loss": 1.4104, + "grad_norm": 12.974337577819824, + "learning_rate": 5.084745762711864e-05 + }, + { + "step": 11, + "epoch": 0.05597964376590331, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436932096, + "loss": 1.3937, + "grad_norm": 10.097373962402344, + "learning_rate": 5.59322033898305e-05 + }, + { + "step": 12, + "epoch": 0.061068702290076333, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436982784, + "loss": 1.4187, + "grad_norm": 13.906693458557129, + "learning_rate": 6.101694915254237e-05 + }, + { + "step": 13, + "epoch": 0.06615776081424936, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437074944, + "loss": 1.3892, + "grad_norm": 6.541962146759033, + "learning_rate": 6.610169491525423e-05 + }, + { + "step": 14, + "epoch": 0.07124681933842239, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436987392, + "loss": 1.4741, + "grad_norm": 13.960658073425293, + "learning_rate": 7.11864406779661e-05 + }, + { + "step": 15, + "epoch": 0.07633587786259542, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437125632, + "loss": 1.4307, + "grad_norm": 9.444807052612305, + "learning_rate": 7.627118644067796e-05 + }, + { + "step": 16, + "epoch": 0.08142493638676845, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436985856, + "loss": 1.4575, + "grad_norm": 13.252518653869629, + "learning_rate": 8.135593220338982e-05 + }, + { + "step": 17, + "epoch": 0.08651399491094147, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437016576, + "loss": 1.3687, + "grad_norm": 6.98301362991333, + "learning_rate": 8.64406779661017e-05 + }, + { + "step": 18, + "epoch": 0.0916030534351145, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436979712, + "loss": 1.3855, + "grad_norm": 10.729589462280273, + "learning_rate": 9.152542372881355e-05 + }, + { + "step": 19, + "epoch": 0.09669211195928754, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436887552, + "loss": 1.3841, + "grad_norm": 5.174127578735352, + "learning_rate": 9.661016949152541e-05 + }, + { + "step": 20, + "epoch": 0.10178117048346055, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436925952, + "loss": 1.3777, + "grad_norm": 4.57600212097168, + "learning_rate": 0.00010169491525423727 + }, + { + "step": 21, + "epoch": 0.10687022900763359, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437059584, + "loss": 1.4452, + "grad_norm": 12.757831573486328, + "learning_rate": 0.00010677966101694915 + }, + { + "step": 22, + "epoch": 0.11195928753180662, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436958208, + "loss": 1.4406, + "grad_norm": 9.265005111694336, + "learning_rate": 0.000111864406779661 + }, + { + "step": 23, + "epoch": 0.11704834605597965, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436985856, + "loss": 1.4484, + "grad_norm": 12.506772994995117, + "learning_rate": 0.00011694915254237288 + }, + { + "step": 24, + "epoch": 0.12213740458015267, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436979712, + "loss": 1.3948, + "grad_norm": 6.763986110687256, + "learning_rate": 0.00012203389830508474 + }, + { + "step": 25, + "epoch": 0.1272264631043257, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436985856, + "loss": 1.4521, + "grad_norm": 10.200590133666992, + "learning_rate": 0.00012711864406779658 + }, + { + "step": 26, + "epoch": 0.13231552162849872, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43703808, + "loss": 1.4408, + "grad_norm": 12.931480407714844, + "learning_rate": 0.00013220338983050846 + }, + { + "step": 27, + "epoch": 0.13740458015267176, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436979712, + "loss": 1.3922, + "grad_norm": 6.691672325134277, + "learning_rate": 0.00013728813559322033 + }, + { + "step": 28, + "epoch": 0.14249363867684478, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436925952, + "loss": 1.406, + "grad_norm": 4.177423000335693, + "learning_rate": 0.0001423728813559322 + }, + { + "step": 29, + "epoch": 0.1475826972010178, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437018112, + "loss": 1.3896, + "grad_norm": 3.882187843322754, + "learning_rate": 0.00014745762711864405 + }, + { + "step": 30, + "epoch": 0.15267175572519084, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437013504, + "loss": 1.3254, + "grad_norm": 6.207408428192139, + "learning_rate": 0.00015254237288135592 + }, + { + "step": 31, + "epoch": 0.15776081424936386, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436992, + "loss": 1.3894, + "grad_norm": 7.2342119216918945, + "learning_rate": 0.0001576271186440678 + }, + { + "step": 32, + "epoch": 0.1628498727735369, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436996608, + "loss": 1.5835, + "grad_norm": 16.798126220703125, + "learning_rate": 0.00016271186440677964 + }, + { + "step": 33, + "epoch": 0.16793893129770993, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437031936, + "loss": 1.3806, + "grad_norm": 7.74294900894165, + "learning_rate": 0.0001677966101694915 + }, + { + "step": 34, + "epoch": 0.17302798982188294, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436956672, + "loss": 1.4736, + "grad_norm": 11.493640899658203, + "learning_rate": 0.0001728813559322034 + }, + { + "step": 35, + "epoch": 0.178117048346056, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437004288, + "loss": 1.3772, + "grad_norm": 7.786534309387207, + "learning_rate": 0.00017796610169491523 + }, + { + "step": 36, + "epoch": 0.183206106870229, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437019648, + "loss": 1.431, + "grad_norm": 8.842007637023926, + "learning_rate": 0.0001830508474576271 + }, + { + "step": 37, + "epoch": 0.18829516539440203, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437085696, + "loss": 1.63, + "grad_norm": 19.143287658691406, + "learning_rate": 0.00018813559322033895 + }, + { + "step": 38, + "epoch": 0.19338422391857507, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43696896, + "loss": 1.437, + "grad_norm": 7.777129173278809, + "learning_rate": 0.00019322033898305083 + }, + { + "step": 39, + "epoch": 0.1984732824427481, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437085696, + "loss": 1.4284, + "grad_norm": 3.858962059020996, + "learning_rate": 0.0001983050847457627 + }, + { + "step": 40, + "epoch": 0.2035623409669211, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43700736, + "loss": 1.3717, + "grad_norm": 3.465468406677246, + "learning_rate": 0.00020338983050847455 + }, + { + "step": 41, + "epoch": 0.20865139949109415, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436905984, + "loss": 1.415, + "grad_norm": 6.483587741851807, + "learning_rate": 0.00020847457627118642 + }, + { + "step": 42, + "epoch": 0.21374045801526717, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436978176, + "loss": 1.5583, + "grad_norm": 12.813518524169922, + "learning_rate": 0.0002135593220338983 + }, + { + "step": 43, + "epoch": 0.21882951653944022, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436941312, + "loss": 1.5052, + "grad_norm": 11.528299331665039, + "learning_rate": 0.00021864406779661014 + }, + { + "step": 44, + "epoch": 0.22391857506361323, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43697664, + "loss": 1.3962, + "grad_norm": 4.32180118560791, + "learning_rate": 0.000223728813559322 + }, + { + "step": 45, + "epoch": 0.22900763358778625, + "cpu_mem": 1.547993088, + "gpu_mem": 4.4370304, + "loss": 1.3845, + "grad_norm": 2.8027145862579346, + "learning_rate": 0.00022881355932203386 + }, + { + "step": 46, + "epoch": 0.2340966921119593, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437078016, + "loss": 1.3837, + "grad_norm": 3.8305411338806152, + "learning_rate": 0.00023389830508474576 + }, + { + "step": 47, + "epoch": 0.23918575063613232, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436905984, + "loss": 1.4121, + "grad_norm": 5.516960620880127, + "learning_rate": 0.0002389830508474576 + }, + { + "step": 48, + "epoch": 0.24427480916030533, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436959744, + "loss": 1.4267, + "grad_norm": 5.097311973571777, + "learning_rate": 0.00024406779661016948 + }, + { + "step": 49, + "epoch": 0.24936386768447838, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436948992, + "loss": 1.4197, + "grad_norm": 3.493868827819824, + "learning_rate": 0.00024915254237288135 + }, + { + "step": 50, + "epoch": 0.2544529262086514, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436958208, + "loss": 1.3759, + "grad_norm": 2.3809967041015625, + "learning_rate": 0.00025423728813559317 + }, + { + "step": 51, + "epoch": 0.2595419847328244, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43705344, + "loss": 1.375, + "grad_norm": 3.1047401428222656, + "learning_rate": 0.0002593220338983051 + }, + { + "step": 52, + "epoch": 0.26463104325699743, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436995072, + "loss": 1.5038, + "grad_norm": 9.552563667297363, + "learning_rate": 0.0002644067796610169 + }, + { + "step": 53, + "epoch": 0.2697201017811705, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437073408, + "loss": 1.4852, + "grad_norm": 9.313773155212402, + "learning_rate": 0.0002694915254237288 + }, + { + "step": 54, + "epoch": 0.2748091603053435, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436973568, + "loss": 1.402, + "grad_norm": 3.2892346382141113, + "learning_rate": 0.00027457627118644066 + }, + { + "step": 55, + "epoch": 0.27989821882951654, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436970496, + "loss": 1.4008, + "grad_norm": 4.480459690093994, + "learning_rate": 0.0002796610169491525 + }, + { + "step": 56, + "epoch": 0.28498727735368956, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43701504, + "loss": 1.393, + "grad_norm": 3.0579943656921387, + "learning_rate": 0.0002847457627118644 + }, + { + "step": 57, + "epoch": 0.2900763358778626, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43700736, + "loss": 1.4071, + "grad_norm": 4.939620494842529, + "learning_rate": 0.00028983050847457623 + }, + { + "step": 58, + "epoch": 0.2951653944020356, + "cpu_mem": 1.547993088, + "gpu_mem": 4.4370304, + "loss": 1.4622, + "grad_norm": 7.6857991218566895, + "learning_rate": 0.0002949152542372881 + }, + { + "step": 59, + "epoch": 0.30025445292620867, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436998144, + "loss": 1.3538, + "grad_norm": 3.4843547344207764, + "learning_rate": 0.0003 + }, + { + "step": 60, + "epoch": 0.3053435114503817, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436988928, + "loss": 1.4282, + "grad_norm": 4.150071144104004, + "learning_rate": 0.00029999735486167307 + }, + { + "step": 61, + "epoch": 0.3104325699745547, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437025792, + "loss": 1.3642, + "grad_norm": 3.105482339859009, + "learning_rate": 0.00029998941953998247 + }, + { + "step": 62, + "epoch": 0.3155216284987277, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437065728, + "loss": 1.3724, + "grad_norm": 1.8189406394958496, + "learning_rate": 0.0002999761943147951 + }, + { + "step": 63, + "epoch": 0.32061068702290074, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436995072, + "loss": 1.3913, + "grad_norm": 4.127066135406494, + "learning_rate": 0.000299957679652545 + }, + { + "step": 64, + "epoch": 0.3256997455470738, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436896768, + "loss": 1.4501, + "grad_norm": 8.422019958496094, + "learning_rate": 0.0002999338762062168 + }, + { + "step": 65, + "epoch": 0.33078880407124683, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436978176, + "loss": 1.4944, + "grad_norm": 8.256820678710938, + "learning_rate": 0.00029990478481532246 + }, + { + "step": 66, + "epoch": 0.33587786259541985, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437082624, + "loss": 1.4069, + "grad_norm": 2.098107099533081, + "learning_rate": 0.00029987040650587214 + }, + { + "step": 67, + "epoch": 0.34096692111959287, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436959744, + "loss": 1.4095, + "grad_norm": 5.316582679748535, + "learning_rate": 0.0002998307424903376 + }, + { + "step": 68, + "epoch": 0.3460559796437659, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437011968, + "loss": 1.4449, + "grad_norm": 10.68295669555664, + "learning_rate": 0.00029978579416760955 + }, + { + "step": 69, + "epoch": 0.3511450381679389, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437004288, + "loss": 1.417, + "grad_norm": 9.017110824584961, + "learning_rate": 0.00029973556312294853 + }, + { + "step": 70, + "epoch": 0.356234096692112, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436919808, + "loss": 1.4305, + "grad_norm": 10.536531448364258, + "learning_rate": 0.0002996800511279286 + }, + { + "step": 71, + "epoch": 0.361323155216285, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436942848, + "loss": 1.3802, + "grad_norm": 7.772149085998535, + "learning_rate": 0.0002996192601403751 + }, + { + "step": 72, + "epoch": 0.366412213740458, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436992, + "loss": 1.3963, + "grad_norm": 6.475351810455322, + "learning_rate": 0.00029955319230429584 + }, + { + "step": 73, + "epoch": 0.37150127226463103, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43694592, + "loss": 1.4226, + "grad_norm": 6.625594615936279, + "learning_rate": 0.00029948184994980486 + }, + { + "step": 74, + "epoch": 0.37659033078880405, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43699968, + "loss": 1.4237, + "grad_norm": 6.603949546813965, + "learning_rate": 0.0002994052355930409 + }, + { + "step": 75, + "epoch": 0.3816793893129771, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437042688, + "loss": 1.444, + "grad_norm": 6.53723669052124, + "learning_rate": 0.0002993233519360781 + }, + { + "step": 76, + "epoch": 0.38676844783715014, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436990464, + "loss": 1.4201, + "grad_norm": 4.724778175354004, + "learning_rate": 0.0002992362018668312 + }, + { + "step": 77, + "epoch": 0.39185750636132316, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436929024, + "loss": 1.3956, + "grad_norm": 3.769388198852539, + "learning_rate": 0.00029914378845895343 + }, + { + "step": 78, + "epoch": 0.3969465648854962, + "cpu_mem": 1.547993088, + "gpu_mem": 4.4370304, + "loss": 1.413, + "grad_norm": 11.433056831359863, + "learning_rate": 0.000299046114971728 + }, + { + "step": 79, + "epoch": 0.4020356234096692, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437047296, + "loss": 1.2629, + "grad_norm": 7.541171550750732, + "learning_rate": 0.0002989431848499534 + }, + { + "step": 80, + "epoch": 0.4071246819338422, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436887552, + "loss": 1.6143, + "grad_norm": 21.642969131469727, + "learning_rate": 0.0002988350017238218 + }, + { + "step": 81, + "epoch": 0.4122137404580153, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436993536, + "loss": 1.6058, + "grad_norm": 22.657106399536133, + "learning_rate": 0.0002987215694087909 + }, + { + "step": 82, + "epoch": 0.4173027989821883, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436965888, + "loss": 1.478, + "grad_norm": 12.65410041809082, + "learning_rate": 0.0002986028919054496 + }, + { + "step": 83, + "epoch": 0.4223918575063613, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436979712, + "loss": 1.4911, + "grad_norm": 14.12283706665039, + "learning_rate": 0.00029847897339937675 + }, + { + "step": 84, + "epoch": 0.42748091603053434, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436985856, + "loss": 1.4465, + "grad_norm": 7.715137958526611, + "learning_rate": 0.0002983498182609935 + }, + { + "step": 85, + "epoch": 0.43256997455470736, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43701504, + "loss": 1.4103, + "grad_norm": 4.2480597496032715, + "learning_rate": 0.0002982154310454093 + }, + { + "step": 86, + "epoch": 0.43765903307888043, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436956672, + "loss": 1.3704, + "grad_norm": 2.4768495559692383, + "learning_rate": 0.00029807581649226114 + }, + { + "step": 87, + "epoch": 0.44274809160305345, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436978176, + "loss": 1.4081, + "grad_norm": 4.221753120422363, + "learning_rate": 0.00029793097952554646 + }, + { + "step": 88, + "epoch": 0.44783715012722647, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437004288, + "loss": 1.3728, + "grad_norm": 5.797238349914551, + "learning_rate": 0.0002977809252534494 + }, + { + "step": 89, + "epoch": 0.4529262086513995, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436952064, + "loss": 1.4942, + "grad_norm": 61.849849700927734, + "learning_rate": 0.00029762565896816073 + }, + { + "step": 90, + "epoch": 0.4580152671755725, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43698432, + "loss": 1.3563, + "grad_norm": 5.139640808105469, + "learning_rate": 0.000297465186145691 + }, + { + "step": 91, + "epoch": 0.4631043256997455, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43697664, + "loss": 1.3427, + "grad_norm": 3.5426790714263916, + "learning_rate": 0.0002972995124456779 + }, + { + "step": 92, + "epoch": 0.4681933842239186, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436916736, + "loss": 1.4059, + "grad_norm": 7.412167072296143, + "learning_rate": 0.0002971286437111861 + }, + { + "step": 93, + "epoch": 0.4732824427480916, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437117952, + "loss": 1.4047, + "grad_norm": 4.4727067947387695, + "learning_rate": 0.0002969525859685014 + }, + { + "step": 94, + "epoch": 0.47837150127226463, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436932096, + "loss": 1.4542, + "grad_norm": 4.2654194831848145, + "learning_rate": 0.0002967713454269183 + }, + { + "step": 95, + "epoch": 0.48346055979643765, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43709952, + "loss": 1.4145, + "grad_norm": 3.0380778312683105, + "learning_rate": 0.0002965849284785207 + }, + { + "step": 96, + "epoch": 0.48854961832061067, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436973568, + "loss": 1.3807, + "grad_norm": 4.473355770111084, + "learning_rate": 0.000296393341697957 + }, + { + "step": 97, + "epoch": 0.49363867684478374, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43696896, + "loss": 1.3985, + "grad_norm": 4.709869384765625, + "learning_rate": 0.00029619659184220755 + }, + { + "step": 98, + "epoch": 0.49872773536895676, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437016576, + "loss": 1.4624, + "grad_norm": 12.270997047424316, + "learning_rate": 0.00029599468585034684 + }, + { + "step": 99, + "epoch": 0.5038167938931297, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436998144, + "loss": 1.4121, + "grad_norm": 7.835821628570557, + "learning_rate": 0.0002957876308432986 + }, + { + "step": 100, + "epoch": 0.5089058524173028, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436962816, + "loss": 1.3836, + "grad_norm": 3.682302474975586, + "learning_rate": 0.0002955754341235846 + }, + { + "step": 101, + "epoch": 0.5139949109414759, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43694592, + "loss": 1.4289, + "grad_norm": 3.322937488555908, + "learning_rate": 0.00029535810317506714 + }, + { + "step": 102, + "epoch": 0.5190839694656488, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436996608, + "loss": 1.3995, + "grad_norm": 3.6305789947509766, + "learning_rate": 0.00029513564566268524 + }, + { + "step": 103, + "epoch": 0.5241730279898219, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436956672, + "loss": 1.3954, + "grad_norm": 4.752967834472656, + "learning_rate": 0.0002949080694321841 + }, + { + "step": 104, + "epoch": 0.5292620865139949, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43704576, + "loss": 1.4046, + "grad_norm": 5.5753703117370605, + "learning_rate": 0.0002946753825098386 + }, + { + "step": 105, + "epoch": 0.5343511450381679, + "cpu_mem": 1.547993088, + "gpu_mem": 4.4369152, + "loss": 1.4065, + "grad_norm": 14.203337669372559, + "learning_rate": 0.0002944375931021699 + }, + { + "step": 106, + "epoch": 0.539440203562341, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436978176, + "loss": 1.388, + "grad_norm": 2.1711857318878174, + "learning_rate": 0.0002941947095956564 + }, + { + "step": 107, + "epoch": 0.544529262086514, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436973568, + "loss": 1.4044, + "grad_norm": 5.651125431060791, + "learning_rate": 0.0002939467405564377 + }, + { + "step": 108, + "epoch": 0.549618320610687, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436962816, + "loss": 1.3658, + "grad_norm": 2.760498046875, + "learning_rate": 0.00029369369473001265 + }, + { + "step": 109, + "epoch": 0.55470737913486, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437131776, + "loss": 1.3918, + "grad_norm": 4.558818817138672, + "learning_rate": 0.0002934355810409307 + }, + { + "step": 110, + "epoch": 0.5597964376590331, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436929024, + "loss": 1.3286, + "grad_norm": 12.547309875488281, + "learning_rate": 0.0002931724085924774 + }, + { + "step": 111, + "epoch": 0.5648854961832062, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43693056, + "loss": 3.8516, + "grad_norm": 394.89556884765625, + "learning_rate": 0.00029290418666635314 + }, + { + "step": 112, + "epoch": 0.5699745547073791, + "cpu_mem": 1.547993088, + "gpu_mem": 4.4370304, + "loss": 4.4351, + "grad_norm": 1389.341552734375, + "learning_rate": 0.0002926309247223459 + }, + { + "step": 113, + "epoch": 0.5750636132315522, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437128704, + "loss": 1.7522, + "grad_norm": 301.48419189453125, + "learning_rate": 0.0002923526323979975 + }, + { + "step": 114, + "epoch": 0.5801526717557252, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436958208, + "loss": 1.4492, + "grad_norm": 26.6928653717041, + "learning_rate": 0.00029206931950826387 + }, + { + "step": 115, + "epoch": 0.5852417302798982, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436967424, + "loss": 1.5123, + "grad_norm": 19.180158615112305, + "learning_rate": 0.00029178099604516876 + }, + { + "step": 116, + "epoch": 0.5903307888040712, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437028864, + "loss": 1.4155, + "grad_norm": 12.314019203186035, + "learning_rate": 0.0002914876721774515 + }, + { + "step": 117, + "epoch": 0.5954198473282443, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436921344, + "loss": 1.4016, + "grad_norm": 5.045202255249023, + "learning_rate": 0.00029118935825020806 + }, + { + "step": 118, + "epoch": 0.6005089058524173, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437016576, + "loss": 1.4009, + "grad_norm": 9.185734748840332, + "learning_rate": 0.00029088606478452656 + }, + { + "step": 119, + "epoch": 0.6055979643765903, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437150208, + "loss": 1.4332, + "grad_norm": 8.547242164611816, + "learning_rate": 0.0002905778024771158 + }, + { + "step": 120, + "epoch": 0.6106870229007634, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43705344, + "loss": 1.3961, + "grad_norm": 8.1763916015625, + "learning_rate": 0.00029026458219992855 + }, + { + "step": 121, + "epoch": 0.6157760814249363, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43709952, + "loss": 1.406, + "grad_norm": 5.207728862762451, + "learning_rate": 0.00028994641499977745 + }, + { + "step": 122, + "epoch": 0.6208651399491094, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437044224, + "loss": 1.4144, + "grad_norm": 4.546237468719482, + "learning_rate": 0.00028962331209794604 + }, + { + "step": 123, + "epoch": 0.6259541984732825, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437082624, + "loss": 1.4938, + "grad_norm": 11.621025085449219, + "learning_rate": 0.00028929528488979244 + }, + { + "step": 124, + "epoch": 0.6310432569974554, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437005824, + "loss": 1.3624, + "grad_norm": 4.2380805015563965, + "learning_rate": 0.0002889623449443479 + }, + { + "step": 125, + "epoch": 0.6361323155216285, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437041152, + "loss": 1.4218, + "grad_norm": 5.953054428100586, + "learning_rate": 0.0002886245040039086 + }, + { + "step": 126, + "epoch": 0.6412213740458015, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436947456, + "loss": 1.3943, + "grad_norm": 5.93511962890625, + "learning_rate": 0.0002882817739836215 + }, + { + "step": 127, + "epoch": 0.6463104325699746, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436972032, + "loss": 1.4356, + "grad_norm": 6.253291606903076, + "learning_rate": 0.000287934166971064 + }, + { + "step": 128, + "epoch": 0.6513994910941476, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436944384, + "loss": 1.38, + "grad_norm": 4.03702974319458, + "learning_rate": 0.0002875816952258179 + }, + { + "step": 129, + "epoch": 0.6564885496183206, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436950528, + "loss": 1.3725, + "grad_norm": 2.619130849838257, + "learning_rate": 0.00028722437117903693 + }, + { + "step": 130, + "epoch": 0.6615776081424937, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436941312, + "loss": 1.4108, + "grad_norm": 3.50997257232666, + "learning_rate": 0.000286862207433008 + }, + { + "step": 131, + "epoch": 0.6666666666666666, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436875264, + "loss": 1.4081, + "grad_norm": 4.510496616363525, + "learning_rate": 0.00028649521676070726 + }, + { + "step": 132, + "epoch": 0.6717557251908397, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43703808, + "loss": 1.3792, + "grad_norm": 0.8556973934173584, + "learning_rate": 0.0002861234121053493 + }, + { + "step": 133, + "epoch": 0.6768447837150128, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436950528, + "loss": 1.3479, + "grad_norm": 3.3924639225006104, + "learning_rate": 0.0002857468065799307 + }, + { + "step": 134, + "epoch": 0.6819338422391857, + "cpu_mem": 1.547993088, + "gpu_mem": 4.4369536, + "loss": 1.4474, + "grad_norm": 9.347857475280762, + "learning_rate": 0.0002853654134667676 + }, + { + "step": 135, + "epoch": 0.6870229007633588, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437008896, + "loss": 1.4261, + "grad_norm": 9.368775367736816, + "learning_rate": 0.0002849792462170271 + }, + { + "step": 136, + "epoch": 0.6921119592875318, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436912128, + "loss": 1.4915, + "grad_norm": 13.565790176391602, + "learning_rate": 0.0002845883184502533 + }, + { + "step": 137, + "epoch": 0.6972010178117048, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437079552, + "loss": 1.414, + "grad_norm": 5.20265007019043, + "learning_rate": 0.00028419264395388626 + }, + { + "step": 138, + "epoch": 0.7022900763358778, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43694592, + "loss": 1.3964, + "grad_norm": 9.744085311889648, + "learning_rate": 0.0002837922366827765 + }, + { + "step": 139, + "epoch": 0.7073791348600509, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436935168, + "loss": 1.3945, + "grad_norm": 6.888545989990234, + "learning_rate": 0.00028338711075869216 + }, + { + "step": 140, + "epoch": 0.712468193384224, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436987392, + "loss": 1.3812, + "grad_norm": 7.004289150238037, + "learning_rate": 0.00028297728046982137 + }, + { + "step": 141, + "epoch": 0.7175572519083969, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436912128, + "loss": 1.4493, + "grad_norm": 11.87932300567627, + "learning_rate": 0.00028256276027026816 + }, + { + "step": 142, + "epoch": 0.72264631043257, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436973568, + "loss": 1.4787, + "grad_norm": 9.332930564880371, + "learning_rate": 0.0002821435647795429 + }, + { + "step": 143, + "epoch": 0.727735368956743, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436972032, + "loss": 1.408, + "grad_norm": 10.163644790649414, + "learning_rate": 0.00028171970878204623 + }, + { + "step": 144, + "epoch": 0.732824427480916, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436916736, + "loss": 1.3805, + "grad_norm": 3.221417188644409, + "learning_rate": 0.0002812912072265481 + }, + { + "step": 145, + "epoch": 0.7379134860050891, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436913664, + "loss": 1.44, + "grad_norm": 6.082846164703369, + "learning_rate": 0.00028085807522566043 + }, + { + "step": 146, + "epoch": 0.7430025445292621, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43701504, + "loss": 1.4174, + "grad_norm": 3.4450299739837646, + "learning_rate": 0.00028042032805530387 + }, + { + "step": 147, + "epoch": 0.7480916030534351, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437019648, + "loss": 1.3983, + "grad_norm": 2.7770490646362305, + "learning_rate": 0.00027997798115416935 + }, + { + "step": 148, + "epoch": 0.7531806615776081, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437117952, + "loss": 1.3999, + "grad_norm": 3.985198497772217, + "learning_rate": 0.0002795310501231734 + }, + { + "step": 149, + "epoch": 0.7582697201017812, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43696128, + "loss": 1.4045, + "grad_norm": 3.4227142333984375, + "learning_rate": 0.0002790795507249081 + }, + { + "step": 150, + "epoch": 0.7633587786259542, + "cpu_mem": 1.547993088, + "gpu_mem": 4.4369536, + "loss": 1.3966, + "grad_norm": 3.720924139022827, + "learning_rate": 0.00027862349888308494 + }, + { + "step": 151, + "epoch": 0.7684478371501272, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436896768, + "loss": 1.3908, + "grad_norm": 4.302285671234131, + "learning_rate": 0.0002781629106819733 + }, + { + "step": 152, + "epoch": 0.7735368956743003, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436929024, + "loss": 1.394, + "grad_norm": 2.2109181880950928, + "learning_rate": 0.00027769780236583315 + }, + { + "step": 153, + "epoch": 0.7786259541984732, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436956672, + "loss": 1.4095, + "grad_norm": 3.273369312286377, + "learning_rate": 0.0002772281903383424 + }, + { + "step": 154, + "epoch": 0.7837150127226463, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43700736, + "loss": 1.3832, + "grad_norm": 2.413292646408081, + "learning_rate": 0.00027675409116201797 + }, + { + "step": 155, + "epoch": 0.7888040712468194, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436919808, + "loss": 1.4042, + "grad_norm": 1.885083794593811, + "learning_rate": 0.00027627552155763186 + }, + { + "step": 156, + "epoch": 0.7938931297709924, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436939776, + "loss": 1.4344, + "grad_norm": 4.110490798950195, + "learning_rate": 0.00027579249840362145 + }, + { + "step": 157, + "epoch": 0.7989821882951654, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437016576, + "loss": 1.3783, + "grad_norm": 2.614349603652954, + "learning_rate": 0.0002753050387354942 + }, + { + "step": 158, + "epoch": 0.8040712468193384, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436918272, + "loss": 1.4003, + "grad_norm": 2.0815186500549316, + "learning_rate": 0.0002748131597452268 + }, + { + "step": 159, + "epoch": 0.8091603053435115, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437019648, + "loss": 1.4062, + "grad_norm": 2.4118287563323975, + "learning_rate": 0.00027431687878065874 + }, + { + "step": 160, + "epoch": 0.8142493638676844, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43697664, + "loss": 1.3706, + "grad_norm": 2.7744522094726562, + "learning_rate": 0.00027381621334488085 + }, + { + "step": 161, + "epoch": 0.8193384223918575, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436975104, + "loss": 1.3894, + "grad_norm": 2.8940885066986084, + "learning_rate": 0.00027331118109561744 + }, + { + "step": 162, + "epoch": 0.8244274809160306, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436958208, + "loss": 1.4029, + "grad_norm": 2.959047794342041, + "learning_rate": 0.000272801799844604 + }, + { + "step": 163, + "epoch": 0.8295165394402035, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437074944, + "loss": 1.4615, + "grad_norm": 6.1964569091796875, + "learning_rate": 0.00027228808755695884 + }, + { + "step": 164, + "epoch": 0.8346055979643766, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436970496, + "loss": 1.4198, + "grad_norm": 5.1621270179748535, + "learning_rate": 0.00027177006235054943 + }, + { + "step": 165, + "epoch": 0.8396946564885496, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437050368, + "loss": 1.3993, + "grad_norm": 3.2406246662139893, + "learning_rate": 0.0002712477424953534 + }, + { + "step": 166, + "epoch": 0.8447837150127226, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436959744, + "loss": 1.439, + "grad_norm": 9.39356517791748, + "learning_rate": 0.00027072114641281435 + }, + { + "step": 167, + "epoch": 0.8498727735368957, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436921344, + "loss": 1.3935, + "grad_norm": 5.068621635437012, + "learning_rate": 0.0002701902926751921 + }, + { + "step": 168, + "epoch": 0.8549618320610687, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436924416, + "loss": 1.354, + "grad_norm": 2.036619186401367, + "learning_rate": 0.00026965520000490743 + }, + { + "step": 169, + "epoch": 0.8600508905852418, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436992, + "loss": 1.3768, + "grad_norm": 3.3214974403381348, + "learning_rate": 0.0002691158872738822 + }, + { + "step": 170, + "epoch": 0.8651399491094147, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43699968, + "loss": 1.431, + "grad_norm": 3.966883420944214, + "learning_rate": 0.00026857237350287334 + }, + { + "step": 171, + "epoch": 0.8702290076335878, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436973568, + "loss": 1.4052, + "grad_norm": 2.4346113204956055, + "learning_rate": 0.0002680246778608023 + }, + { + "step": 172, + "epoch": 0.8753180661577609, + "cpu_mem": 1.547993088, + "gpu_mem": 4.4369536, + "loss": 1.4041, + "grad_norm": 1.9648479223251343, + "learning_rate": 0.0002674728196640788 + }, + { + "step": 173, + "epoch": 0.8804071246819338, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436990464, + "loss": 1.3737, + "grad_norm": 1.419352650642395, + "learning_rate": 0.00026691681837591984 + }, + { + "step": 174, + "epoch": 0.8854961832061069, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436933632, + "loss": 1.384, + "grad_norm": 0.8905038833618164, + "learning_rate": 0.00026635669360566296 + }, + { + "step": 175, + "epoch": 0.8905852417302799, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437004288, + "loss": 1.4043, + "grad_norm": 4.2183709144592285, + "learning_rate": 0.00026579246510807477 + }, + { + "step": 176, + "epoch": 0.8956743002544529, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436901376, + "loss": 1.3715, + "grad_norm": 1.1273407936096191, + "learning_rate": 0.00026522415278265425 + }, + { + "step": 177, + "epoch": 0.9007633587786259, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436910592, + "loss": 1.451, + "grad_norm": 7.368158340454102, + "learning_rate": 0.0002646517766729309 + }, + { + "step": 178, + "epoch": 0.905852417302799, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436987392, + "loss": 1.4232, + "grad_norm": 5.5431976318359375, + "learning_rate": 0.0002640753569657579 + }, + { + "step": 179, + "epoch": 0.910941475826972, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436995072, + "loss": 1.3526, + "grad_norm": 1.6538937091827393, + "learning_rate": 0.0002634949139906 + }, + { + "step": 180, + "epoch": 0.916030534351145, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437001216, + "loss": 1.4452, + "grad_norm": 5.8866190910339355, + "learning_rate": 0.00026291046821881673 + }, + { + "step": 181, + "epoch": 0.9211195928753181, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436919808, + "loss": 1.4068, + "grad_norm": 3.306889772415161, + "learning_rate": 0.0002623220402629402 + }, + { + "step": 182, + "epoch": 0.926208651399491, + "cpu_mem": 1.547993088, + "gpu_mem": 4.4370304, + "loss": 1.4073, + "grad_norm": 2.801975727081299, + "learning_rate": 0.0002617296508759483 + }, + { + "step": 183, + "epoch": 0.9312977099236641, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437018112, + "loss": 1.3847, + "grad_norm": 1.62848961353302, + "learning_rate": 0.00026113332095053257 + }, + { + "step": 184, + "epoch": 0.9363867684478372, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437001216, + "loss": 1.4105, + "grad_norm": 4.056824684143066, + "learning_rate": 0.0002605330715183616 + }, + { + "step": 185, + "epoch": 0.9414758269720102, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436939776, + "loss": 1.3659, + "grad_norm": 3.5004308223724365, + "learning_rate": 0.0002599289237493392 + }, + { + "step": 186, + "epoch": 0.9465648854961832, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436972032, + "loss": 1.3845, + "grad_norm": 2.743223190307617, + "learning_rate": 0.0002593208989508575 + }, + { + "step": 187, + "epoch": 0.9516539440203562, + "cpu_mem": 1.547993088, + "gpu_mem": 4.4370688, + "loss": 1.3898, + "grad_norm": 2.917649030685425, + "learning_rate": 0.00025870901856704583 + }, + { + "step": 188, + "epoch": 0.9567430025445293, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437041152, + "loss": 1.3585, + "grad_norm": 0.9300052523612976, + "learning_rate": 0.00025809330417801425 + }, + { + "step": 189, + "epoch": 0.9618320610687023, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43694592, + "loss": 1.4934, + "grad_norm": 6.275365352630615, + "learning_rate": 0.00025747377749909254 + }, + { + "step": 190, + "epoch": 0.9669211195928753, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436978176, + "loss": 1.442, + "grad_norm": 7.51246452331543, + "learning_rate": 0.00025685046038006413 + }, + { + "step": 191, + "epoch": 0.9720101781170484, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436967424, + "loss": 1.3983, + "grad_norm": 4.858281135559082, + "learning_rate": 0.0002562233748043958 + }, + { + "step": 192, + "epoch": 0.9770992366412213, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436904448, + "loss": 1.4112, + "grad_norm": 10.26813793182373, + "learning_rate": 0.00025559254288846196 + }, + { + "step": 193, + "epoch": 0.9821882951653944, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437025792, + "loss": 1.3974, + "grad_norm": 3.9834237098693848, + "learning_rate": 0.0002549579868807651 + }, + { + "step": 194, + "epoch": 0.9872773536895675, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436947456, + "loss": 1.3893, + "grad_norm": 6.387353897094727, + "learning_rate": 0.0002543197291611507 + }, + { + "step": 195, + "epoch": 0.9923664122137404, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437059584, + "loss": 1.3815, + "grad_norm": 3.582815170288086, + "learning_rate": 0.0002536777922400183 + }, + { + "step": 196, + "epoch": 0.9974554707379135, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437054976, + "loss": 1.384, + "grad_norm": 4.741793155670166, + "learning_rate": 0.0002530321987575271 + }, + { + "step": 197, + "epoch": 1.0025445292620865, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443375616, + "loss": 2.1187, + "grad_norm": 7.876739501953125, + "learning_rate": 0.0002523829714827981 + }, + { + "step": 198, + "epoch": 1.0076335877862594, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443441664, + "loss": 1.38, + "grad_norm": 2.773564577102661, + "learning_rate": 0.00025173013331311053 + }, + { + "step": 199, + "epoch": 1.0127226463104326, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443438592, + "loss": 1.3898, + "grad_norm": 5.180283069610596, + "learning_rate": 0.0002510737072730946 + }, + { + "step": 200, + "epoch": 1.0178117048346056, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443323392, + "loss": 1.3596, + "grad_norm": 3.430567502975464, + "learning_rate": 0.0002504137165139193 + }, + { + "step": 201, + "epoch": 1.0229007633587786, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44337408, + "loss": 1.4409, + "grad_norm": 10.016457557678223, + "learning_rate": 0.0002497501843124761 + }, + { + "step": 202, + "epoch": 1.0279898218829517, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443367936, + "loss": 1.4078, + "grad_norm": 5.990716934204102, + "learning_rate": 0.00024908313407055765 + }, + { + "step": 203, + "epoch": 1.0330788804071247, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443415552, + "loss": 1.498, + "grad_norm": 14.323673248291016, + "learning_rate": 0.00024841258931403284 + }, + { + "step": 204, + "epoch": 1.0381679389312977, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443347968, + "loss": 1.4367, + "grad_norm": 9.776204109191895, + "learning_rate": 0.00024773857369201675 + }, + { + "step": 205, + "epoch": 1.0432569974554706, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443394048, + "loss": 1.3876, + "grad_norm": 3.706735610961914, + "learning_rate": 0.00024706111097603676 + }, + { + "step": 206, + "epoch": 1.0483460559796438, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443407872, + "loss": 1.3707, + "grad_norm": 3.7016794681549072, + "learning_rate": 0.00024638022505919425 + }, + { + "step": 207, + "epoch": 1.0534351145038168, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443363328, + "loss": 1.4105, + "grad_norm": 8.072277069091797, + "learning_rate": 0.00024569593995532157 + }, + { + "step": 208, + "epoch": 1.0585241730279897, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443355648, + "loss": 1.4655, + "grad_norm": 9.413515090942383, + "learning_rate": 0.00024500827979813546 + }, + { + "step": 209, + "epoch": 1.063613231552163, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443476992, + "loss": 1.4111, + "grad_norm": 6.987301826477051, + "learning_rate": 0.0002443172688403859 + }, + { + "step": 210, + "epoch": 1.0687022900763359, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44337408, + "loss": 1.3535, + "grad_norm": 6.347769260406494, + "learning_rate": 0.00024362293145300027 + }, + { + "step": 211, + "epoch": 1.0737913486005088, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443375616, + "loss": 1.4171, + "grad_norm": 11.720592498779297, + "learning_rate": 0.00024292529212422445 + }, + { + "step": 212, + "epoch": 1.078880407124682, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443378688, + "loss": 1.3759, + "grad_norm": 2.899240493774414, + "learning_rate": 0.00024222437545875887 + }, + { + "step": 213, + "epoch": 1.083969465648855, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44330496, + "loss": 1.4137, + "grad_norm": 4.680694580078125, + "learning_rate": 0.0002415202061768906 + }, + { + "step": 214, + "epoch": 1.089058524173028, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443300352, + "loss": 1.3831, + "grad_norm": 1.9631277322769165, + "learning_rate": 0.0002408128091136217 + }, + { + "step": 215, + "epoch": 1.094147582697201, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44335104, + "loss": 1.3623, + "grad_norm": 4.5422210693359375, + "learning_rate": 0.00024010220921779336 + }, + { + "step": 216, + "epoch": 1.099236641221374, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443480064, + "loss": 1.4839, + "grad_norm": 7.080499649047852, + "learning_rate": 0.00023938843155120581 + }, + { + "step": 217, + "epoch": 1.104325699745547, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443341824, + "loss": 1.4321, + "grad_norm": 5.4563398361206055, + "learning_rate": 0.00023867150128773453 + }, + { + "step": 218, + "epoch": 1.10941475826972, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443367936, + "loss": 1.3921, + "grad_norm": 2.6275148391723633, + "learning_rate": 0.0002379514437124425 + }, + { + "step": 219, + "epoch": 1.1145038167938932, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443380224, + "loss": 1.3852, + "grad_norm": 1.3167184591293335, + "learning_rate": 0.00023722828422068814 + }, + { + "step": 220, + "epoch": 1.1195928753180662, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443352576, + "loss": 1.398, + "grad_norm": 3.2626845836639404, + "learning_rate": 0.00023650204831723008 + }, + { + "step": 221, + "epoch": 1.1246819338422391, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443326464, + "loss": 1.3968, + "grad_norm": 3.6421191692352295, + "learning_rate": 0.00023577276161532718 + }, + { + "step": 222, + "epoch": 1.1297709923664123, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443338752, + "loss": 1.3894, + "grad_norm": 4.639440536499023, + "learning_rate": 0.0002350404498358356 + }, + { + "step": 223, + "epoch": 1.1348600508905853, + "cpu_mem": 1.547993088, + "gpu_mem": 4.4433664, + "loss": 3.4112, + "grad_norm": 2004.84814453125, + "learning_rate": 0.00023430513880630133 + }, + { + "step": 224, + "epoch": 1.1399491094147582, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443384832, + "loss": 1.3876, + "grad_norm": 5.099781513214111, + "learning_rate": 0.00023356685446004966 + }, + { + "step": 225, + "epoch": 1.1450381679389312, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44343552, + "loss": 1.3743, + "grad_norm": 1.265486717224121, + "learning_rate": 0.00023282562283527005 + }, + { + "step": 226, + "epoch": 1.1501272264631044, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443386368, + "loss": 1.3904, + "grad_norm": 2.865905523300171, + "learning_rate": 0.00023208147007409827 + }, + { + "step": 227, + "epoch": 1.1552162849872774, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443337216, + "loss": 1.3774, + "grad_norm": 2.6712446212768555, + "learning_rate": 0.00023133442242169425 + }, + { + "step": 228, + "epoch": 1.1603053435114503, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44338176, + "loss": 1.485, + "grad_norm": 40.328487396240234, + "learning_rate": 0.00023058450622531632 + }, + { + "step": 229, + "epoch": 1.1653944020356235, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443280384, + "loss": 1.3756, + "grad_norm": 3.6664040088653564, + "learning_rate": 0.00022983174793339206 + }, + { + "step": 230, + "epoch": 1.1704834605597965, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443347968, + "loss": 1.3983, + "grad_norm": 5.039750099182129, + "learning_rate": 0.0002290761740945857 + }, + { + "step": 231, + "epoch": 1.1755725190839694, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443338752, + "loss": 1.3951, + "grad_norm": 8.265695571899414, + "learning_rate": 0.00022831781135686135 + }, + { + "step": 232, + "epoch": 1.1806615776081424, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44334336, + "loss": 1.4525, + "grad_norm": 22.711240768432617, + "learning_rate": 0.00022755668646654375 + }, + { + "step": 233, + "epoch": 1.1857506361323156, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44345856, + "loss": 1.4034, + "grad_norm": 9.172844886779785, + "learning_rate": 0.00022679282626737442 + }, + { + "step": 234, + "epoch": 1.1908396946564885, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443418624, + "loss": 1.4043, + "grad_norm": 7.119886875152588, + "learning_rate": 0.00022602625769956519 + }, + { + "step": 235, + "epoch": 1.1959287531806615, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443377152, + "loss": 2.987, + "grad_norm": 1576.65380859375, + "learning_rate": 0.00022525700779884802 + }, + { + "step": 236, + "epoch": 1.2010178117048347, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443375616, + "loss": 1.4174, + "grad_norm": 9.58725357055664, + "learning_rate": 0.00022448510369552164 + }, + { + "step": 237, + "epoch": 1.2061068702290076, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443400192, + "loss": 1.3976, + "grad_norm": 3.9680752754211426, + "learning_rate": 0.0002237105726134943 + }, + { + "step": 238, + "epoch": 1.2111959287531806, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44342016, + "loss": 1.3909, + "grad_norm": 3.7593157291412354, + "learning_rate": 0.00022293344186932406 + }, + { + "step": 239, + "epoch": 1.2162849872773536, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443337216, + "loss": 1.4075, + "grad_norm": 8.69713306427002, + "learning_rate": 0.00022215373887125514 + }, + { + "step": 240, + "epoch": 1.2213740458015268, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443472384, + "loss": 1.3825, + "grad_norm": 4.425906658172607, + "learning_rate": 0.00022137149111825128 + }, + { + "step": 241, + "epoch": 1.2264631043256997, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443426304, + "loss": 1.4256, + "grad_norm": 6.402980804443359, + "learning_rate": 0.00022058672619902606 + }, + { + "step": 242, + "epoch": 1.2315521628498727, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443447808, + "loss": 1.4145, + "grad_norm": 7.375691890716553, + "learning_rate": 0.00021979947179106966 + }, + { + "step": 243, + "epoch": 1.2366412213740459, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443334144, + "loss": 1.3896, + "grad_norm": 9.25141716003418, + "learning_rate": 0.0002190097556596728 + }, + { + "step": 244, + "epoch": 1.2417302798982188, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443321856, + "loss": 1.4189, + "grad_norm": 5.865243911743164, + "learning_rate": 0.0002182176056569476 + }, + { + "step": 245, + "epoch": 1.2468193384223918, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443326464, + "loss": 1.4063, + "grad_norm": 4.288112163543701, + "learning_rate": 0.00021742304972084518 + }, + { + "step": 246, + "epoch": 1.2519083969465647, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443363328, + "loss": 1.3881, + "grad_norm": 1.9679096937179565, + "learning_rate": 0.00021662611587417035 + }, + { + "step": 247, + "epoch": 1.256997455470738, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443324928, + "loss": 1.443, + "grad_norm": 8.807768821716309, + "learning_rate": 0.00021582683222359317 + }, + { + "step": 248, + "epoch": 1.262086513994911, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443371008, + "loss": 1.3924, + "grad_norm": 1.8757638931274414, + "learning_rate": 0.00021502522695865796 + }, + { + "step": 249, + "epoch": 1.267175572519084, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443380224, + "loss": 1.3904, + "grad_norm": 2.0265486240386963, + "learning_rate": 0.00021422132835078884 + }, + { + "step": 250, + "epoch": 1.272264631043257, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443418624, + "loss": 1.4045, + "grad_norm": 4.497933864593506, + "learning_rate": 0.0002134151647522927 + }, + { + "step": 251, + "epoch": 1.27735368956743, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443337216, + "loss": 1.3915, + "grad_norm": 3.89949369430542, + "learning_rate": 0.00021260676459535933 + }, + { + "step": 252, + "epoch": 1.282442748091603, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443346432, + "loss": 1.3714, + "grad_norm": 5.125971794128418, + "learning_rate": 0.00021179615639105857 + }, + { + "step": 253, + "epoch": 1.2875318066157762, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44341248, + "loss": 1.387, + "grad_norm": 1.11368989944458, + "learning_rate": 0.00021098336872833482 + }, + { + "step": 254, + "epoch": 1.2926208651399491, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443338752, + "loss": 1.3793, + "grad_norm": 2.6628825664520264, + "learning_rate": 0.0002101684302729987 + }, + { + "step": 255, + "epoch": 1.297709923664122, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443332608, + "loss": 1.4312, + "grad_norm": 3.8492119312286377, + "learning_rate": 0.00020935136976671617 + }, + { + "step": 256, + "epoch": 1.3027989821882953, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443398656, + "loss": 1.4252, + "grad_norm": 4.406234264373779, + "learning_rate": 0.00020853221602599458 + }, + { + "step": 257, + "epoch": 1.3078880407124682, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443357184, + "loss": 1.3734, + "grad_norm": 1.0730401277542114, + "learning_rate": 0.00020771099794116672 + }, + { + "step": 258, + "epoch": 1.3129770992366412, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443470848, + "loss": 1.3963, + "grad_norm": 4.121011734008789, + "learning_rate": 0.0002068877444753717 + }, + { + "step": 259, + "epoch": 1.3180661577608141, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443347968, + "loss": 1.426, + "grad_norm": 4.8143134117126465, + "learning_rate": 0.0002060624846635335 + }, + { + "step": 260, + "epoch": 1.3231552162849873, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44337408, + "loss": 1.3926, + "grad_norm": 2.3762896060943604, + "learning_rate": 0.00020523524761133677 + }, + { + "step": 261, + "epoch": 1.3282442748091603, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443380224, + "loss": 1.4021, + "grad_norm": 6.552372455596924, + "learning_rate": 0.00020440606249420073 + }, + { + "step": 262, + "epoch": 1.3333333333333333, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443447808, + "loss": 1.5125, + "grad_norm": 19.26609230041504, + "learning_rate": 0.00020357495855624974 + }, + { + "step": 263, + "epoch": 1.3384223918575064, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443303424, + "loss": 1.5421, + "grad_norm": 23.97142219543457, + "learning_rate": 0.0002027419651092822 + }, + { + "step": 264, + "epoch": 1.3435114503816794, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443437056, + "loss": 1.4514, + "grad_norm": 12.813199996948242, + "learning_rate": 0.00020190711153173676 + }, + { + "step": 265, + "epoch": 1.3486005089058524, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44346624, + "loss": 1.3628, + "grad_norm": 5.174118995666504, + "learning_rate": 0.00020107042726765588 + }, + { + "step": 266, + "epoch": 1.3536895674300253, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44333568, + "loss": 1.4145, + "grad_norm": 6.571137428283691, + "learning_rate": 0.0002002319418256479 + }, + { + "step": 267, + "epoch": 1.3587786259541985, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443337216, + "loss": 1.38, + "grad_norm": 3.6098642349243164, + "learning_rate": 0.00019939168477784583 + }, + { + "step": 268, + "epoch": 1.3638676844783715, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443338752, + "loss": 1.39, + "grad_norm": 3.104609727859497, + "learning_rate": 0.00019854968575886458 + }, + { + "step": 269, + "epoch": 1.3689567430025447, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44341248, + "loss": 1.4317, + "grad_norm": 5.867928504943848, + "learning_rate": 0.00019770597446475588 + }, + { + "step": 270, + "epoch": 1.3740458015267176, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443394048, + "loss": 1.3842, + "grad_norm": 2.517528772354126, + "learning_rate": 0.0001968605806519608 + }, + { + "step": 271, + "epoch": 1.3791348600508906, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443337216, + "loss": 1.3939, + "grad_norm": 2.25945782661438, + "learning_rate": 0.00019601353413626032 + }, + { + "step": 272, + "epoch": 1.3842239185750635, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443375616, + "loss": 1.3968, + "grad_norm": 4.548844337463379, + "learning_rate": 0.00019516486479172386 + }, + { + "step": 273, + "epoch": 1.3893129770992365, + "cpu_mem": 1.547993088, + "gpu_mem": 4.4434432, + "loss": 1.4, + "grad_norm": 2.6111390590667725, + "learning_rate": 0.0001943146025496555 + }, + { + "step": 274, + "epoch": 1.3944020356234097, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443395584, + "loss": 1.3828, + "grad_norm": 2.363496780395508, + "learning_rate": 0.00019346277739753855 + }, + { + "step": 275, + "epoch": 1.3994910941475827, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443523072, + "loss": 1.3813, + "grad_norm": 3.586388349533081, + "learning_rate": 0.00019260941937797776 + }, + { + "step": 276, + "epoch": 1.4045801526717558, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443407872, + "loss": 1.3727, + "grad_norm": 1.7741920948028564, + "learning_rate": 0.00019175455858763988 + }, + { + "step": 277, + "epoch": 1.4096692111959288, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443357184, + "loss": 1.3753, + "grad_norm": 2.334848642349243, + "learning_rate": 0.0001908982251761921 + }, + { + "step": 278, + "epoch": 1.4147582697201018, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443344896, + "loss": 1.3704, + "grad_norm": 2.2509167194366455, + "learning_rate": 0.00019004044934523871 + }, + { + "step": 279, + "epoch": 1.4198473282442747, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443337216, + "loss": 1.3955, + "grad_norm": 2.0306811332702637, + "learning_rate": 0.00018918126134725616 + }, + { + "step": 280, + "epoch": 1.424936386768448, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443460096, + "loss": 1.4229, + "grad_norm": 3.161910057067871, + "learning_rate": 0.00018832069148452582 + }, + { + "step": 281, + "epoch": 1.4300254452926209, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443400192, + "loss": 1.4441, + "grad_norm": 5.82344388961792, + "learning_rate": 0.00018745877010806534 + }, + { + "step": 282, + "epoch": 1.4351145038167938, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443346432, + "loss": 1.4474, + "grad_norm": 4.600713729858398, + "learning_rate": 0.00018659552761655828 + }, + { + "step": 283, + "epoch": 1.440203562340967, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443363328, + "loss": 1.4167, + "grad_norm": 3.9772515296936035, + "learning_rate": 0.00018573099445528204 + }, + { + "step": 284, + "epoch": 1.44529262086514, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443503104, + "loss": 1.3865, + "grad_norm": 1.304153561592102, + "learning_rate": 0.00018486520111503387 + }, + { + "step": 285, + "epoch": 1.450381679389313, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44337408, + "loss": 1.3982, + "grad_norm": 3.43221378326416, + "learning_rate": 0.0001839981781310558 + }, + { + "step": 286, + "epoch": 1.455470737913486, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443369472, + "loss": 1.4302, + "grad_norm": 4.505683422088623, + "learning_rate": 0.00018312995608195747 + }, + { + "step": 287, + "epoch": 1.460559796437659, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443272704, + "loss": 1.3963, + "grad_norm": 3.0907890796661377, + "learning_rate": 0.00018226056558863778 + }, + { + "step": 288, + "epoch": 1.465648854961832, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443347968, + "loss": 1.3963, + "grad_norm": 2.765439033508301, + "learning_rate": 0.00018139003731320496 + }, + { + "step": 289, + "epoch": 1.470737913486005, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443367936, + "loss": 1.3808, + "grad_norm": 1.4289566278457642, + "learning_rate": 0.00018051840195789506 + }, + { + "step": 290, + "epoch": 1.4758269720101782, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443334144, + "loss": 1.376, + "grad_norm": 0.5951865315437317, + "learning_rate": 0.00017964569026398926 + }, + { + "step": 291, + "epoch": 1.4809160305343512, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443295744, + "loss": 1.4139, + "grad_norm": 1.9721232652664185, + "learning_rate": 0.00017877193301072945 + }, + { + "step": 292, + "epoch": 1.4860050890585241, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443421696, + "loss": 1.41, + "grad_norm": 1.8678447008132935, + "learning_rate": 0.0001778971610142331 + }, + { + "step": 293, + "epoch": 1.491094147582697, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443298816, + "loss": 1.4121, + "grad_norm": 1.8233745098114014, + "learning_rate": 0.00017702140512640594 + }, + { + "step": 294, + "epoch": 1.4961832061068703, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443341824, + "loss": 1.3851, + "grad_norm": 1.1304267644882202, + "learning_rate": 0.00017614469623385414 + }, + { + "step": 295, + "epoch": 1.5012722646310432, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443324928, + "loss": 1.3947, + "grad_norm": 2.5517282485961914, + "learning_rate": 0.00017526706525679498 + }, + { + "step": 296, + "epoch": 1.5063613231552164, + "cpu_mem": 1.547993088, + "gpu_mem": 4.4433664, + "loss": 1.4085, + "grad_norm": 1.9250513315200806, + "learning_rate": 0.00017438854314796623 + }, + { + "step": 297, + "epoch": 1.5114503816793894, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443306496, + "loss": 1.3955, + "grad_norm": 1.8314943313598633, + "learning_rate": 0.00017350916089153455 + }, + { + "step": 298, + "epoch": 1.5165394402035624, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443329536, + "loss": 1.391, + "grad_norm": 2.989061117172241, + "learning_rate": 0.00017262894950200277 + }, + { + "step": 299, + "epoch": 1.5216284987277353, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443318784, + "loss": 1.4101, + "grad_norm": 3.663693428039551, + "learning_rate": 0.000171747940023116 + }, + { + "step": 300, + "epoch": 1.5267175572519083, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443331072, + "loss": 1.4107, + "grad_norm": 3.9488275051116943, + "learning_rate": 0.0001708661635267667 + }, + { + "step": 301, + "epoch": 1.5318066157760815, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443334144, + "loss": 1.3461, + "grad_norm": 5.22908878326416, + "learning_rate": 0.00016998365111189906 + }, + { + "step": 302, + "epoch": 1.5368956743002544, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443315712, + "loss": 1.4044, + "grad_norm": 2.8792381286621094, + "learning_rate": 0.00016910043390341183 + }, + { + "step": 303, + "epoch": 1.5419847328244276, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443321856, + "loss": 1.429, + "grad_norm": 3.9767704010009766, + "learning_rate": 0.0001682165430510609 + }, + { + "step": 304, + "epoch": 1.5470737913486006, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443269632, + "loss": 1.4707, + "grad_norm": 5.497215747833252, + "learning_rate": 0.00016733200972836055 + }, + { + "step": 305, + "epoch": 1.5521628498727735, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443341824, + "loss": 1.3737, + "grad_norm": 1.5776493549346924, + "learning_rate": 0.00016644686513148397 + }, + { + "step": 306, + "epoch": 1.5572519083969465, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443383296, + "loss": 1.3755, + "grad_norm": 1.9662401676177979, + "learning_rate": 0.00016556114047816317 + }, + { + "step": 307, + "epoch": 1.5623409669211195, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443331072, + "loss": 1.3762, + "grad_norm": 1.1532561779022217, + "learning_rate": 0.00016467486700658785 + }, + { + "step": 308, + "epoch": 1.5674300254452926, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443341824, + "loss": 1.4028, + "grad_norm": 1.5066567659378052, + "learning_rate": 0.0001637880759743037 + }, + { + "step": 309, + "epoch": 1.5725190839694656, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443324928, + "loss": 1.4294, + "grad_norm": 2.685598134994507, + "learning_rate": 0.00016290079865711004 + }, + { + "step": 310, + "epoch": 1.5776081424936388, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443380224, + "loss": 1.3918, + "grad_norm": 1.040982723236084, + "learning_rate": 0.00016201306634795675 + }, + { + "step": 311, + "epoch": 1.5826972010178118, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443346432, + "loss": 1.3873, + "grad_norm": 0.9213346242904663, + "learning_rate": 0.00016112491035584047 + }, + { + "step": 312, + "epoch": 1.5877862595419847, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443355648, + "loss": 1.3971, + "grad_norm": 1.1168651580810547, + "learning_rate": 0.00016023636200470065 + }, + { + "step": 313, + "epoch": 1.5928753180661577, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443357184, + "loss": 1.3876, + "grad_norm": 2.4175913333892822, + "learning_rate": 0.00015934745263231464 + }, + { + "step": 314, + "epoch": 1.5979643765903306, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443584512, + "loss": 1.3931, + "grad_norm": 1.4979102611541748, + "learning_rate": 0.00015845821358919236 + }, + { + "step": 315, + "epoch": 1.6030534351145038, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443377152, + "loss": 1.3976, + "grad_norm": 1.8535171747207642, + "learning_rate": 0.00015756867623747088 + }, + { + "step": 316, + "epoch": 1.608142493638677, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443349504, + "loss": 1.4023, + "grad_norm": 2.0635488033294678, + "learning_rate": 0.00015667887194980806 + }, + { + "step": 317, + "epoch": 1.61323155216285, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443314176, + "loss": 1.3961, + "grad_norm": 2.61737322807312, + "learning_rate": 0.00015578883210827626 + }, + { + "step": 318, + "epoch": 1.618320610687023, + "cpu_mem": 1.547993088, + "gpu_mem": 4.4434048, + "loss": 1.3899, + "grad_norm": 1.9784200191497803, + "learning_rate": 0.0001548985881032554 + }, + { + "step": 319, + "epoch": 1.623409669211196, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443329536, + "loss": 1.3821, + "grad_norm": 2.6784799098968506, + "learning_rate": 0.00015400817133232606 + }, + { + "step": 320, + "epoch": 1.6284987277353689, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443377152, + "loss": 1.3753, + "grad_norm": 1.4658716917037964, + "learning_rate": 0.00015311761319916184 + }, + { + "step": 321, + "epoch": 1.6335877862595418, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443318784, + "loss": 1.3757, + "grad_norm": 1.9525872468948364, + "learning_rate": 0.00015222694511242215 + }, + { + "step": 322, + "epoch": 1.638676844783715, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443539968, + "loss": 1.4204, + "grad_norm": 2.7995970249176025, + "learning_rate": 0.00015133619848464424 + }, + { + "step": 323, + "epoch": 1.6437659033078882, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443421696, + "loss": 1.3971, + "grad_norm": 2.083730697631836, + "learning_rate": 0.0001504454047311353 + }, + { + "step": 324, + "epoch": 1.6488549618320612, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443314176, + "loss": 1.3816, + "grad_norm": 2.94307279586792, + "learning_rate": 0.00014955459526886468 + }, + { + "step": 325, + "epoch": 1.6539440203562341, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443354112, + "loss": 1.3835, + "grad_norm": 1.8263893127441406, + "learning_rate": 0.00014866380151535574 + }, + { + "step": 326, + "epoch": 1.659033078880407, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443401728, + "loss": 1.3872, + "grad_norm": 1.6781882047653198, + "learning_rate": 0.0001477730548875778 + }, + { + "step": 327, + "epoch": 1.66412213740458, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443367936, + "loss": 1.4056, + "grad_norm": 1.9546908140182495, + "learning_rate": 0.0001468823868008382 + }, + { + "step": 328, + "epoch": 1.6692111959287532, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443423232, + "loss": 1.4017, + "grad_norm": 1.611274242401123, + "learning_rate": 0.000145991828667674 + }, + { + "step": 329, + "epoch": 1.6743002544529262, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443315712, + "loss": 1.3734, + "grad_norm": 0.9216758608818054, + "learning_rate": 0.0001451014118967446 + }, + { + "step": 330, + "epoch": 1.6793893129770994, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443400192, + "loss": 1.3743, + "grad_norm": 2.215052604675293, + "learning_rate": 0.00014421116789172374 + }, + { + "step": 331, + "epoch": 1.6844783715012723, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44338944, + "loss": 1.3487, + "grad_norm": 1.7355232238769531, + "learning_rate": 0.00014332112805019194 + }, + { + "step": 332, + "epoch": 1.6895674300254453, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443344896, + "loss": 1.4052, + "grad_norm": 2.2989501953125, + "learning_rate": 0.00014243132376252912 + }, + { + "step": 333, + "epoch": 1.6946564885496183, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443433984, + "loss": 1.4199, + "grad_norm": 2.1605899333953857, + "learning_rate": 0.00014154178641080767 + }, + { + "step": 334, + "epoch": 1.6997455470737912, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443361792, + "loss": 1.3897, + "grad_norm": 1.3938462734222412, + "learning_rate": 0.0001406525473676854 + }, + { + "step": 335, + "epoch": 1.7048346055979644, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443371008, + "loss": 1.385, + "grad_norm": 1.4314461946487427, + "learning_rate": 0.00013976363799529936 + }, + { + "step": 336, + "epoch": 1.7099236641221374, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443423232, + "loss": 1.3748, + "grad_norm": 1.548171877861023, + "learning_rate": 0.00013887508964415956 + }, + { + "step": 337, + "epoch": 1.7150127226463106, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443423232, + "loss": 1.3958, + "grad_norm": 3.1536169052124023, + "learning_rate": 0.00013798693365204325 + }, + { + "step": 338, + "epoch": 1.7201017811704835, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443460096, + "loss": 1.385, + "grad_norm": 2.660367012023926, + "learning_rate": 0.00013709920134288993 + }, + { + "step": 339, + "epoch": 1.7251908396946565, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443377152, + "loss": 1.3898, + "grad_norm": 1.7037527561187744, + "learning_rate": 0.00013621192402569628 + }, + { + "step": 340, + "epoch": 1.7302798982188294, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443446272, + "loss": 1.3885, + "grad_norm": 1.338598370552063, + "learning_rate": 0.00013532513299341215 + }, + { + "step": 341, + "epoch": 1.7353689567430024, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443387904, + "loss": 1.3903, + "grad_norm": 1.1447930335998535, + "learning_rate": 0.00013443885952183683 + }, + { + "step": 342, + "epoch": 1.7404580152671756, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443394048, + "loss": 1.3718, + "grad_norm": 3.0245883464813232, + "learning_rate": 0.00013355313486851603 + }, + { + "step": 343, + "epoch": 1.7455470737913485, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44330496, + "loss": 1.4092, + "grad_norm": 2.4077954292297363, + "learning_rate": 0.00013266799027163942 + }, + { + "step": 344, + "epoch": 1.7506361323155217, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443423232, + "loss": 1.4, + "grad_norm": 1.6438277959823608, + "learning_rate": 0.00013178345694893906 + }, + { + "step": 345, + "epoch": 1.7557251908396947, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443375616, + "loss": 1.3871, + "grad_norm": 2.3109991550445557, + "learning_rate": 0.0001308995660965881 + }, + { + "step": 346, + "epoch": 1.7608142493638677, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443321856, + "loss": 1.3803, + "grad_norm": 2.3710386753082275, + "learning_rate": 0.00013001634888810094 + }, + { + "step": 347, + "epoch": 1.7659033078880406, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443371008, + "loss": 1.3887, + "grad_norm": 2.0217654705047607, + "learning_rate": 0.0001291338364732333 + }, + { + "step": 348, + "epoch": 1.7709923664122136, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44349696, + "loss": 1.3799, + "grad_norm": 2.3703670501708984, + "learning_rate": 0.00012825205997688403 + }, + { + "step": 349, + "epoch": 1.7760814249363868, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443295744, + "loss": 1.3837, + "grad_norm": 1.4704362154006958, + "learning_rate": 0.00012737105049799723 + }, + { + "step": 350, + "epoch": 1.78117048346056, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443360256, + "loss": 1.3954, + "grad_norm": 1.5949901342391968, + "learning_rate": 0.00012649083910846543 + }, + { + "step": 351, + "epoch": 1.786259541984733, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443375616, + "loss": 1.3799, + "grad_norm": 1.5547409057617188, + "learning_rate": 0.00012561145685203374 + }, + { + "step": 352, + "epoch": 1.7913486005089059, + "cpu_mem": 1.547993088, + "gpu_mem": 4.4433664, + "loss": 1.3954, + "grad_norm": 1.9825468063354492, + "learning_rate": 0.00012473293474320505 + }, + { + "step": 353, + "epoch": 1.7964376590330788, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443363328, + "loss": 1.3826, + "grad_norm": 3.001039743423462, + "learning_rate": 0.00012385530376614586 + }, + { + "step": 354, + "epoch": 1.8015267175572518, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443338752, + "loss": 1.4034, + "grad_norm": 2.2517752647399902, + "learning_rate": 0.00012297859487359408 + }, + { + "step": 355, + "epoch": 1.806615776081425, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443321856, + "loss": 1.3736, + "grad_norm": 1.4256705045700073, + "learning_rate": 0.0001221028389857669 + }, + { + "step": 356, + "epoch": 1.811704834605598, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443323392, + "loss": 1.3747, + "grad_norm": 1.1332354545593262, + "learning_rate": 0.00012122806698927051 + }, + { + "step": 357, + "epoch": 1.8167938931297711, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443375616, + "loss": 1.4127, + "grad_norm": 3.387650728225708, + "learning_rate": 0.00012035430973601075 + }, + { + "step": 358, + "epoch": 1.821882951653944, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443441664, + "loss": 1.3965, + "grad_norm": 4.969893455505371, + "learning_rate": 0.00011948159804210495 + }, + { + "step": 359, + "epoch": 1.826972010178117, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443395584, + "loss": 1.3923, + "grad_norm": 2.535094738006592, + "learning_rate": 0.00011860996268679504 + }, + { + "step": 360, + "epoch": 1.83206106870229, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443446272, + "loss": 1.3944, + "grad_norm": 1.4764195680618286, + "learning_rate": 0.00011773943441136221 + }, + { + "step": 361, + "epoch": 1.837150127226463, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443417088, + "loss": 1.3947, + "grad_norm": 2.4537291526794434, + "learning_rate": 0.00011687004391804251 + }, + { + "step": 362, + "epoch": 1.8422391857506362, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44335104, + "loss": 1.3872, + "grad_norm": 6.119813442230225, + "learning_rate": 0.00011600182186894417 + }, + { + "step": 363, + "epoch": 1.8473282442748091, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443308032, + "loss": 1.3635, + "grad_norm": 5.0741400718688965, + "learning_rate": 0.00011513479888496609 + }, + { + "step": 364, + "epoch": 1.8524173027989823, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44333568, + "loss": 1.4775, + "grad_norm": 28.188060760498047, + "learning_rate": 0.00011426900554471795 + }, + { + "step": 365, + "epoch": 1.8575063613231553, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443318784, + "loss": 1.366, + "grad_norm": 11.424386024475098, + "learning_rate": 0.0001134044723834417 + }, + { + "step": 366, + "epoch": 1.8625954198473282, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443361792, + "loss": 1.3996, + "grad_norm": 9.051538467407227, + "learning_rate": 0.00011254122989193465 + }, + { + "step": 367, + "epoch": 1.8676844783715012, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443415552, + "loss": 1.4008, + "grad_norm": 5.713983058929443, + "learning_rate": 0.00011167930851547418 + }, + { + "step": 368, + "epoch": 1.8727735368956742, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44331264, + "loss": 1.4063, + "grad_norm": 6.821964740753174, + "learning_rate": 0.0001108187386527438 + }, + { + "step": 369, + "epoch": 1.8778625954198473, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443372544, + "loss": 1.3941, + "grad_norm": 3.3910341262817383, + "learning_rate": 0.00010995955065476126 + }, + { + "step": 370, + "epoch": 1.8829516539440203, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443295744, + "loss": 1.3789, + "grad_norm": 2.7156152725219727, + "learning_rate": 0.00010910177482380795 + }, + { + "step": 371, + "epoch": 1.8880407124681935, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443455488, + "loss": 1.3857, + "grad_norm": 2.516284942626953, + "learning_rate": 0.00010824544141236015 + }, + { + "step": 372, + "epoch": 1.8931297709923665, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443309568, + "loss": 1.4018, + "grad_norm": 3.8482778072357178, + "learning_rate": 0.00010739058062202224 + }, + { + "step": 373, + "epoch": 1.8982188295165394, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443332608, + "loss": 1.3703, + "grad_norm": 1.461777925491333, + "learning_rate": 0.00010653722260246145 + }, + { + "step": 374, + "epoch": 1.9033078880407124, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443355648, + "loss": 1.4488, + "grad_norm": 4.555549621582031, + "learning_rate": 0.00010568539745034447 + }, + { + "step": 375, + "epoch": 1.9083969465648853, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443579904, + "loss": 1.4198, + "grad_norm": 2.883892059326172, + "learning_rate": 0.00010483513520827614 + }, + { + "step": 376, + "epoch": 1.9134860050890585, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443337216, + "loss": 1.3711, + "grad_norm": 2.872962713241577, + "learning_rate": 0.00010398646586373969 + }, + { + "step": 377, + "epoch": 1.9185750636132317, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443460096, + "loss": 1.4339, + "grad_norm": 3.861992597579956, + "learning_rate": 0.00010313941934803922 + }, + { + "step": 378, + "epoch": 1.9236641221374047, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443375616, + "loss": 1.406, + "grad_norm": 5.7337446212768555, + "learning_rate": 0.00010229402553524413 + }, + { + "step": 379, + "epoch": 1.9287531806615776, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443464704, + "loss": 1.4129, + "grad_norm": 5.8016180992126465, + "learning_rate": 0.00010145031424113542 + }, + { + "step": 380, + "epoch": 1.9338422391857506, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443363328, + "loss": 1.4046, + "grad_norm": 4.355131149291992, + "learning_rate": 0.00010060831522215416 + }, + { + "step": 381, + "epoch": 1.9389312977099236, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443371008, + "loss": 1.3926, + "grad_norm": 1.9057713747024536, + "learning_rate": 9.976805817435207e-05 + }, + { + "step": 382, + "epoch": 1.9440203562340967, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443294208, + "loss": 1.3753, + "grad_norm": 3.6798036098480225, + "learning_rate": 9.89295727323441e-05 + }, + { + "step": 383, + "epoch": 1.9491094147582697, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44337408, + "loss": 1.4138, + "grad_norm": 4.293233871459961, + "learning_rate": 9.809288846826327e-05 + }, + { + "step": 384, + "epoch": 1.954198473282443, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443344896, + "loss": 1.4206, + "grad_norm": 3.4451849460601807, + "learning_rate": 9.725803489071779e-05 + }, + { + "step": 385, + "epoch": 1.9592875318066159, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443332608, + "loss": 1.3841, + "grad_norm": 1.489809274673462, + "learning_rate": 9.642504144375026e-05 + }, + { + "step": 386, + "epoch": 1.9643765903307888, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443475456, + "loss": 1.3473, + "grad_norm": 2.19252872467041, + "learning_rate": 9.559393750579926e-05 + }, + { + "step": 387, + "epoch": 1.9694656488549618, + "cpu_mem": 1.547993088, + "gpu_mem": 4.44335872, + "loss": 1.373, + "grad_norm": 0.9671472907066345, + "learning_rate": 9.476475238866318e-05 + }, + { + "step": 388, + "epoch": 1.9745547073791347, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443369472, + "loss": 1.4112, + "grad_norm": 2.1894729137420654, + "learning_rate": 9.393751533646649e-05 + }, + { + "step": 389, + "epoch": 1.979643765903308, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443524608, + "loss": 1.4008, + "grad_norm": 2.1241235733032227, + "learning_rate": 9.31122555246283e-05 + }, + { + "step": 390, + "epoch": 1.984732824427481, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443334144, + "loss": 1.3874, + "grad_norm": 1.6909992694854736, + "learning_rate": 9.228900205883324e-05 + }, + { + "step": 391, + "epoch": 1.989821882951654, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443355648, + "loss": 1.3897, + "grad_norm": 1.4312812089920044, + "learning_rate": 9.146778397400543e-05 + }, + { + "step": 392, + "epoch": 1.994910941475827, + "cpu_mem": 1.547993088, + "gpu_mem": 4.443392512, + "loss": 1.3914, + "grad_norm": 1.4874215126037598, + "learning_rate": 9.064863023328384e-05 + }, + { + "step": 393, + "epoch": 2.0, + "cpu_mem": 1.547993088, + "gpu_mem": 4.442979328, + "loss": 2.069, + "grad_norm": 1.5979630947113037, + "learning_rate": 8.983156972700125e-05 + }, + { + "step": 394, + "epoch": 2.005089058524173, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436927488, + "loss": 1.3739, + "grad_norm": 2.147052049636841, + "learning_rate": 8.901663127166513e-05 + }, + { + "step": 395, + "epoch": 2.010178117048346, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436905984, + "loss": 1.3725, + "grad_norm": 1.3177812099456787, + "learning_rate": 8.820384360894143e-05 + }, + { + "step": 396, + "epoch": 2.015267175572519, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436921344, + "loss": 1.4009, + "grad_norm": 1.858148217201233, + "learning_rate": 8.739323540464063e-05 + }, + { + "step": 397, + "epoch": 2.0203562340966923, + "cpu_mem": 1.547993088, + "gpu_mem": 4.4370688, + "loss": 1.365, + "grad_norm": 1.1124112606048584, + "learning_rate": 8.658483524770728e-05 + }, + { + "step": 398, + "epoch": 2.0254452926208653, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43701504, + "loss": 1.3986, + "grad_norm": 1.3658708333969116, + "learning_rate": 8.577867164921113e-05 + }, + { + "step": 399, + "epoch": 2.030534351145038, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437031936, + "loss": 1.367, + "grad_norm": 1.1217901706695557, + "learning_rate": 8.497477304134203e-05 + }, + { + "step": 400, + "epoch": 2.035623409669211, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43698432, + "loss": 1.3926, + "grad_norm": 1.2156225442886353, + "learning_rate": 8.41731677764068e-05 + }, + { + "step": 401, + "epoch": 2.040712468193384, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437008896, + "loss": 1.3906, + "grad_norm": 1.6388152837753296, + "learning_rate": 8.337388412582972e-05 + }, + { + "step": 402, + "epoch": 2.045801526717557, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437056512, + "loss": 1.3883, + "grad_norm": 1.4425628185272217, + "learning_rate": 8.257695027915481e-05 + }, + { + "step": 403, + "epoch": 2.05089058524173, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436965888, + "loss": 1.3699, + "grad_norm": 1.6139097213745117, + "learning_rate": 8.178239434305235e-05 + }, + { + "step": 404, + "epoch": 2.0559796437659035, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43693824, + "loss": 1.3671, + "grad_norm": 1.541282296180725, + "learning_rate": 8.099024434032717e-05 + }, + { + "step": 405, + "epoch": 2.0610687022900764, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437027328, + "loss": 1.3792, + "grad_norm": 0.9301919341087341, + "learning_rate": 8.02005282089303e-05 + }, + { + "step": 406, + "epoch": 2.0661577608142494, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436918272, + "loss": 1.3762, + "grad_norm": 1.4375085830688477, + "learning_rate": 7.941327380097388e-05 + }, + { + "step": 407, + "epoch": 2.0712468193384224, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436970496, + "loss": 1.3984, + "grad_norm": 1.294041633605957, + "learning_rate": 7.862850888174869e-05 + }, + { + "step": 408, + "epoch": 2.0763358778625953, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436924416, + "loss": 1.3707, + "grad_norm": 1.5805730819702148, + "learning_rate": 7.784626112874487e-05 + }, + { + "step": 409, + "epoch": 2.0814249363867683, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437127168, + "loss": 1.3692, + "grad_norm": 2.0819528102874756, + "learning_rate": 7.706655813067594e-05 + }, + { + "step": 410, + "epoch": 2.0865139949109412, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437125632, + "loss": 1.4152, + "grad_norm": 2.5771429538726807, + "learning_rate": 7.628942738650573e-05 + }, + { + "step": 411, + "epoch": 2.0916030534351147, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437048832, + "loss": 1.3887, + "grad_norm": 1.4272663593292236, + "learning_rate": 7.551489630447835e-05 + }, + { + "step": 412, + "epoch": 2.0966921119592876, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437005824, + "loss": 1.3704, + "grad_norm": 1.949789047241211, + "learning_rate": 7.474299220115195e-05 + }, + { + "step": 413, + "epoch": 2.1017811704834606, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436982784, + "loss": 1.3808, + "grad_norm": 1.9287328720092773, + "learning_rate": 7.397374230043484e-05 + }, + { + "step": 414, + "epoch": 2.1068702290076335, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436936704, + "loss": 1.3711, + "grad_norm": 1.1953377723693848, + "learning_rate": 7.320717373262557e-05 + }, + { + "step": 415, + "epoch": 2.1119592875318065, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43705344, + "loss": 1.3455, + "grad_norm": 2.7569072246551514, + "learning_rate": 7.244331353345625e-05 + }, + { + "step": 416, + "epoch": 2.1170483460559795, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436941312, + "loss": 1.398, + "grad_norm": 1.974427580833435, + "learning_rate": 7.16821886431386e-05 + }, + { + "step": 417, + "epoch": 2.122137404580153, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43693056, + "loss": 1.4037, + "grad_norm": 1.9963749647140503, + "learning_rate": 7.092382590541432e-05 + }, + { + "step": 418, + "epoch": 2.127226463104326, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436935168, + "loss": 1.3876, + "grad_norm": 1.794506311416626, + "learning_rate": 7.016825206660788e-05 + }, + { + "step": 419, + "epoch": 2.132315521628499, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437050368, + "loss": 1.4525, + "grad_norm": 3.771240234375, + "learning_rate": 6.941549377468367e-05 + }, + { + "step": 420, + "epoch": 2.1374045801526718, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437024256, + "loss": 1.4072, + "grad_norm": 2.5800931453704834, + "learning_rate": 6.866557757830575e-05 + }, + { + "step": 421, + "epoch": 2.1424936386768447, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436925952, + "loss": 1.3774, + "grad_norm": 2.248476266860962, + "learning_rate": 6.791852992590169e-05 + }, + { + "step": 422, + "epoch": 2.1475826972010177, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43696128, + "loss": 1.3733, + "grad_norm": 1.6509946584701538, + "learning_rate": 6.717437716472997e-05 + }, + { + "step": 423, + "epoch": 2.1526717557251906, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436927488, + "loss": 1.3881, + "grad_norm": 1.5690736770629883, + "learning_rate": 6.643314553995034e-05 + }, + { + "step": 424, + "epoch": 2.157760814249364, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43706112, + "loss": 1.3778, + "grad_norm": 1.1217279434204102, + "learning_rate": 6.569486119369863e-05 + }, + { + "step": 425, + "epoch": 2.162849872773537, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437035008, + "loss": 1.3913, + "grad_norm": 2.758509635925293, + "learning_rate": 6.495955016416441e-05 + }, + { + "step": 426, + "epoch": 2.16793893129771, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437079552, + "loss": 1.341, + "grad_norm": 1.969782829284668, + "learning_rate": 6.422723838467286e-05 + }, + { + "step": 427, + "epoch": 2.173027989821883, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436970496, + "loss": 1.3381, + "grad_norm": 1.841902732849121, + "learning_rate": 6.349795168276994e-05 + }, + { + "step": 428, + "epoch": 2.178117048346056, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436987392, + "loss": 1.3406, + "grad_norm": 3.5378804206848145, + "learning_rate": 6.277171577931187e-05 + }, + { + "step": 429, + "epoch": 2.183206106870229, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437062656, + "loss": 1.4443, + "grad_norm": 4.393992900848389, + "learning_rate": 6.204855628755751e-05 + }, + { + "step": 430, + "epoch": 2.188295165394402, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436910592, + "loss": 1.4169, + "grad_norm": 4.095976829528809, + "learning_rate": 6.13284987122654e-05 + }, + { + "step": 431, + "epoch": 2.1933842239185752, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437028864, + "loss": 1.4488, + "grad_norm": 4.918739318847656, + "learning_rate": 6.061156844879417e-05 + }, + { + "step": 432, + "epoch": 2.198473282442748, + "cpu_mem": 1.547993088, + "gpu_mem": 4.437018112, + "loss": 1.4525, + "grad_norm": 4.2065534591674805, + "learning_rate": 5.9897790782206636e-05 + }, + { + "step": 433, + "epoch": 2.203562340966921, + "cpu_mem": 1.547993088, + "gpu_mem": 4.43698432, + "loss": 1.3981, + "grad_norm": 2.051485538482666, + "learning_rate": 5.9187190886378306e-05 + }, + { + "step": 434, + "epoch": 2.208651399491094, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436962816, + "loss": 1.357, + "grad_norm": 1.5417219400405884, + "learning_rate": 5.8479793823109406e-05 + }, + { + "step": 435, + "epoch": 2.213740458015267, + "cpu_mem": 1.547993088, + "gpu_mem": 4.436958208, + "loss": 1.4126, + "grad_norm": 2.9108402729034424, + "learning_rate": 5.777562454124113e-05 + }, + { + "step": 436, + "epoch": 2.21882951653944, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43696896, + "loss": 1.3739, + "grad_norm": 2.4440855979919434, + "learning_rate": 5.7074707875775496e-05 + }, + { + "step": 437, + "epoch": 2.223918575063613, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43702272, + "loss": 1.4001, + "grad_norm": 2.3966352939605713, + "learning_rate": 5.637706854699974e-05 + }, + { + "step": 438, + "epoch": 2.2290076335877864, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436939776, + "loss": 1.3858, + "grad_norm": 0.9639256596565247, + "learning_rate": 5.568273115961414e-05 + }, + { + "step": 439, + "epoch": 2.2340966921119594, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436967424, + "loss": 1.397, + "grad_norm": 2.3709142208099365, + "learning_rate": 5.499172020186447e-05 + }, + { + "step": 440, + "epoch": 2.2391857506361323, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436965888, + "loss": 1.3718, + "grad_norm": 1.4743845462799072, + "learning_rate": 5.430406004467842e-05 + }, + { + "step": 441, + "epoch": 2.2442748091603053, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437008896, + "loss": 1.3898, + "grad_norm": 2.0872960090637207, + "learning_rate": 5.361977494080572e-05 + }, + { + "step": 442, + "epoch": 2.2493638676844783, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43700736, + "loss": 1.364, + "grad_norm": 1.9988889694213867, + "learning_rate": 5.293888902396319e-05 + }, + { + "step": 443, + "epoch": 2.2544529262086512, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437058048, + "loss": 1.3857, + "grad_norm": 1.5726934671401978, + "learning_rate": 5.2261426307983204e-05 + }, + { + "step": 444, + "epoch": 2.2595419847328246, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437064192, + "loss": 1.3832, + "grad_norm": 1.099429726600647, + "learning_rate": 5.158741068596714e-05 + }, + { + "step": 445, + "epoch": 2.2646310432569976, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437002752, + "loss": 1.3952, + "grad_norm": 1.7239986658096313, + "learning_rate": 5.0916865929442326e-05 + }, + { + "step": 446, + "epoch": 2.2697201017811706, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436992, + "loss": 1.4082, + "grad_norm": 2.6275012493133545, + "learning_rate": 5.024981568752386e-05 + }, + { + "step": 447, + "epoch": 2.2748091603053435, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437016576, + "loss": 1.3992, + "grad_norm": 1.9475616216659546, + "learning_rate": 4.958628348608065e-05 + }, + { + "step": 448, + "epoch": 2.2798982188295165, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437033472, + "loss": 1.3754, + "grad_norm": 1.3646483421325684, + "learning_rate": 4.892629272690536e-05 + }, + { + "step": 449, + "epoch": 2.2849872773536894, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436904448, + "loss": 1.389, + "grad_norm": 1.221951961517334, + "learning_rate": 4.826986668688944e-05 + }, + { + "step": 450, + "epoch": 2.2900763358778624, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437079552, + "loss": 1.381, + "grad_norm": 1.6174566745758057, + "learning_rate": 4.761702851720191e-05 + }, + { + "step": 451, + "epoch": 2.2951653944020354, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437036544, + "loss": 1.3795, + "grad_norm": 1.3451648950576782, + "learning_rate": 4.6967801242472916e-05 + }, + { + "step": 452, + "epoch": 2.300254452926209, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43694592, + "loss": 1.3987, + "grad_norm": 2.163792610168457, + "learning_rate": 4.632220775998172e-05 + }, + { + "step": 453, + "epoch": 2.3053435114503817, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436893696, + "loss": 1.3858, + "grad_norm": 1.1163667440414429, + "learning_rate": 4.568027083884929e-05 + }, + { + "step": 454, + "epoch": 2.3104325699745547, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436909056, + "loss": 1.3859, + "grad_norm": 0.9826223254203796, + "learning_rate": 4.504201311923488e-05 + }, + { + "step": 455, + "epoch": 2.3155216284987277, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437010432, + "loss": 1.3753, + "grad_norm": 1.0289199352264404, + "learning_rate": 4.440745711153804e-05 + }, + { + "step": 456, + "epoch": 2.3206106870229006, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437019648, + "loss": 1.3749, + "grad_norm": 2.284090995788574, + "learning_rate": 4.377662519560423e-05 + }, + { + "step": 457, + "epoch": 2.325699745547074, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437048832, + "loss": 1.3965, + "grad_norm": 2.2552146911621094, + "learning_rate": 4.3149539619935836e-05 + }, + { + "step": 458, + "epoch": 2.330788804071247, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437110272, + "loss": 1.3743, + "grad_norm": 1.2774784564971924, + "learning_rate": 4.252622250090746e-05 + }, + { + "step": 459, + "epoch": 2.33587786259542, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436964352, + "loss": 1.3658, + "grad_norm": 1.0670140981674194, + "learning_rate": 4.190669582198571e-05 + }, + { + "step": 460, + "epoch": 2.340966921119593, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437051904, + "loss": 1.3581, + "grad_norm": 2.134063720703125, + "learning_rate": 4.1290981432954185e-05 + }, + { + "step": 461, + "epoch": 2.346055979643766, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436955136, + "loss": 1.3667, + "grad_norm": 1.7775194644927979, + "learning_rate": 4.067910104914249e-05 + }, + { + "step": 462, + "epoch": 2.351145038167939, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437025792, + "loss": 1.3918, + "grad_norm": 1.5040647983551025, + "learning_rate": 4.007107625066079e-05 + }, + { + "step": 463, + "epoch": 2.356234096692112, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43706112, + "loss": 1.3816, + "grad_norm": 2.24969744682312, + "learning_rate": 3.946692848163836e-05 + }, + { + "step": 464, + "epoch": 2.3613231552162848, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43699968, + "loss": 1.3549, + "grad_norm": 1.408273696899414, + "learning_rate": 3.886667904946739e-05 + }, + { + "step": 465, + "epoch": 2.366412213740458, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436956672, + "loss": 1.3777, + "grad_norm": 1.6811532974243164, + "learning_rate": 3.8270349124051694e-05 + }, + { + "step": 466, + "epoch": 2.371501272264631, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436958208, + "loss": 1.3786, + "grad_norm": 1.2004536390304565, + "learning_rate": 3.767795973705975e-05 + }, + { + "step": 467, + "epoch": 2.376590330788804, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436995072, + "loss": 1.3495, + "grad_norm": 1.6829664707183838, + "learning_rate": 3.708953178118324e-05 + }, + { + "step": 468, + "epoch": 2.381679389312977, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436967424, + "loss": 1.3563, + "grad_norm": 1.4313675165176392, + "learning_rate": 3.6505086009399944e-05 + }, + { + "step": 469, + "epoch": 2.38676844783715, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437024256, + "loss": 1.3774, + "grad_norm": 2.0832738876342773, + "learning_rate": 3.5924643034242136e-05 + }, + { + "step": 470, + "epoch": 2.391857506361323, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43705344, + "loss": 1.3917, + "grad_norm": 3.3172905445098877, + "learning_rate": 3.5348223327069105e-05 + }, + { + "step": 471, + "epoch": 2.3969465648854964, + "cpu_mem": 1.548189696, + "gpu_mem": 4.4369536, + "loss": 1.4212, + "grad_norm": 2.221398115158081, + "learning_rate": 3.4775847217345756e-05 + }, + { + "step": 472, + "epoch": 2.4020356234096694, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43697664, + "loss": 1.3962, + "grad_norm": 2.0207877159118652, + "learning_rate": 3.420753489192524e-05 + }, + { + "step": 473, + "epoch": 2.4071246819338423, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436952064, + "loss": 1.4087, + "grad_norm": 2.275268793106079, + "learning_rate": 3.364330639433701e-05 + }, + { + "step": 474, + "epoch": 2.4122137404580153, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437044224, + "loss": 1.3861, + "grad_norm": 1.704047679901123, + "learning_rate": 3.308318162408013e-05 + }, + { + "step": 475, + "epoch": 2.4173027989821882, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436913664, + "loss": 1.3773, + "grad_norm": 1.010913372039795, + "learning_rate": 3.2527180335921186e-05 + }, + { + "step": 476, + "epoch": 2.422391857506361, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437050368, + "loss": 1.376, + "grad_norm": 1.1782402992248535, + "learning_rate": 3.197532213919774e-05 + }, + { + "step": 477, + "epoch": 2.427480916030534, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43690752, + "loss": 1.3965, + "grad_norm": 1.8571929931640625, + "learning_rate": 3.1427626497126654e-05 + }, + { + "step": 478, + "epoch": 2.432569974554707, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43693056, + "loss": 1.3643, + "grad_norm": 1.2913237810134888, + "learning_rate": 3.088411272611781e-05 + }, + { + "step": 479, + "epoch": 2.4376590330788805, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436978176, + "loss": 1.3786, + "grad_norm": 1.7221027612686157, + "learning_rate": 3.0344799995092533e-05 + }, + { + "step": 480, + "epoch": 2.4427480916030535, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437021184, + "loss": 1.377, + "grad_norm": 1.0032213926315308, + "learning_rate": 2.9809707324807912e-05 + }, + { + "step": 481, + "epoch": 2.4478371501272265, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436948992, + "loss": 1.3888, + "grad_norm": 1.9584612846374512, + "learning_rate": 2.9278853587185658e-05 + }, + { + "step": 482, + "epoch": 2.4529262086513994, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437047296, + "loss": 1.3817, + "grad_norm": 1.7268660068511963, + "learning_rate": 2.8752257504646616e-05 + }, + { + "step": 483, + "epoch": 2.4580152671755724, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436942848, + "loss": 1.3771, + "grad_norm": 1.4355885982513428, + "learning_rate": 2.8229937649450613e-05 + }, + { + "step": 484, + "epoch": 2.4631043256997454, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437097984, + "loss": 1.3823, + "grad_norm": 1.3262178897857666, + "learning_rate": 2.7711912443041123e-05 + }, + { + "step": 485, + "epoch": 2.4681933842239188, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43693056, + "loss": 1.3702, + "grad_norm": 1.5874065160751343, + "learning_rate": 2.719820015539596e-05 + }, + { + "step": 486, + "epoch": 2.4732824427480917, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436973568, + "loss": 1.4007, + "grad_norm": 1.6258431673049927, + "learning_rate": 2.6688818904382513e-05 + }, + { + "step": 487, + "epoch": 2.4783715012722647, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436995072, + "loss": 1.3776, + "grad_norm": 1.6264384984970093, + "learning_rate": 2.6183786655119144e-05 + }, + { + "step": 488, + "epoch": 2.4834605597964376, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436918272, + "loss": 1.3797, + "grad_norm": 1.282909870147705, + "learning_rate": 2.5683121219341217e-05 + }, + { + "step": 489, + "epoch": 2.4885496183206106, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437054976, + "loss": 1.3811, + "grad_norm": 2.164472818374634, + "learning_rate": 2.518684025477319e-05 + }, + { + "step": 490, + "epoch": 2.4936386768447836, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436942848, + "loss": 1.3592, + "grad_norm": 1.4914262294769287, + "learning_rate": 2.469496126450578e-05 + }, + { + "step": 491, + "epoch": 2.4987277353689565, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436939776, + "loss": 1.3839, + "grad_norm": 1.2959678173065186, + "learning_rate": 2.4207501596378508e-05 + }, + { + "step": 492, + "epoch": 2.5038167938931295, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436913664, + "loss": 1.3739, + "grad_norm": 1.4718014001846313, + "learning_rate": 2.3724478442368133e-05 + }, + { + "step": 493, + "epoch": 2.508905852417303, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436948992, + "loss": 1.3653, + "grad_norm": 2.2325589656829834, + "learning_rate": 2.324590883798204e-05 + }, + { + "step": 494, + "epoch": 2.513994910941476, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436964352, + "loss": 1.3632, + "grad_norm": 1.877788782119751, + "learning_rate": 2.2771809661657614e-05 + }, + { + "step": 495, + "epoch": 2.519083969465649, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436958208, + "loss": 1.3638, + "grad_norm": 1.3412086963653564, + "learning_rate": 2.2302197634166835e-05 + }, + { + "step": 496, + "epoch": 2.524173027989822, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436970496, + "loss": 1.3899, + "grad_norm": 1.6476160287857056, + "learning_rate": 2.1837089318026714e-05 + }, + { + "step": 497, + "epoch": 2.5292620865139948, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437002752, + "loss": 1.368, + "grad_norm": 1.6131469011306763, + "learning_rate": 2.1376501116915047e-05 + }, + { + "step": 498, + "epoch": 2.534351145038168, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437004288, + "loss": 1.3751, + "grad_norm": 1.4000169038772583, + "learning_rate": 2.0920449275091837e-05 + }, + { + "step": 499, + "epoch": 2.539440203562341, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43697664, + "loss": 1.3844, + "grad_norm": 1.5824880599975586, + "learning_rate": 2.0468949876826573e-05 + }, + { + "step": 500, + "epoch": 2.544529262086514, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437025792, + "loss": 1.3587, + "grad_norm": 2.3679094314575195, + "learning_rate": 2.002201884583065e-05 + }, + { + "step": 501, + "epoch": 2.549618320610687, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436955136, + "loss": 1.3733, + "grad_norm": 2.00667142868042, + "learning_rate": 1.957967194469615e-05 + }, + { + "step": 502, + "epoch": 2.55470737913486, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437013504, + "loss": 1.3634, + "grad_norm": 2.8760063648223877, + "learning_rate": 1.9141924774339566e-05 + }, + { + "step": 503, + "epoch": 2.559796437659033, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437004288, + "loss": 1.3452, + "grad_norm": 1.893934726715088, + "learning_rate": 1.8708792773451874e-05 + }, + { + "step": 504, + "epoch": 2.564885496183206, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43693824, + "loss": 1.3767, + "grad_norm": 2.9167017936706543, + "learning_rate": 1.828029121795375e-05 + }, + { + "step": 505, + "epoch": 2.569974554707379, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436959744, + "loss": 1.3687, + "grad_norm": 1.7173620462417603, + "learning_rate": 1.7856435220457092e-05 + }, + { + "step": 506, + "epoch": 2.5750636132315523, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43701504, + "loss": 1.3824, + "grad_norm": 4.691531658172607, + "learning_rate": 1.7437239729731806e-05 + }, + { + "step": 507, + "epoch": 2.5801526717557253, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436932096, + "loss": 1.3656, + "grad_norm": 1.781585693359375, + "learning_rate": 1.7022719530178624e-05 + }, + { + "step": 508, + "epoch": 2.5852417302798982, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436996608, + "loss": 1.3869, + "grad_norm": 2.184264898300171, + "learning_rate": 1.6612889241307836e-05 + }, + { + "step": 509, + "epoch": 2.590330788804071, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436955136, + "loss": 1.4128, + "grad_norm": 2.842815637588501, + "learning_rate": 1.620776331722347e-05 + }, + { + "step": 510, + "epoch": 2.595419847328244, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436987392, + "loss": 1.3739, + "grad_norm": 2.4537107944488525, + "learning_rate": 1.580735604611368e-05 + }, + { + "step": 511, + "epoch": 2.6005089058524176, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43696128, + "loss": 1.3821, + "grad_norm": 2.179072618484497, + "learning_rate": 1.5411681549746678e-05 + }, + { + "step": 512, + "epoch": 2.6055979643765905, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436955136, + "loss": 1.3475, + "grad_norm": 1.8482345342636108, + "learning_rate": 1.502075378297285e-05 + }, + { + "step": 513, + "epoch": 2.6106870229007635, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436965888, + "loss": 1.4111, + "grad_norm": 2.351365089416504, + "learning_rate": 1.4634586533232428e-05 + }, + { + "step": 514, + "epoch": 2.6157760814249365, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436870656, + "loss": 1.3619, + "grad_norm": 2.0333783626556396, + "learning_rate": 1.4253193420069292e-05 + }, + { + "step": 515, + "epoch": 2.6208651399491094, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436939776, + "loss": 1.3768, + "grad_norm": 2.3254451751708984, + "learning_rate": 1.3876587894650686e-05 + }, + { + "step": 516, + "epoch": 2.6259541984732824, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436987392, + "loss": 1.3942, + "grad_norm": 2.081760883331299, + "learning_rate": 1.350478323929271e-05 + }, + { + "step": 517, + "epoch": 2.6310432569974553, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436932096, + "loss": 1.3925, + "grad_norm": 2.5955841541290283, + "learning_rate": 1.3137792566992001e-05 + }, + { + "step": 518, + "epoch": 2.6361323155216283, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436990464, + "loss": 1.3866, + "grad_norm": 1.7712701559066772, + "learning_rate": 1.2775628820963091e-05 + }, + { + "step": 519, + "epoch": 2.6412213740458013, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436996608, + "loss": 1.3617, + "grad_norm": 1.9820581674575806, + "learning_rate": 1.2418304774182075e-05 + }, + { + "step": 520, + "epoch": 2.6463104325699747, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437133312, + "loss": 1.3665, + "grad_norm": 1.4278383255004883, + "learning_rate": 1.2065833028935968e-05 + }, + { + "step": 521, + "epoch": 2.6513994910941476, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436942848, + "loss": 1.3812, + "grad_norm": 1.778550148010254, + "learning_rate": 1.1718226016378507e-05 + }, + { + "step": 522, + "epoch": 2.6564885496183206, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43700736, + "loss": 1.3586, + "grad_norm": 1.8024600744247437, + "learning_rate": 1.137549599609136e-05 + }, + { + "step": 523, + "epoch": 2.6615776081424936, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43692288, + "loss": 1.3722, + "grad_norm": 2.1221506595611572, + "learning_rate": 1.103765505565205e-05 + }, + { + "step": 524, + "epoch": 2.6666666666666665, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436939776, + "loss": 1.3662, + "grad_norm": 1.7756028175354004, + "learning_rate": 1.0704715110207579e-05 + }, + { + "step": 525, + "epoch": 2.67175572519084, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436939776, + "loss": 1.3615, + "grad_norm": 2.7130725383758545, + "learning_rate": 1.0376687902053981e-05 + }, + { + "step": 526, + "epoch": 2.676844783715013, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437065728, + "loss": 1.3767, + "grad_norm": 1.6754722595214844, + "learning_rate": 1.0053585000222524e-05 + }, + { + "step": 527, + "epoch": 2.681933842239186, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43694592, + "loss": 1.3531, + "grad_norm": 2.413041353225708, + "learning_rate": 9.735417800071433e-06 + }, + { + "step": 528, + "epoch": 2.687022900763359, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437104128, + "loss": 1.3809, + "grad_norm": 1.8563499450683594, + "learning_rate": 9.42219752288414e-06 + }, + { + "step": 529, + "epoch": 2.6921119592875318, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436929024, + "loss": 1.3817, + "grad_norm": 1.7901402711868286, + "learning_rate": 9.113935215473428e-06 + }, + { + "step": 530, + "epoch": 2.6972010178117047, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436947456, + "loss": 1.375, + "grad_norm": 1.7131940126419067, + "learning_rate": 8.810641749791902e-06 + }, + { + "step": 531, + "epoch": 2.7022900763358777, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437124096, + "loss": 1.3445, + "grad_norm": 2.2280545234680176, + "learning_rate": 8.512327822548481e-06 + }, + { + "step": 532, + "epoch": 2.7073791348600507, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436998144, + "loss": 1.3515, + "grad_norm": 1.6605305671691895, + "learning_rate": 8.219003954831199e-06 + }, + { + "step": 533, + "epoch": 2.712468193384224, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436995072, + "loss": 1.3666, + "grad_norm": 2.249818801879883, + "learning_rate": 7.930680491736135e-06 + }, + { + "step": 534, + "epoch": 2.717557251908397, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437004288, + "loss": 1.3784, + "grad_norm": 2.0690183639526367, + "learning_rate": 7.647367602002491e-06 + }, + { + "step": 535, + "epoch": 2.72264631043257, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436879872, + "loss": 1.3993, + "grad_norm": 2.063326120376587, + "learning_rate": 7.369075277654091e-06 + }, + { + "step": 536, + "epoch": 2.727735368956743, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437162496, + "loss": 1.3732, + "grad_norm": 2.6791951656341553, + "learning_rate": 7.095813333646832e-06 + }, + { + "step": 537, + "epoch": 2.732824427480916, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436925952, + "loss": 1.359, + "grad_norm": 1.740431547164917, + "learning_rate": 6.827591407522548e-06 + }, + { + "step": 538, + "epoch": 2.7379134860050893, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437021184, + "loss": 1.3631, + "grad_norm": 1.5894725322723389, + "learning_rate": 6.564418959069273e-06 + }, + { + "step": 539, + "epoch": 2.7430025445292623, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436987392, + "loss": 1.3864, + "grad_norm": 4.287158966064453, + "learning_rate": 6.3063052699873326e-06 + }, + { + "step": 540, + "epoch": 2.7480916030534353, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436955136, + "loss": 1.3875, + "grad_norm": 2.463944673538208, + "learning_rate": 6.053259443562286e-06 + }, + { + "step": 541, + "epoch": 2.753180661577608, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436973568, + "loss": 1.3828, + "grad_norm": 2.6525678634643555, + "learning_rate": 5.8052904043435985e-06 + }, + { + "step": 542, + "epoch": 2.758269720101781, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437024256, + "loss": 1.3971, + "grad_norm": 2.3816018104553223, + "learning_rate": 5.56240689783013e-06 + }, + { + "step": 543, + "epoch": 2.763358778625954, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436919808, + "loss": 1.3748, + "grad_norm": 1.896399974822998, + "learning_rate": 5.324617490161409e-06 + }, + { + "step": 544, + "epoch": 2.768447837150127, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436948992, + "loss": 1.3658, + "grad_norm": 2.0597591400146484, + "learning_rate": 5.091930567815866e-06 + }, + { + "step": 545, + "epoch": 2.7735368956743, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43689216, + "loss": 1.3814, + "grad_norm": 1.9340543746948242, + "learning_rate": 4.86435433731473e-06 + }, + { + "step": 546, + "epoch": 2.778625954198473, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436967424, + "loss": 1.3471, + "grad_norm": 2.0314033031463623, + "learning_rate": 4.641896824932861e-06 + }, + { + "step": 547, + "epoch": 2.7837150127226464, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436959744, + "loss": 1.3622, + "grad_norm": 1.9499433040618896, + "learning_rate": 4.424565876415415e-06 + }, + { + "step": 548, + "epoch": 2.7888040712468194, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436990464, + "loss": 1.3869, + "grad_norm": 1.8582836389541626, + "learning_rate": 4.212369156701373e-06 + }, + { + "step": 549, + "epoch": 2.7938931297709924, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436919808, + "loss": 1.378, + "grad_norm": 1.7725268602371216, + "learning_rate": 4.005314149653133e-06 + }, + { + "step": 550, + "epoch": 2.7989821882951653, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43694592, + "loss": 1.3813, + "grad_norm": 2.5149035453796387, + "learning_rate": 3.8034081577924147e-06 + }, + { + "step": 551, + "epoch": 2.8040712468193383, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437041152, + "loss": 1.3668, + "grad_norm": 2.4087231159210205, + "learning_rate": 3.6066583020429864e-06 + }, + { + "step": 552, + "epoch": 2.8091603053435117, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436936704, + "loss": 1.3656, + "grad_norm": 1.7467432022094727, + "learning_rate": 3.415071521479246e-06 + }, + { + "step": 553, + "epoch": 2.8142493638676847, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43694592, + "loss": 1.3829, + "grad_norm": 1.8908185958862305, + "learning_rate": 3.2286545730817183e-06 + }, + { + "step": 554, + "epoch": 2.8193384223918576, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436973568, + "loss": 1.3646, + "grad_norm": 2.200730562210083, + "learning_rate": 3.0474140314985628e-06 + }, + { + "step": 555, + "epoch": 2.8244274809160306, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436987392, + "loss": 1.3935, + "grad_norm": 2.3858351707458496, + "learning_rate": 2.8713562888138754e-06 + }, + { + "step": 556, + "epoch": 2.8295165394402035, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436981248, + "loss": 1.3704, + "grad_norm": 2.083524227142334, + "learning_rate": 2.7004875543220506e-06 + }, + { + "step": 557, + "epoch": 2.8346055979643765, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436944384, + "loss": 1.363, + "grad_norm": 1.9698450565338135, + "learning_rate": 2.5348138543089425e-06 + }, + { + "step": 558, + "epoch": 2.8396946564885495, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437002752, + "loss": 1.3835, + "grad_norm": 3.016568899154663, + "learning_rate": 2.374341031839283e-06 + }, + { + "step": 559, + "epoch": 2.8447837150127224, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436912128, + "loss": 1.3835, + "grad_norm": 2.2180750370025635, + "learning_rate": 2.2190747465505644e-06 + }, + { + "step": 560, + "epoch": 2.849872773536896, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436996608, + "loss": 1.3586, + "grad_norm": 2.2156975269317627, + "learning_rate": 2.0690204744534976e-06 + }, + { + "step": 561, + "epoch": 2.854961832061069, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437035008, + "loss": 1.3531, + "grad_norm": 2.7578489780426025, + "learning_rate": 1.924183507738819e-06 + }, + { + "step": 562, + "epoch": 2.8600508905852418, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437079552, + "loss": 1.3847, + "grad_norm": 2.0806589126586914, + "learning_rate": 1.7845689545906704e-06 + }, + { + "step": 563, + "epoch": 2.8651399491094147, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437041152, + "loss": 1.3739, + "grad_norm": 1.8992005586624146, + "learning_rate": 1.6501817390064786e-06 + }, + { + "step": 564, + "epoch": 2.8702290076335877, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436978176, + "loss": 1.377, + "grad_norm": 2.1560027599334717, + "learning_rate": 1.521026600623243e-06 + }, + { + "step": 565, + "epoch": 2.875318066157761, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437031936, + "loss": 1.3918, + "grad_norm": 2.238588809967041, + "learning_rate": 1.3971080945503866e-06 + }, + { + "step": 566, + "epoch": 2.880407124681934, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436970496, + "loss": 1.3998, + "grad_norm": 2.8667194843292236, + "learning_rate": 1.2784305912090842e-06 + }, + { + "step": 567, + "epoch": 2.885496183206107, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436956672, + "loss": 1.3665, + "grad_norm": 2.011626958847046, + "learning_rate": 1.1649982761782195e-06 + }, + { + "step": 568, + "epoch": 2.89058524173028, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43716864, + "loss": 1.3566, + "grad_norm": 2.1838929653167725, + "learning_rate": 1.0568151500465693e-06 + }, + { + "step": 569, + "epoch": 2.895674300254453, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437033472, + "loss": 1.3469, + "grad_norm": 3.6305816173553467, + "learning_rate": 9.538850282719833e-07 + }, + { + "step": 570, + "epoch": 2.900763358778626, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43700736, + "loss": 1.3777, + "grad_norm": 2.414332151412964, + "learning_rate": 8.56211541046542e-07 + }, + { + "step": 571, + "epoch": 2.905852417302799, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437013504, + "loss": 1.3615, + "grad_norm": 2.322244167327881, + "learning_rate": 7.637981331687582e-07 + }, + { + "step": 572, + "epoch": 2.910941475826972, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437047296, + "loss": 1.3769, + "grad_norm": 3.078051805496216, + "learning_rate": 6.766480639218752e-07 + }, + { + "step": 573, + "epoch": 2.916030534351145, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436955136, + "loss": 1.3931, + "grad_norm": 1.9631168842315674, + "learning_rate": 5.947644069591084e-07 + }, + { + "step": 574, + "epoch": 2.921119592875318, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437039616, + "loss": 1.3716, + "grad_norm": 2.567920684814453, + "learning_rate": 5.181500501950986e-07 + }, + { + "step": 575, + "epoch": 2.926208651399491, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436927488, + "loss": 1.3527, + "grad_norm": 1.929931879043579, + "learning_rate": 4.468076957041433e-07 + }, + { + "step": 576, + "epoch": 2.931297709923664, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436990464, + "loss": 1.3701, + "grad_norm": 1.9018645286560059, + "learning_rate": 3.807398596248401e-07 + }, + { + "step": 577, + "epoch": 2.936386768447837, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436955136, + "loss": 1.35, + "grad_norm": 2.419567108154297, + "learning_rate": 3.199488720714072e-07 + }, + { + "step": 578, + "epoch": 2.94147582697201, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436955136, + "loss": 1.3519, + "grad_norm": 1.9865206480026245, + "learning_rate": 2.64436877051466e-07 + }, + { + "step": 579, + "epoch": 2.9465648854961835, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437039616, + "loss": 1.3702, + "grad_norm": 2.4486827850341797, + "learning_rate": 2.1420583239040167e-07 + }, + { + "step": 580, + "epoch": 2.9516539440203564, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436985856, + "loss": 1.3957, + "grad_norm": 2.184225559234619, + "learning_rate": 1.6925750966238494e-07 + }, + { + "step": 581, + "epoch": 2.9567430025445294, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437005824, + "loss": 1.385, + "grad_norm": 2.440579891204834, + "learning_rate": 1.295934941278387e-07 + }, + { + "step": 582, + "epoch": 2.9618320610687023, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436933632, + "loss": 1.3739, + "grad_norm": 3.1537342071533203, + "learning_rate": 9.52151846775162e-08 + }, + { + "step": 583, + "epoch": 2.9669211195928753, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437021184, + "loss": 1.3828, + "grad_norm": 1.9990376234054565, + "learning_rate": 6.612379378320709e-08 + }, + { + "step": 584, + "epoch": 2.9720101781170483, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436988928, + "loss": 1.351, + "grad_norm": 1.8849868774414062, + "learning_rate": 4.232034745495494e-08 + }, + { + "step": 585, + "epoch": 2.9770992366412212, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436967424, + "loss": 1.3651, + "grad_norm": 1.7759051322937012, + "learning_rate": 2.3805685204869583e-08 + }, + { + "step": 586, + "epoch": 2.982188295165394, + "cpu_mem": 1.548189696, + "gpu_mem": 4.436910592, + "loss": 1.3859, + "grad_norm": 3.324946165084839, + "learning_rate": 1.0580460017517444e-08 + }, + { + "step": 587, + "epoch": 2.9872773536895676, + "cpu_mem": 1.548189696, + "gpu_mem": 4.43697664, + "loss": 1.406, + "grad_norm": 2.1703426837921143, + "learning_rate": 2.645138326906604e-09 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437018112, + "loss": 1.3654, + "grad_norm": 1.8603730201721191, + "learning_rate": 0.0 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 1.548189696, + "gpu_mem": 4.437018112, + "train_runtime": 8378.9531, + "train_samples_per_second": 4.499, + "train_steps_per_second": 0.07, + "total_flos": 0.0, + "train_loss": 1.433186767255368 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r32-a2/adapter_config.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0052eed638e4aeb48f103586efb96096bb8d3ed --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r32-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 64, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 32, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "A" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r32-a2/eval_results.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..731d68617b9c04b08a736204d2f33d5ee62f35f1 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "logiqa", + "results": 0.2679198512703987 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r32-a2/training_configuration.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..ecdbde2aeb3551f9a7edfe698696fd0c3c554d51 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "LOGIQA", + "dataset_id": "data/logiqa_train", + "preprocess_id": "logiqa_train_deepeval" + }, + "peft_config": { + "method": "abl_A", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 25389056 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 3, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_A-logiqa-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r32-a2", + "seed": 42, + "timestamp": "2025-08-31T03:25:22.864246" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r32-a2/training_logs.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..7c011616a9e869c4119345cf49534cf42bceee16 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r32-a2/training_logs.json @@ -0,0 +1,5305 @@ +[ + { + "step": 1, + "epoch": 0.005089058524173028, + "cpu_mem": 1.545965568, + "gpu_mem": 4.519357952, + "loss": 3.8396, + "grad_norm": 271.3771667480469, + "learning_rate": 5.084745762711864e-06 + }, + { + "step": 2, + "epoch": 0.010178117048346057, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722489856, + "loss": 3.9728, + "grad_norm": 270.0074768066406, + "learning_rate": 1.0169491525423728e-05 + }, + { + "step": 3, + "epoch": 0.015267175572519083, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722566656, + "loss": 2.6682, + "grad_norm": 139.5524139404297, + "learning_rate": 1.5254237288135592e-05 + }, + { + "step": 4, + "epoch": 0.020356234096692113, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72246528, + "loss": 1.7342, + "grad_norm": 44.00794219970703, + "learning_rate": 2.0338983050847455e-05 + }, + { + "step": 5, + "epoch": 0.02544529262086514, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72248064, + "loss": 1.5108, + "grad_norm": 15.299688339233398, + "learning_rate": 2.542372881355932e-05 + }, + { + "step": 6, + "epoch": 0.030534351145038167, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72247296, + "loss": 1.5063, + "grad_norm": 17.20232582092285, + "learning_rate": 3.0508474576271185e-05 + }, + { + "step": 7, + "epoch": 0.035623409669211195, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722543616, + "loss": 1.3378, + "grad_norm": 6.1887078285217285, + "learning_rate": 3.559322033898305e-05 + }, + { + "step": 8, + "epoch": 0.04071246819338423, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722528256, + "loss": 1.372, + "grad_norm": 10.312596321105957, + "learning_rate": 4.067796610169491e-05 + }, + { + "step": 9, + "epoch": 0.04580152671755725, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72252672, + "loss": 1.4392, + "grad_norm": 16.08499526977539, + "learning_rate": 4.576271186440678e-05 + }, + { + "step": 10, + "epoch": 0.05089058524173028, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722537472, + "loss": 1.5502, + "grad_norm": 30.192041397094727, + "learning_rate": 5.084745762711864e-05 + }, + { + "step": 11, + "epoch": 0.05597964376590331, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722440704, + "loss": 1.4603, + "grad_norm": 16.578365325927734, + "learning_rate": 5.59322033898305e-05 + }, + { + "step": 12, + "epoch": 0.061068702290076333, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722491392, + "loss": 1.4618, + "grad_norm": 26.531391143798828, + "learning_rate": 6.101694915254237e-05 + }, + { + "step": 13, + "epoch": 0.06615776081424936, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722583552, + "loss": 1.479, + "grad_norm": 18.423450469970703, + "learning_rate": 6.610169491525423e-05 + }, + { + "step": 14, + "epoch": 0.07124681933842239, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722496, + "loss": 1.4022, + "grad_norm": 6.835627555847168, + "learning_rate": 7.11864406779661e-05 + }, + { + "step": 15, + "epoch": 0.07633587786259542, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72263424, + "loss": 1.4142, + "grad_norm": 7.189740180969238, + "learning_rate": 7.627118644067796e-05 + }, + { + "step": 16, + "epoch": 0.08142493638676845, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722494464, + "loss": 1.4338, + "grad_norm": 11.586292266845703, + "learning_rate": 8.135593220338982e-05 + }, + { + "step": 17, + "epoch": 0.08651399491094147, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722525184, + "loss": 1.3878, + "grad_norm": 4.118014812469482, + "learning_rate": 8.64406779661017e-05 + }, + { + "step": 18, + "epoch": 0.0916030534351145, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72248832, + "loss": 1.4096, + "grad_norm": 9.754966735839844, + "learning_rate": 9.152542372881355e-05 + }, + { + "step": 19, + "epoch": 0.09669211195928754, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72239616, + "loss": 1.4031, + "grad_norm": 5.6669535636901855, + "learning_rate": 9.661016949152541e-05 + }, + { + "step": 20, + "epoch": 0.10178117048346055, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72243456, + "loss": 1.374, + "grad_norm": 2.8487565517425537, + "learning_rate": 0.00010169491525423727 + }, + { + "step": 21, + "epoch": 0.10687022900763359, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722568192, + "loss": 1.5187, + "grad_norm": 13.678075790405273, + "learning_rate": 0.00010677966101694915 + }, + { + "step": 22, + "epoch": 0.11195928753180662, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722466816, + "loss": 1.3727, + "grad_norm": 6.014540195465088, + "learning_rate": 0.000111864406779661 + }, + { + "step": 23, + "epoch": 0.11704834605597965, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722494464, + "loss": 1.682, + "grad_norm": 21.651535034179688, + "learning_rate": 0.00011694915254237288 + }, + { + "step": 24, + "epoch": 0.12213740458015267, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72248832, + "loss": 1.4962, + "grad_norm": 12.43869686126709, + "learning_rate": 0.00012203389830508474 + }, + { + "step": 25, + "epoch": 0.1272264631043257, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722494464, + "loss": 1.4438, + "grad_norm": 8.773053169250488, + "learning_rate": 0.00012711864406779658 + }, + { + "step": 26, + "epoch": 0.13231552162849872, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722546688, + "loss": 1.4556, + "grad_norm": 11.020599365234375, + "learning_rate": 0.00013220338983050846 + }, + { + "step": 27, + "epoch": 0.13740458015267176, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72248832, + "loss": 1.6483, + "grad_norm": 14.616039276123047, + "learning_rate": 0.00013728813559322033 + }, + { + "step": 28, + "epoch": 0.14249363867684478, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72243456, + "loss": 1.5475, + "grad_norm": 9.463422775268555, + "learning_rate": 0.0001423728813559322 + }, + { + "step": 29, + "epoch": 0.1475826972010178, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72252672, + "loss": 1.4245, + "grad_norm": 5.435797691345215, + "learning_rate": 0.00014745762711864405 + }, + { + "step": 30, + "epoch": 0.15267175572519084, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722522112, + "loss": 1.3349, + "grad_norm": 5.284818172454834, + "learning_rate": 0.00015254237288135592 + }, + { + "step": 31, + "epoch": 0.15776081424936386, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722500608, + "loss": 1.5385, + "grad_norm": 17.210933685302734, + "learning_rate": 0.0001576271186440678 + }, + { + "step": 32, + "epoch": 0.1628498727735369, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722505216, + "loss": 1.7194, + "grad_norm": 23.252246856689453, + "learning_rate": 0.00016271186440677964 + }, + { + "step": 33, + "epoch": 0.16793893129770993, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722540544, + "loss": 1.3456, + "grad_norm": 4.432555675506592, + "learning_rate": 0.0001677966101694915 + }, + { + "step": 34, + "epoch": 0.17302798982188294, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72246528, + "loss": 1.4179, + "grad_norm": 6.796652793884277, + "learning_rate": 0.0001728813559322034 + }, + { + "step": 35, + "epoch": 0.178117048346056, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722512896, + "loss": 1.5632, + "grad_norm": 23.584243774414062, + "learning_rate": 0.00017796610169491523 + }, + { + "step": 36, + "epoch": 0.183206106870229, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722528256, + "loss": 1.4761, + "grad_norm": 8.497844696044922, + "learning_rate": 0.0001830508474576271 + }, + { + "step": 37, + "epoch": 0.18829516539440203, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722594304, + "loss": 1.4139, + "grad_norm": 9.844210624694824, + "learning_rate": 0.00018813559322033895 + }, + { + "step": 38, + "epoch": 0.19338422391857507, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722477568, + "loss": 1.4714, + "grad_norm": 7.682750225067139, + "learning_rate": 0.00019322033898305083 + }, + { + "step": 39, + "epoch": 0.1984732824427481, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722594304, + "loss": 1.505, + "grad_norm": 11.14980411529541, + "learning_rate": 0.0001983050847457627 + }, + { + "step": 40, + "epoch": 0.2035623409669211, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722515968, + "loss": 1.4592, + "grad_norm": 8.962281227111816, + "learning_rate": 0.00020338983050847455 + }, + { + "step": 41, + "epoch": 0.20865139949109415, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722414592, + "loss": 1.4025, + "grad_norm": 5.890960216522217, + "learning_rate": 0.00020847457627118642 + }, + { + "step": 42, + "epoch": 0.21374045801526717, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722486784, + "loss": 1.4361, + "grad_norm": 5.423296928405762, + "learning_rate": 0.0002135593220338983 + }, + { + "step": 43, + "epoch": 0.21882951653944022, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72244992, + "loss": 1.3805, + "grad_norm": 4.231081962585449, + "learning_rate": 0.00021864406779661014 + }, + { + "step": 44, + "epoch": 0.22391857506361323, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722485248, + "loss": 1.5223, + "grad_norm": 7.535573482513428, + "learning_rate": 0.000223728813559322 + }, + { + "step": 45, + "epoch": 0.22900763358778625, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722539008, + "loss": 1.4912, + "grad_norm": 6.9525651931762695, + "learning_rate": 0.00022881355932203386 + }, + { + "step": 46, + "epoch": 0.2340966921119593, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722586624, + "loss": 1.4224, + "grad_norm": 4.725684642791748, + "learning_rate": 0.00023389830508474576 + }, + { + "step": 47, + "epoch": 0.23918575063613232, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722414592, + "loss": 1.3948, + "grad_norm": 7.642266750335693, + "learning_rate": 0.0002389830508474576 + }, + { + "step": 48, + "epoch": 0.24427480916030533, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722468352, + "loss": 1.5365, + "grad_norm": 9.672203063964844, + "learning_rate": 0.00024406779661016948 + }, + { + "step": 49, + "epoch": 0.24936386768447838, + "cpu_mem": 1.551273984, + "gpu_mem": 4.7224576, + "loss": 1.4202, + "grad_norm": 4.105184078216553, + "learning_rate": 0.00024915254237288135 + }, + { + "step": 50, + "epoch": 0.2544529262086514, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722466816, + "loss": 1.4331, + "grad_norm": 5.194916725158691, + "learning_rate": 0.00025423728813559317 + }, + { + "step": 51, + "epoch": 0.2595419847328244, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722562048, + "loss": 1.3797, + "grad_norm": 2.8139877319335938, + "learning_rate": 0.0002593220338983051 + }, + { + "step": 52, + "epoch": 0.26463104325699743, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72250368, + "loss": 1.5592, + "grad_norm": 9.379073143005371, + "learning_rate": 0.0002644067796610169 + }, + { + "step": 53, + "epoch": 0.2697201017811705, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722582016, + "loss": 1.4135, + "grad_norm": 4.130614280700684, + "learning_rate": 0.0002694915254237288 + }, + { + "step": 54, + "epoch": 0.2748091603053435, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722482176, + "loss": 1.7605, + "grad_norm": 16.032995223999023, + "learning_rate": 0.00027457627118644066 + }, + { + "step": 55, + "epoch": 0.27989821882951654, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722479104, + "loss": 1.5251, + "grad_norm": 8.913888931274414, + "learning_rate": 0.0002796610169491525 + }, + { + "step": 56, + "epoch": 0.28498727735368956, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722523648, + "loss": 1.473, + "grad_norm": 7.471628189086914, + "learning_rate": 0.0002847457627118644 + }, + { + "step": 57, + "epoch": 0.2900763358778626, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722515968, + "loss": 1.3826, + "grad_norm": 2.505580425262451, + "learning_rate": 0.00028983050847457623 + }, + { + "step": 58, + "epoch": 0.2951653944020356, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722539008, + "loss": 1.4173, + "grad_norm": 3.617821216583252, + "learning_rate": 0.0002949152542372881 + }, + { + "step": 59, + "epoch": 0.30025445292620867, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722506752, + "loss": 1.4438, + "grad_norm": 5.74372673034668, + "learning_rate": 0.0003 + }, + { + "step": 60, + "epoch": 0.3053435114503817, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722497536, + "loss": 1.4471, + "grad_norm": 3.9761955738067627, + "learning_rate": 0.00029999735486167307 + }, + { + "step": 61, + "epoch": 0.3104325699745547, + "cpu_mem": 1.551273984, + "gpu_mem": 4.7225344, + "loss": 1.3689, + "grad_norm": 1.4566318988800049, + "learning_rate": 0.00029998941953998247 + }, + { + "step": 62, + "epoch": 0.3155216284987277, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722574336, + "loss": 1.4022, + "grad_norm": 2.9466397762298584, + "learning_rate": 0.0002999761943147951 + }, + { + "step": 63, + "epoch": 0.32061068702290074, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72250368, + "loss": 1.3771, + "grad_norm": 1.8482449054718018, + "learning_rate": 0.000299957679652545 + }, + { + "step": 64, + "epoch": 0.3256997455470738, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722405376, + "loss": 1.4316, + "grad_norm": 4.072117805480957, + "learning_rate": 0.0002999338762062168 + }, + { + "step": 65, + "epoch": 0.33078880407124683, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722486784, + "loss": 1.5166, + "grad_norm": 5.629543781280518, + "learning_rate": 0.00029990478481532246 + }, + { + "step": 66, + "epoch": 0.33587786259541985, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722591232, + "loss": 1.405, + "grad_norm": 1.5960183143615723, + "learning_rate": 0.00029987040650587214 + }, + { + "step": 67, + "epoch": 0.34096692111959287, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722468352, + "loss": 1.3926, + "grad_norm": 2.0797505378723145, + "learning_rate": 0.0002998307424903376 + }, + { + "step": 68, + "epoch": 0.3460559796437659, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722520576, + "loss": 1.5078, + "grad_norm": 4.455959320068359, + "learning_rate": 0.00029978579416760955 + }, + { + "step": 69, + "epoch": 0.3511450381679389, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722512896, + "loss": 1.5038, + "grad_norm": 4.382183074951172, + "learning_rate": 0.00029973556312294853 + }, + { + "step": 70, + "epoch": 0.356234096692112, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722428416, + "loss": 1.4588, + "grad_norm": 3.142360210418701, + "learning_rate": 0.0002996800511279286 + }, + { + "step": 71, + "epoch": 0.361323155216285, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722451456, + "loss": 1.4285, + "grad_norm": 3.227759838104248, + "learning_rate": 0.0002996192601403751 + }, + { + "step": 72, + "epoch": 0.366412213740458, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722500608, + "loss": 1.3934, + "grad_norm": 1.803337574005127, + "learning_rate": 0.00029955319230429584 + }, + { + "step": 73, + "epoch": 0.37150127226463103, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722454528, + "loss": 1.3973, + "grad_norm": 1.4235550165176392, + "learning_rate": 0.00029948184994980486 + }, + { + "step": 74, + "epoch": 0.37659033078880405, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722508288, + "loss": 1.3862, + "grad_norm": 1.4987317323684692, + "learning_rate": 0.0002994052355930409 + }, + { + "step": 75, + "epoch": 0.3816793893129771, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722551296, + "loss": 1.5118, + "grad_norm": 6.6575469970703125, + "learning_rate": 0.0002993233519360781 + }, + { + "step": 76, + "epoch": 0.38676844783715014, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722499072, + "loss": 1.5141, + "grad_norm": 7.410783290863037, + "learning_rate": 0.0002992362018668312 + }, + { + "step": 77, + "epoch": 0.39185750636132316, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722437632, + "loss": 1.4096, + "grad_norm": 3.9286210536956787, + "learning_rate": 0.00029914378845895343 + }, + { + "step": 78, + "epoch": 0.3969465648854962, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722539008, + "loss": 1.3765, + "grad_norm": 2.934699058532715, + "learning_rate": 0.000299046114971728 + }, + { + "step": 79, + "epoch": 0.4020356234096692, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722555904, + "loss": 1.4556, + "grad_norm": 6.281524658203125, + "learning_rate": 0.0002989431848499534 + }, + { + "step": 80, + "epoch": 0.4071246819338422, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72239616, + "loss": 1.44, + "grad_norm": 3.021941661834717, + "learning_rate": 0.0002988350017238218 + }, + { + "step": 81, + "epoch": 0.4122137404580153, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722502144, + "loss": 1.4028, + "grad_norm": 2.5619680881500244, + "learning_rate": 0.0002987215694087909 + }, + { + "step": 82, + "epoch": 0.4173027989821883, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722474496, + "loss": 1.4926, + "grad_norm": 5.18883752822876, + "learning_rate": 0.0002986028919054496 + }, + { + "step": 83, + "epoch": 0.4223918575063613, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72248832, + "loss": 1.5519, + "grad_norm": 6.495824337005615, + "learning_rate": 0.00029847897339937675 + }, + { + "step": 84, + "epoch": 0.42748091603053434, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722494464, + "loss": 1.4755, + "grad_norm": 3.829564094543457, + "learning_rate": 0.0002983498182609935 + }, + { + "step": 85, + "epoch": 0.43256997455470736, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722523648, + "loss": 1.3903, + "grad_norm": 0.8621678352355957, + "learning_rate": 0.0002982154310454093 + }, + { + "step": 86, + "epoch": 0.43765903307888043, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72246528, + "loss": 1.4053, + "grad_norm": 2.4236562252044678, + "learning_rate": 0.00029807581649226114 + }, + { + "step": 87, + "epoch": 0.44274809160305345, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722486784, + "loss": 1.4697, + "grad_norm": 2.60406494140625, + "learning_rate": 0.00029793097952554646 + }, + { + "step": 88, + "epoch": 0.44783715012722647, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722512896, + "loss": 1.3878, + "grad_norm": 2.270684003829956, + "learning_rate": 0.0002977809252534494 + }, + { + "step": 89, + "epoch": 0.4529262086513995, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722460672, + "loss": 1.3809, + "grad_norm": 1.4236985445022583, + "learning_rate": 0.00029762565896816073 + }, + { + "step": 90, + "epoch": 0.4580152671755725, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722492928, + "loss": 1.4193, + "grad_norm": 3.5759921073913574, + "learning_rate": 0.000297465186145691 + }, + { + "step": 91, + "epoch": 0.4631043256997455, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722485248, + "loss": 1.4848, + "grad_norm": 6.9067463874816895, + "learning_rate": 0.0002972995124456779 + }, + { + "step": 92, + "epoch": 0.4681933842239186, + "cpu_mem": 1.551273984, + "gpu_mem": 4.722425344, + "loss": 1.4779, + "grad_norm": 4.97851037979126, + "learning_rate": 0.0002971286437111861 + }, + { + "step": 93, + "epoch": 0.4732824427480916, + "cpu_mem": 1.551273984, + "gpu_mem": 4.72262656, + "loss": 1.3841, + "grad_norm": 1.239617943763733, + "learning_rate": 0.0002969525859685014 + }, + { + "step": 94, + "epoch": 0.47837150127226463, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722440704, + "loss": 1.4574, + "grad_norm": 3.6463546752929688, + "learning_rate": 0.0002967713454269183 + }, + { + "step": 95, + "epoch": 0.48346055979643765, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722608128, + "loss": 1.4109, + "grad_norm": 1.9787710905075073, + "learning_rate": 0.0002965849284785207 + }, + { + "step": 96, + "epoch": 0.48854961832061067, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722482176, + "loss": 1.4894, + "grad_norm": 4.2269511222839355, + "learning_rate": 0.000296393341697957 + }, + { + "step": 97, + "epoch": 0.49363867684478374, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722477568, + "loss": 1.4147, + "grad_norm": 2.4501824378967285, + "learning_rate": 0.00029619659184220755 + }, + { + "step": 98, + "epoch": 0.49872773536895676, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722525184, + "loss": 1.5069, + "grad_norm": 5.392322540283203, + "learning_rate": 0.00029599468585034684 + }, + { + "step": 99, + "epoch": 0.5038167938931297, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722506752, + "loss": 1.3711, + "grad_norm": 3.711118221282959, + "learning_rate": 0.0002957876308432986 + }, + { + "step": 100, + "epoch": 0.5089058524173028, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722471424, + "loss": 1.4853, + "grad_norm": 5.242152214050293, + "learning_rate": 0.0002955754341235846 + }, + { + "step": 101, + "epoch": 0.5139949109414759, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722454528, + "loss": 1.462, + "grad_norm": 4.053441524505615, + "learning_rate": 0.00029535810317506714 + }, + { + "step": 102, + "epoch": 0.5190839694656488, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722505216, + "loss": 1.419, + "grad_norm": 2.324816942214966, + "learning_rate": 0.00029513564566268524 + }, + { + "step": 103, + "epoch": 0.5241730279898219, + "cpu_mem": 1.551470592, + "gpu_mem": 4.72246528, + "loss": 1.3734, + "grad_norm": 0.6682705879211426, + "learning_rate": 0.0002949080694321841 + }, + { + "step": 104, + "epoch": 0.5292620865139949, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722554368, + "loss": 1.3999, + "grad_norm": 1.1558257341384888, + "learning_rate": 0.0002946753825098386 + }, + { + "step": 105, + "epoch": 0.5343511450381679, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722423808, + "loss": 1.4127, + "grad_norm": 2.2834534645080566, + "learning_rate": 0.0002944375931021699 + }, + { + "step": 106, + "epoch": 0.539440203562341, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722486784, + "loss": 1.408, + "grad_norm": 1.3491995334625244, + "learning_rate": 0.0002941947095956564 + }, + { + "step": 107, + "epoch": 0.544529262086514, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722482176, + "loss": 1.4193, + "grad_norm": 1.6507482528686523, + "learning_rate": 0.0002939467405564377 + }, + { + "step": 108, + "epoch": 0.549618320610687, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722471424, + "loss": 1.3829, + "grad_norm": 1.164767861366272, + "learning_rate": 0.00029369369473001265 + }, + { + "step": 109, + "epoch": 0.55470737913486, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722640384, + "loss": 1.3941, + "grad_norm": 1.1770557165145874, + "learning_rate": 0.0002934355810409307 + }, + { + "step": 110, + "epoch": 0.5597964376590331, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722437632, + "loss": 1.3604, + "grad_norm": 1.258116364479065, + "learning_rate": 0.0002931724085924774 + }, + { + "step": 111, + "epoch": 0.5648854961832062, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722439168, + "loss": 1.3608, + "grad_norm": 0.5941305160522461, + "learning_rate": 0.00029290418666635314 + }, + { + "step": 112, + "epoch": 0.5699745547073791, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722539008, + "loss": 1.4955, + "grad_norm": 3.544870138168335, + "learning_rate": 0.0002926309247223459 + }, + { + "step": 113, + "epoch": 0.5750636132315522, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722637312, + "loss": 1.4954, + "grad_norm": 3.7154810428619385, + "learning_rate": 0.0002923526323979975 + }, + { + "step": 114, + "epoch": 0.5801526717557252, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722466816, + "loss": 1.4134, + "grad_norm": 1.6514805555343628, + "learning_rate": 0.00029206931950826387 + }, + { + "step": 115, + "epoch": 0.5852417302798982, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722476032, + "loss": 1.4078, + "grad_norm": 1.7302197217941284, + "learning_rate": 0.00029178099604516876 + }, + { + "step": 116, + "epoch": 0.5903307888040712, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722537472, + "loss": 1.3954, + "grad_norm": 1.7851572036743164, + "learning_rate": 0.0002914876721774515 + }, + { + "step": 117, + "epoch": 0.5954198473282443, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722429952, + "loss": 1.4141, + "grad_norm": 1.564724326133728, + "learning_rate": 0.00029118935825020806 + }, + { + "step": 118, + "epoch": 0.6005089058524173, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722525184, + "loss": 1.3902, + "grad_norm": 0.5744034647941589, + "learning_rate": 0.00029088606478452656 + }, + { + "step": 119, + "epoch": 0.6055979643765903, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722658816, + "loss": 1.4085, + "grad_norm": 1.8977991342544556, + "learning_rate": 0.0002905778024771158 + }, + { + "step": 120, + "epoch": 0.6106870229007634, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722562048, + "loss": 1.4239, + "grad_norm": 3.061228036880493, + "learning_rate": 0.00029026458219992855 + }, + { + "step": 121, + "epoch": 0.6157760814249363, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722608128, + "loss": 1.386, + "grad_norm": 0.9628821015357971, + "learning_rate": 0.00028994641499977745 + }, + { + "step": 122, + "epoch": 0.6208651399491094, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722552832, + "loss": 1.3932, + "grad_norm": 1.5408903360366821, + "learning_rate": 0.00028962331209794604 + }, + { + "step": 123, + "epoch": 0.6259541984732825, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722591232, + "loss": 1.4792, + "grad_norm": 3.6138012409210205, + "learning_rate": 0.00028929528488979244 + }, + { + "step": 124, + "epoch": 0.6310432569974554, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722514432, + "loss": 1.3684, + "grad_norm": 1.3878494501113892, + "learning_rate": 0.0002889623449443479 + }, + { + "step": 125, + "epoch": 0.6361323155216285, + "cpu_mem": 1.551470592, + "gpu_mem": 4.72254976, + "loss": 1.47, + "grad_norm": 13.806567192077637, + "learning_rate": 0.0002886245040039086 + }, + { + "step": 126, + "epoch": 0.6412213740458015, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722456064, + "loss": 1.3717, + "grad_norm": 1.3156911134719849, + "learning_rate": 0.0002882817739836215 + }, + { + "step": 127, + "epoch": 0.6463104325699746, + "cpu_mem": 1.551470592, + "gpu_mem": 4.72248064, + "loss": 1.4156, + "grad_norm": 1.660102128982544, + "learning_rate": 0.000287934166971064 + }, + { + "step": 128, + "epoch": 0.6513994910941476, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722452992, + "loss": 1.3658, + "grad_norm": 0.5342100262641907, + "learning_rate": 0.0002875816952258179 + }, + { + "step": 129, + "epoch": 0.6564885496183206, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722459136, + "loss": 1.4092, + "grad_norm": 1.9647552967071533, + "learning_rate": 0.00028722437117903693 + }, + { + "step": 130, + "epoch": 0.6615776081424937, + "cpu_mem": 1.551470592, + "gpu_mem": 4.72244992, + "loss": 1.4871, + "grad_norm": 2.9868907928466797, + "learning_rate": 0.000286862207433008 + }, + { + "step": 131, + "epoch": 0.6666666666666666, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722383872, + "loss": 1.4339, + "grad_norm": 2.1462063789367676, + "learning_rate": 0.00028649521676070726 + }, + { + "step": 132, + "epoch": 0.6717557251908397, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722546688, + "loss": 1.4013, + "grad_norm": 1.1982227563858032, + "learning_rate": 0.0002861234121053493 + }, + { + "step": 133, + "epoch": 0.6768447837150128, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722459136, + "loss": 1.3661, + "grad_norm": 1.6715247631072998, + "learning_rate": 0.0002857468065799307 + }, + { + "step": 134, + "epoch": 0.6819338422391857, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722462208, + "loss": 1.4434, + "grad_norm": 2.002156972885132, + "learning_rate": 0.0002853654134667676 + }, + { + "step": 135, + "epoch": 0.6870229007633588, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722517504, + "loss": 1.4699, + "grad_norm": 2.539489984512329, + "learning_rate": 0.0002849792462170271 + }, + { + "step": 136, + "epoch": 0.6921119592875318, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722420736, + "loss": 1.5021, + "grad_norm": 3.8422982692718506, + "learning_rate": 0.0002845883184502533 + }, + { + "step": 137, + "epoch": 0.6972010178117048, + "cpu_mem": 1.551470592, + "gpu_mem": 4.72258816, + "loss": 1.4464, + "grad_norm": 3.228269577026367, + "learning_rate": 0.00028419264395388626 + }, + { + "step": 138, + "epoch": 0.7022900763358778, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722454528, + "loss": 1.4031, + "grad_norm": 3.8493785858154297, + "learning_rate": 0.0002837922366827765 + }, + { + "step": 139, + "epoch": 0.7073791348600509, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722443776, + "loss": 1.5096, + "grad_norm": 19.693716049194336, + "learning_rate": 0.00028338711075869216 + }, + { + "step": 140, + "epoch": 0.712468193384224, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722496, + "loss": 1.3814, + "grad_norm": 1.9551446437835693, + "learning_rate": 0.00028297728046982137 + }, + { + "step": 141, + "epoch": 0.7175572519083969, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722420736, + "loss": 1.4419, + "grad_norm": 2.6929852962493896, + "learning_rate": 0.00028256276027026816 + }, + { + "step": 142, + "epoch": 0.72264631043257, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722482176, + "loss": 1.5, + "grad_norm": 3.4572694301605225, + "learning_rate": 0.0002821435647795429 + }, + { + "step": 143, + "epoch": 0.727735368956743, + "cpu_mem": 1.551470592, + "gpu_mem": 4.72248064, + "loss": 1.3998, + "grad_norm": 1.7284815311431885, + "learning_rate": 0.00028171970878204623 + }, + { + "step": 144, + "epoch": 0.732824427480916, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722425344, + "loss": 1.4591, + "grad_norm": 4.240757942199707, + "learning_rate": 0.0002812912072265481 + }, + { + "step": 145, + "epoch": 0.7379134860050891, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722422272, + "loss": 10.811, + "grad_norm": 717.0750122070312, + "learning_rate": 0.00028085807522566043 + }, + { + "step": 146, + "epoch": 0.7430025445292621, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722523648, + "loss": 1.385, + "grad_norm": 2.6633331775665283, + "learning_rate": 0.00028042032805530387 + }, + { + "step": 147, + "epoch": 0.7480916030534351, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722528256, + "loss": 1.4421, + "grad_norm": 4.065269947052002, + "learning_rate": 0.00027997798115416935 + }, + { + "step": 148, + "epoch": 0.7531806615776081, + "cpu_mem": 1.551470592, + "gpu_mem": 4.72262656, + "loss": 1.3741, + "grad_norm": 1.2735133171081543, + "learning_rate": 0.0002795310501231734 + }, + { + "step": 149, + "epoch": 0.7582697201017812, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722469888, + "loss": 1.38, + "grad_norm": 1.1603426933288574, + "learning_rate": 0.0002790795507249081 + }, + { + "step": 150, + "epoch": 0.7633587786259542, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722462208, + "loss": 1.4324, + "grad_norm": 2.7113873958587646, + "learning_rate": 0.00027862349888308494 + }, + { + "step": 151, + "epoch": 0.7684478371501272, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722405376, + "loss": 1.4202, + "grad_norm": 1.945507526397705, + "learning_rate": 0.0002781629106819733 + }, + { + "step": 152, + "epoch": 0.7735368956743003, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722437632, + "loss": 1.3925, + "grad_norm": 1.2032877206802368, + "learning_rate": 0.00027769780236583315 + }, + { + "step": 153, + "epoch": 0.7786259541984732, + "cpu_mem": 1.551470592, + "gpu_mem": 4.72246528, + "loss": 1.4084, + "grad_norm": 1.9131542444229126, + "learning_rate": 0.0002772281903383424 + }, + { + "step": 154, + "epoch": 0.7837150127226463, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722515968, + "loss": 1.4324, + "grad_norm": 3.4122989177703857, + "learning_rate": 0.00027675409116201797 + }, + { + "step": 155, + "epoch": 0.7888040712468194, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722428416, + "loss": 1.4549, + "grad_norm": 2.9110217094421387, + "learning_rate": 0.00027627552155763186 + }, + { + "step": 156, + "epoch": 0.7938931297709924, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722448384, + "loss": 1.4208, + "grad_norm": 2.2799835205078125, + "learning_rate": 0.00027579249840362145 + }, + { + "step": 157, + "epoch": 0.7989821882951654, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722525184, + "loss": 1.3994, + "grad_norm": 1.701493740081787, + "learning_rate": 0.0002753050387354942 + }, + { + "step": 158, + "epoch": 0.8040712468193384, + "cpu_mem": 1.551470592, + "gpu_mem": 4.72242688, + "loss": 1.4081, + "grad_norm": 1.1864149570465088, + "learning_rate": 0.0002748131597452268 + }, + { + "step": 159, + "epoch": 0.8091603053435115, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722528256, + "loss": 1.4266, + "grad_norm": 1.802351713180542, + "learning_rate": 0.00027431687878065874 + }, + { + "step": 160, + "epoch": 0.8142493638676844, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722485248, + "loss": 1.3802, + "grad_norm": 1.342867374420166, + "learning_rate": 0.00027381621334488085 + }, + { + "step": 161, + "epoch": 0.8193384223918575, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722483712, + "loss": 1.4213, + "grad_norm": 2.153465509414673, + "learning_rate": 0.00027331118109561744 + }, + { + "step": 162, + "epoch": 0.8244274809160306, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722466816, + "loss": 1.4035, + "grad_norm": 1.1201364994049072, + "learning_rate": 0.000272801799844604 + }, + { + "step": 163, + "epoch": 0.8295165394402035, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722583552, + "loss": 1.4475, + "grad_norm": 2.365255117416382, + "learning_rate": 0.00027228808755695884 + }, + { + "step": 164, + "epoch": 0.8346055979643766, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722479104, + "loss": 1.3927, + "grad_norm": 1.5914548635482788, + "learning_rate": 0.00027177006235054943 + }, + { + "step": 165, + "epoch": 0.8396946564885496, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722558976, + "loss": 1.3847, + "grad_norm": 0.7639757394790649, + "learning_rate": 0.0002712477424953534 + }, + { + "step": 166, + "epoch": 0.8447837150127226, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722468352, + "loss": 1.5034, + "grad_norm": 5.434316158294678, + "learning_rate": 0.00027072114641281435 + }, + { + "step": 167, + "epoch": 0.8498727735368957, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722429952, + "loss": 1.4448, + "grad_norm": 3.305232048034668, + "learning_rate": 0.0002701902926751921 + }, + { + "step": 168, + "epoch": 0.8549618320610687, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722433024, + "loss": 1.3699, + "grad_norm": 1.5457658767700195, + "learning_rate": 0.00026965520000490743 + }, + { + "step": 169, + "epoch": 0.8600508905852418, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722500608, + "loss": 1.3799, + "grad_norm": 1.811471700668335, + "learning_rate": 0.0002691158872738822 + }, + { + "step": 170, + "epoch": 0.8651399491094147, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722508288, + "loss": 1.4005, + "grad_norm": 1.1352332830429077, + "learning_rate": 0.00026857237350287334 + }, + { + "step": 171, + "epoch": 0.8702290076335878, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722482176, + "loss": 1.3862, + "grad_norm": 0.5826876163482666, + "learning_rate": 0.0002680246778608023 + }, + { + "step": 172, + "epoch": 0.8753180661577609, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722462208, + "loss": 1.4213, + "grad_norm": 1.9313009977340698, + "learning_rate": 0.0002674728196640788 + }, + { + "step": 173, + "epoch": 0.8804071246819338, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722499072, + "loss": 1.3867, + "grad_norm": 1.4438503980636597, + "learning_rate": 0.00026691681837591984 + }, + { + "step": 174, + "epoch": 0.8854961832061069, + "cpu_mem": 1.551470592, + "gpu_mem": 4.72244224, + "loss": 1.385, + "grad_norm": 0.3550768494606018, + "learning_rate": 0.00026635669360566296 + }, + { + "step": 175, + "epoch": 0.8905852417302799, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722512896, + "loss": 1.3882, + "grad_norm": 1.3243764638900757, + "learning_rate": 0.00026579246510807477 + }, + { + "step": 176, + "epoch": 0.8956743002544529, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722409984, + "loss": 1.3759, + "grad_norm": 0.8900005221366882, + "learning_rate": 0.00026522415278265425 + }, + { + "step": 177, + "epoch": 0.9007633587786259, + "cpu_mem": 1.551470592, + "gpu_mem": 4.7224192, + "loss": 1.4478, + "grad_norm": 2.110278367996216, + "learning_rate": 0.0002646517766729309 + }, + { + "step": 178, + "epoch": 0.905852417302799, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722496, + "loss": 1.3989, + "grad_norm": 1.21245539188385, + "learning_rate": 0.0002640753569657579 + }, + { + "step": 179, + "epoch": 0.910941475826972, + "cpu_mem": 1.551470592, + "gpu_mem": 4.72250368, + "loss": 1.3973, + "grad_norm": 1.4524458646774292, + "learning_rate": 0.0002634949139906 + }, + { + "step": 180, + "epoch": 0.916030534351145, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722509824, + "loss": 1.4062, + "grad_norm": 1.2306846380233765, + "learning_rate": 0.00026291046821881673 + }, + { + "step": 181, + "epoch": 0.9211195928753181, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722428416, + "loss": 1.4017, + "grad_norm": 1.2103828191757202, + "learning_rate": 0.0002623220402629402 + }, + { + "step": 182, + "epoch": 0.926208651399491, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722539008, + "loss": 1.3954, + "grad_norm": 1.1630715131759644, + "learning_rate": 0.0002617296508759483 + }, + { + "step": 183, + "epoch": 0.9312977099236641, + "cpu_mem": 1.551470592, + "gpu_mem": 4.72252672, + "loss": 1.4461, + "grad_norm": 2.982577085494995, + "learning_rate": 0.00026113332095053257 + }, + { + "step": 184, + "epoch": 0.9363867684478372, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722509824, + "loss": 1.4056, + "grad_norm": 2.767061948776245, + "learning_rate": 0.0002605330715183616 + }, + { + "step": 185, + "epoch": 0.9414758269720102, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722448384, + "loss": 1.3943, + "grad_norm": 3.0054008960723877, + "learning_rate": 0.0002599289237493392 + }, + { + "step": 186, + "epoch": 0.9465648854961832, + "cpu_mem": 1.551470592, + "gpu_mem": 4.72248064, + "loss": 1.3992, + "grad_norm": 1.4885081052780151, + "learning_rate": 0.0002593208989508575 + }, + { + "step": 187, + "epoch": 0.9516539440203562, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722577408, + "loss": 1.3982, + "grad_norm": 1.8627299070358276, + "learning_rate": 0.00025870901856704583 + }, + { + "step": 188, + "epoch": 0.9567430025445293, + "cpu_mem": 1.551470592, + "gpu_mem": 4.72254976, + "loss": 1.3773, + "grad_norm": 1.444387435913086, + "learning_rate": 0.00025809330417801425 + }, + { + "step": 189, + "epoch": 0.9618320610687023, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722454528, + "loss": 1.4852, + "grad_norm": 3.37605357170105, + "learning_rate": 0.00025747377749909254 + }, + { + "step": 190, + "epoch": 0.9669211195928753, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722486784, + "loss": 1.4322, + "grad_norm": 2.3520376682281494, + "learning_rate": 0.00025685046038006413 + }, + { + "step": 191, + "epoch": 0.9720101781170484, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722476032, + "loss": 1.3985, + "grad_norm": 1.6314219236373901, + "learning_rate": 0.0002562233748043958 + }, + { + "step": 192, + "epoch": 0.9770992366412213, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722413056, + "loss": 1.4623, + "grad_norm": 3.886289119720459, + "learning_rate": 0.00025559254288846196 + }, + { + "step": 193, + "epoch": 0.9821882951653944, + "cpu_mem": 1.551470592, + "gpu_mem": 4.7225344, + "loss": 1.3976, + "grad_norm": 1.21151602268219, + "learning_rate": 0.0002549579868807651 + }, + { + "step": 194, + "epoch": 0.9872773536895675, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722456064, + "loss": 1.3954, + "grad_norm": 1.973647117614746, + "learning_rate": 0.0002543197291611507 + }, + { + "step": 195, + "epoch": 0.9923664122137404, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722568192, + "loss": 1.4053, + "grad_norm": 1.364322304725647, + "learning_rate": 0.0002536777922400183 + }, + { + "step": 196, + "epoch": 0.9974554707379135, + "cpu_mem": 1.551470592, + "gpu_mem": 4.722563584, + "loss": 1.3854, + "grad_norm": 0.9883294701576233, + "learning_rate": 0.0002530321987575271 + }, + { + "step": 197, + "epoch": 1.0025445292620865, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82405376, + "loss": 2.1804, + "grad_norm": 2.641925811767578, + "learning_rate": 0.0002523829714827981 + }, + { + "step": 198, + "epoch": 1.0076335877862594, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824119808, + "loss": 1.4118, + "grad_norm": 1.0482864379882812, + "learning_rate": 0.00025173013331311053 + }, + { + "step": 199, + "epoch": 1.0127226463104326, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824116736, + "loss": 1.3925, + "grad_norm": 1.5241440534591675, + "learning_rate": 0.0002510737072730946 + }, + { + "step": 200, + "epoch": 1.0178117048346056, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824001536, + "loss": 1.3863, + "grad_norm": 1.1898653507232666, + "learning_rate": 0.0002504137165139193 + }, + { + "step": 201, + "epoch": 1.0229007633587786, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824052224, + "loss": 1.476, + "grad_norm": 2.6304969787597656, + "learning_rate": 0.0002497501843124761 + }, + { + "step": 202, + "epoch": 1.0279898218829517, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82404608, + "loss": 1.4522, + "grad_norm": 2.4260401725769043, + "learning_rate": 0.00024908313407055765 + }, + { + "step": 203, + "epoch": 1.0330788804071247, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824093696, + "loss": 1.4078, + "grad_norm": 1.2206748723983765, + "learning_rate": 0.00024841258931403284 + }, + { + "step": 204, + "epoch": 1.0381679389312977, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824026112, + "loss": 1.3743, + "grad_norm": 0.6892529129981995, + "learning_rate": 0.00024773857369201675 + }, + { + "step": 205, + "epoch": 1.0432569974554706, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824072192, + "loss": 1.3931, + "grad_norm": 0.7511376142501831, + "learning_rate": 0.00024706111097603676 + }, + { + "step": 206, + "epoch": 1.0483460559796438, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824086016, + "loss": 1.3719, + "grad_norm": 0.8811691403388977, + "learning_rate": 0.00024638022505919425 + }, + { + "step": 207, + "epoch": 1.0534351145038168, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824041472, + "loss": 1.4321, + "grad_norm": 2.2129056453704834, + "learning_rate": 0.00024569593995532157 + }, + { + "step": 208, + "epoch": 1.0585241730279897, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824033792, + "loss": 1.4882, + "grad_norm": 2.9165899753570557, + "learning_rate": 0.00024500827979813546 + }, + { + "step": 209, + "epoch": 1.063613231552163, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824155136, + "loss": 1.4407, + "grad_norm": 3.2292227745056152, + "learning_rate": 0.0002443172688403859 + }, + { + "step": 210, + "epoch": 1.0687022900763359, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824052224, + "loss": 1.3826, + "grad_norm": 1.8866963386535645, + "learning_rate": 0.00024362293145300027 + }, + { + "step": 211, + "epoch": 1.0737913486005088, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82405376, + "loss": 1.3682, + "grad_norm": 1.829493522644043, + "learning_rate": 0.00024292529212422445 + }, + { + "step": 212, + "epoch": 1.078880407124682, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824056832, + "loss": 1.3745, + "grad_norm": 0.629325807094574, + "learning_rate": 0.00024222437545875887 + }, + { + "step": 213, + "epoch": 1.083969465648855, + "cpu_mem": 1.551470592, + "gpu_mem": 4.823983104, + "loss": 1.4386, + "grad_norm": 2.1756932735443115, + "learning_rate": 0.0002415202061768906 + }, + { + "step": 214, + "epoch": 1.089058524173028, + "cpu_mem": 1.551470592, + "gpu_mem": 4.823978496, + "loss": 1.3926, + "grad_norm": 1.1817556619644165, + "learning_rate": 0.0002408128091136217 + }, + { + "step": 215, + "epoch": 1.094147582697201, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824029184, + "loss": 1.3895, + "grad_norm": 2.4658241271972656, + "learning_rate": 0.00024010220921779336 + }, + { + "step": 216, + "epoch": 1.099236641221374, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824158208, + "loss": 1.4332, + "grad_norm": 2.7499256134033203, + "learning_rate": 0.00023938843155120581 + }, + { + "step": 217, + "epoch": 1.104325699745547, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824019968, + "loss": 1.3754, + "grad_norm": 1.7992602586746216, + "learning_rate": 0.00023867150128773453 + }, + { + "step": 218, + "epoch": 1.10941475826972, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82404608, + "loss": 1.3736, + "grad_norm": 1.3103652000427246, + "learning_rate": 0.0002379514437124425 + }, + { + "step": 219, + "epoch": 1.1145038167938932, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824058368, + "loss": 1.3865, + "grad_norm": 1.1715731620788574, + "learning_rate": 0.00023722828422068814 + }, + { + "step": 220, + "epoch": 1.1195928753180662, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82403072, + "loss": 1.3865, + "grad_norm": 2.0342674255371094, + "learning_rate": 0.00023650204831723008 + }, + { + "step": 221, + "epoch": 1.1246819338422391, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824004608, + "loss": 1.4135, + "grad_norm": 1.676208257675171, + "learning_rate": 0.00023577276161532718 + }, + { + "step": 222, + "epoch": 1.1297709923664123, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824016896, + "loss": 1.4069, + "grad_norm": 1.6349016427993774, + "learning_rate": 0.0002350404498358356 + }, + { + "step": 223, + "epoch": 1.1348600508905853, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824044544, + "loss": 1.349, + "grad_norm": 0.42087727785110474, + "learning_rate": 0.00023430513880630133 + }, + { + "step": 224, + "epoch": 1.1399491094147582, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824062976, + "loss": 1.3708, + "grad_norm": 0.30793169140815735, + "learning_rate": 0.00023356685446004966 + }, + { + "step": 225, + "epoch": 1.1450381679389312, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824113664, + "loss": 1.3911, + "grad_norm": 1.337777018547058, + "learning_rate": 0.00023282562283527005 + }, + { + "step": 226, + "epoch": 1.1501272264631044, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824064512, + "loss": 1.3885, + "grad_norm": 0.663662314414978, + "learning_rate": 0.00023208147007409827 + }, + { + "step": 227, + "epoch": 1.1552162849872774, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82401536, + "loss": 1.3963, + "grad_norm": 1.3594931364059448, + "learning_rate": 0.00023133442242169425 + }, + { + "step": 228, + "epoch": 1.1603053435114503, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824059904, + "loss": 1.4283, + "grad_norm": 1.9465670585632324, + "learning_rate": 0.00023058450622531632 + }, + { + "step": 229, + "epoch": 1.1653944020356235, + "cpu_mem": 1.551470592, + "gpu_mem": 4.823958528, + "loss": 1.3773, + "grad_norm": 1.0921103954315186, + "learning_rate": 0.00022983174793339206 + }, + { + "step": 230, + "epoch": 1.1704834605597965, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824026112, + "loss": 1.3856, + "grad_norm": 0.7312276363372803, + "learning_rate": 0.0002290761740945857 + }, + { + "step": 231, + "epoch": 1.1755725190839694, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824016896, + "loss": 1.393, + "grad_norm": 1.5116827487945557, + "learning_rate": 0.00022831781135686135 + }, + { + "step": 232, + "epoch": 1.1806615776081424, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824021504, + "loss": 1.3622, + "grad_norm": 1.6412174701690674, + "learning_rate": 0.00022755668646654375 + }, + { + "step": 233, + "epoch": 1.1857506361323156, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824136704, + "loss": 1.4017, + "grad_norm": 1.1993286609649658, + "learning_rate": 0.00022679282626737442 + }, + { + "step": 234, + "epoch": 1.1908396946564885, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824096768, + "loss": 1.4213, + "grad_norm": 1.8331832885742188, + "learning_rate": 0.00022602625769956519 + }, + { + "step": 235, + "epoch": 1.1959287531806615, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824055296, + "loss": 1.4069, + "grad_norm": 1.7101690769195557, + "learning_rate": 0.00022525700779884802 + }, + { + "step": 236, + "epoch": 1.2010178117048347, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82405376, + "loss": 1.4146, + "grad_norm": 2.0961923599243164, + "learning_rate": 0.00022448510369552164 + }, + { + "step": 237, + "epoch": 1.2061068702290076, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824078336, + "loss": 1.3951, + "grad_norm": 0.804914653301239, + "learning_rate": 0.0002237105726134943 + }, + { + "step": 238, + "epoch": 1.2111959287531806, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824098304, + "loss": 1.4053, + "grad_norm": 0.9675382375717163, + "learning_rate": 0.00022293344186932406 + }, + { + "step": 239, + "epoch": 1.2162849872773536, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82401536, + "loss": 1.4199, + "grad_norm": 1.4990777969360352, + "learning_rate": 0.00022215373887125514 + }, + { + "step": 240, + "epoch": 1.2213740458015268, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824150528, + "loss": 1.3873, + "grad_norm": 0.809116780757904, + "learning_rate": 0.00022137149111825128 + }, + { + "step": 241, + "epoch": 1.2264631043256997, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824104448, + "loss": 1.3985, + "grad_norm": 0.9551796913146973, + "learning_rate": 0.00022058672619902606 + }, + { + "step": 242, + "epoch": 1.2315521628498727, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824125952, + "loss": 1.4039, + "grad_norm": 1.2388542890548706, + "learning_rate": 0.00021979947179106966 + }, + { + "step": 243, + "epoch": 1.2366412213740459, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824012288, + "loss": 1.3576, + "grad_norm": 2.413356065750122, + "learning_rate": 0.0002190097556596728 + }, + { + "step": 244, + "epoch": 1.2417302798982188, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824, + "loss": 1.4768, + "grad_norm": 2.606452703475952, + "learning_rate": 0.0002182176056569476 + }, + { + "step": 245, + "epoch": 1.2468193384223918, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824004608, + "loss": 1.4652, + "grad_norm": 2.258957624435425, + "learning_rate": 0.00021742304972084518 + }, + { + "step": 246, + "epoch": 1.2519083969465647, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824041472, + "loss": 1.411, + "grad_norm": 1.3216497898101807, + "learning_rate": 0.00021662611587417035 + }, + { + "step": 247, + "epoch": 1.256997455470738, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824003072, + "loss": 1.3727, + "grad_norm": 1.4818793535232544, + "learning_rate": 0.00021582683222359317 + }, + { + "step": 248, + "epoch": 1.262086513994911, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824049152, + "loss": 1.4038, + "grad_norm": 0.9552634358406067, + "learning_rate": 0.00021502522695865796 + }, + { + "step": 249, + "epoch": 1.267175572519084, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824058368, + "loss": 1.397, + "grad_norm": 0.9909098148345947, + "learning_rate": 0.00021422132835078884 + }, + { + "step": 250, + "epoch": 1.272264631043257, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824096768, + "loss": 1.4114, + "grad_norm": 1.4204975366592407, + "learning_rate": 0.0002134151647522927 + }, + { + "step": 251, + "epoch": 1.27735368956743, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82401536, + "loss": 1.3848, + "grad_norm": 0.9547375440597534, + "learning_rate": 0.00021260676459535933 + }, + { + "step": 252, + "epoch": 1.282442748091603, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824024576, + "loss": 1.3981, + "grad_norm": 1.6257498264312744, + "learning_rate": 0.00021179615639105857 + }, + { + "step": 253, + "epoch": 1.2875318066157762, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824090624, + "loss": 1.3989, + "grad_norm": 0.6737183332443237, + "learning_rate": 0.00021098336872833482 + }, + { + "step": 254, + "epoch": 1.2926208651399491, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824016896, + "loss": 1.3939, + "grad_norm": 0.9097368717193604, + "learning_rate": 0.0002101684302729987 + }, + { + "step": 255, + "epoch": 1.297709923664122, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824010752, + "loss": 1.3838, + "grad_norm": 0.8187041282653809, + "learning_rate": 0.00020935136976671617 + }, + { + "step": 256, + "epoch": 1.3027989821882953, + "cpu_mem": 1.551470592, + "gpu_mem": 4.8240768, + "loss": 1.3948, + "grad_norm": 0.6148169636726379, + "learning_rate": 0.00020853221602599458 + }, + { + "step": 257, + "epoch": 1.3078880407124682, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824035328, + "loss": 1.3719, + "grad_norm": 0.24591919779777527, + "learning_rate": 0.00020771099794116672 + }, + { + "step": 258, + "epoch": 1.3129770992366412, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824148992, + "loss": 1.4213, + "grad_norm": 1.8839397430419922, + "learning_rate": 0.0002068877444753717 + }, + { + "step": 259, + "epoch": 1.3180661577608141, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824026112, + "loss": 1.4277, + "grad_norm": 1.520527720451355, + "learning_rate": 0.0002060624846635335 + }, + { + "step": 260, + "epoch": 1.3231552162849873, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824052224, + "loss": 1.4119, + "grad_norm": 0.9448533654212952, + "learning_rate": 0.00020523524761133677 + }, + { + "step": 261, + "epoch": 1.3282442748091603, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824058368, + "loss": 1.411, + "grad_norm": 1.1394981145858765, + "learning_rate": 0.00020440606249420073 + }, + { + "step": 262, + "epoch": 1.3333333333333333, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824125952, + "loss": 1.4095, + "grad_norm": 1.012259840965271, + "learning_rate": 0.00020357495855624974 + }, + { + "step": 263, + "epoch": 1.3384223918575064, + "cpu_mem": 1.551470592, + "gpu_mem": 4.823981568, + "loss": 1.4002, + "grad_norm": 1.0023353099822998, + "learning_rate": 0.0002027419651092822 + }, + { + "step": 264, + "epoch": 1.3435114503816794, + "cpu_mem": 1.551470592, + "gpu_mem": 4.8241152, + "loss": 1.3933, + "grad_norm": 0.36288848519325256, + "learning_rate": 0.00020190711153173676 + }, + { + "step": 265, + "epoch": 1.3486005089058524, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824144384, + "loss": 1.38, + "grad_norm": 0.9548200368881226, + "learning_rate": 0.00020107042726765588 + }, + { + "step": 266, + "epoch": 1.3536895674300253, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824013824, + "loss": 1.4135, + "grad_norm": 1.0557254552841187, + "learning_rate": 0.0002002319418256479 + }, + { + "step": 267, + "epoch": 1.3587786259541985, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82401536, + "loss": 1.391, + "grad_norm": 0.7758092880249023, + "learning_rate": 0.00019939168477784583 + }, + { + "step": 268, + "epoch": 1.3638676844783715, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824016896, + "loss": 1.3777, + "grad_norm": 0.39785683155059814, + "learning_rate": 0.00019854968575886458 + }, + { + "step": 269, + "epoch": 1.3689567430025447, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824090624, + "loss": 1.4201, + "grad_norm": 1.1142178773880005, + "learning_rate": 0.00019770597446475588 + }, + { + "step": 270, + "epoch": 1.3740458015267176, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824072192, + "loss": 1.3708, + "grad_norm": 0.233343243598938, + "learning_rate": 0.0001968605806519608 + }, + { + "step": 271, + "epoch": 1.3791348600508906, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82401536, + "loss": 1.4182, + "grad_norm": 0.8968793153762817, + "learning_rate": 0.00019601353413626032 + }, + { + "step": 272, + "epoch": 1.3842239185750635, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82405376, + "loss": 1.3695, + "grad_norm": 0.7298686504364014, + "learning_rate": 0.00019516486479172386 + }, + { + "step": 273, + "epoch": 1.3893129770992365, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824121344, + "loss": 1.4018, + "grad_norm": 0.7068928480148315, + "learning_rate": 0.0001943146025496555 + }, + { + "step": 274, + "epoch": 1.3944020356234097, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824073728, + "loss": 1.4088, + "grad_norm": 0.8555915951728821, + "learning_rate": 0.00019346277739753855 + }, + { + "step": 275, + "epoch": 1.3994910941475827, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824201216, + "loss": 1.3624, + "grad_norm": 0.4845145642757416, + "learning_rate": 0.00019260941937797776 + }, + { + "step": 276, + "epoch": 1.4045801526717558, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824086016, + "loss": 1.372, + "grad_norm": 0.42860937118530273, + "learning_rate": 0.00019175455858763988 + }, + { + "step": 277, + "epoch": 1.4096692111959288, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824035328, + "loss": 1.3725, + "grad_norm": 0.5255283713340759, + "learning_rate": 0.0001908982251761921 + }, + { + "step": 278, + "epoch": 1.4147582697201018, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82402304, + "loss": 1.3718, + "grad_norm": 0.7592639327049255, + "learning_rate": 0.00019004044934523871 + }, + { + "step": 279, + "epoch": 1.4198473282442747, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82401536, + "loss": 1.3884, + "grad_norm": 0.5821996927261353, + "learning_rate": 0.00018918126134725616 + }, + { + "step": 280, + "epoch": 1.424936386768448, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82413824, + "loss": 1.4214, + "grad_norm": 1.2125145196914673, + "learning_rate": 0.00018832069148452582 + }, + { + "step": 281, + "epoch": 1.4300254452926209, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824078336, + "loss": 1.4543, + "grad_norm": 1.870402216911316, + "learning_rate": 0.00018745877010806534 + }, + { + "step": 282, + "epoch": 1.4351145038167938, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824024576, + "loss": 1.4469, + "grad_norm": 1.5391175746917725, + "learning_rate": 0.00018659552761655828 + }, + { + "step": 283, + "epoch": 1.440203562340967, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824041472, + "loss": 1.4205, + "grad_norm": 1.2172181606292725, + "learning_rate": 0.00018573099445528204 + }, + { + "step": 284, + "epoch": 1.44529262086514, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824181248, + "loss": 1.3851, + "grad_norm": 0.32691463828086853, + "learning_rate": 0.00018486520111503387 + }, + { + "step": 285, + "epoch": 1.450381679389313, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824052224, + "loss": 1.4018, + "grad_norm": 1.0365663766860962, + "learning_rate": 0.0001839981781310558 + }, + { + "step": 286, + "epoch": 1.455470737913486, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824047616, + "loss": 1.4299, + "grad_norm": 1.633254885673523, + "learning_rate": 0.00018312995608195747 + }, + { + "step": 287, + "epoch": 1.460559796437659, + "cpu_mem": 1.551470592, + "gpu_mem": 4.823950848, + "loss": 1.3845, + "grad_norm": 0.8990828990936279, + "learning_rate": 0.00018226056558863778 + }, + { + "step": 288, + "epoch": 1.465648854961832, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824026112, + "loss": 1.3945, + "grad_norm": 1.1541779041290283, + "learning_rate": 0.00018139003731320496 + }, + { + "step": 289, + "epoch": 1.470737913486005, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82404608, + "loss": 1.3861, + "grad_norm": 0.9398142099380493, + "learning_rate": 0.00018051840195789506 + }, + { + "step": 290, + "epoch": 1.4758269720101782, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824012288, + "loss": 1.379, + "grad_norm": 0.6198959350585938, + "learning_rate": 0.00017964569026398926 + }, + { + "step": 291, + "epoch": 1.4809160305343512, + "cpu_mem": 1.551470592, + "gpu_mem": 4.823973888, + "loss": 1.4247, + "grad_norm": 1.5448065996170044, + "learning_rate": 0.00017877193301072945 + }, + { + "step": 292, + "epoch": 1.4860050890585241, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82409984, + "loss": 1.4225, + "grad_norm": 1.6145689487457275, + "learning_rate": 0.0001778971610142331 + }, + { + "step": 293, + "epoch": 1.491094147582697, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82397696, + "loss": 1.4176, + "grad_norm": 1.3426302671432495, + "learning_rate": 0.00017702140512640594 + }, + { + "step": 294, + "epoch": 1.4961832061068703, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824019968, + "loss": 1.3999, + "grad_norm": 1.1035585403442383, + "learning_rate": 0.00017614469623385414 + }, + { + "step": 295, + "epoch": 1.5012722646310432, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824003072, + "loss": 1.3802, + "grad_norm": 1.4438745975494385, + "learning_rate": 0.00017526706525679498 + }, + { + "step": 296, + "epoch": 1.5063613231552164, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824044544, + "loss": 1.411, + "grad_norm": 1.2223833799362183, + "learning_rate": 0.00017438854314796623 + }, + { + "step": 297, + "epoch": 1.5114503816793894, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82398464, + "loss": 1.4029, + "grad_norm": 1.0447454452514648, + "learning_rate": 0.00017350916089153455 + }, + { + "step": 298, + "epoch": 1.5165394402035624, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82400768, + "loss": 1.4014, + "grad_norm": 1.3826704025268555, + "learning_rate": 0.00017262894950200277 + }, + { + "step": 299, + "epoch": 1.5216284987277353, + "cpu_mem": 1.551470592, + "gpu_mem": 4.823996928, + "loss": 1.3823, + "grad_norm": 0.37444961071014404, + "learning_rate": 0.000171747940023116 + }, + { + "step": 300, + "epoch": 1.5267175572519083, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824009216, + "loss": 1.3887, + "grad_norm": 0.5600830316543579, + "learning_rate": 0.0001708661635267667 + }, + { + "step": 301, + "epoch": 1.5318066157760815, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824012288, + "loss": 1.3414, + "grad_norm": 1.1934688091278076, + "learning_rate": 0.00016998365111189906 + }, + { + "step": 302, + "epoch": 1.5368956743002544, + "cpu_mem": 1.551470592, + "gpu_mem": 4.823993856, + "loss": 1.3974, + "grad_norm": 0.5738597512245178, + "learning_rate": 0.00016910043390341183 + }, + { + "step": 303, + "epoch": 1.5419847328244276, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824, + "loss": 1.4178, + "grad_norm": 0.9916496872901917, + "learning_rate": 0.0001682165430510609 + }, + { + "step": 304, + "epoch": 1.5470737913486006, + "cpu_mem": 1.551470592, + "gpu_mem": 4.823947776, + "loss": 1.4902, + "grad_norm": 2.090427875518799, + "learning_rate": 0.00016733200972836055 + }, + { + "step": 305, + "epoch": 1.5521628498727735, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824019968, + "loss": 1.3715, + "grad_norm": 0.6034709811210632, + "learning_rate": 0.00016644686513148397 + }, + { + "step": 306, + "epoch": 1.5572519083969465, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82406144, + "loss": 1.3719, + "grad_norm": 0.8657121658325195, + "learning_rate": 0.00016556114047816317 + }, + { + "step": 307, + "epoch": 1.5623409669211195, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824009216, + "loss": 1.3769, + "grad_norm": 0.4712666869163513, + "learning_rate": 0.00016467486700658785 + }, + { + "step": 308, + "epoch": 1.5674300254452926, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824019968, + "loss": 1.4006, + "grad_norm": 0.711251437664032, + "learning_rate": 0.0001637880759743037 + }, + { + "step": 309, + "epoch": 1.5725190839694656, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824003072, + "loss": 1.423, + "grad_norm": 1.3841835260391235, + "learning_rate": 0.00016290079865711004 + }, + { + "step": 310, + "epoch": 1.5776081424936388, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824058368, + "loss": 1.3908, + "grad_norm": 0.36009499430656433, + "learning_rate": 0.00016201306634795675 + }, + { + "step": 311, + "epoch": 1.5826972010178118, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824024576, + "loss": 1.3887, + "grad_norm": 0.18713289499282837, + "learning_rate": 0.00016112491035584047 + }, + { + "step": 312, + "epoch": 1.5877862595419847, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824033792, + "loss": 1.3894, + "grad_norm": 0.28689849376678467, + "learning_rate": 0.00016023636200470065 + }, + { + "step": 313, + "epoch": 1.5928753180661577, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824035328, + "loss": 1.3829, + "grad_norm": 0.9756432771682739, + "learning_rate": 0.00015934745263231464 + }, + { + "step": 314, + "epoch": 1.5979643765903306, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824262656, + "loss": 1.3998, + "grad_norm": 0.7462515830993652, + "learning_rate": 0.00015845821358919236 + }, + { + "step": 315, + "epoch": 1.6030534351145038, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824055296, + "loss": 1.3912, + "grad_norm": 0.6030295491218567, + "learning_rate": 0.00015756867623747088 + }, + { + "step": 316, + "epoch": 1.608142493638677, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824027648, + "loss": 1.4046, + "grad_norm": 0.8174999356269836, + "learning_rate": 0.00015667887194980806 + }, + { + "step": 317, + "epoch": 1.61323155216285, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82399232, + "loss": 1.3692, + "grad_norm": 0.7832980155944824, + "learning_rate": 0.00015578883210827626 + }, + { + "step": 318, + "epoch": 1.618320610687023, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824082944, + "loss": 1.4047, + "grad_norm": 0.9715800285339355, + "learning_rate": 0.0001548985881032554 + }, + { + "step": 319, + "epoch": 1.623409669211196, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82400768, + "loss": 1.3657, + "grad_norm": 0.8473177552223206, + "learning_rate": 0.00015400817133232606 + }, + { + "step": 320, + "epoch": 1.6284987277353689, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824055296, + "loss": 1.3895, + "grad_norm": 0.6562851071357727, + "learning_rate": 0.00015311761319916184 + }, + { + "step": 321, + "epoch": 1.6335877862595418, + "cpu_mem": 1.551470592, + "gpu_mem": 4.823996928, + "loss": 1.3845, + "grad_norm": 1.145982265472412, + "learning_rate": 0.00015222694511242215 + }, + { + "step": 322, + "epoch": 1.638676844783715, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824218112, + "loss": 1.4259, + "grad_norm": 1.451865792274475, + "learning_rate": 0.00015133619848464424 + }, + { + "step": 323, + "epoch": 1.6437659033078882, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82409984, + "loss": 1.4056, + "grad_norm": 1.1007230281829834, + "learning_rate": 0.0001504454047311353 + }, + { + "step": 324, + "epoch": 1.6488549618320612, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82399232, + "loss": 1.3931, + "grad_norm": 1.452522873878479, + "learning_rate": 0.00014955459526886468 + }, + { + "step": 325, + "epoch": 1.6539440203562341, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824032256, + "loss": 1.3852, + "grad_norm": 0.6649460792541504, + "learning_rate": 0.00014866380151535574 + }, + { + "step": 326, + "epoch": 1.659033078880407, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824079872, + "loss": 1.39, + "grad_norm": 0.6176749467849731, + "learning_rate": 0.0001477730548875778 + }, + { + "step": 327, + "epoch": 1.66412213740458, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82404608, + "loss": 1.4022, + "grad_norm": 0.6544138193130493, + "learning_rate": 0.0001468823868008382 + }, + { + "step": 328, + "epoch": 1.6692111959287532, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824101376, + "loss": 1.3978, + "grad_norm": 0.5311940908432007, + "learning_rate": 0.000145991828667674 + }, + { + "step": 329, + "epoch": 1.6743002544529262, + "cpu_mem": 1.551470592, + "gpu_mem": 4.823993856, + "loss": 1.3754, + "grad_norm": 0.3555145561695099, + "learning_rate": 0.0001451014118967446 + }, + { + "step": 330, + "epoch": 1.6793893129770994, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824078336, + "loss": 1.3772, + "grad_norm": 0.8369079828262329, + "learning_rate": 0.00014421116789172374 + }, + { + "step": 331, + "epoch": 1.6844783715012723, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824067584, + "loss": 1.3579, + "grad_norm": 0.6875025629997253, + "learning_rate": 0.00014332112805019194 + }, + { + "step": 332, + "epoch": 1.6895674300254453, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82402304, + "loss": 1.4134, + "grad_norm": 0.9211751818656921, + "learning_rate": 0.00014243132376252912 + }, + { + "step": 333, + "epoch": 1.6946564885496183, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824112128, + "loss": 1.4217, + "grad_norm": 0.8491256833076477, + "learning_rate": 0.00014154178641080767 + }, + { + "step": 334, + "epoch": 1.6997455470737912, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824039936, + "loss": 1.3952, + "grad_norm": 0.4295397102832794, + "learning_rate": 0.0001406525473676854 + }, + { + "step": 335, + "epoch": 1.7048346055979644, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824049152, + "loss": 1.3911, + "grad_norm": 0.4205728769302368, + "learning_rate": 0.00013976363799529936 + }, + { + "step": 336, + "epoch": 1.7099236641221374, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824101376, + "loss": 1.3799, + "grad_norm": 0.3420359194278717, + "learning_rate": 0.00013887508964415956 + }, + { + "step": 337, + "epoch": 1.7150127226463106, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824101376, + "loss": 1.3895, + "grad_norm": 1.3230409622192383, + "learning_rate": 0.00013798693365204325 + }, + { + "step": 338, + "epoch": 1.7201017811704835, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82413824, + "loss": 1.3677, + "grad_norm": 0.8734058141708374, + "learning_rate": 0.00013709920134288993 + }, + { + "step": 339, + "epoch": 1.7251908396946565, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824055296, + "loss": 1.4045, + "grad_norm": 0.7086853384971619, + "learning_rate": 0.00013621192402569628 + }, + { + "step": 340, + "epoch": 1.7302798982188294, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824124416, + "loss": 1.3966, + "grad_norm": 0.5378472805023193, + "learning_rate": 0.00013532513299341215 + }, + { + "step": 341, + "epoch": 1.7353689567430024, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824066048, + "loss": 1.3962, + "grad_norm": 0.5128660202026367, + "learning_rate": 0.00013443885952183683 + }, + { + "step": 342, + "epoch": 1.7404580152671756, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824072192, + "loss": 1.3681, + "grad_norm": 1.0943738222122192, + "learning_rate": 0.00013355313486851603 + }, + { + "step": 343, + "epoch": 1.7455470737913485, + "cpu_mem": 1.551470592, + "gpu_mem": 4.823983104, + "loss": 1.4187, + "grad_norm": 1.2269887924194336, + "learning_rate": 0.00013266799027163942 + }, + { + "step": 344, + "epoch": 1.7506361323155217, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824101376, + "loss": 1.4069, + "grad_norm": 0.8399978280067444, + "learning_rate": 0.00013178345694893906 + }, + { + "step": 345, + "epoch": 1.7557251908396947, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82405376, + "loss": 1.3953, + "grad_norm": 1.03951895236969, + "learning_rate": 0.0001308995660965881 + }, + { + "step": 346, + "epoch": 1.7608142493638677, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824, + "loss": 1.3749, + "grad_norm": 1.2858556509017944, + "learning_rate": 0.00013001634888810094 + }, + { + "step": 347, + "epoch": 1.7659033078880406, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824049152, + "loss": 1.3949, + "grad_norm": 0.8824889063835144, + "learning_rate": 0.0001291338364732333 + }, + { + "step": 348, + "epoch": 1.7709923664122136, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824175104, + "loss": 1.4006, + "grad_norm": 1.4508477449417114, + "learning_rate": 0.00012825205997688403 + }, + { + "step": 349, + "epoch": 1.7760814249363868, + "cpu_mem": 1.551470592, + "gpu_mem": 4.823973888, + "loss": 1.3825, + "grad_norm": 0.7308638691902161, + "learning_rate": 0.00012737105049799723 + }, + { + "step": 350, + "epoch": 1.78117048346056, + "cpu_mem": 1.551470592, + "gpu_mem": 4.8240384, + "loss": 1.3813, + "grad_norm": 0.5678991675376892, + "learning_rate": 0.00012649083910846543 + }, + { + "step": 351, + "epoch": 1.786259541984733, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82405376, + "loss": 1.3808, + "grad_norm": 0.5174372792243958, + "learning_rate": 0.00012561145685203374 + }, + { + "step": 352, + "epoch": 1.7913486005089059, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824044544, + "loss": 1.3972, + "grad_norm": 0.6785205006599426, + "learning_rate": 0.00012473293474320505 + }, + { + "step": 353, + "epoch": 1.7964376590330788, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824041472, + "loss": 1.3776, + "grad_norm": 1.0116349458694458, + "learning_rate": 0.00012385530376614586 + }, + { + "step": 354, + "epoch": 1.8015267175572518, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824016896, + "loss": 1.4025, + "grad_norm": 0.7240336537361145, + "learning_rate": 0.00012297859487359408 + }, + { + "step": 355, + "epoch": 1.806615776081425, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824, + "loss": 1.3747, + "grad_norm": 0.1348349004983902, + "learning_rate": 0.0001221028389857669 + }, + { + "step": 356, + "epoch": 1.811704834605598, + "cpu_mem": 1.551470592, + "gpu_mem": 4.824001536, + "loss": 1.381, + "grad_norm": 0.13849203288555145, + "learning_rate": 0.00012122806698927051 + }, + { + "step": 357, + "epoch": 1.8167938931297711, + "cpu_mem": 1.551470592, + "gpu_mem": 4.82405376, + "loss": 1.4153, + "grad_norm": 1.2179282903671265, + "learning_rate": 0.00012035430973601075 + }, + { + "step": 358, + "epoch": 1.821882951653944, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824119808, + "loss": 1.3862, + "grad_norm": 1.787388563156128, + "learning_rate": 0.00011948159804210495 + }, + { + "step": 359, + "epoch": 1.826972010178117, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824073728, + "loss": 1.3899, + "grad_norm": 0.7719748616218567, + "learning_rate": 0.00011860996268679504 + }, + { + "step": 360, + "epoch": 1.83206106870229, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824124416, + "loss": 1.3836, + "grad_norm": 0.3153761029243469, + "learning_rate": 0.00011773943441136221 + }, + { + "step": 361, + "epoch": 1.837150127226463, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824095232, + "loss": 1.3923, + "grad_norm": 0.8031628727912903, + "learning_rate": 0.00011687004391804251 + }, + { + "step": 362, + "epoch": 1.8422391857506362, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824029184, + "loss": 1.3943, + "grad_norm": 0.8615172505378723, + "learning_rate": 0.00011600182186894417 + }, + { + "step": 363, + "epoch": 1.8473282442748091, + "cpu_mem": 1.5516672, + "gpu_mem": 4.823986176, + "loss": 1.3771, + "grad_norm": 0.20750238001346588, + "learning_rate": 0.00011513479888496609 + }, + { + "step": 364, + "epoch": 1.8524173027989823, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824013824, + "loss": 1.3559, + "grad_norm": 0.9327572584152222, + "learning_rate": 0.00011426900554471795 + }, + { + "step": 365, + "epoch": 1.8575063613231553, + "cpu_mem": 1.5516672, + "gpu_mem": 4.823996928, + "loss": 1.3928, + "grad_norm": 0.7289075255393982, + "learning_rate": 0.0001134044723834417 + }, + { + "step": 366, + "epoch": 1.8625954198473282, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824039936, + "loss": 1.407, + "grad_norm": 1.1459906101226807, + "learning_rate": 0.00011254122989193465 + }, + { + "step": 367, + "epoch": 1.8676844783715012, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824093696, + "loss": 1.4378, + "grad_norm": 1.2897809743881226, + "learning_rate": 0.00011167930851547418 + }, + { + "step": 368, + "epoch": 1.8727735368956742, + "cpu_mem": 1.5516672, + "gpu_mem": 4.823990784, + "loss": 1.3983, + "grad_norm": 0.9696313142776489, + "learning_rate": 0.0001108187386527438 + }, + { + "step": 369, + "epoch": 1.8778625954198473, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824050688, + "loss": 1.4017, + "grad_norm": 1.0442718267440796, + "learning_rate": 0.00010995955065476126 + }, + { + "step": 370, + "epoch": 1.8829516539440203, + "cpu_mem": 1.5516672, + "gpu_mem": 4.823973888, + "loss": 1.3481, + "grad_norm": 0.7889568209648132, + "learning_rate": 0.00010910177482380795 + }, + { + "step": 371, + "epoch": 1.8880407124681935, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824133632, + "loss": 1.3862, + "grad_norm": 0.653309166431427, + "learning_rate": 0.00010824544141236015 + }, + { + "step": 372, + "epoch": 1.8931297709923665, + "cpu_mem": 1.5516672, + "gpu_mem": 4.823987712, + "loss": 1.3924, + "grad_norm": 0.4430103600025177, + "learning_rate": 0.00010739058062202224 + }, + { + "step": 373, + "epoch": 1.8982188295165394, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824010752, + "loss": 1.3819, + "grad_norm": 0.424518346786499, + "learning_rate": 0.00010653722260246145 + }, + { + "step": 374, + "epoch": 1.9033078880407124, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824033792, + "loss": 1.4448, + "grad_norm": 1.2372688055038452, + "learning_rate": 0.00010568539745034447 + }, + { + "step": 375, + "epoch": 1.9083969465648853, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824258048, + "loss": 1.4212, + "grad_norm": 0.926084041595459, + "learning_rate": 0.00010483513520827614 + }, + { + "step": 376, + "epoch": 1.9134860050890585, + "cpu_mem": 1.5516672, + "gpu_mem": 4.82401536, + "loss": 1.3742, + "grad_norm": 0.3879023790359497, + "learning_rate": 0.00010398646586373969 + }, + { + "step": 377, + "epoch": 1.9185750636132317, + "cpu_mem": 1.5516672, + "gpu_mem": 4.82413824, + "loss": 1.4137, + "grad_norm": 0.6434004306793213, + "learning_rate": 0.00010313941934803922 + }, + { + "step": 378, + "epoch": 1.9236641221374047, + "cpu_mem": 1.5516672, + "gpu_mem": 4.82405376, + "loss": 1.3732, + "grad_norm": 0.14771580696105957, + "learning_rate": 0.00010229402553524413 + }, + { + "step": 379, + "epoch": 1.9287531806615776, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824142848, + "loss": 1.3977, + "grad_norm": 1.311026692390442, + "learning_rate": 0.00010145031424113542 + }, + { + "step": 380, + "epoch": 1.9338422391857506, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824041472, + "loss": 1.4071, + "grad_norm": 0.895010232925415, + "learning_rate": 0.00010060831522215416 + }, + { + "step": 381, + "epoch": 1.9389312977099236, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824049152, + "loss": 1.3925, + "grad_norm": 0.3043169677257538, + "learning_rate": 9.976805817435207e-05 + }, + { + "step": 382, + "epoch": 1.9440203562340967, + "cpu_mem": 1.5516672, + "gpu_mem": 4.823972352, + "loss": 1.3807, + "grad_norm": 0.9626392722129822, + "learning_rate": 9.89295727323441e-05 + }, + { + "step": 383, + "epoch": 1.9491094147582697, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824052224, + "loss": 1.4056, + "grad_norm": 1.201212763786316, + "learning_rate": 9.809288846826327e-05 + }, + { + "step": 384, + "epoch": 1.954198473282443, + "cpu_mem": 1.5516672, + "gpu_mem": 4.82402304, + "loss": 1.402, + "grad_norm": 0.7693207263946533, + "learning_rate": 9.725803489071779e-05 + }, + { + "step": 385, + "epoch": 1.9592875318066159, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824010752, + "loss": 1.3927, + "grad_norm": 0.4780730605125427, + "learning_rate": 9.642504144375026e-05 + }, + { + "step": 386, + "epoch": 1.9643765903307888, + "cpu_mem": 1.5516672, + "gpu_mem": 4.8241536, + "loss": 1.3697, + "grad_norm": 1.1756418943405151, + "learning_rate": 9.559393750579926e-05 + }, + { + "step": 387, + "epoch": 1.9694656488549618, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824036864, + "loss": 1.3814, + "grad_norm": 0.6135494112968445, + "learning_rate": 9.476475238866318e-05 + }, + { + "step": 388, + "epoch": 1.9745547073791347, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824047616, + "loss": 1.4109, + "grad_norm": 0.8982216715812683, + "learning_rate": 9.393751533646649e-05 + }, + { + "step": 389, + "epoch": 1.979643765903308, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824202752, + "loss": 1.387, + "grad_norm": 0.8449673056602478, + "learning_rate": 9.31122555246283e-05 + }, + { + "step": 390, + "epoch": 1.984732824427481, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824012288, + "loss": 1.3932, + "grad_norm": 1.0608329772949219, + "learning_rate": 9.228900205883324e-05 + }, + { + "step": 391, + "epoch": 1.989821882951654, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824033792, + "loss": 1.3879, + "grad_norm": 0.5371776819229126, + "learning_rate": 9.146778397400543e-05 + }, + { + "step": 392, + "epoch": 1.994910941475827, + "cpu_mem": 1.5516672, + "gpu_mem": 4.824070656, + "loss": 1.397, + "grad_norm": 0.7191483974456787, + "learning_rate": 9.064863023328384e-05 + }, + { + "step": 393, + "epoch": 2.0, + "cpu_mem": 1.5516672, + "gpu_mem": 4.823657472, + "loss": 2.0638, + "grad_norm": 0.502435028553009, + "learning_rate": 8.983156972700125e-05 + }, + { + "step": 394, + "epoch": 2.005089058524173, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722436096, + "loss": 1.373, + "grad_norm": 1.2797338962554932, + "learning_rate": 8.901663127166513e-05 + }, + { + "step": 395, + "epoch": 2.010178117048346, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722414592, + "loss": 1.3688, + "grad_norm": 0.5172354578971863, + "learning_rate": 8.820384360894143e-05 + }, + { + "step": 396, + "epoch": 2.015267175572519, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722429952, + "loss": 1.414, + "grad_norm": 1.1282832622528076, + "learning_rate": 8.739323540464063e-05 + }, + { + "step": 397, + "epoch": 2.0203562340966923, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722577408, + "loss": 1.3729, + "grad_norm": 0.5787538886070251, + "learning_rate": 8.658483524770728e-05 + }, + { + "step": 398, + "epoch": 2.0254452926208653, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722523648, + "loss": 1.4005, + "grad_norm": 0.7913793325424194, + "learning_rate": 8.577867164921113e-05 + }, + { + "step": 399, + "epoch": 2.030534351145038, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722540544, + "loss": 1.3755, + "grad_norm": 0.5702304244041443, + "learning_rate": 8.497477304134203e-05 + }, + { + "step": 400, + "epoch": 2.035623409669211, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722492928, + "loss": 1.3889, + "grad_norm": 0.5534777641296387, + "learning_rate": 8.41731677764068e-05 + }, + { + "step": 401, + "epoch": 2.040712468193384, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722517504, + "loss": 1.3981, + "grad_norm": 0.847438395023346, + "learning_rate": 8.337388412582972e-05 + }, + { + "step": 402, + "epoch": 2.045801526717557, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72256512, + "loss": 1.3834, + "grad_norm": 0.5697630643844604, + "learning_rate": 8.257695027915481e-05 + }, + { + "step": 403, + "epoch": 2.05089058524173, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722474496, + "loss": 1.3797, + "grad_norm": 0.8113382458686829, + "learning_rate": 8.178239434305235e-05 + }, + { + "step": 404, + "epoch": 2.0559796437659035, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722446848, + "loss": 1.384, + "grad_norm": 0.7403522729873657, + "learning_rate": 8.099024434032717e-05 + }, + { + "step": 405, + "epoch": 2.0610687022900764, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722535936, + "loss": 1.3832, + "grad_norm": 0.4111902415752411, + "learning_rate": 8.02005282089303e-05 + }, + { + "step": 406, + "epoch": 2.0661577608142494, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72242688, + "loss": 1.3764, + "grad_norm": 0.6267712712287903, + "learning_rate": 7.941327380097388e-05 + }, + { + "step": 407, + "epoch": 2.0712468193384224, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722479104, + "loss": 1.3979, + "grad_norm": 0.530770480632782, + "learning_rate": 7.862850888174869e-05 + }, + { + "step": 408, + "epoch": 2.0763358778625953, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722433024, + "loss": 1.3758, + "grad_norm": 0.6244580149650574, + "learning_rate": 7.784626112874487e-05 + }, + { + "step": 409, + "epoch": 2.0814249363867683, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722635776, + "loss": 1.3742, + "grad_norm": 0.882335364818573, + "learning_rate": 7.706655813067594e-05 + }, + { + "step": 410, + "epoch": 2.0865139949109412, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72263424, + "loss": 1.4209, + "grad_norm": 1.0021132230758667, + "learning_rate": 7.628942738650573e-05 + }, + { + "step": 411, + "epoch": 2.0916030534351147, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72255744, + "loss": 1.3963, + "grad_norm": 0.5310623049736023, + "learning_rate": 7.551489630447835e-05 + }, + { + "step": 412, + "epoch": 2.0966921119592876, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722514432, + "loss": 1.3832, + "grad_norm": 0.5882535576820374, + "learning_rate": 7.474299220115195e-05 + }, + { + "step": 413, + "epoch": 2.1017811704834606, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722491392, + "loss": 1.3865, + "grad_norm": 0.7175636887550354, + "learning_rate": 7.397374230043484e-05 + }, + { + "step": 414, + "epoch": 2.1068702290076335, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722445312, + "loss": 1.3834, + "grad_norm": 0.48065656423568726, + "learning_rate": 7.320717373262557e-05 + }, + { + "step": 415, + "epoch": 2.1119592875318065, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722562048, + "loss": 1.3485, + "grad_norm": 1.0788990259170532, + "learning_rate": 7.244331353345625e-05 + }, + { + "step": 416, + "epoch": 2.1170483460559795, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72244992, + "loss": 1.3863, + "grad_norm": 0.49346235394477844, + "learning_rate": 7.16821886431386e-05 + }, + { + "step": 417, + "epoch": 2.122137404580153, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722439168, + "loss": 1.4038, + "grad_norm": 0.86514812707901, + "learning_rate": 7.092382590541432e-05 + }, + { + "step": 418, + "epoch": 2.127226463104326, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722443776, + "loss": 1.3856, + "grad_norm": 0.37930673360824585, + "learning_rate": 7.016825206660788e-05 + }, + { + "step": 419, + "epoch": 2.132315521628499, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722558976, + "loss": 1.4557, + "grad_norm": 1.4893485307693481, + "learning_rate": 6.941549377468367e-05 + }, + { + "step": 420, + "epoch": 2.1374045801526718, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722532864, + "loss": 1.4197, + "grad_norm": 0.9896464347839355, + "learning_rate": 6.866557757830575e-05 + }, + { + "step": 421, + "epoch": 2.1424936386768447, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72243456, + "loss": 1.3896, + "grad_norm": 0.8609235882759094, + "learning_rate": 6.791852992590169e-05 + }, + { + "step": 422, + "epoch": 2.1475826972010177, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722469888, + "loss": 1.3833, + "grad_norm": 0.7888544201850891, + "learning_rate": 6.717437716472997e-05 + }, + { + "step": 423, + "epoch": 2.1526717557251906, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722436096, + "loss": 1.3896, + "grad_norm": 0.40036535263061523, + "learning_rate": 6.643314553995034e-05 + }, + { + "step": 424, + "epoch": 2.157760814249364, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722569728, + "loss": 1.3828, + "grad_norm": 0.1620834618806839, + "learning_rate": 6.569486119369863e-05 + }, + { + "step": 425, + "epoch": 2.162849872773537, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722543616, + "loss": 1.3827, + "grad_norm": 0.6449941396713257, + "learning_rate": 6.495955016416441e-05 + }, + { + "step": 426, + "epoch": 2.16793893129771, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72258816, + "loss": 1.3599, + "grad_norm": 0.8058264255523682, + "learning_rate": 6.422723838467286e-05 + }, + { + "step": 427, + "epoch": 2.173027989821883, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722479104, + "loss": 1.3554, + "grad_norm": 0.7026816606521606, + "learning_rate": 6.349795168276994e-05 + }, + { + "step": 428, + "epoch": 2.178117048346056, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722496, + "loss": 1.3833, + "grad_norm": 1.4556708335876465, + "learning_rate": 6.277171577931187e-05 + }, + { + "step": 429, + "epoch": 2.183206106870229, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722571264, + "loss": 1.4329, + "grad_norm": 1.3969082832336426, + "learning_rate": 6.204855628755751e-05 + }, + { + "step": 430, + "epoch": 2.188295165394402, + "cpu_mem": 1.5516672, + "gpu_mem": 4.7224192, + "loss": 1.4167, + "grad_norm": 1.3412868976593018, + "learning_rate": 6.13284987122654e-05 + }, + { + "step": 431, + "epoch": 2.1933842239185752, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722537472, + "loss": 1.4156, + "grad_norm": 1.186583161354065, + "learning_rate": 6.061156844879417e-05 + }, + { + "step": 432, + "epoch": 2.198473282442748, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72252672, + "loss": 1.4239, + "grad_norm": 0.9954550862312317, + "learning_rate": 5.9897790782206636e-05 + }, + { + "step": 433, + "epoch": 2.203562340966921, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722492928, + "loss": 1.394, + "grad_norm": 0.46497249603271484, + "learning_rate": 5.9187190886378306e-05 + }, + { + "step": 434, + "epoch": 2.208651399491094, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722471424, + "loss": 1.3604, + "grad_norm": 0.5555978417396545, + "learning_rate": 5.8479793823109406e-05 + }, + { + "step": 435, + "epoch": 2.213740458015267, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722466816, + "loss": 1.414, + "grad_norm": 0.9207667112350464, + "learning_rate": 5.777562454124113e-05 + }, + { + "step": 436, + "epoch": 2.21882951653944, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722477568, + "loss": 1.3787, + "grad_norm": 0.688822329044342, + "learning_rate": 5.7074707875775496e-05 + }, + { + "step": 437, + "epoch": 2.223918575063613, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722531328, + "loss": 1.4063, + "grad_norm": 0.8017343878746033, + "learning_rate": 5.637706854699974e-05 + }, + { + "step": 438, + "epoch": 2.2290076335877864, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722448384, + "loss": 1.3828, + "grad_norm": 0.25593069195747375, + "learning_rate": 5.568273115961414e-05 + }, + { + "step": 439, + "epoch": 2.2340966921119594, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722476032, + "loss": 1.3936, + "grad_norm": 0.6322511434555054, + "learning_rate": 5.499172020186447e-05 + }, + { + "step": 440, + "epoch": 2.2391857506361323, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722474496, + "loss": 1.3695, + "grad_norm": 0.4600679278373718, + "learning_rate": 5.430406004467842e-05 + }, + { + "step": 441, + "epoch": 2.2442748091603053, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722517504, + "loss": 1.404, + "grad_norm": 0.8055370450019836, + "learning_rate": 5.361977494080572e-05 + }, + { + "step": 442, + "epoch": 2.2493638676844783, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722515968, + "loss": 1.367, + "grad_norm": 0.7678072452545166, + "learning_rate": 5.293888902396319e-05 + }, + { + "step": 443, + "epoch": 2.2544529262086512, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722566656, + "loss": 1.3925, + "grad_norm": 0.6166152358055115, + "learning_rate": 5.2261426307983204e-05 + }, + { + "step": 444, + "epoch": 2.2595419847328246, + "cpu_mem": 1.5516672, + "gpu_mem": 4.7225728, + "loss": 1.3974, + "grad_norm": 0.456502228975296, + "learning_rate": 5.158741068596714e-05 + }, + { + "step": 445, + "epoch": 2.2646310432569976, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72251136, + "loss": 1.4047, + "grad_norm": 0.6939594149589539, + "learning_rate": 5.0916865929442326e-05 + }, + { + "step": 446, + "epoch": 2.2697201017811706, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722500608, + "loss": 1.4237, + "grad_norm": 1.158292531967163, + "learning_rate": 5.024981568752386e-05 + }, + { + "step": 447, + "epoch": 2.2748091603053435, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722525184, + "loss": 1.4104, + "grad_norm": 0.7610563635826111, + "learning_rate": 4.958628348608065e-05 + }, + { + "step": 448, + "epoch": 2.2798982188295165, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72254208, + "loss": 1.3846, + "grad_norm": 0.5091719031333923, + "learning_rate": 4.892629272690536e-05 + }, + { + "step": 449, + "epoch": 2.2849872773536894, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722413056, + "loss": 1.3947, + "grad_norm": 0.4253019690513611, + "learning_rate": 4.826986668688944e-05 + }, + { + "step": 450, + "epoch": 2.2900763358778624, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72258816, + "loss": 1.3821, + "grad_norm": 0.48106855154037476, + "learning_rate": 4.761702851720191e-05 + }, + { + "step": 451, + "epoch": 2.2951653944020354, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722545152, + "loss": 1.3939, + "grad_norm": 0.592142641544342, + "learning_rate": 4.6967801242472916e-05 + }, + { + "step": 452, + "epoch": 2.300254452926209, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722454528, + "loss": 1.3888, + "grad_norm": 0.5233579874038696, + "learning_rate": 4.632220775998172e-05 + }, + { + "step": 453, + "epoch": 2.3053435114503817, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722402304, + "loss": 1.3905, + "grad_norm": 0.19239936769008636, + "learning_rate": 4.568027083884929e-05 + }, + { + "step": 454, + "epoch": 2.3104325699745547, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722417664, + "loss": 1.3952, + "grad_norm": 0.47527334094047546, + "learning_rate": 4.504201311923488e-05 + }, + { + "step": 455, + "epoch": 2.3155216284987277, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72251904, + "loss": 1.3874, + "grad_norm": 0.3452906310558319, + "learning_rate": 4.440745711153804e-05 + }, + { + "step": 456, + "epoch": 2.3206106870229006, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722528256, + "loss": 1.3767, + "grad_norm": 1.0427889823913574, + "learning_rate": 4.377662519560423e-05 + }, + { + "step": 457, + "epoch": 2.325699745547074, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72255744, + "loss": 1.3941, + "grad_norm": 0.8290899991989136, + "learning_rate": 4.3149539619935836e-05 + }, + { + "step": 458, + "epoch": 2.330788804071247, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72261888, + "loss": 1.3902, + "grad_norm": 0.6631004810333252, + "learning_rate": 4.252622250090746e-05 + }, + { + "step": 459, + "epoch": 2.33587786259542, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72247296, + "loss": 1.3737, + "grad_norm": 0.33023732900619507, + "learning_rate": 4.190669582198571e-05 + }, + { + "step": 460, + "epoch": 2.340966921119593, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722560512, + "loss": 1.3755, + "grad_norm": 1.006908893585205, + "learning_rate": 4.1290981432954185e-05 + }, + { + "step": 461, + "epoch": 2.346055979643766, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722463744, + "loss": 1.3759, + "grad_norm": 0.8271876573562622, + "learning_rate": 4.067910104914249e-05 + }, + { + "step": 462, + "epoch": 2.351145038167939, + "cpu_mem": 1.5516672, + "gpu_mem": 4.7225344, + "loss": 1.396, + "grad_norm": 0.5270286798477173, + "learning_rate": 4.007107625066079e-05 + }, + { + "step": 463, + "epoch": 2.356234096692112, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722569728, + "loss": 1.3796, + "grad_norm": 0.9457967877388, + "learning_rate": 3.946692848163836e-05 + }, + { + "step": 464, + "epoch": 2.3613231552162848, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722508288, + "loss": 1.3668, + "grad_norm": 0.3560408651828766, + "learning_rate": 3.886667904946739e-05 + }, + { + "step": 465, + "epoch": 2.366412213740458, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72246528, + "loss": 1.3783, + "grad_norm": 0.5760812759399414, + "learning_rate": 3.8270349124051694e-05 + }, + { + "step": 466, + "epoch": 2.371501272264631, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722466816, + "loss": 1.3716, + "grad_norm": 0.3530998229980469, + "learning_rate": 3.767795973705975e-05 + }, + { + "step": 467, + "epoch": 2.376590330788804, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72250368, + "loss": 1.3538, + "grad_norm": 0.6126216650009155, + "learning_rate": 3.708953178118324e-05 + }, + { + "step": 468, + "epoch": 2.381679389312977, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722476032, + "loss": 1.3642, + "grad_norm": 0.6583148837089539, + "learning_rate": 3.6505086009399944e-05 + }, + { + "step": 469, + "epoch": 2.38676844783715, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722532864, + "loss": 1.3928, + "grad_norm": 1.1439341306686401, + "learning_rate": 3.5924643034242136e-05 + }, + { + "step": 470, + "epoch": 2.391857506361323, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722562048, + "loss": 1.3923, + "grad_norm": 1.655451774597168, + "learning_rate": 3.5348223327069105e-05 + }, + { + "step": 471, + "epoch": 2.3969465648854964, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722462208, + "loss": 1.4221, + "grad_norm": 1.1489430665969849, + "learning_rate": 3.4775847217345756e-05 + }, + { + "step": 472, + "epoch": 2.4020356234096694, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722485248, + "loss": 1.414, + "grad_norm": 1.0387245416641235, + "learning_rate": 3.420753489192524e-05 + }, + { + "step": 473, + "epoch": 2.4071246819338423, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722460672, + "loss": 1.4185, + "grad_norm": 1.1219216585159302, + "learning_rate": 3.364330639433701e-05 + }, + { + "step": 474, + "epoch": 2.4122137404580153, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722552832, + "loss": 1.3893, + "grad_norm": 0.8340598344802856, + "learning_rate": 3.308318162408013e-05 + }, + { + "step": 475, + "epoch": 2.4173027989821882, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722422272, + "loss": 1.3905, + "grad_norm": 0.4686090648174286, + "learning_rate": 3.2527180335921186e-05 + }, + { + "step": 476, + "epoch": 2.422391857506361, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722558976, + "loss": 1.3822, + "grad_norm": 0.5529274344444275, + "learning_rate": 3.197532213919774e-05 + }, + { + "step": 477, + "epoch": 2.427480916030534, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722416128, + "loss": 1.4071, + "grad_norm": 0.7993528246879578, + "learning_rate": 3.1427626497126654e-05 + }, + { + "step": 478, + "epoch": 2.432569974554707, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722439168, + "loss": 1.3735, + "grad_norm": 0.4819258451461792, + "learning_rate": 3.088411272611781e-05 + }, + { + "step": 479, + "epoch": 2.4376590330788805, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722486784, + "loss": 1.3995, + "grad_norm": 0.8543264865875244, + "learning_rate": 3.0344799995092533e-05 + }, + { + "step": 480, + "epoch": 2.4427480916030535, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722529792, + "loss": 1.3874, + "grad_norm": 0.35341909527778625, + "learning_rate": 2.9809707324807912e-05 + }, + { + "step": 481, + "epoch": 2.4478371501272265, + "cpu_mem": 1.5516672, + "gpu_mem": 4.7224576, + "loss": 1.3816, + "grad_norm": 0.5040900707244873, + "learning_rate": 2.9278853587185658e-05 + }, + { + "step": 482, + "epoch": 2.4529262086513994, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722555904, + "loss": 1.3738, + "grad_norm": 0.5523751974105835, + "learning_rate": 2.8752257504646616e-05 + }, + { + "step": 483, + "epoch": 2.4580152671755724, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722451456, + "loss": 1.3907, + "grad_norm": 0.5961681008338928, + "learning_rate": 2.8229937649450613e-05 + }, + { + "step": 484, + "epoch": 2.4631043256997454, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722606592, + "loss": 1.3928, + "grad_norm": 0.489266037940979, + "learning_rate": 2.7711912443041123e-05 + }, + { + "step": 485, + "epoch": 2.4681933842239188, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722439168, + "loss": 1.3894, + "grad_norm": 0.39808815717697144, + "learning_rate": 2.719820015539596e-05 + }, + { + "step": 486, + "epoch": 2.4732824427480917, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722482176, + "loss": 1.3827, + "grad_norm": 0.4956972301006317, + "learning_rate": 2.6688818904382513e-05 + }, + { + "step": 487, + "epoch": 2.4783715012722647, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72250368, + "loss": 1.3838, + "grad_norm": 0.5331231951713562, + "learning_rate": 2.6183786655119144e-05 + }, + { + "step": 488, + "epoch": 2.4834605597964376, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72242688, + "loss": 1.3873, + "grad_norm": 0.4375815689563751, + "learning_rate": 2.5683121219341217e-05 + }, + { + "step": 489, + "epoch": 2.4885496183206106, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722563584, + "loss": 1.3837, + "grad_norm": 0.5623536109924316, + "learning_rate": 2.518684025477319e-05 + }, + { + "step": 490, + "epoch": 2.4936386768447836, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722451456, + "loss": 1.3879, + "grad_norm": 0.43118542432785034, + "learning_rate": 2.469496126450578e-05 + }, + { + "step": 491, + "epoch": 2.4987277353689565, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722448384, + "loss": 1.3862, + "grad_norm": 0.3515470623970032, + "learning_rate": 2.4207501596378508e-05 + }, + { + "step": 492, + "epoch": 2.5038167938931295, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722422272, + "loss": 1.3868, + "grad_norm": 0.3298569917678833, + "learning_rate": 2.3724478442368133e-05 + }, + { + "step": 493, + "epoch": 2.508905852417303, + "cpu_mem": 1.5516672, + "gpu_mem": 4.7224576, + "loss": 1.383, + "grad_norm": 0.6903353333473206, + "learning_rate": 2.324590883798204e-05 + }, + { + "step": 494, + "epoch": 2.513994910941476, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72247296, + "loss": 1.3842, + "grad_norm": 0.6602411270141602, + "learning_rate": 2.2771809661657614e-05 + }, + { + "step": 495, + "epoch": 2.519083969465649, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722466816, + "loss": 1.3797, + "grad_norm": 0.3897811472415924, + "learning_rate": 2.2302197634166835e-05 + }, + { + "step": 496, + "epoch": 2.524173027989822, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722479104, + "loss": 1.3946, + "grad_norm": 0.485270231962204, + "learning_rate": 2.1837089318026714e-05 + }, + { + "step": 497, + "epoch": 2.5292620865139948, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72251136, + "loss": 1.3855, + "grad_norm": 0.5218082666397095, + "learning_rate": 2.1376501116915047e-05 + }, + { + "step": 498, + "epoch": 2.534351145038168, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722512896, + "loss": 1.3817, + "grad_norm": 0.23808270692825317, + "learning_rate": 2.0920449275091837e-05 + }, + { + "step": 499, + "epoch": 2.539440203562341, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722485248, + "loss": 1.3879, + "grad_norm": 0.25771450996398926, + "learning_rate": 2.0468949876826573e-05 + }, + { + "step": 500, + "epoch": 2.544529262086514, + "cpu_mem": 1.5516672, + "gpu_mem": 4.7225344, + "loss": 1.373, + "grad_norm": 0.820423424243927, + "learning_rate": 2.002201884583065e-05 + }, + { + "step": 501, + "epoch": 2.549618320610687, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722463744, + "loss": 1.3868, + "grad_norm": 0.5707834362983704, + "learning_rate": 1.957967194469615e-05 + }, + { + "step": 502, + "epoch": 2.55470737913486, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722522112, + "loss": 1.3769, + "grad_norm": 0.6583533883094788, + "learning_rate": 1.9141924774339566e-05 + }, + { + "step": 503, + "epoch": 2.559796437659033, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722512896, + "loss": 1.3781, + "grad_norm": 0.20566970109939575, + "learning_rate": 1.8708792773451874e-05 + }, + { + "step": 504, + "epoch": 2.564885496183206, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722446848, + "loss": 1.3694, + "grad_norm": 0.6135937571525574, + "learning_rate": 1.828029121795375e-05 + }, + { + "step": 505, + "epoch": 2.569974554707379, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722468352, + "loss": 1.3864, + "grad_norm": 0.2826383411884308, + "learning_rate": 1.7856435220457092e-05 + }, + { + "step": 506, + "epoch": 2.5750636132315523, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722523648, + "loss": 1.369, + "grad_norm": 1.1443809270858765, + "learning_rate": 1.7437239729731806e-05 + }, + { + "step": 507, + "epoch": 2.5801526717557253, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722440704, + "loss": 1.3821, + "grad_norm": 0.30513325333595276, + "learning_rate": 1.7022719530178624e-05 + }, + { + "step": 508, + "epoch": 2.5852417302798982, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722505216, + "loss": 1.394, + "grad_norm": 0.524377167224884, + "learning_rate": 1.6612889241307836e-05 + }, + { + "step": 509, + "epoch": 2.590330788804071, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722463744, + "loss": 1.4085, + "grad_norm": 0.676613450050354, + "learning_rate": 1.620776331722347e-05 + }, + { + "step": 510, + "epoch": 2.595419847328244, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722496, + "loss": 1.3734, + "grad_norm": 0.5955591797828674, + "learning_rate": 1.580735604611368e-05 + }, + { + "step": 511, + "epoch": 2.6005089058524176, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722469888, + "loss": 1.3887, + "grad_norm": 0.3319673240184784, + "learning_rate": 1.5411681549746678e-05 + }, + { + "step": 512, + "epoch": 2.6055979643765905, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722463744, + "loss": 1.3735, + "grad_norm": 0.5458908677101135, + "learning_rate": 1.502075378297285e-05 + }, + { + "step": 513, + "epoch": 2.6106870229007635, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722474496, + "loss": 1.3886, + "grad_norm": 0.7336044907569885, + "learning_rate": 1.4634586533232428e-05 + }, + { + "step": 514, + "epoch": 2.6157760814249365, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722379264, + "loss": 1.3668, + "grad_norm": 0.6494007110595703, + "learning_rate": 1.4253193420069292e-05 + }, + { + "step": 515, + "epoch": 2.6208651399491094, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722448384, + "loss": 1.3925, + "grad_norm": 0.806191623210907, + "learning_rate": 1.3876587894650686e-05 + }, + { + "step": 516, + "epoch": 2.6259541984732824, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722496, + "loss": 1.3961, + "grad_norm": 0.5098924040794373, + "learning_rate": 1.350478323929271e-05 + }, + { + "step": 517, + "epoch": 2.6310432569974553, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722440704, + "loss": 1.4133, + "grad_norm": 0.831392765045166, + "learning_rate": 1.3137792566992001e-05 + }, + { + "step": 518, + "epoch": 2.6361323155216283, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722499072, + "loss": 1.4011, + "grad_norm": 0.42338210344314575, + "learning_rate": 1.2775628820963091e-05 + }, + { + "step": 519, + "epoch": 2.6412213740458013, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722505216, + "loss": 1.3752, + "grad_norm": 0.40642625093460083, + "learning_rate": 1.2418304774182075e-05 + }, + { + "step": 520, + "epoch": 2.6463104325699747, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72264192, + "loss": 1.3851, + "grad_norm": 0.3687264323234558, + "learning_rate": 1.2065833028935968e-05 + }, + { + "step": 521, + "epoch": 2.6513994910941476, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722451456, + "loss": 1.3978, + "grad_norm": 0.45433759689331055, + "learning_rate": 1.1718226016378507e-05 + }, + { + "step": 522, + "epoch": 2.6564885496183206, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722515968, + "loss": 1.362, + "grad_norm": 0.5264555215835571, + "learning_rate": 1.137549599609136e-05 + }, + { + "step": 523, + "epoch": 2.6615776081424936, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722431488, + "loss": 1.3706, + "grad_norm": 0.5409470796585083, + "learning_rate": 1.103765505565205e-05 + }, + { + "step": 524, + "epoch": 2.6666666666666665, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722448384, + "loss": 1.3865, + "grad_norm": 0.15573547780513763, + "learning_rate": 1.0704715110207579e-05 + }, + { + "step": 525, + "epoch": 2.67175572519084, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722448384, + "loss": 1.3879, + "grad_norm": 0.6277367472648621, + "learning_rate": 1.0376687902053981e-05 + }, + { + "step": 526, + "epoch": 2.676844783715013, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722574336, + "loss": 1.3823, + "grad_norm": 0.29285651445388794, + "learning_rate": 1.0053585000222524e-05 + }, + { + "step": 527, + "epoch": 2.681933842239186, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722454528, + "loss": 1.3698, + "grad_norm": 0.6419893503189087, + "learning_rate": 9.735417800071433e-06 + }, + { + "step": 528, + "epoch": 2.687022900763359, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722612736, + "loss": 1.3944, + "grad_norm": 0.4858682155609131, + "learning_rate": 9.42219752288414e-06 + }, + { + "step": 529, + "epoch": 2.6921119592875318, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722437632, + "loss": 1.3912, + "grad_norm": 0.41376155614852905, + "learning_rate": 9.113935215473428e-06 + }, + { + "step": 530, + "epoch": 2.6972010178117047, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722456064, + "loss": 1.3889, + "grad_norm": 0.41145309805870056, + "learning_rate": 8.810641749791902e-06 + }, + { + "step": 531, + "epoch": 2.7022900763358777, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722632704, + "loss": 1.3733, + "grad_norm": 0.5009766221046448, + "learning_rate": 8.512327822548481e-06 + }, + { + "step": 532, + "epoch": 2.7073791348600507, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722506752, + "loss": 1.3883, + "grad_norm": 0.3440101742744446, + "learning_rate": 8.219003954831199e-06 + }, + { + "step": 533, + "epoch": 2.712468193384224, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72250368, + "loss": 1.3692, + "grad_norm": 0.34446197748184204, + "learning_rate": 7.930680491736135e-06 + }, + { + "step": 534, + "epoch": 2.717557251908397, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722512896, + "loss": 1.3883, + "grad_norm": 0.35409578680992126, + "learning_rate": 7.647367602002491e-06 + }, + { + "step": 535, + "epoch": 2.72264631043257, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72238848, + "loss": 1.39, + "grad_norm": 0.27121633291244507, + "learning_rate": 7.369075277654091e-06 + }, + { + "step": 536, + "epoch": 2.727735368956743, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722671104, + "loss": 1.3903, + "grad_norm": 0.4778631925582886, + "learning_rate": 7.095813333646832e-06 + }, + { + "step": 537, + "epoch": 2.732824427480916, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72243456, + "loss": 1.372, + "grad_norm": 0.24676819145679474, + "learning_rate": 6.827591407522548e-06 + }, + { + "step": 538, + "epoch": 2.7379134860050893, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722529792, + "loss": 1.3801, + "grad_norm": 0.10632278025150299, + "learning_rate": 6.564418959069273e-06 + }, + { + "step": 539, + "epoch": 2.7430025445292623, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722496, + "loss": 1.3609, + "grad_norm": 0.8037012219429016, + "learning_rate": 6.3063052699873326e-06 + }, + { + "step": 540, + "epoch": 2.7480916030534353, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722463744, + "loss": 1.3881, + "grad_norm": 0.4939643442630768, + "learning_rate": 6.053259443562286e-06 + }, + { + "step": 541, + "epoch": 2.753180661577608, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722482176, + "loss": 1.3747, + "grad_norm": 0.32902196049690247, + "learning_rate": 5.8052904043435985e-06 + }, + { + "step": 542, + "epoch": 2.758269720101781, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722532864, + "loss": 1.4085, + "grad_norm": 0.6574386358261108, + "learning_rate": 5.56240689783013e-06 + }, + { + "step": 543, + "epoch": 2.763358778625954, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722428416, + "loss": 1.3847, + "grad_norm": 0.597353994846344, + "learning_rate": 5.324617490161409e-06 + }, + { + "step": 544, + "epoch": 2.768447837150127, + "cpu_mem": 1.5516672, + "gpu_mem": 4.7224576, + "loss": 1.4002, + "grad_norm": 0.6885882019996643, + "learning_rate": 5.091930567815866e-06 + }, + { + "step": 545, + "epoch": 2.7735368956743, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722400768, + "loss": 1.3931, + "grad_norm": 0.4509515166282654, + "learning_rate": 4.86435433731473e-06 + }, + { + "step": 546, + "epoch": 2.778625954198473, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722476032, + "loss": 1.3748, + "grad_norm": 0.31158390641212463, + "learning_rate": 4.641896824932861e-06 + }, + { + "step": 547, + "epoch": 2.7837150127226464, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722468352, + "loss": 1.3742, + "grad_norm": 0.22705523669719696, + "learning_rate": 4.424565876415415e-06 + }, + { + "step": 548, + "epoch": 2.7888040712468194, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722499072, + "loss": 1.3954, + "grad_norm": 0.39787203073501587, + "learning_rate": 4.212369156701373e-06 + }, + { + "step": 549, + "epoch": 2.7938931297709924, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722428416, + "loss": 1.394, + "grad_norm": 0.47616589069366455, + "learning_rate": 4.005314149653133e-06 + }, + { + "step": 550, + "epoch": 2.7989821882951653, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722454528, + "loss": 1.3661, + "grad_norm": 0.7114639282226562, + "learning_rate": 3.8034081577924147e-06 + }, + { + "step": 551, + "epoch": 2.8040712468193383, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72254976, + "loss": 1.3885, + "grad_norm": 0.6986518502235413, + "learning_rate": 3.6066583020429864e-06 + }, + { + "step": 552, + "epoch": 2.8091603053435117, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722445312, + "loss": 1.3816, + "grad_norm": 0.281005859375, + "learning_rate": 3.415071521479246e-06 + }, + { + "step": 553, + "epoch": 2.8142493638676847, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722454528, + "loss": 1.3879, + "grad_norm": 0.32937270402908325, + "learning_rate": 3.2286545730817183e-06 + }, + { + "step": 554, + "epoch": 2.8193384223918576, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722482176, + "loss": 1.3657, + "grad_norm": 0.5604687333106995, + "learning_rate": 3.0474140314985628e-06 + }, + { + "step": 555, + "epoch": 2.8244274809160306, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722496, + "loss": 1.3976, + "grad_norm": 0.5719352960586548, + "learning_rate": 2.8713562888138754e-06 + }, + { + "step": 556, + "epoch": 2.8295165394402035, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722489856, + "loss": 1.3887, + "grad_norm": 0.44682666659355164, + "learning_rate": 2.7004875543220506e-06 + }, + { + "step": 557, + "epoch": 2.8346055979643765, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722452992, + "loss": 1.3725, + "grad_norm": 0.5492080450057983, + "learning_rate": 2.5348138543089425e-06 + }, + { + "step": 558, + "epoch": 2.8396946564885495, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72251136, + "loss": 1.389, + "grad_norm": 0.8523757457733154, + "learning_rate": 2.374341031839283e-06 + }, + { + "step": 559, + "epoch": 2.8447837150127224, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722420736, + "loss": 1.3777, + "grad_norm": 0.35911449790000916, + "learning_rate": 2.2190747465505644e-06 + }, + { + "step": 560, + "epoch": 2.849872773536896, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722505216, + "loss": 1.3756, + "grad_norm": 0.43615591526031494, + "learning_rate": 2.0690204744534976e-06 + }, + { + "step": 561, + "epoch": 2.854961832061069, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722543616, + "loss": 1.3744, + "grad_norm": 0.646202027797699, + "learning_rate": 1.924183507738819e-06 + }, + { + "step": 562, + "epoch": 2.8600508905852418, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72258816, + "loss": 1.3924, + "grad_norm": 0.3355652987957001, + "learning_rate": 1.7845689545906704e-06 + }, + { + "step": 563, + "epoch": 2.8651399491094147, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72254976, + "loss": 1.3937, + "grad_norm": 0.3837812840938568, + "learning_rate": 1.6501817390064786e-06 + }, + { + "step": 564, + "epoch": 2.8702290076335877, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722486784, + "loss": 1.3928, + "grad_norm": 0.5439286828041077, + "learning_rate": 1.521026600623243e-06 + }, + { + "step": 565, + "epoch": 2.875318066157761, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722540544, + "loss": 1.4078, + "grad_norm": 0.7915974855422974, + "learning_rate": 1.3971080945503866e-06 + }, + { + "step": 566, + "epoch": 2.880407124681934, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722479104, + "loss": 1.4069, + "grad_norm": 0.7494407892227173, + "learning_rate": 1.2784305912090842e-06 + }, + { + "step": 567, + "epoch": 2.885496183206107, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72246528, + "loss": 1.3925, + "grad_norm": 0.3065261244773865, + "learning_rate": 1.1649982761782195e-06 + }, + { + "step": 568, + "epoch": 2.89058524173028, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722677248, + "loss": 1.3831, + "grad_norm": 0.418610543012619, + "learning_rate": 1.0568151500465693e-06 + }, + { + "step": 569, + "epoch": 2.895674300254453, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72254208, + "loss": 1.3656, + "grad_norm": 0.8256853222846985, + "learning_rate": 9.538850282719833e-07 + }, + { + "step": 570, + "epoch": 2.900763358778626, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722515968, + "loss": 1.3797, + "grad_norm": 0.5066593289375305, + "learning_rate": 8.56211541046542e-07 + }, + { + "step": 571, + "epoch": 2.905852417302799, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722522112, + "loss": 1.3836, + "grad_norm": 0.5989217758178711, + "learning_rate": 7.637981331687582e-07 + }, + { + "step": 572, + "epoch": 2.910941475826972, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722555904, + "loss": 1.3947, + "grad_norm": 0.5632013082504272, + "learning_rate": 6.766480639218752e-07 + }, + { + "step": 573, + "epoch": 2.916030534351145, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722463744, + "loss": 1.3804, + "grad_norm": 0.357595294713974, + "learning_rate": 5.947644069591084e-07 + }, + { + "step": 574, + "epoch": 2.921119592875318, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722548224, + "loss": 1.4011, + "grad_norm": 0.9546111226081848, + "learning_rate": 5.181500501950986e-07 + }, + { + "step": 575, + "epoch": 2.926208651399491, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722436096, + "loss": 1.3801, + "grad_norm": 0.34118926525115967, + "learning_rate": 4.468076957041433e-07 + }, + { + "step": 576, + "epoch": 2.931297709923664, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722499072, + "loss": 1.3933, + "grad_norm": 0.28512343764305115, + "learning_rate": 3.807398596248401e-07 + }, + { + "step": 577, + "epoch": 2.936386768447837, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722463744, + "loss": 1.382, + "grad_norm": 0.4648236930370331, + "learning_rate": 3.199488720714072e-07 + }, + { + "step": 578, + "epoch": 2.94147582697201, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722463744, + "loss": 1.3764, + "grad_norm": 0.31836631894111633, + "learning_rate": 2.64436877051466e-07 + }, + { + "step": 579, + "epoch": 2.9465648854961835, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722548224, + "loss": 1.3747, + "grad_norm": 0.3417756259441376, + "learning_rate": 2.1420583239040167e-07 + }, + { + "step": 580, + "epoch": 2.9516539440203564, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722494464, + "loss": 1.3785, + "grad_norm": 0.34367117285728455, + "learning_rate": 1.6925750966238494e-07 + }, + { + "step": 581, + "epoch": 2.9567430025445294, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722514432, + "loss": 1.4013, + "grad_norm": 0.5484647154808044, + "learning_rate": 1.295934941278387e-07 + }, + { + "step": 582, + "epoch": 2.9618320610687023, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72244224, + "loss": 1.3965, + "grad_norm": 0.7166821956634521, + "learning_rate": 9.52151846775162e-08 + }, + { + "step": 583, + "epoch": 2.9669211195928753, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722529792, + "loss": 1.3862, + "grad_norm": 0.44896969199180603, + "learning_rate": 6.612379378320709e-08 + }, + { + "step": 584, + "epoch": 2.9720101781170483, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722497536, + "loss": 1.3842, + "grad_norm": 0.163295716047287, + "learning_rate": 4.232034745495494e-08 + }, + { + "step": 585, + "epoch": 2.9770992366412212, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722476032, + "loss": 1.3825, + "grad_norm": 0.14979392290115356, + "learning_rate": 2.3805685204869583e-08 + }, + { + "step": 586, + "epoch": 2.982188295165394, + "cpu_mem": 1.5516672, + "gpu_mem": 4.7224192, + "loss": 1.4071, + "grad_norm": 0.8315573334693909, + "learning_rate": 1.0580460017517444e-08 + }, + { + "step": 587, + "epoch": 2.9872773536895676, + "cpu_mem": 1.5516672, + "gpu_mem": 4.722485248, + "loss": 1.3829, + "grad_norm": 0.5591462850570679, + "learning_rate": 2.645138326906604e-09 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72252672, + "loss": 1.3833, + "grad_norm": 0.20603559911251068, + "learning_rate": 0.0 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 1.5516672, + "gpu_mem": 4.72252672, + "train_runtime": 8423.3497, + "train_samples_per_second": 4.476, + "train_steps_per_second": 0.07, + "total_flos": 0.0, + "train_loss": 1.4356497396417216 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r8-a2/adapter_config.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6934cfad94edb068f0d54db83e6a8b58f0fc939 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r8-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 16, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 8, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "A" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r8-a2/eval_results.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..120bdc51cafa128902f25751608cba97b4186fd6 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "logiqa", + "results": 0.3573641809543483 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r8-a2/training_configuration.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..2d689d157b7d7490e0baf72c1b59fe8f1688accb --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "LOGIQA", + "dataset_id": "data/logiqa_train", + "preprocess_id": "logiqa_train_deepeval" + }, + "peft_config": { + "method": "abl_A", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 6317696 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 3, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_A-logiqa-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r8-a2", + "seed": 42, + "timestamp": "2025-08-30T20:27:50.593543" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r8-a2/training_logs.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..ce5516f0576d2ce920eea592fbbab412370b89ed --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r8-a2/training_logs.json @@ -0,0 +1,5305 @@ +[ + { + "step": 1, + "epoch": 0.005089058524173028, + "cpu_mem": 1.543450624, + "gpu_mem": 4.443111936, + "loss": 3.8396, + "grad_norm": 263.4623718261719, + "learning_rate": 5.084745762711864e-06 + }, + { + "step": 2, + "epoch": 0.010178117048346057, + "cpu_mem": 1.54875904, + "gpu_mem": 4.493751808, + "loss": 3.9728, + "grad_norm": 262.2771911621094, + "learning_rate": 1.0169491525423728e-05 + }, + { + "step": 3, + "epoch": 0.015267175572519083, + "cpu_mem": 1.54875904, + "gpu_mem": 4.493828608, + "loss": 3.1553, + "grad_norm": 196.2952117919922, + "learning_rate": 1.5254237288135592e-05 + }, + { + "step": 4, + "epoch": 0.020356234096692113, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493727232, + "loss": 2.2269, + "grad_norm": 99.4975357055664, + "learning_rate": 2.0338983050847455e-05 + }, + { + "step": 5, + "epoch": 0.02544529262086514, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493742592, + "loss": 1.8484, + "grad_norm": 48.89902114868164, + "learning_rate": 2.542372881355932e-05 + }, + { + "step": 6, + "epoch": 0.030534351145038167, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493734912, + "loss": 1.6258, + "grad_norm": 21.519527435302734, + "learning_rate": 3.0508474576271185e-05 + }, + { + "step": 7, + "epoch": 0.035623409669211195, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493805568, + "loss": 1.4173, + "grad_norm": 10.89854621887207, + "learning_rate": 3.559322033898305e-05 + }, + { + "step": 8, + "epoch": 0.04071246819338423, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493790208, + "loss": 1.4641, + "grad_norm": 19.983911514282227, + "learning_rate": 4.067796610169491e-05 + }, + { + "step": 9, + "epoch": 0.04580152671755725, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493788672, + "loss": 1.4059, + "grad_norm": 10.421069145202637, + "learning_rate": 4.576271186440678e-05 + }, + { + "step": 10, + "epoch": 0.05089058524173028, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493799424, + "loss": 1.4841, + "grad_norm": 18.2463321685791, + "learning_rate": 5.084745762711864e-05 + }, + { + "step": 11, + "epoch": 0.05597964376590331, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493702656, + "loss": 1.4882, + "grad_norm": 12.941911697387695, + "learning_rate": 5.59322033898305e-05 + }, + { + "step": 12, + "epoch": 0.061068702290076333, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493753344, + "loss": 1.404, + "grad_norm": 8.999972343444824, + "learning_rate": 6.101694915254237e-05 + }, + { + "step": 13, + "epoch": 0.06615776081424936, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493845504, + "loss": 1.4152, + "grad_norm": 10.954671859741211, + "learning_rate": 6.610169491525423e-05 + }, + { + "step": 14, + "epoch": 0.07124681933842239, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493757952, + "loss": 1.4457, + "grad_norm": 10.533514022827148, + "learning_rate": 7.11864406779661e-05 + }, + { + "step": 15, + "epoch": 0.07633587786259542, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493896192, + "loss": 1.3991, + "grad_norm": 5.221796035766602, + "learning_rate": 7.627118644067796e-05 + }, + { + "step": 16, + "epoch": 0.08142493638676845, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493756416, + "loss": 1.4721, + "grad_norm": 11.932865142822266, + "learning_rate": 8.135593220338982e-05 + }, + { + "step": 17, + "epoch": 0.08651399491094147, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493787136, + "loss": 1.3844, + "grad_norm": 3.528195858001709, + "learning_rate": 8.64406779661017e-05 + }, + { + "step": 18, + "epoch": 0.0916030534351145, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493750272, + "loss": 1.3742, + "grad_norm": 5.116476058959961, + "learning_rate": 9.152542372881355e-05 + }, + { + "step": 19, + "epoch": 0.09669211195928754, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493658112, + "loss": 1.4053, + "grad_norm": 4.93786096572876, + "learning_rate": 9.661016949152541e-05 + }, + { + "step": 20, + "epoch": 0.10178117048346055, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493696512, + "loss": 1.3803, + "grad_norm": 3.6637585163116455, + "learning_rate": 0.00010169491525423727 + }, + { + "step": 21, + "epoch": 0.10687022900763359, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493830144, + "loss": 1.4392, + "grad_norm": 8.401270866394043, + "learning_rate": 0.00010677966101694915 + }, + { + "step": 22, + "epoch": 0.11195928753180662, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493728768, + "loss": 1.3815, + "grad_norm": 5.607382297515869, + "learning_rate": 0.000111864406779661 + }, + { + "step": 23, + "epoch": 0.11704834605597965, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493756416, + "loss": 1.5667, + "grad_norm": 12.697549819946289, + "learning_rate": 0.00011694915254237288 + }, + { + "step": 24, + "epoch": 0.12213740458015267, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493750272, + "loss": 1.4847, + "grad_norm": 9.185148239135742, + "learning_rate": 0.00012203389830508474 + }, + { + "step": 25, + "epoch": 0.1272264631043257, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493756416, + "loss": 1.5221, + "grad_norm": 9.454619407653809, + "learning_rate": 0.00012711864406779658 + }, + { + "step": 26, + "epoch": 0.13231552162849872, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49380864, + "loss": 1.368, + "grad_norm": 6.330122470855713, + "learning_rate": 0.00013220338983050846 + }, + { + "step": 27, + "epoch": 0.13740458015267176, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493750272, + "loss": 1.5139, + "grad_norm": 11.576857566833496, + "learning_rate": 0.00013728813559322033 + }, + { + "step": 28, + "epoch": 0.14249363867684478, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493696512, + "loss": 1.474, + "grad_norm": 8.180583000183105, + "learning_rate": 0.0001423728813559322 + }, + { + "step": 29, + "epoch": 0.1475826972010178, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493788672, + "loss": 1.3933, + "grad_norm": 4.263663291931152, + "learning_rate": 0.00014745762711864405 + }, + { + "step": 30, + "epoch": 0.15267175572519084, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493784064, + "loss": 1.3428, + "grad_norm": 6.418519020080566, + "learning_rate": 0.00015254237288135592 + }, + { + "step": 31, + "epoch": 0.15776081424936386, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49376256, + "loss": 1.3884, + "grad_norm": 7.910444259643555, + "learning_rate": 0.0001576271186440678 + }, + { + "step": 32, + "epoch": 0.1628498727735369, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493767168, + "loss": 1.5762, + "grad_norm": 19.049686431884766, + "learning_rate": 0.00016271186440677964 + }, + { + "step": 33, + "epoch": 0.16793893129770993, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493802496, + "loss": 1.3534, + "grad_norm": 4.597019195556641, + "learning_rate": 0.0001677966101694915 + }, + { + "step": 34, + "epoch": 0.17302798982188294, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493727232, + "loss": 1.4406, + "grad_norm": 6.63232946395874, + "learning_rate": 0.0001728813559322034 + }, + { + "step": 35, + "epoch": 0.178117048346056, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493774848, + "loss": 1.3744, + "grad_norm": 2.574458122253418, + "learning_rate": 0.00017796610169491523 + }, + { + "step": 36, + "epoch": 0.183206106870229, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493790208, + "loss": 1.4396, + "grad_norm": 9.705180168151855, + "learning_rate": 0.0001830508474576271 + }, + { + "step": 37, + "epoch": 0.18829516539440203, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493856256, + "loss": 1.5177, + "grad_norm": 16.29548454284668, + "learning_rate": 0.00018813559322033895 + }, + { + "step": 38, + "epoch": 0.19338422391857507, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49373952, + "loss": 1.4406, + "grad_norm": 4.15235710144043, + "learning_rate": 0.00019322033898305083 + }, + { + "step": 39, + "epoch": 0.1984732824427481, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493856256, + "loss": 1.4538, + "grad_norm": 4.818760395050049, + "learning_rate": 0.0001983050847457627 + }, + { + "step": 40, + "epoch": 0.2035623409669211, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49377792, + "loss": 1.3802, + "grad_norm": 2.8532683849334717, + "learning_rate": 0.00020338983050847455 + }, + { + "step": 41, + "epoch": 0.20865139949109415, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493676544, + "loss": 1.3857, + "grad_norm": 3.0497469902038574, + "learning_rate": 0.00020847457627118642 + }, + { + "step": 42, + "epoch": 0.21374045801526717, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493748736, + "loss": 1.4514, + "grad_norm": 5.865910053253174, + "learning_rate": 0.0002135593220338983 + }, + { + "step": 43, + "epoch": 0.21882951653944022, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493711872, + "loss": 1.3951, + "grad_norm": 4.429351806640625, + "learning_rate": 0.00021864406779661014 + }, + { + "step": 44, + "epoch": 0.22391857506361323, + "cpu_mem": 1.548955648, + "gpu_mem": 4.4937472, + "loss": 1.4663, + "grad_norm": 6.049858093261719, + "learning_rate": 0.000223728813559322 + }, + { + "step": 45, + "epoch": 0.22900763358778625, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49380096, + "loss": 1.4372, + "grad_norm": 4.684779644012451, + "learning_rate": 0.00022881355932203386 + }, + { + "step": 46, + "epoch": 0.2340966921119593, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493848576, + "loss": 1.3945, + "grad_norm": 2.8447964191436768, + "learning_rate": 0.00023389830508474576 + }, + { + "step": 47, + "epoch": 0.23918575063613232, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493676544, + "loss": 1.3694, + "grad_norm": 2.092355966567993, + "learning_rate": 0.0002389830508474576 + }, + { + "step": 48, + "epoch": 0.24427480916030533, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493730304, + "loss": 1.412, + "grad_norm": 3.8053181171417236, + "learning_rate": 0.00024406779661016948 + }, + { + "step": 49, + "epoch": 0.24936386768447838, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493719552, + "loss": 1.4878, + "grad_norm": 5.71662712097168, + "learning_rate": 0.00024915254237288135 + }, + { + "step": 50, + "epoch": 0.2544529262086514, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493728768, + "loss": 1.3829, + "grad_norm": 1.4902143478393555, + "learning_rate": 0.00025423728813559317 + }, + { + "step": 51, + "epoch": 0.2595419847328244, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493824, + "loss": 1.366, + "grad_norm": 2.2738630771636963, + "learning_rate": 0.0002593220338983051 + }, + { + "step": 52, + "epoch": 0.26463104325699743, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493765632, + "loss": 1.4318, + "grad_norm": 4.380495071411133, + "learning_rate": 0.0002644067796610169 + }, + { + "step": 53, + "epoch": 0.2697201017811705, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493843968, + "loss": 1.4173, + "grad_norm": 3.361121416091919, + "learning_rate": 0.0002694915254237288 + }, + { + "step": 54, + "epoch": 0.2748091603053435, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493744128, + "loss": 1.4106, + "grad_norm": 3.8658447265625, + "learning_rate": 0.00027457627118644066 + }, + { + "step": 55, + "epoch": 0.27989821882951654, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493741056, + "loss": 1.426, + "grad_norm": 4.589377403259277, + "learning_rate": 0.0002796610169491525 + }, + { + "step": 56, + "epoch": 0.28498727735368956, + "cpu_mem": 1.548955648, + "gpu_mem": 4.4937856, + "loss": 1.402, + "grad_norm": 3.1339633464813232, + "learning_rate": 0.0002847457627118644 + }, + { + "step": 57, + "epoch": 0.2900763358778626, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49377792, + "loss": 1.3604, + "grad_norm": 1.6231077909469604, + "learning_rate": 0.00028983050847457623 + }, + { + "step": 58, + "epoch": 0.2951653944020356, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49380096, + "loss": 1.4284, + "grad_norm": 3.838261604309082, + "learning_rate": 0.0002949152542372881 + }, + { + "step": 59, + "epoch": 0.30025445292620867, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493768704, + "loss": 1.4718, + "grad_norm": 6.222418308258057, + "learning_rate": 0.0003 + }, + { + "step": 60, + "epoch": 0.3053435114503817, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493759488, + "loss": 1.4042, + "grad_norm": 2.0736210346221924, + "learning_rate": 0.00029999735486167307 + }, + { + "step": 61, + "epoch": 0.3104325699745547, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493796352, + "loss": 1.3792, + "grad_norm": 2.757523536682129, + "learning_rate": 0.00029998941953998247 + }, + { + "step": 62, + "epoch": 0.3155216284987277, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493836288, + "loss": 1.37, + "grad_norm": 1.1986058950424194, + "learning_rate": 0.0002999761943147951 + }, + { + "step": 63, + "epoch": 0.32061068702290074, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493765632, + "loss": 1.3622, + "grad_norm": 1.1270558834075928, + "learning_rate": 0.000299957679652545 + }, + { + "step": 64, + "epoch": 0.3256997455470738, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493667328, + "loss": 1.3969, + "grad_norm": 3.8118391036987305, + "learning_rate": 0.0002999338762062168 + }, + { + "step": 65, + "epoch": 0.33078880407124683, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493748736, + "loss": 1.4419, + "grad_norm": 4.2420148849487305, + "learning_rate": 0.00029990478481532246 + }, + { + "step": 66, + "epoch": 0.33587786259541985, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493853184, + "loss": 1.4068, + "grad_norm": 2.3709568977355957, + "learning_rate": 0.00029987040650587214 + }, + { + "step": 67, + "epoch": 0.34096692111959287, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493730304, + "loss": 1.3964, + "grad_norm": 2.733264684677124, + "learning_rate": 0.0002998307424903376 + }, + { + "step": 68, + "epoch": 0.3460559796437659, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493782528, + "loss": 1.4231, + "grad_norm": 3.041414499282837, + "learning_rate": 0.00029978579416760955 + }, + { + "step": 69, + "epoch": 0.3511450381679389, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493774848, + "loss": 1.4179, + "grad_norm": 2.9425160884857178, + "learning_rate": 0.00029973556312294853 + }, + { + "step": 70, + "epoch": 0.356234096692112, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493690368, + "loss": 1.4124, + "grad_norm": 2.2964680194854736, + "learning_rate": 0.0002996800511279286 + }, + { + "step": 71, + "epoch": 0.361323155216285, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493713408, + "loss": 1.4078, + "grad_norm": 3.439091205596924, + "learning_rate": 0.0002996192601403751 + }, + { + "step": 72, + "epoch": 0.366412213740458, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49376256, + "loss": 1.3707, + "grad_norm": 1.1325554847717285, + "learning_rate": 0.00029955319230429584 + }, + { + "step": 73, + "epoch": 0.37150127226463103, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49371648, + "loss": 1.4082, + "grad_norm": 2.617950677871704, + "learning_rate": 0.00029948184994980486 + }, + { + "step": 74, + "epoch": 0.37659033078880405, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49377024, + "loss": 1.4444, + "grad_norm": 3.739596366882324, + "learning_rate": 0.0002994052355930409 + }, + { + "step": 75, + "epoch": 0.3816793893129771, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493813248, + "loss": 1.4797, + "grad_norm": 5.954856872558594, + "learning_rate": 0.0002993233519360781 + }, + { + "step": 76, + "epoch": 0.38676844783715014, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493761024, + "loss": 1.4327, + "grad_norm": 4.112733364105225, + "learning_rate": 0.0002992362018668312 + }, + { + "step": 77, + "epoch": 0.39185750636132316, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493699584, + "loss": 1.3821, + "grad_norm": 1.9279943704605103, + "learning_rate": 0.00029914378845895343 + }, + { + "step": 78, + "epoch": 0.3969465648854962, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49380096, + "loss": 1.4008, + "grad_norm": 4.672056674957275, + "learning_rate": 0.000299046114971728 + }, + { + "step": 79, + "epoch": 0.4020356234096692, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493817856, + "loss": 1.32, + "grad_norm": 4.567533016204834, + "learning_rate": 0.0002989431848499534 + }, + { + "step": 80, + "epoch": 0.4071246819338422, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493658112, + "loss": 1.5431, + "grad_norm": 8.950145721435547, + "learning_rate": 0.0002988350017238218 + }, + { + "step": 81, + "epoch": 0.4122137404580153, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493764096, + "loss": 1.389, + "grad_norm": 2.6848866939544678, + "learning_rate": 0.0002987215694087909 + }, + { + "step": 82, + "epoch": 0.4173027989821883, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493736448, + "loss": 1.4698, + "grad_norm": 4.940035343170166, + "learning_rate": 0.0002986028919054496 + }, + { + "step": 83, + "epoch": 0.4223918575063613, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493750272, + "loss": 1.4333, + "grad_norm": 4.4171624183654785, + "learning_rate": 0.00029847897339937675 + }, + { + "step": 84, + "epoch": 0.42748091603053434, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493756416, + "loss": 1.3997, + "grad_norm": 3.0464634895324707, + "learning_rate": 0.0002983498182609935 + }, + { + "step": 85, + "epoch": 0.43256997455470736, + "cpu_mem": 1.548955648, + "gpu_mem": 4.4937856, + "loss": 1.3997, + "grad_norm": 1.7955894470214844, + "learning_rate": 0.0002982154310454093 + }, + { + "step": 86, + "epoch": 0.43765903307888043, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493727232, + "loss": 1.381, + "grad_norm": 1.7920331954956055, + "learning_rate": 0.00029807581649226114 + }, + { + "step": 87, + "epoch": 0.44274809160305345, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493748736, + "loss": 1.4073, + "grad_norm": 2.124807119369507, + "learning_rate": 0.00029793097952554646 + }, + { + "step": 88, + "epoch": 0.44783715012722647, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493774848, + "loss": 1.374, + "grad_norm": 2.4253909587860107, + "learning_rate": 0.0002977809252534494 + }, + { + "step": 89, + "epoch": 0.4529262086513995, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493722624, + "loss": 1.4753, + "grad_norm": 4.976850509643555, + "learning_rate": 0.00029762565896816073 + }, + { + "step": 90, + "epoch": 0.4580152671755725, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49375488, + "loss": 1.4729, + "grad_norm": 5.208970069885254, + "learning_rate": 0.000297465186145691 + }, + { + "step": 91, + "epoch": 0.4631043256997455, + "cpu_mem": 1.548955648, + "gpu_mem": 4.4937472, + "loss": 1.4395, + "grad_norm": 5.626943111419678, + "learning_rate": 0.0002972995124456779 + }, + { + "step": 92, + "epoch": 0.4681933842239186, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493687296, + "loss": 1.4343, + "grad_norm": 4.959468364715576, + "learning_rate": 0.0002971286437111861 + }, + { + "step": 93, + "epoch": 0.4732824427480916, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493888512, + "loss": 1.3821, + "grad_norm": 1.5149654150009155, + "learning_rate": 0.0002969525859685014 + }, + { + "step": 94, + "epoch": 0.47837150127226463, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493702656, + "loss": 1.5387, + "grad_norm": 6.7013750076293945, + "learning_rate": 0.0002967713454269183 + }, + { + "step": 95, + "epoch": 0.48346055979643765, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49387008, + "loss": 1.4533, + "grad_norm": 4.1641316413879395, + "learning_rate": 0.0002965849284785207 + }, + { + "step": 96, + "epoch": 0.48854961832061067, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493744128, + "loss": 1.4154, + "grad_norm": 3.6885852813720703, + "learning_rate": 0.000296393341697957 + }, + { + "step": 97, + "epoch": 0.49363867684478374, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49373952, + "loss": 1.4085, + "grad_norm": 3.187304973602295, + "learning_rate": 0.00029619659184220755 + }, + { + "step": 98, + "epoch": 0.49872773536895676, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493787136, + "loss": 1.5714, + "grad_norm": 10.36270523071289, + "learning_rate": 0.00029599468585034684 + }, + { + "step": 99, + "epoch": 0.5038167938931297, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493768704, + "loss": 1.4366, + "grad_norm": 6.099697589874268, + "learning_rate": 0.0002957876308432986 + }, + { + "step": 100, + "epoch": 0.5089058524173028, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493733376, + "loss": 1.4054, + "grad_norm": 3.1802968978881836, + "learning_rate": 0.0002955754341235846 + }, + { + "step": 101, + "epoch": 0.5139949109414759, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49371648, + "loss": 1.4088, + "grad_norm": 1.5544377565383911, + "learning_rate": 0.00029535810317506714 + }, + { + "step": 102, + "epoch": 0.5190839694656488, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493767168, + "loss": 1.4084, + "grad_norm": 1.7905771732330322, + "learning_rate": 0.00029513564566268524 + }, + { + "step": 103, + "epoch": 0.5241730279898219, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493727232, + "loss": 1.3747, + "grad_norm": 0.7225137948989868, + "learning_rate": 0.0002949080694321841 + }, + { + "step": 104, + "epoch": 0.5292620865139949, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49381632, + "loss": 1.4447, + "grad_norm": 3.222010374069214, + "learning_rate": 0.0002946753825098386 + }, + { + "step": 105, + "epoch": 0.5343511450381679, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49368576, + "loss": 1.5563, + "grad_norm": 6.599139213562012, + "learning_rate": 0.0002944375931021699 + }, + { + "step": 106, + "epoch": 0.539440203562341, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493748736, + "loss": 1.4258, + "grad_norm": 2.679426670074463, + "learning_rate": 0.0002941947095956564 + }, + { + "step": 107, + "epoch": 0.544529262086514, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493744128, + "loss": 1.3885, + "grad_norm": 0.7273852229118347, + "learning_rate": 0.0002939467405564377 + }, + { + "step": 108, + "epoch": 0.549618320610687, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493733376, + "loss": 1.3834, + "grad_norm": 1.5885823965072632, + "learning_rate": 0.00029369369473001265 + }, + { + "step": 109, + "epoch": 0.55470737913486, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493902336, + "loss": 1.3967, + "grad_norm": 1.5692379474639893, + "learning_rate": 0.0002934355810409307 + }, + { + "step": 110, + "epoch": 0.5597964376590331, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493699584, + "loss": 1.4155, + "grad_norm": 3.2361180782318115, + "learning_rate": 0.0002931724085924774 + }, + { + "step": 111, + "epoch": 0.5648854961832062, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49370112, + "loss": 1.4342, + "grad_norm": 3.014986515045166, + "learning_rate": 0.00029290418666635314 + }, + { + "step": 112, + "epoch": 0.5699745547073791, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49380096, + "loss": 1.3723, + "grad_norm": 0.9222341775894165, + "learning_rate": 0.0002926309247223459 + }, + { + "step": 113, + "epoch": 0.5750636132315522, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493899264, + "loss": 1.3753, + "grad_norm": 1.193905234336853, + "learning_rate": 0.0002923526323979975 + }, + { + "step": 114, + "epoch": 0.5801526717557252, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493728768, + "loss": 1.4132, + "grad_norm": 1.6691721677780151, + "learning_rate": 0.00029206931950826387 + }, + { + "step": 115, + "epoch": 0.5852417302798982, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493737984, + "loss": 1.4143, + "grad_norm": 1.9173507690429688, + "learning_rate": 0.00029178099604516876 + }, + { + "step": 116, + "epoch": 0.5903307888040712, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493799424, + "loss": 1.4294, + "grad_norm": 3.6753170490264893, + "learning_rate": 0.0002914876721774515 + }, + { + "step": 117, + "epoch": 0.5954198473282443, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493691904, + "loss": 1.3837, + "grad_norm": 0.995786190032959, + "learning_rate": 0.00029118935825020806 + }, + { + "step": 118, + "epoch": 0.6005089058524173, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493787136, + "loss": 1.3888, + "grad_norm": 1.1405963897705078, + "learning_rate": 0.00029088606478452656 + }, + { + "step": 119, + "epoch": 0.6055979643765903, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493920768, + "loss": 1.3702, + "grad_norm": 1.5577439069747925, + "learning_rate": 0.0002905778024771158 + }, + { + "step": 120, + "epoch": 0.6106870229007634, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493824, + "loss": 1.5129, + "grad_norm": 5.353615760803223, + "learning_rate": 0.00029026458219992855 + }, + { + "step": 121, + "epoch": 0.6157760814249363, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49387008, + "loss": 1.4244, + "grad_norm": 3.463958501815796, + "learning_rate": 0.00028994641499977745 + }, + { + "step": 122, + "epoch": 0.6208651399491094, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493814784, + "loss": 1.3933, + "grad_norm": 1.6234169006347656, + "learning_rate": 0.00028962331209794604 + }, + { + "step": 123, + "epoch": 0.6259541984732825, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493853184, + "loss": 1.6681, + "grad_norm": 15.544632911682129, + "learning_rate": 0.00028929528488979244 + }, + { + "step": 124, + "epoch": 0.6310432569974554, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493776384, + "loss": 1.4317, + "grad_norm": 7.792145729064941, + "learning_rate": 0.0002889623449443479 + }, + { + "step": 125, + "epoch": 0.6361323155216285, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493811712, + "loss": 1.4808, + "grad_norm": 4.067721843719482, + "learning_rate": 0.0002886245040039086 + }, + { + "step": 126, + "epoch": 0.6412213740458015, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493718016, + "loss": 1.3822, + "grad_norm": 1.5442131757736206, + "learning_rate": 0.0002882817739836215 + }, + { + "step": 127, + "epoch": 0.6463104325699746, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493742592, + "loss": 1.4658, + "grad_norm": 3.202880620956421, + "learning_rate": 0.000287934166971064 + }, + { + "step": 128, + "epoch": 0.6513994910941476, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493714944, + "loss": 1.3784, + "grad_norm": 1.4464333057403564, + "learning_rate": 0.0002875816952258179 + }, + { + "step": 129, + "epoch": 0.6564885496183206, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493721088, + "loss": 1.4157, + "grad_norm": 2.841951608657837, + "learning_rate": 0.00028722437117903693 + }, + { + "step": 130, + "epoch": 0.6615776081424937, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493711872, + "loss": 1.4863, + "grad_norm": 5.012450695037842, + "learning_rate": 0.000286862207433008 + }, + { + "step": 131, + "epoch": 0.6666666666666666, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493645824, + "loss": 1.4585, + "grad_norm": 5.363411903381348, + "learning_rate": 0.00028649521676070726 + }, + { + "step": 132, + "epoch": 0.6717557251908397, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49380864, + "loss": 1.3744, + "grad_norm": 2.340942621231079, + "learning_rate": 0.0002861234121053493 + }, + { + "step": 133, + "epoch": 0.6768447837150128, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493721088, + "loss": 1.3664, + "grad_norm": 5.671582221984863, + "learning_rate": 0.0002857468065799307 + }, + { + "step": 134, + "epoch": 0.6819338422391857, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49372416, + "loss": 1.6155, + "grad_norm": 11.945754051208496, + "learning_rate": 0.0002853654134667676 + }, + { + "step": 135, + "epoch": 0.6870229007633588, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493779456, + "loss": 1.5506, + "grad_norm": 9.770878791809082, + "learning_rate": 0.0002849792462170271 + }, + { + "step": 136, + "epoch": 0.6921119592875318, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493682688, + "loss": 1.6022, + "grad_norm": 9.199873924255371, + "learning_rate": 0.0002845883184502533 + }, + { + "step": 137, + "epoch": 0.6972010178117048, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493850112, + "loss": 1.4639, + "grad_norm": 3.7787086963653564, + "learning_rate": 0.00028419264395388626 + }, + { + "step": 138, + "epoch": 0.7022900763358778, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49371648, + "loss": 1.4316, + "grad_norm": 4.088203430175781, + "learning_rate": 0.0002837922366827765 + }, + { + "step": 139, + "epoch": 0.7073791348600509, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493705728, + "loss": 1.3848, + "grad_norm": 3.124978542327881, + "learning_rate": 0.00028338711075869216 + }, + { + "step": 140, + "epoch": 0.712468193384224, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493757952, + "loss": 1.3405, + "grad_norm": 1.2433826923370361, + "learning_rate": 0.00028297728046982137 + }, + { + "step": 141, + "epoch": 0.7175572519083969, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493682688, + "loss": 1.5695, + "grad_norm": 7.719336032867432, + "learning_rate": 0.00028256276027026816 + }, + { + "step": 142, + "epoch": 0.72264631043257, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493744128, + "loss": 1.7183, + "grad_norm": 9.57568073272705, + "learning_rate": 0.0002821435647795429 + }, + { + "step": 143, + "epoch": 0.727735368956743, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493742592, + "loss": 1.5104, + "grad_norm": 4.921686172485352, + "learning_rate": 0.00028171970878204623 + }, + { + "step": 144, + "epoch": 0.732824427480916, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493687296, + "loss": 1.3843, + "grad_norm": 2.03564453125, + "learning_rate": 0.0002812912072265481 + }, + { + "step": 145, + "epoch": 0.7379134860050891, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493684224, + "loss": 1.4055, + "grad_norm": 2.348337173461914, + "learning_rate": 0.00028085807522566043 + }, + { + "step": 146, + "epoch": 0.7430025445292621, + "cpu_mem": 1.548955648, + "gpu_mem": 4.4937856, + "loss": 1.4037, + "grad_norm": 2.3145878314971924, + "learning_rate": 0.00028042032805530387 + }, + { + "step": 147, + "epoch": 0.7480916030534351, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493790208, + "loss": 1.3792, + "grad_norm": 2.099241256713867, + "learning_rate": 0.00027997798115416935 + }, + { + "step": 148, + "epoch": 0.7531806615776081, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493888512, + "loss": 1.4431, + "grad_norm": 3.97795033454895, + "learning_rate": 0.0002795310501231734 + }, + { + "step": 149, + "epoch": 0.7582697201017812, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49373184, + "loss": 1.4284, + "grad_norm": 3.237278938293457, + "learning_rate": 0.0002790795507249081 + }, + { + "step": 150, + "epoch": 0.7633587786259542, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49372416, + "loss": 1.3805, + "grad_norm": 1.1377184391021729, + "learning_rate": 0.00027862349888308494 + }, + { + "step": 151, + "epoch": 0.7684478371501272, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493667328, + "loss": 1.4187, + "grad_norm": 2.5593974590301514, + "learning_rate": 0.0002781629106819733 + }, + { + "step": 152, + "epoch": 0.7735368956743003, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493699584, + "loss": 1.4428, + "grad_norm": 3.8174688816070557, + "learning_rate": 0.00027769780236583315 + }, + { + "step": 153, + "epoch": 0.7786259541984732, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493727232, + "loss": 1.4393, + "grad_norm": 3.5999579429626465, + "learning_rate": 0.0002772281903383424 + }, + { + "step": 154, + "epoch": 0.7837150127226463, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49377792, + "loss": 1.3823, + "grad_norm": 1.821285605430603, + "learning_rate": 0.00027675409116201797 + }, + { + "step": 155, + "epoch": 0.7888040712468194, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493690368, + "loss": 1.4408, + "grad_norm": 2.362061023712158, + "learning_rate": 0.00027627552155763186 + }, + { + "step": 156, + "epoch": 0.7938931297709924, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493710336, + "loss": 1.479, + "grad_norm": 3.7810425758361816, + "learning_rate": 0.00027579249840362145 + }, + { + "step": 157, + "epoch": 0.7989821882951654, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493787136, + "loss": 1.3674, + "grad_norm": 1.2702386379241943, + "learning_rate": 0.0002753050387354942 + }, + { + "step": 158, + "epoch": 0.8040712468193384, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493688832, + "loss": 1.4059, + "grad_norm": 1.5716339349746704, + "learning_rate": 0.0002748131597452268 + }, + { + "step": 159, + "epoch": 0.8091603053435115, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493790208, + "loss": 1.4279, + "grad_norm": 2.7276456356048584, + "learning_rate": 0.00027431687878065874 + }, + { + "step": 160, + "epoch": 0.8142493638676844, + "cpu_mem": 1.548955648, + "gpu_mem": 4.4937472, + "loss": 1.3841, + "grad_norm": 2.6820287704467773, + "learning_rate": 0.00027381621334488085 + }, + { + "step": 161, + "epoch": 0.8193384223918575, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493745664, + "loss": 1.3897, + "grad_norm": 2.4373202323913574, + "learning_rate": 0.00027331118109561744 + }, + { + "step": 162, + "epoch": 0.8244274809160306, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493728768, + "loss": 1.4271, + "grad_norm": 2.619266986846924, + "learning_rate": 0.000272801799844604 + }, + { + "step": 163, + "epoch": 0.8295165394402035, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493845504, + "loss": 1.4789, + "grad_norm": 3.993908166885376, + "learning_rate": 0.00027228808755695884 + }, + { + "step": 164, + "epoch": 0.8346055979643766, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493741056, + "loss": 1.4116, + "grad_norm": 2.3738760948181152, + "learning_rate": 0.00027177006235054943 + }, + { + "step": 165, + "epoch": 0.8396946564885496, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493820928, + "loss": 1.3875, + "grad_norm": 0.7805041670799255, + "learning_rate": 0.0002712477424953534 + }, + { + "step": 166, + "epoch": 0.8447837150127226, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493730304, + "loss": 1.4257, + "grad_norm": 2.764387845993042, + "learning_rate": 0.00027072114641281435 + }, + { + "step": 167, + "epoch": 0.8498727735368957, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493691904, + "loss": 1.4076, + "grad_norm": 2.548055648803711, + "learning_rate": 0.0002701902926751921 + }, + { + "step": 168, + "epoch": 0.8549618320610687, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493694976, + "loss": 1.3587, + "grad_norm": 1.3887176513671875, + "learning_rate": 0.00026965520000490743 + }, + { + "step": 169, + "epoch": 0.8600508905852418, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49376256, + "loss": 1.4168, + "grad_norm": 2.78411865234375, + "learning_rate": 0.0002691158872738822 + }, + { + "step": 170, + "epoch": 0.8651399491094147, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49377024, + "loss": 1.428, + "grad_norm": 1.7522097826004028, + "learning_rate": 0.00026857237350287334 + }, + { + "step": 171, + "epoch": 0.8702290076335878, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493744128, + "loss": 1.4101, + "grad_norm": 1.66981041431427, + "learning_rate": 0.0002680246778608023 + }, + { + "step": 172, + "epoch": 0.8753180661577609, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49372416, + "loss": 1.4087, + "grad_norm": 2.4739677906036377, + "learning_rate": 0.0002674728196640788 + }, + { + "step": 173, + "epoch": 0.8804071246819338, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493761024, + "loss": 1.3696, + "grad_norm": 1.570025086402893, + "learning_rate": 0.00026691681837591984 + }, + { + "step": 174, + "epoch": 0.8854961832061069, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493704192, + "loss": 1.393, + "grad_norm": 0.8794103860855103, + "learning_rate": 0.00026635669360566296 + }, + { + "step": 175, + "epoch": 0.8905852417302799, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493774848, + "loss": 1.4178, + "grad_norm": 2.5211541652679443, + "learning_rate": 0.00026579246510807477 + }, + { + "step": 176, + "epoch": 0.8956743002544529, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493671936, + "loss": 1.3724, + "grad_norm": 0.9500671029090881, + "learning_rate": 0.00026522415278265425 + }, + { + "step": 177, + "epoch": 0.9007633587786259, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493681152, + "loss": 1.43, + "grad_norm": 2.618424892425537, + "learning_rate": 0.0002646517766729309 + }, + { + "step": 178, + "epoch": 0.905852417302799, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493757952, + "loss": 1.3904, + "grad_norm": 1.7236144542694092, + "learning_rate": 0.0002640753569657579 + }, + { + "step": 179, + "epoch": 0.910941475826972, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493765632, + "loss": 1.3852, + "grad_norm": 3.2436625957489014, + "learning_rate": 0.0002634949139906 + }, + { + "step": 180, + "epoch": 0.916030534351145, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493771776, + "loss": 1.4202, + "grad_norm": 2.5512611865997314, + "learning_rate": 0.00026291046821881673 + }, + { + "step": 181, + "epoch": 0.9211195928753181, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493690368, + "loss": 1.4142, + "grad_norm": 2.1180455684661865, + "learning_rate": 0.0002623220402629402 + }, + { + "step": 182, + "epoch": 0.926208651399491, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49380096, + "loss": 1.4157, + "grad_norm": 1.8322601318359375, + "learning_rate": 0.0002617296508759483 + }, + { + "step": 183, + "epoch": 0.9312977099236641, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493788672, + "loss": 1.4032, + "grad_norm": 1.864183783531189, + "learning_rate": 0.00026113332095053257 + }, + { + "step": 184, + "epoch": 0.9363867684478372, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493771776, + "loss": 1.414, + "grad_norm": 2.700875997543335, + "learning_rate": 0.0002605330715183616 + }, + { + "step": 185, + "epoch": 0.9414758269720102, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493710336, + "loss": 1.3928, + "grad_norm": 2.4159388542175293, + "learning_rate": 0.0002599289237493392 + }, + { + "step": 186, + "epoch": 0.9465648854961832, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493742592, + "loss": 1.3766, + "grad_norm": 1.3019382953643799, + "learning_rate": 0.0002593208989508575 + }, + { + "step": 187, + "epoch": 0.9516539440203562, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49383936, + "loss": 1.3762, + "grad_norm": 0.8830015659332275, + "learning_rate": 0.00025870901856704583 + }, + { + "step": 188, + "epoch": 0.9567430025445293, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493811712, + "loss": 1.3667, + "grad_norm": 0.9610096216201782, + "learning_rate": 0.00025809330417801425 + }, + { + "step": 189, + "epoch": 0.9618320610687023, + "cpu_mem": 1.548955648, + "gpu_mem": 4.49371648, + "loss": 1.5049, + "grad_norm": 3.2255423069000244, + "learning_rate": 0.00025747377749909254 + }, + { + "step": 190, + "epoch": 0.9669211195928753, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493748736, + "loss": 1.4148, + "grad_norm": 2.0446369647979736, + "learning_rate": 0.00025685046038006413 + }, + { + "step": 191, + "epoch": 0.9720101781170484, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493737984, + "loss": 1.3993, + "grad_norm": 1.6096148490905762, + "learning_rate": 0.0002562233748043958 + }, + { + "step": 192, + "epoch": 0.9770992366412213, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493675008, + "loss": 1.3908, + "grad_norm": 2.0882649421691895, + "learning_rate": 0.00025559254288846196 + }, + { + "step": 193, + "epoch": 0.9821882951653944, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493796352, + "loss": 1.391, + "grad_norm": 0.38686811923980713, + "learning_rate": 0.0002549579868807651 + }, + { + "step": 194, + "epoch": 0.9872773536895675, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493718016, + "loss": 1.4187, + "grad_norm": 2.5117785930633545, + "learning_rate": 0.0002543197291611507 + }, + { + "step": 195, + "epoch": 0.9923664122137404, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493830144, + "loss": 1.3742, + "grad_norm": 0.6958848834037781, + "learning_rate": 0.0002536777922400183 + }, + { + "step": 196, + "epoch": 0.9974554707379135, + "cpu_mem": 1.548955648, + "gpu_mem": 4.493825536, + "loss": 1.3924, + "grad_norm": 1.5233603715896606, + "learning_rate": 0.0002530321987575271 + }, + { + "step": 197, + "epoch": 1.0025445292620865, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519069696, + "loss": 2.1447, + "grad_norm": 5.07933235168457, + "learning_rate": 0.0002523829714827981 + }, + { + "step": 198, + "epoch": 1.0076335877862594, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519135744, + "loss": 1.4164, + "grad_norm": 2.4734885692596436, + "learning_rate": 0.00025173013331311053 + }, + { + "step": 199, + "epoch": 1.0127226463104326, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519132672, + "loss": 1.3798, + "grad_norm": 1.798200011253357, + "learning_rate": 0.0002510737072730946 + }, + { + "step": 200, + "epoch": 1.0178117048346056, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519017472, + "loss": 1.3833, + "grad_norm": 1.7802138328552246, + "learning_rate": 0.0002504137165139193 + }, + { + "step": 201, + "epoch": 1.0229007633587786, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51906816, + "loss": 1.4143, + "grad_norm": 2.4477458000183105, + "learning_rate": 0.0002497501843124761 + }, + { + "step": 202, + "epoch": 1.0279898218829517, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519062016, + "loss": 1.4277, + "grad_norm": 2.7050905227661133, + "learning_rate": 0.00024908313407055765 + }, + { + "step": 203, + "epoch": 1.0330788804071247, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519109632, + "loss": 1.6583, + "grad_norm": 190.3770751953125, + "learning_rate": 0.00024841258931403284 + }, + { + "step": 204, + "epoch": 1.0381679389312977, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519042048, + "loss": 1.4118, + "grad_norm": 3.01418399810791, + "learning_rate": 0.00024773857369201675 + }, + { + "step": 205, + "epoch": 1.0432569974554706, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519088128, + "loss": 1.3924, + "grad_norm": 1.1563904285430908, + "learning_rate": 0.00024706111097603676 + }, + { + "step": 206, + "epoch": 1.0483460559796438, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519101952, + "loss": 1.3897, + "grad_norm": 1.6158376932144165, + "learning_rate": 0.00024638022505919425 + }, + { + "step": 207, + "epoch": 1.0534351145038168, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519057408, + "loss": 1.3812, + "grad_norm": 1.5778634548187256, + "learning_rate": 0.00024569593995532157 + }, + { + "step": 208, + "epoch": 1.0585241730279897, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519049728, + "loss": 1.4474, + "grad_norm": 2.793684244155884, + "learning_rate": 0.00024500827979813546 + }, + { + "step": 209, + "epoch": 1.063613231552163, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519171072, + "loss": 1.4136, + "grad_norm": 3.056872844696045, + "learning_rate": 0.0002443172688403859 + }, + { + "step": 210, + "epoch": 1.0687022900763359, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51906816, + "loss": 1.353, + "grad_norm": 1.6506890058517456, + "learning_rate": 0.00024362293145300027 + }, + { + "step": 211, + "epoch": 1.0737913486005088, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519069696, + "loss": 1.3692, + "grad_norm": 1.8252465724945068, + "learning_rate": 0.00024292529212422445 + }, + { + "step": 212, + "epoch": 1.078880407124682, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519072768, + "loss": 1.3709, + "grad_norm": 1.1712000370025635, + "learning_rate": 0.00024222437545875887 + }, + { + "step": 213, + "epoch": 1.083969465648855, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51899904, + "loss": 1.4354, + "grad_norm": 2.106110095977783, + "learning_rate": 0.0002415202061768906 + }, + { + "step": 214, + "epoch": 1.089058524173028, + "cpu_mem": 1.548955648, + "gpu_mem": 4.518994432, + "loss": 1.3901, + "grad_norm": 1.2667193412780762, + "learning_rate": 0.0002408128091136217 + }, + { + "step": 215, + "epoch": 1.094147582697201, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51904512, + "loss": 1.3518, + "grad_norm": 1.8704897165298462, + "learning_rate": 0.00024010220921779336 + }, + { + "step": 216, + "epoch": 1.099236641221374, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519174144, + "loss": 1.4435, + "grad_norm": 3.481454610824585, + "learning_rate": 0.00023938843155120581 + }, + { + "step": 217, + "epoch": 1.104325699745547, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519035904, + "loss": 1.3983, + "grad_norm": 3.441441535949707, + "learning_rate": 0.00023867150128773453 + }, + { + "step": 218, + "epoch": 1.10941475826972, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519062016, + "loss": 1.3683, + "grad_norm": 2.3860650062561035, + "learning_rate": 0.0002379514437124425 + }, + { + "step": 219, + "epoch": 1.1145038167938932, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519074304, + "loss": 1.3892, + "grad_norm": 3.3840084075927734, + "learning_rate": 0.00023722828422068814 + }, + { + "step": 220, + "epoch": 1.1195928753180662, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519046656, + "loss": 1.4276, + "grad_norm": 6.038804054260254, + "learning_rate": 0.00023650204831723008 + }, + { + "step": 221, + "epoch": 1.1246819338422391, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519020544, + "loss": 1.3835, + "grad_norm": 1.3335877656936646, + "learning_rate": 0.00023577276161532718 + }, + { + "step": 222, + "epoch": 1.1297709923664123, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519032832, + "loss": 1.4232, + "grad_norm": 12.959759712219238, + "learning_rate": 0.0002350404498358356 + }, + { + "step": 223, + "epoch": 1.1348600508905853, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51906048, + "loss": 1.3561, + "grad_norm": 1.7554781436920166, + "learning_rate": 0.00023430513880630133 + }, + { + "step": 224, + "epoch": 1.1399491094147582, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519078912, + "loss": 1.3668, + "grad_norm": 1.0784075260162354, + "learning_rate": 0.00023356685446004966 + }, + { + "step": 225, + "epoch": 1.1450381679389312, + "cpu_mem": 1.548955648, + "gpu_mem": 4.5191296, + "loss": 1.3821, + "grad_norm": 1.8996286392211914, + "learning_rate": 0.00023282562283527005 + }, + { + "step": 226, + "epoch": 1.1501272264631044, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519080448, + "loss": 1.4031, + "grad_norm": 1.688071846961975, + "learning_rate": 0.00023208147007409827 + }, + { + "step": 227, + "epoch": 1.1552162849872774, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519031296, + "loss": 1.4057, + "grad_norm": 2.209498405456543, + "learning_rate": 0.00023133442242169425 + }, + { + "step": 228, + "epoch": 1.1603053435114503, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51907584, + "loss": 1.41, + "grad_norm": 1.8450075387954712, + "learning_rate": 0.00023058450622531632 + }, + { + "step": 229, + "epoch": 1.1653944020356235, + "cpu_mem": 1.548955648, + "gpu_mem": 4.518974464, + "loss": 1.3854, + "grad_norm": 1.9602433443069458, + "learning_rate": 0.00022983174793339206 + }, + { + "step": 230, + "epoch": 1.1704834605597965, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519042048, + "loss": 1.3793, + "grad_norm": 0.9755760431289673, + "learning_rate": 0.0002290761740945857 + }, + { + "step": 231, + "epoch": 1.1755725190839694, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519032832, + "loss": 1.3626, + "grad_norm": 1.9989182949066162, + "learning_rate": 0.00022831781135686135 + }, + { + "step": 232, + "epoch": 1.1806615776081424, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51903744, + "loss": 1.3491, + "grad_norm": 2.4246444702148438, + "learning_rate": 0.00022755668646654375 + }, + { + "step": 233, + "epoch": 1.1857506361323156, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51915264, + "loss": 1.4338, + "grad_norm": 3.0701518058776855, + "learning_rate": 0.00022679282626737442 + }, + { + "step": 234, + "epoch": 1.1908396946564885, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519112704, + "loss": 1.4434, + "grad_norm": 2.9751205444335938, + "learning_rate": 0.00022602625769956519 + }, + { + "step": 235, + "epoch": 1.1959287531806615, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519071232, + "loss": 1.4173, + "grad_norm": 2.461958408355713, + "learning_rate": 0.00022525700779884802 + }, + { + "step": 236, + "epoch": 1.2010178117048347, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519069696, + "loss": 1.3792, + "grad_norm": 2.3899643421173096, + "learning_rate": 0.00022448510369552164 + }, + { + "step": 237, + "epoch": 1.2061068702290076, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519094272, + "loss": 1.3939, + "grad_norm": 1.7204440832138062, + "learning_rate": 0.0002237105726134943 + }, + { + "step": 238, + "epoch": 1.2111959287531806, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51911424, + "loss": 1.3776, + "grad_norm": 1.010871171951294, + "learning_rate": 0.00022293344186932406 + }, + { + "step": 239, + "epoch": 1.2162849872773536, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519031296, + "loss": 1.4043, + "grad_norm": 2.655653476715088, + "learning_rate": 0.00022215373887125514 + }, + { + "step": 240, + "epoch": 1.2213740458015268, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519166464, + "loss": 1.3725, + "grad_norm": 1.9991810321807861, + "learning_rate": 0.00022137149111825128 + }, + { + "step": 241, + "epoch": 1.2264631043256997, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519120384, + "loss": 1.4126, + "grad_norm": 3.7145869731903076, + "learning_rate": 0.00022058672619902606 + }, + { + "step": 242, + "epoch": 1.2315521628498727, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519141888, + "loss": 1.3923, + "grad_norm": 1.9210848808288574, + "learning_rate": 0.00021979947179106966 + }, + { + "step": 243, + "epoch": 1.2366412213740459, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519028224, + "loss": 1.3607, + "grad_norm": 2.778935670852661, + "learning_rate": 0.0002190097556596728 + }, + { + "step": 244, + "epoch": 1.2417302798982188, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519015936, + "loss": 1.4102, + "grad_norm": 2.4693703651428223, + "learning_rate": 0.0002182176056569476 + }, + { + "step": 245, + "epoch": 1.2468193384223918, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519020544, + "loss": 1.3818, + "grad_norm": 1.3956981897354126, + "learning_rate": 0.00021742304972084518 + }, + { + "step": 246, + "epoch": 1.2519083969465647, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519057408, + "loss": 1.3922, + "grad_norm": 1.3113068342208862, + "learning_rate": 0.00021662611587417035 + }, + { + "step": 247, + "epoch": 1.256997455470738, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519019008, + "loss": 1.4235, + "grad_norm": 2.7557179927825928, + "learning_rate": 0.00021582683222359317 + }, + { + "step": 248, + "epoch": 1.262086513994911, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519065088, + "loss": 1.3765, + "grad_norm": 1.8259626626968384, + "learning_rate": 0.00021502522695865796 + }, + { + "step": 249, + "epoch": 1.267175572519084, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519074304, + "loss": 1.3889, + "grad_norm": 1.3236335515975952, + "learning_rate": 0.00021422132835078884 + }, + { + "step": 250, + "epoch": 1.272264631043257, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519112704, + "loss": 1.366, + "grad_norm": 1.3230165243148804, + "learning_rate": 0.0002134151647522927 + }, + { + "step": 251, + "epoch": 1.27735368956743, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519031296, + "loss": 1.402, + "grad_norm": 2.3951914310455322, + "learning_rate": 0.00021260676459535933 + }, + { + "step": 252, + "epoch": 1.282442748091603, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519040512, + "loss": 1.4133, + "grad_norm": 2.732980489730835, + "learning_rate": 0.00021179615639105857 + }, + { + "step": 253, + "epoch": 1.2875318066157762, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51910656, + "loss": 1.374, + "grad_norm": 1.2674767971038818, + "learning_rate": 0.00021098336872833482 + }, + { + "step": 254, + "epoch": 1.2926208651399491, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519032832, + "loss": 1.3676, + "grad_norm": 1.6378636360168457, + "learning_rate": 0.0002101684302729987 + }, + { + "step": 255, + "epoch": 1.297709923664122, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519026688, + "loss": 1.4129, + "grad_norm": 2.0275728702545166, + "learning_rate": 0.00020935136976671617 + }, + { + "step": 256, + "epoch": 1.3027989821882953, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519092736, + "loss": 1.408, + "grad_norm": 2.633847713470459, + "learning_rate": 0.00020853221602599458 + }, + { + "step": 257, + "epoch": 1.3078880407124682, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519051264, + "loss": 1.3643, + "grad_norm": 1.5230259895324707, + "learning_rate": 0.00020771099794116672 + }, + { + "step": 258, + "epoch": 1.3129770992366412, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519164928, + "loss": 1.3953, + "grad_norm": 3.567425489425659, + "learning_rate": 0.0002068877444753717 + }, + { + "step": 259, + "epoch": 1.3180661577608141, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519042048, + "loss": 1.4101, + "grad_norm": 2.446251392364502, + "learning_rate": 0.0002060624846635335 + }, + { + "step": 260, + "epoch": 1.3231552162849873, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51906816, + "loss": 1.3919, + "grad_norm": 1.1604695320129395, + "learning_rate": 0.00020523524761133677 + }, + { + "step": 261, + "epoch": 1.3282442748091603, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519074304, + "loss": 1.3926, + "grad_norm": 2.1555254459381104, + "learning_rate": 0.00020440606249420073 + }, + { + "step": 262, + "epoch": 1.3333333333333333, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519141888, + "loss": 1.4127, + "grad_norm": 1.6217305660247803, + "learning_rate": 0.00020357495855624974 + }, + { + "step": 263, + "epoch": 1.3384223918575064, + "cpu_mem": 1.548955648, + "gpu_mem": 4.518997504, + "loss": 1.4168, + "grad_norm": 2.0662100315093994, + "learning_rate": 0.0002027419651092822 + }, + { + "step": 264, + "epoch": 1.3435114503816794, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519131136, + "loss": 1.3963, + "grad_norm": 1.0134828090667725, + "learning_rate": 0.00020190711153173676 + }, + { + "step": 265, + "epoch": 1.3486005089058524, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51916032, + "loss": 1.3643, + "grad_norm": 1.3594999313354492, + "learning_rate": 0.00020107042726765588 + }, + { + "step": 266, + "epoch": 1.3536895674300253, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51902976, + "loss": 1.4212, + "grad_norm": 2.550603151321411, + "learning_rate": 0.0002002319418256479 + }, + { + "step": 267, + "epoch": 1.3587786259541985, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519031296, + "loss": 1.3825, + "grad_norm": 2.2979753017425537, + "learning_rate": 0.00019939168477784583 + }, + { + "step": 268, + "epoch": 1.3638676844783715, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519032832, + "loss": 1.3831, + "grad_norm": 1.6919949054718018, + "learning_rate": 0.00019854968575886458 + }, + { + "step": 269, + "epoch": 1.3689567430025447, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51910656, + "loss": 1.4594, + "grad_norm": 3.7549149990081787, + "learning_rate": 0.00019770597446475588 + }, + { + "step": 270, + "epoch": 1.3740458015267176, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519088128, + "loss": 1.3688, + "grad_norm": 0.7282609343528748, + "learning_rate": 0.0001968605806519608 + }, + { + "step": 271, + "epoch": 1.3791348600508906, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519031296, + "loss": 1.4136, + "grad_norm": 1.3225268125534058, + "learning_rate": 0.00019601353413626032 + }, + { + "step": 272, + "epoch": 1.3842239185750635, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519069696, + "loss": 1.3976, + "grad_norm": 1.7072538137435913, + "learning_rate": 0.00019516486479172386 + }, + { + "step": 273, + "epoch": 1.3893129770992365, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51913728, + "loss": 1.3882, + "grad_norm": 1.1817103624343872, + "learning_rate": 0.0001943146025496555 + }, + { + "step": 274, + "epoch": 1.3944020356234097, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519089664, + "loss": 1.3887, + "grad_norm": 1.0569573640823364, + "learning_rate": 0.00019346277739753855 + }, + { + "step": 275, + "epoch": 1.3994910941475827, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519217152, + "loss": 1.3818, + "grad_norm": 1.35332453250885, + "learning_rate": 0.00019260941937797776 + }, + { + "step": 276, + "epoch": 1.4045801526717558, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519101952, + "loss": 1.3738, + "grad_norm": 0.8303911089897156, + "learning_rate": 0.00019175455858763988 + }, + { + "step": 277, + "epoch": 1.4096692111959288, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519051264, + "loss": 1.3672, + "grad_norm": 1.1599332094192505, + "learning_rate": 0.0001908982251761921 + }, + { + "step": 278, + "epoch": 1.4147582697201018, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519038976, + "loss": 1.3685, + "grad_norm": 2.131857395172119, + "learning_rate": 0.00019004044934523871 + }, + { + "step": 279, + "epoch": 1.4198473282442747, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519031296, + "loss": 1.4124, + "grad_norm": 2.0581538677215576, + "learning_rate": 0.00018918126134725616 + }, + { + "step": 280, + "epoch": 1.424936386768448, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519154176, + "loss": 1.4115, + "grad_norm": 1.6868329048156738, + "learning_rate": 0.00018832069148452582 + }, + { + "step": 281, + "epoch": 1.4300254452926209, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519094272, + "loss": 1.4349, + "grad_norm": 2.509308338165283, + "learning_rate": 0.00018745877010806534 + }, + { + "step": 282, + "epoch": 1.4351145038167938, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519040512, + "loss": 1.4071, + "grad_norm": 1.8864262104034424, + "learning_rate": 0.00018659552761655828 + }, + { + "step": 283, + "epoch": 1.440203562340967, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519057408, + "loss": 1.3591, + "grad_norm": 1.4728574752807617, + "learning_rate": 0.00018573099445528204 + }, + { + "step": 284, + "epoch": 1.44529262086514, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519197184, + "loss": 1.4458, + "grad_norm": 3.7232115268707275, + "learning_rate": 0.00018486520111503387 + }, + { + "step": 285, + "epoch": 1.450381679389313, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51906816, + "loss": 1.5024, + "grad_norm": 6.0232744216918945, + "learning_rate": 0.0001839981781310558 + }, + { + "step": 286, + "epoch": 1.455470737913486, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519063552, + "loss": 1.4825, + "grad_norm": 6.215474605560303, + "learning_rate": 0.00018312995608195747 + }, + { + "step": 287, + "epoch": 1.460559796437659, + "cpu_mem": 1.548955648, + "gpu_mem": 4.518966784, + "loss": 1.4069, + "grad_norm": 2.9568498134613037, + "learning_rate": 0.00018226056558863778 + }, + { + "step": 288, + "epoch": 1.465648854961832, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519042048, + "loss": 1.4051, + "grad_norm": 3.139335870742798, + "learning_rate": 0.00018139003731320496 + }, + { + "step": 289, + "epoch": 1.470737913486005, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519062016, + "loss": 1.3841, + "grad_norm": 1.7651118040084839, + "learning_rate": 0.00018051840195789506 + }, + { + "step": 290, + "epoch": 1.4758269720101782, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519028224, + "loss": 1.3825, + "grad_norm": 1.6057661771774292, + "learning_rate": 0.00017964569026398926 + }, + { + "step": 291, + "epoch": 1.4809160305343512, + "cpu_mem": 1.548955648, + "gpu_mem": 4.518989824, + "loss": 1.4342, + "grad_norm": 3.1051573753356934, + "learning_rate": 0.00017877193301072945 + }, + { + "step": 292, + "epoch": 1.4860050890585241, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519115776, + "loss": 1.4239, + "grad_norm": 3.0075109004974365, + "learning_rate": 0.0001778971610142331 + }, + { + "step": 293, + "epoch": 1.491094147582697, + "cpu_mem": 1.548955648, + "gpu_mem": 4.518992896, + "loss": 1.4188, + "grad_norm": 2.238313913345337, + "learning_rate": 0.00017702140512640594 + }, + { + "step": 294, + "epoch": 1.4961832061068703, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519035904, + "loss": 1.4046, + "grad_norm": 1.8450006246566772, + "learning_rate": 0.00017614469623385414 + }, + { + "step": 295, + "epoch": 1.5012722646310432, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519019008, + "loss": 1.3845, + "grad_norm": 2.207218885421753, + "learning_rate": 0.00017526706525679498 + }, + { + "step": 296, + "epoch": 1.5063613231552164, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51906048, + "loss": 1.4113, + "grad_norm": 2.075350046157837, + "learning_rate": 0.00017438854314796623 + }, + { + "step": 297, + "epoch": 1.5114503816793894, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519000576, + "loss": 1.4018, + "grad_norm": 1.548322081565857, + "learning_rate": 0.00017350916089153455 + }, + { + "step": 298, + "epoch": 1.5165394402035624, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519023616, + "loss": 1.4113, + "grad_norm": 1.9908292293548584, + "learning_rate": 0.00017262894950200277 + }, + { + "step": 299, + "epoch": 1.5216284987277353, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519012864, + "loss": 1.3773, + "grad_norm": 0.9290497899055481, + "learning_rate": 0.000171747940023116 + }, + { + "step": 300, + "epoch": 1.5267175572519083, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519025152, + "loss": 1.3929, + "grad_norm": 1.0981857776641846, + "learning_rate": 0.0001708661635267667 + }, + { + "step": 301, + "epoch": 1.5318066157760815, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519028224, + "loss": 1.3381, + "grad_norm": 1.9414076805114746, + "learning_rate": 0.00016998365111189906 + }, + { + "step": 302, + "epoch": 1.5368956743002544, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519009792, + "loss": 1.3896, + "grad_norm": 0.9831597208976746, + "learning_rate": 0.00016910043390341183 + }, + { + "step": 303, + "epoch": 1.5419847328244276, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519015936, + "loss": 1.4125, + "grad_norm": 1.4598218202590942, + "learning_rate": 0.0001682165430510609 + }, + { + "step": 304, + "epoch": 1.5470737913486006, + "cpu_mem": 1.548955648, + "gpu_mem": 4.518963712, + "loss": 1.4513, + "grad_norm": 2.670051336288452, + "learning_rate": 0.00016733200972836055 + }, + { + "step": 305, + "epoch": 1.5521628498727735, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519035904, + "loss": 1.3856, + "grad_norm": 1.1799101829528809, + "learning_rate": 0.00016644686513148397 + }, + { + "step": 306, + "epoch": 1.5572519083969465, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519077376, + "loss": 1.3858, + "grad_norm": 1.3579021692276, + "learning_rate": 0.00016556114047816317 + }, + { + "step": 307, + "epoch": 1.5623409669211195, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519025152, + "loss": 1.3759, + "grad_norm": 0.8210622072219849, + "learning_rate": 0.00016467486700658785 + }, + { + "step": 308, + "epoch": 1.5674300254452926, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519035904, + "loss": 1.4006, + "grad_norm": 1.0406382083892822, + "learning_rate": 0.0001637880759743037 + }, + { + "step": 309, + "epoch": 1.5725190839694656, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519019008, + "loss": 1.4177, + "grad_norm": 2.0407252311706543, + "learning_rate": 0.00016290079865711004 + }, + { + "step": 310, + "epoch": 1.5776081424936388, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519074304, + "loss": 1.3886, + "grad_norm": 0.7341501712799072, + "learning_rate": 0.00016201306634795675 + }, + { + "step": 311, + "epoch": 1.5826972010178118, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519040512, + "loss": 1.3913, + "grad_norm": 0.6332818865776062, + "learning_rate": 0.00016112491035584047 + }, + { + "step": 312, + "epoch": 1.5877862595419847, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519049728, + "loss": 1.4015, + "grad_norm": 0.7691799998283386, + "learning_rate": 0.00016023636200470065 + }, + { + "step": 313, + "epoch": 1.5928753180661577, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519051264, + "loss": 1.3857, + "grad_norm": 1.4171892404556274, + "learning_rate": 0.00015934745263231464 + }, + { + "step": 314, + "epoch": 1.5979643765903306, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519278592, + "loss": 1.3921, + "grad_norm": 1.2236794233322144, + "learning_rate": 0.00015845821358919236 + }, + { + "step": 315, + "epoch": 1.6030534351145038, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519071232, + "loss": 1.388, + "grad_norm": 0.9952640533447266, + "learning_rate": 0.00015756867623747088 + }, + { + "step": 316, + "epoch": 1.608142493638677, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519043584, + "loss": 1.3978, + "grad_norm": 1.3725591897964478, + "learning_rate": 0.00015667887194980806 + }, + { + "step": 317, + "epoch": 1.61323155216285, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519008256, + "loss": 1.3609, + "grad_norm": 1.4044119119644165, + "learning_rate": 0.00015578883210827626 + }, + { + "step": 318, + "epoch": 1.618320610687023, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51909888, + "loss": 1.3906, + "grad_norm": 1.4221484661102295, + "learning_rate": 0.0001548985881032554 + }, + { + "step": 319, + "epoch": 1.623409669211196, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519023616, + "loss": 1.3735, + "grad_norm": 1.5416958332061768, + "learning_rate": 0.00015400817133232606 + }, + { + "step": 320, + "epoch": 1.6284987277353689, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519071232, + "loss": 1.379, + "grad_norm": 1.1094119548797607, + "learning_rate": 0.00015311761319916184 + }, + { + "step": 321, + "epoch": 1.6335877862595418, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519012864, + "loss": 1.3698, + "grad_norm": 1.61104154586792, + "learning_rate": 0.00015222694511242215 + }, + { + "step": 322, + "epoch": 1.638676844783715, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519234048, + "loss": 1.4173, + "grad_norm": 1.9689369201660156, + "learning_rate": 0.00015133619848464424 + }, + { + "step": 323, + "epoch": 1.6437659033078882, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519115776, + "loss": 1.3917, + "grad_norm": 1.776476263999939, + "learning_rate": 0.0001504454047311353 + }, + { + "step": 324, + "epoch": 1.6488549618320612, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519008256, + "loss": 1.3791, + "grad_norm": 2.4010748863220215, + "learning_rate": 0.00014955459526886468 + }, + { + "step": 325, + "epoch": 1.6539440203562341, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519048192, + "loss": 1.3783, + "grad_norm": 1.4185007810592651, + "learning_rate": 0.00014866380151535574 + }, + { + "step": 326, + "epoch": 1.659033078880407, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519095808, + "loss": 1.362, + "grad_norm": 1.381518840789795, + "learning_rate": 0.0001477730548875778 + }, + { + "step": 327, + "epoch": 1.66412213740458, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519062016, + "loss": 1.392, + "grad_norm": 1.8509361743927002, + "learning_rate": 0.0001468823868008382 + }, + { + "step": 328, + "epoch": 1.6692111959287532, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519117312, + "loss": 1.3735, + "grad_norm": 1.4929441213607788, + "learning_rate": 0.000145991828667674 + }, + { + "step": 329, + "epoch": 1.6743002544529262, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519009792, + "loss": 1.3717, + "grad_norm": 1.5579564571380615, + "learning_rate": 0.0001451014118967446 + }, + { + "step": 330, + "epoch": 1.6793893129770994, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519094272, + "loss": 1.3936, + "grad_norm": 2.450465440750122, + "learning_rate": 0.00014421116789172374 + }, + { + "step": 331, + "epoch": 1.6844783715012723, + "cpu_mem": 1.548955648, + "gpu_mem": 4.51908352, + "loss": 1.3333, + "grad_norm": 1.493731141090393, + "learning_rate": 0.00014332112805019194 + }, + { + "step": 332, + "epoch": 1.6895674300254453, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519038976, + "loss": 1.3918, + "grad_norm": 1.8603183031082153, + "learning_rate": 0.00014243132376252912 + }, + { + "step": 333, + "epoch": 1.6946564885496183, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519128064, + "loss": 1.4351, + "grad_norm": 1.7325583696365356, + "learning_rate": 0.00014154178641080767 + }, + { + "step": 334, + "epoch": 1.6997455470737912, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519055872, + "loss": 1.3882, + "grad_norm": 1.317692756652832, + "learning_rate": 0.0001406525473676854 + }, + { + "step": 335, + "epoch": 1.7048346055979644, + "cpu_mem": 1.548955648, + "gpu_mem": 4.519065088, + "loss": 1.3679, + "grad_norm": 1.0434831380844116, + "learning_rate": 0.00013976363799529936 + }, + { + "step": 336, + "epoch": 1.7099236641221374, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519117312, + "loss": 1.3657, + "grad_norm": 1.3659099340438843, + "learning_rate": 0.00013887508964415956 + }, + { + "step": 337, + "epoch": 1.7150127226463106, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519117312, + "loss": 1.3506, + "grad_norm": 2.4785165786743164, + "learning_rate": 0.00013798693365204325 + }, + { + "step": 338, + "epoch": 1.7201017811704835, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519154176, + "loss": 1.3883, + "grad_norm": 2.2068703174591064, + "learning_rate": 0.00013709920134288993 + }, + { + "step": 339, + "epoch": 1.7251908396946565, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519071232, + "loss": 1.3939, + "grad_norm": 1.4903738498687744, + "learning_rate": 0.00013621192402569628 + }, + { + "step": 340, + "epoch": 1.7302798982188294, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519140352, + "loss": 1.3945, + "grad_norm": 1.1913816928863525, + "learning_rate": 0.00013532513299341215 + }, + { + "step": 341, + "epoch": 1.7353689567430024, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519081984, + "loss": 1.3872, + "grad_norm": 1.060878872871399, + "learning_rate": 0.00013443885952183683 + }, + { + "step": 342, + "epoch": 1.7404580152671756, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519088128, + "loss": 1.3658, + "grad_norm": 1.6997623443603516, + "learning_rate": 0.00013355313486851603 + }, + { + "step": 343, + "epoch": 1.7455470737913485, + "cpu_mem": 1.549152256, + "gpu_mem": 4.51899904, + "loss": 1.3937, + "grad_norm": 1.3663522005081177, + "learning_rate": 0.00013266799027163942 + }, + { + "step": 344, + "epoch": 1.7506361323155217, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519117312, + "loss": 1.3806, + "grad_norm": 0.82390296459198, + "learning_rate": 0.00013178345694893906 + }, + { + "step": 345, + "epoch": 1.7557251908396947, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519069696, + "loss": 1.3873, + "grad_norm": 1.53325355052948, + "learning_rate": 0.0001308995660965881 + }, + { + "step": 346, + "epoch": 1.7608142493638677, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519015936, + "loss": 1.3995, + "grad_norm": 1.9457041025161743, + "learning_rate": 0.00013001634888810094 + }, + { + "step": 347, + "epoch": 1.7659033078880406, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519065088, + "loss": 1.3818, + "grad_norm": 1.1930177211761475, + "learning_rate": 0.0001291338364732333 + }, + { + "step": 348, + "epoch": 1.7709923664122136, + "cpu_mem": 1.549152256, + "gpu_mem": 4.51919104, + "loss": 1.3929, + "grad_norm": 1.5315130949020386, + "learning_rate": 0.00012825205997688403 + }, + { + "step": 349, + "epoch": 1.7760814249363868, + "cpu_mem": 1.549152256, + "gpu_mem": 4.518989824, + "loss": 1.3779, + "grad_norm": 1.124269723892212, + "learning_rate": 0.00012737105049799723 + }, + { + "step": 350, + "epoch": 1.78117048346056, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519054336, + "loss": 1.3828, + "grad_norm": 1.0576345920562744, + "learning_rate": 0.00012649083910846543 + }, + { + "step": 351, + "epoch": 1.786259541984733, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519069696, + "loss": 1.3669, + "grad_norm": 1.0675820112228394, + "learning_rate": 0.00012561145685203374 + }, + { + "step": 352, + "epoch": 1.7913486005089059, + "cpu_mem": 1.549152256, + "gpu_mem": 4.51906048, + "loss": 1.3942, + "grad_norm": 1.3536916971206665, + "learning_rate": 0.00012473293474320505 + }, + { + "step": 353, + "epoch": 1.7964376590330788, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519057408, + "loss": 1.3662, + "grad_norm": 1.4612363576889038, + "learning_rate": 0.00012385530376614586 + }, + { + "step": 354, + "epoch": 1.8015267175572518, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519032832, + "loss": 1.4178, + "grad_norm": 1.9625743627548218, + "learning_rate": 0.00012297859487359408 + }, + { + "step": 355, + "epoch": 1.806615776081425, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519015936, + "loss": 1.3513, + "grad_norm": 1.282235860824585, + "learning_rate": 0.0001221028389857669 + }, + { + "step": 356, + "epoch": 1.811704834605598, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519017472, + "loss": 1.3683, + "grad_norm": 1.4586429595947266, + "learning_rate": 0.00012122806698927051 + }, + { + "step": 357, + "epoch": 1.8167938931297711, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519069696, + "loss": 1.3887, + "grad_norm": 1.8730717897415161, + "learning_rate": 0.00012035430973601075 + }, + { + "step": 358, + "epoch": 1.821882951653944, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519135744, + "loss": 1.3979, + "grad_norm": 3.02571439743042, + "learning_rate": 0.00011948159804210495 + }, + { + "step": 359, + "epoch": 1.826972010178117, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519089664, + "loss": 1.3552, + "grad_norm": 1.8004968166351318, + "learning_rate": 0.00011860996268679504 + }, + { + "step": 360, + "epoch": 1.83206106870229, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519140352, + "loss": 1.3815, + "grad_norm": 1.717481017112732, + "learning_rate": 0.00011773943441136221 + }, + { + "step": 361, + "epoch": 1.837150127226463, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519111168, + "loss": 1.3856, + "grad_norm": 2.044525384902954, + "learning_rate": 0.00011687004391804251 + }, + { + "step": 362, + "epoch": 1.8422391857506362, + "cpu_mem": 1.549152256, + "gpu_mem": 4.51904512, + "loss": 1.3606, + "grad_norm": 2.1291258335113525, + "learning_rate": 0.00011600182186894417 + }, + { + "step": 363, + "epoch": 1.8473282442748091, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519002112, + "loss": 1.3378, + "grad_norm": 1.6260528564453125, + "learning_rate": 0.00011513479888496609 + }, + { + "step": 364, + "epoch": 1.8524173027989823, + "cpu_mem": 1.549152256, + "gpu_mem": 4.51902976, + "loss": 1.3396, + "grad_norm": 3.646223783493042, + "learning_rate": 0.00011426900554471795 + }, + { + "step": 365, + "epoch": 1.8575063613231553, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519012864, + "loss": 1.4021, + "grad_norm": 2.7909398078918457, + "learning_rate": 0.0001134044723834417 + }, + { + "step": 366, + "epoch": 1.8625954198473282, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519055872, + "loss": 1.3705, + "grad_norm": 2.3960258960723877, + "learning_rate": 0.00011254122989193465 + }, + { + "step": 367, + "epoch": 1.8676844783715012, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519109632, + "loss": 1.3673, + "grad_norm": 2.3699557781219482, + "learning_rate": 0.00011167930851547418 + }, + { + "step": 368, + "epoch": 1.8727735368956742, + "cpu_mem": 1.549152256, + "gpu_mem": 4.51900672, + "loss": 1.3707, + "grad_norm": 2.2000036239624023, + "learning_rate": 0.0001108187386527438 + }, + { + "step": 369, + "epoch": 1.8778625954198473, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519066624, + "loss": 1.3563, + "grad_norm": 2.364560127258301, + "learning_rate": 0.00010995955065476126 + }, + { + "step": 370, + "epoch": 1.8829516539440203, + "cpu_mem": 1.549152256, + "gpu_mem": 4.518989824, + "loss": 1.3873, + "grad_norm": 2.3937504291534424, + "learning_rate": 0.00010910177482380795 + }, + { + "step": 371, + "epoch": 1.8880407124681935, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519149568, + "loss": 1.3308, + "grad_norm": 1.9409763813018799, + "learning_rate": 0.00010824544141236015 + }, + { + "step": 372, + "epoch": 1.8931297709923665, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519003648, + "loss": 1.3667, + "grad_norm": 1.9023265838623047, + "learning_rate": 0.00010739058062202224 + }, + { + "step": 373, + "epoch": 1.8982188295165394, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519026688, + "loss": 1.3342, + "grad_norm": 2.1334292888641357, + "learning_rate": 0.00010653722260246145 + }, + { + "step": 374, + "epoch": 1.9033078880407124, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519049728, + "loss": 1.4339, + "grad_norm": 3.2036404609680176, + "learning_rate": 0.00010568539745034447 + }, + { + "step": 375, + "epoch": 1.9083969465648853, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519273984, + "loss": 1.4098, + "grad_norm": 2.8360462188720703, + "learning_rate": 0.00010483513520827614 + }, + { + "step": 376, + "epoch": 1.9134860050890585, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519031296, + "loss": 1.3742, + "grad_norm": 1.9485664367675781, + "learning_rate": 0.00010398646586373969 + }, + { + "step": 377, + "epoch": 1.9185750636132317, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519154176, + "loss": 1.3819, + "grad_norm": 1.702607274055481, + "learning_rate": 0.00010313941934803922 + }, + { + "step": 378, + "epoch": 1.9236641221374047, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519069696, + "loss": 1.3555, + "grad_norm": 9.792722702026367, + "learning_rate": 0.00010229402553524413 + }, + { + "step": 379, + "epoch": 1.9287531806615776, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519158784, + "loss": 1.3959, + "grad_norm": 3.7921090126037598, + "learning_rate": 0.00010145031424113542 + }, + { + "step": 380, + "epoch": 1.9338422391857506, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519057408, + "loss": 1.3878, + "grad_norm": 2.05283522605896, + "learning_rate": 0.00010060831522215416 + }, + { + "step": 381, + "epoch": 1.9389312977099236, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519065088, + "loss": 1.3848, + "grad_norm": 1.2519477605819702, + "learning_rate": 9.976805817435207e-05 + }, + { + "step": 382, + "epoch": 1.9440203562340967, + "cpu_mem": 1.549152256, + "gpu_mem": 4.518988288, + "loss": 1.3483, + "grad_norm": 1.5672640800476074, + "learning_rate": 9.89295727323441e-05 + }, + { + "step": 383, + "epoch": 1.9491094147582697, + "cpu_mem": 1.549152256, + "gpu_mem": 4.51906816, + "loss": 1.3848, + "grad_norm": 1.9949630498886108, + "learning_rate": 9.809288846826327e-05 + }, + { + "step": 384, + "epoch": 1.954198473282443, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519038976, + "loss": 1.3942, + "grad_norm": 1.889150857925415, + "learning_rate": 9.725803489071779e-05 + }, + { + "step": 385, + "epoch": 1.9592875318066159, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519026688, + "loss": 1.3861, + "grad_norm": 1.7564359903335571, + "learning_rate": 9.642504144375026e-05 + }, + { + "step": 386, + "epoch": 1.9643765903307888, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519169536, + "loss": 1.358, + "grad_norm": 1.7210222482681274, + "learning_rate": 9.559393750579926e-05 + }, + { + "step": 387, + "epoch": 1.9694656488549618, + "cpu_mem": 1.549152256, + "gpu_mem": 4.5190528, + "loss": 1.348, + "grad_norm": 1.6718264818191528, + "learning_rate": 9.476475238866318e-05 + }, + { + "step": 388, + "epoch": 1.9745547073791347, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519063552, + "loss": 1.3993, + "grad_norm": 1.8039673566818237, + "learning_rate": 9.393751533646649e-05 + }, + { + "step": 389, + "epoch": 1.979643765903308, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519218688, + "loss": 1.3825, + "grad_norm": 1.732738733291626, + "learning_rate": 9.31122555246283e-05 + }, + { + "step": 390, + "epoch": 1.984732824427481, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519028224, + "loss": 1.3738, + "grad_norm": 2.3856730461120605, + "learning_rate": 9.228900205883324e-05 + }, + { + "step": 391, + "epoch": 1.989821882951654, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519049728, + "loss": 1.363, + "grad_norm": 1.8309390544891357, + "learning_rate": 9.146778397400543e-05 + }, + { + "step": 392, + "epoch": 1.994910941475827, + "cpu_mem": 1.549152256, + "gpu_mem": 4.519086592, + "loss": 1.3717, + "grad_norm": 1.8361440896987915, + "learning_rate": 9.064863023328384e-05 + }, + { + "step": 393, + "epoch": 2.0, + "cpu_mem": 1.549152256, + "gpu_mem": 4.518673408, + "loss": 2.0501, + "grad_norm": 3.271700859069824, + "learning_rate": 8.983156972700125e-05 + }, + { + "step": 394, + "epoch": 2.005089058524173, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493698048, + "loss": 1.3115, + "grad_norm": 2.6110754013061523, + "learning_rate": 8.901663127166513e-05 + }, + { + "step": 395, + "epoch": 2.010178117048346, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493676544, + "loss": 1.3578, + "grad_norm": 2.151705503463745, + "learning_rate": 8.820384360894143e-05 + }, + { + "step": 396, + "epoch": 2.015267175572519, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493691904, + "loss": 1.3679, + "grad_norm": 2.3508405685424805, + "learning_rate": 8.739323540464063e-05 + }, + { + "step": 397, + "epoch": 2.0203562340966923, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49383936, + "loss": 1.335, + "grad_norm": 2.5820233821868896, + "learning_rate": 8.658483524770728e-05 + }, + { + "step": 398, + "epoch": 2.0254452926208653, + "cpu_mem": 1.549152256, + "gpu_mem": 4.4937856, + "loss": 1.3686, + "grad_norm": 2.557969570159912, + "learning_rate": 8.577867164921113e-05 + }, + { + "step": 399, + "epoch": 2.030534351145038, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493802496, + "loss": 1.338, + "grad_norm": 2.5558559894561768, + "learning_rate": 8.497477304134203e-05 + }, + { + "step": 400, + "epoch": 2.035623409669211, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49375488, + "loss": 1.3421, + "grad_norm": 2.715161085128784, + "learning_rate": 8.41731677764068e-05 + }, + { + "step": 401, + "epoch": 2.040712468193384, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493779456, + "loss": 1.2774, + "grad_norm": 3.419095516204834, + "learning_rate": 8.337388412582972e-05 + }, + { + "step": 402, + "epoch": 2.045801526717557, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493827072, + "loss": 1.32, + "grad_norm": 4.199026107788086, + "learning_rate": 8.257695027915481e-05 + }, + { + "step": 403, + "epoch": 2.05089058524173, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493736448, + "loss": 1.2904, + "grad_norm": 3.5525810718536377, + "learning_rate": 8.178239434305235e-05 + }, + { + "step": 404, + "epoch": 2.0559796437659035, + "cpu_mem": 1.549152256, + "gpu_mem": 4.4937088, + "loss": 1.2325, + "grad_norm": 4.239084720611572, + "learning_rate": 8.099024434032717e-05 + }, + { + "step": 405, + "epoch": 2.0610687022900764, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493797888, + "loss": 1.2924, + "grad_norm": 4.611480236053467, + "learning_rate": 8.02005282089303e-05 + }, + { + "step": 406, + "epoch": 2.0661577608142494, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493688832, + "loss": 1.2237, + "grad_norm": 4.568148136138916, + "learning_rate": 7.941327380097388e-05 + }, + { + "step": 407, + "epoch": 2.0712468193384224, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493741056, + "loss": 1.3851, + "grad_norm": 6.48914098739624, + "learning_rate": 7.862850888174869e-05 + }, + { + "step": 408, + "epoch": 2.0763358778625953, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493694976, + "loss": 1.2642, + "grad_norm": 6.341342926025391, + "learning_rate": 7.784626112874487e-05 + }, + { + "step": 409, + "epoch": 2.0814249363867683, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493897728, + "loss": 1.3542, + "grad_norm": 6.5845746994018555, + "learning_rate": 7.706655813067594e-05 + }, + { + "step": 410, + "epoch": 2.0865139949109412, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493896192, + "loss": 1.361, + "grad_norm": 5.641256809234619, + "learning_rate": 7.628942738650573e-05 + }, + { + "step": 411, + "epoch": 2.0916030534351147, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493819392, + "loss": 1.2669, + "grad_norm": 3.9064013957977295, + "learning_rate": 7.551489630447835e-05 + }, + { + "step": 412, + "epoch": 2.0966921119592876, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493776384, + "loss": 1.3022, + "grad_norm": 3.762362241744995, + "learning_rate": 7.474299220115195e-05 + }, + { + "step": 413, + "epoch": 2.1017811704834606, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493753344, + "loss": 1.2309, + "grad_norm": 4.183384418487549, + "learning_rate": 7.397374230043484e-05 + }, + { + "step": 414, + "epoch": 2.1068702290076335, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493707264, + "loss": 1.2612, + "grad_norm": 3.257612466812134, + "learning_rate": 7.320717373262557e-05 + }, + { + "step": 415, + "epoch": 2.1119592875318065, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493824, + "loss": 1.3482, + "grad_norm": 4.347290992736816, + "learning_rate": 7.244331353345625e-05 + }, + { + "step": 416, + "epoch": 2.1170483460559795, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493711872, + "loss": 1.3876, + "grad_norm": 4.673461437225342, + "learning_rate": 7.16821886431386e-05 + }, + { + "step": 417, + "epoch": 2.122137404580153, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49370112, + "loss": 1.4114, + "grad_norm": 5.684346675872803, + "learning_rate": 7.092382590541432e-05 + }, + { + "step": 418, + "epoch": 2.127226463104326, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493705728, + "loss": 1.3316, + "grad_norm": 4.307408809661865, + "learning_rate": 7.016825206660788e-05 + }, + { + "step": 419, + "epoch": 2.132315521628499, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493820928, + "loss": 1.3774, + "grad_norm": 5.346398830413818, + "learning_rate": 6.941549377468367e-05 + }, + { + "step": 420, + "epoch": 2.1374045801526718, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493794816, + "loss": 1.2512, + "grad_norm": 3.915797710418701, + "learning_rate": 6.866557757830575e-05 + }, + { + "step": 421, + "epoch": 2.1424936386768447, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493696512, + "loss": 1.2441, + "grad_norm": 3.704847574234009, + "learning_rate": 6.791852992590169e-05 + }, + { + "step": 422, + "epoch": 2.1475826972010177, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49373184, + "loss": 1.2819, + "grad_norm": 3.336219310760498, + "learning_rate": 6.717437716472997e-05 + }, + { + "step": 423, + "epoch": 2.1526717557251906, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493698048, + "loss": 1.3241, + "grad_norm": 4.151459217071533, + "learning_rate": 6.643314553995034e-05 + }, + { + "step": 424, + "epoch": 2.157760814249364, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49383168, + "loss": 1.2917, + "grad_norm": 3.7383005619049072, + "learning_rate": 6.569486119369863e-05 + }, + { + "step": 425, + "epoch": 2.162849872773537, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493805568, + "loss": 1.3142, + "grad_norm": 4.273043155670166, + "learning_rate": 6.495955016416441e-05 + }, + { + "step": 426, + "epoch": 2.16793893129771, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493850112, + "loss": 1.1988, + "grad_norm": 3.7781248092651367, + "learning_rate": 6.422723838467286e-05 + }, + { + "step": 427, + "epoch": 2.173027989821883, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493741056, + "loss": 1.3113, + "grad_norm": 4.42944860458374, + "learning_rate": 6.349795168276994e-05 + }, + { + "step": 428, + "epoch": 2.178117048346056, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493757952, + "loss": 1.2708, + "grad_norm": 4.251877784729004, + "learning_rate": 6.277171577931187e-05 + }, + { + "step": 429, + "epoch": 2.183206106870229, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493833216, + "loss": 1.3781, + "grad_norm": 5.335277080535889, + "learning_rate": 6.204855628755751e-05 + }, + { + "step": 430, + "epoch": 2.188295165394402, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493681152, + "loss": 1.308, + "grad_norm": 4.778660297393799, + "learning_rate": 6.13284987122654e-05 + }, + { + "step": 431, + "epoch": 2.1933842239185752, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493799424, + "loss": 1.3195, + "grad_norm": 5.388873100280762, + "learning_rate": 6.061156844879417e-05 + }, + { + "step": 432, + "epoch": 2.198473282442748, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493788672, + "loss": 1.2813, + "grad_norm": 5.598742961883545, + "learning_rate": 5.9897790782206636e-05 + }, + { + "step": 433, + "epoch": 2.203562340966921, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49375488, + "loss": 1.3026, + "grad_norm": 4.444918632507324, + "learning_rate": 5.9187190886378306e-05 + }, + { + "step": 434, + "epoch": 2.208651399491094, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493733376, + "loss": 1.3523, + "grad_norm": 5.93340539932251, + "learning_rate": 5.8479793823109406e-05 + }, + { + "step": 435, + "epoch": 2.213740458015267, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493728768, + "loss": 1.232, + "grad_norm": 3.8733508586883545, + "learning_rate": 5.777562454124113e-05 + }, + { + "step": 436, + "epoch": 2.21882951653944, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49373952, + "loss": 1.3105, + "grad_norm": 5.589296817779541, + "learning_rate": 5.7074707875775496e-05 + }, + { + "step": 437, + "epoch": 2.223918575063613, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49379328, + "loss": 1.267, + "grad_norm": 4.350371837615967, + "learning_rate": 5.637706854699974e-05 + }, + { + "step": 438, + "epoch": 2.2290076335877864, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493710336, + "loss": 1.2491, + "grad_norm": 4.154341697692871, + "learning_rate": 5.568273115961414e-05 + }, + { + "step": 439, + "epoch": 2.2340966921119594, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493737984, + "loss": 1.2786, + "grad_norm": 4.494115352630615, + "learning_rate": 5.499172020186447e-05 + }, + { + "step": 440, + "epoch": 2.2391857506361323, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493736448, + "loss": 1.2733, + "grad_norm": 5.1525397300720215, + "learning_rate": 5.430406004467842e-05 + }, + { + "step": 441, + "epoch": 2.2442748091603053, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493779456, + "loss": 1.2216, + "grad_norm": 4.6003828048706055, + "learning_rate": 5.361977494080572e-05 + }, + { + "step": 442, + "epoch": 2.2493638676844783, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49377792, + "loss": 1.2858, + "grad_norm": 4.7908711433410645, + "learning_rate": 5.293888902396319e-05 + }, + { + "step": 443, + "epoch": 2.2544529262086512, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493828608, + "loss": 1.3247, + "grad_norm": 5.7500996589660645, + "learning_rate": 5.2261426307983204e-05 + }, + { + "step": 444, + "epoch": 2.2595419847328246, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493834752, + "loss": 1.2202, + "grad_norm": 6.010868549346924, + "learning_rate": 5.158741068596714e-05 + }, + { + "step": 445, + "epoch": 2.2646310432569976, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493773312, + "loss": 1.2272, + "grad_norm": 5.480494022369385, + "learning_rate": 5.0916865929442326e-05 + }, + { + "step": 446, + "epoch": 2.2697201017811706, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49376256, + "loss": 1.3627, + "grad_norm": 7.722513675689697, + "learning_rate": 5.024981568752386e-05 + }, + { + "step": 447, + "epoch": 2.2748091603053435, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493787136, + "loss": 1.3291, + "grad_norm": 6.564704418182373, + "learning_rate": 4.958628348608065e-05 + }, + { + "step": 448, + "epoch": 2.2798982188295165, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493804032, + "loss": 1.2982, + "grad_norm": 8.189974784851074, + "learning_rate": 4.892629272690536e-05 + }, + { + "step": 449, + "epoch": 2.2849872773536894, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493675008, + "loss": 1.2911, + "grad_norm": 7.6172943115234375, + "learning_rate": 4.826986668688944e-05 + }, + { + "step": 450, + "epoch": 2.2900763358778624, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493850112, + "loss": 1.2149, + "grad_norm": 7.927183628082275, + "learning_rate": 4.761702851720191e-05 + }, + { + "step": 451, + "epoch": 2.2951653944020354, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493807104, + "loss": 1.2363, + "grad_norm": 8.117219924926758, + "learning_rate": 4.6967801242472916e-05 + }, + { + "step": 452, + "epoch": 2.300254452926209, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49371648, + "loss": 1.2273, + "grad_norm": 6.407708644866943, + "learning_rate": 4.632220775998172e-05 + }, + { + "step": 453, + "epoch": 2.3053435114503817, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493664256, + "loss": 1.2083, + "grad_norm": 6.195032119750977, + "learning_rate": 4.568027083884929e-05 + }, + { + "step": 454, + "epoch": 2.3104325699745547, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493679616, + "loss": 1.2518, + "grad_norm": 6.793033123016357, + "learning_rate": 4.504201311923488e-05 + }, + { + "step": 455, + "epoch": 2.3155216284987277, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493780992, + "loss": 1.2536, + "grad_norm": 6.997833251953125, + "learning_rate": 4.440745711153804e-05 + }, + { + "step": 456, + "epoch": 2.3206106870229006, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493790208, + "loss": 1.299, + "grad_norm": 8.61462116241455, + "learning_rate": 4.377662519560423e-05 + }, + { + "step": 457, + "epoch": 2.325699745547074, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493819392, + "loss": 1.2746, + "grad_norm": 6.848941326141357, + "learning_rate": 4.3149539619935836e-05 + }, + { + "step": 458, + "epoch": 2.330788804071247, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493880832, + "loss": 1.3485, + "grad_norm": 6.327116966247559, + "learning_rate": 4.252622250090746e-05 + }, + { + "step": 459, + "epoch": 2.33587786259542, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493734912, + "loss": 1.2954, + "grad_norm": 6.472503662109375, + "learning_rate": 4.190669582198571e-05 + }, + { + "step": 460, + "epoch": 2.340966921119593, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493822464, + "loss": 1.2276, + "grad_norm": 6.660966873168945, + "learning_rate": 4.1290981432954185e-05 + }, + { + "step": 461, + "epoch": 2.346055979643766, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493725696, + "loss": 1.2533, + "grad_norm": 6.101393222808838, + "learning_rate": 4.067910104914249e-05 + }, + { + "step": 462, + "epoch": 2.351145038167939, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493796352, + "loss": 1.2957, + "grad_norm": 6.192693710327148, + "learning_rate": 4.007107625066079e-05 + }, + { + "step": 463, + "epoch": 2.356234096692112, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49383168, + "loss": 1.2324, + "grad_norm": 7.561249732971191, + "learning_rate": 3.946692848163836e-05 + }, + { + "step": 464, + "epoch": 2.3613231552162848, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49377024, + "loss": 1.2437, + "grad_norm": 6.279735088348389, + "learning_rate": 3.886667904946739e-05 + }, + { + "step": 465, + "epoch": 2.366412213740458, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493727232, + "loss": 1.279, + "grad_norm": 6.561028003692627, + "learning_rate": 3.8270349124051694e-05 + }, + { + "step": 466, + "epoch": 2.371501272264631, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493728768, + "loss": 1.2687, + "grad_norm": 5.712686061859131, + "learning_rate": 3.767795973705975e-05 + }, + { + "step": 467, + "epoch": 2.376590330788804, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493765632, + "loss": 1.215, + "grad_norm": 5.405893325805664, + "learning_rate": 3.708953178118324e-05 + }, + { + "step": 468, + "epoch": 2.381679389312977, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493737984, + "loss": 1.1833, + "grad_norm": 5.302999973297119, + "learning_rate": 3.6505086009399944e-05 + }, + { + "step": 469, + "epoch": 2.38676844783715, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493794816, + "loss": 1.2669, + "grad_norm": 5.752892971038818, + "learning_rate": 3.5924643034242136e-05 + }, + { + "step": 470, + "epoch": 2.391857506361323, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493824, + "loss": 1.2402, + "grad_norm": 5.920003414154053, + "learning_rate": 3.5348223327069105e-05 + }, + { + "step": 471, + "epoch": 2.3969465648854964, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49372416, + "loss": 1.2897, + "grad_norm": 6.862788200378418, + "learning_rate": 3.4775847217345756e-05 + }, + { + "step": 472, + "epoch": 2.4020356234096694, + "cpu_mem": 1.549152256, + "gpu_mem": 4.4937472, + "loss": 1.2667, + "grad_norm": 6.120493412017822, + "learning_rate": 3.420753489192524e-05 + }, + { + "step": 473, + "epoch": 2.4071246819338423, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493722624, + "loss": 1.2213, + "grad_norm": 5.794097423553467, + "learning_rate": 3.364330639433701e-05 + }, + { + "step": 474, + "epoch": 2.4122137404580153, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493814784, + "loss": 1.2088, + "grad_norm": 6.443933486938477, + "learning_rate": 3.308318162408013e-05 + }, + { + "step": 475, + "epoch": 2.4173027989821882, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493684224, + "loss": 1.2838, + "grad_norm": 9.36483383178711, + "learning_rate": 3.2527180335921186e-05 + }, + { + "step": 476, + "epoch": 2.422391857506361, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493820928, + "loss": 1.2764, + "grad_norm": 5.715517044067383, + "learning_rate": 3.197532213919774e-05 + }, + { + "step": 477, + "epoch": 2.427480916030534, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49367808, + "loss": 1.246, + "grad_norm": 6.216403007507324, + "learning_rate": 3.1427626497126654e-05 + }, + { + "step": 478, + "epoch": 2.432569974554707, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49370112, + "loss": 1.165, + "grad_norm": 6.445955276489258, + "learning_rate": 3.088411272611781e-05 + }, + { + "step": 479, + "epoch": 2.4376590330788805, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493748736, + "loss": 1.2521, + "grad_norm": 5.697048187255859, + "learning_rate": 3.0344799995092533e-05 + }, + { + "step": 480, + "epoch": 2.4427480916030535, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493791744, + "loss": 1.2813, + "grad_norm": 5.7486090660095215, + "learning_rate": 2.9809707324807912e-05 + }, + { + "step": 481, + "epoch": 2.4478371501272265, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493719552, + "loss": 1.3069, + "grad_norm": 7.398179054260254, + "learning_rate": 2.9278853587185658e-05 + }, + { + "step": 482, + "epoch": 2.4529262086513994, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493817856, + "loss": 1.2701, + "grad_norm": 8.523886680603027, + "learning_rate": 2.8752257504646616e-05 + }, + { + "step": 483, + "epoch": 2.4580152671755724, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493713408, + "loss": 1.2195, + "grad_norm": 5.738861083984375, + "learning_rate": 2.8229937649450613e-05 + }, + { + "step": 484, + "epoch": 2.4631043256997454, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493868544, + "loss": 1.1508, + "grad_norm": 5.326663017272949, + "learning_rate": 2.7711912443041123e-05 + }, + { + "step": 485, + "epoch": 2.4681933842239188, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49370112, + "loss": 1.2394, + "grad_norm": 7.559383392333984, + "learning_rate": 2.719820015539596e-05 + }, + { + "step": 486, + "epoch": 2.4732824427480917, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493744128, + "loss": 1.2785, + "grad_norm": 6.067483425140381, + "learning_rate": 2.6688818904382513e-05 + }, + { + "step": 487, + "epoch": 2.4783715012722647, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493765632, + "loss": 1.2229, + "grad_norm": 7.255116939544678, + "learning_rate": 2.6183786655119144e-05 + }, + { + "step": 488, + "epoch": 2.4834605597964376, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493688832, + "loss": 1.3228, + "grad_norm": 9.263710975646973, + "learning_rate": 2.5683121219341217e-05 + }, + { + "step": 489, + "epoch": 2.4885496183206106, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493825536, + "loss": 1.235, + "grad_norm": 6.926446914672852, + "learning_rate": 2.518684025477319e-05 + }, + { + "step": 490, + "epoch": 2.4936386768447836, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493713408, + "loss": 1.1499, + "grad_norm": 6.095040798187256, + "learning_rate": 2.469496126450578e-05 + }, + { + "step": 491, + "epoch": 2.4987277353689565, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493710336, + "loss": 1.3224, + "grad_norm": 6.985899448394775, + "learning_rate": 2.4207501596378508e-05 + }, + { + "step": 492, + "epoch": 2.5038167938931295, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493684224, + "loss": 1.293, + "grad_norm": 9.759529113769531, + "learning_rate": 2.3724478442368133e-05 + }, + { + "step": 493, + "epoch": 2.508905852417303, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493719552, + "loss": 1.1295, + "grad_norm": 8.705277442932129, + "learning_rate": 2.324590883798204e-05 + }, + { + "step": 494, + "epoch": 2.513994910941476, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493734912, + "loss": 1.0758, + "grad_norm": 6.266777515411377, + "learning_rate": 2.2771809661657614e-05 + }, + { + "step": 495, + "epoch": 2.519083969465649, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493728768, + "loss": 1.1269, + "grad_norm": 6.509443759918213, + "learning_rate": 2.2302197634166835e-05 + }, + { + "step": 496, + "epoch": 2.524173027989822, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493741056, + "loss": 1.3302, + "grad_norm": 8.003247261047363, + "learning_rate": 2.1837089318026714e-05 + }, + { + "step": 497, + "epoch": 2.5292620865139948, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493773312, + "loss": 1.3063, + "grad_norm": 8.047350883483887, + "learning_rate": 2.1376501116915047e-05 + }, + { + "step": 498, + "epoch": 2.534351145038168, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493774848, + "loss": 1.3441, + "grad_norm": 7.563324451446533, + "learning_rate": 2.0920449275091837e-05 + }, + { + "step": 499, + "epoch": 2.539440203562341, + "cpu_mem": 1.549152256, + "gpu_mem": 4.4937472, + "loss": 1.2675, + "grad_norm": 7.774256229400635, + "learning_rate": 2.0468949876826573e-05 + }, + { + "step": 500, + "epoch": 2.544529262086514, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493796352, + "loss": 1.213, + "grad_norm": 8.11932373046875, + "learning_rate": 2.002201884583065e-05 + }, + { + "step": 501, + "epoch": 2.549618320610687, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493725696, + "loss": 1.1672, + "grad_norm": 7.599986553192139, + "learning_rate": 1.957967194469615e-05 + }, + { + "step": 502, + "epoch": 2.55470737913486, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493784064, + "loss": 1.2484, + "grad_norm": 7.470830917358398, + "learning_rate": 1.9141924774339566e-05 + }, + { + "step": 503, + "epoch": 2.559796437659033, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493774848, + "loss": 1.1662, + "grad_norm": 6.462966442108154, + "learning_rate": 1.8708792773451874e-05 + }, + { + "step": 504, + "epoch": 2.564885496183206, + "cpu_mem": 1.549152256, + "gpu_mem": 4.4937088, + "loss": 1.3026, + "grad_norm": 7.411251544952393, + "learning_rate": 1.828029121795375e-05 + }, + { + "step": 505, + "epoch": 2.569974554707379, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493730304, + "loss": 1.2561, + "grad_norm": 15.114665985107422, + "learning_rate": 1.7856435220457092e-05 + }, + { + "step": 506, + "epoch": 2.5750636132315523, + "cpu_mem": 1.549152256, + "gpu_mem": 4.4937856, + "loss": 1.2324, + "grad_norm": 7.802770137786865, + "learning_rate": 1.7437239729731806e-05 + }, + { + "step": 507, + "epoch": 2.5801526717557253, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493702656, + "loss": 1.3252, + "grad_norm": 6.871520042419434, + "learning_rate": 1.7022719530178624e-05 + }, + { + "step": 508, + "epoch": 2.5852417302798982, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493767168, + "loss": 1.2776, + "grad_norm": 9.012080192565918, + "learning_rate": 1.6612889241307836e-05 + }, + { + "step": 509, + "epoch": 2.590330788804071, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493725696, + "loss": 1.1623, + "grad_norm": 5.9950971603393555, + "learning_rate": 1.620776331722347e-05 + }, + { + "step": 510, + "epoch": 2.595419847328244, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493757952, + "loss": 1.1957, + "grad_norm": 7.178092956542969, + "learning_rate": 1.580735604611368e-05 + }, + { + "step": 511, + "epoch": 2.6005089058524176, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49373184, + "loss": 1.3011, + "grad_norm": 8.181595802307129, + "learning_rate": 1.5411681549746678e-05 + }, + { + "step": 512, + "epoch": 2.6055979643765905, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493725696, + "loss": 1.1186, + "grad_norm": 6.441361904144287, + "learning_rate": 1.502075378297285e-05 + }, + { + "step": 513, + "epoch": 2.6106870229007635, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493736448, + "loss": 1.3079, + "grad_norm": 7.951591491699219, + "learning_rate": 1.4634586533232428e-05 + }, + { + "step": 514, + "epoch": 2.6157760814249365, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493641216, + "loss": 1.3558, + "grad_norm": 8.894723892211914, + "learning_rate": 1.4253193420069292e-05 + }, + { + "step": 515, + "epoch": 2.6208651399491094, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493710336, + "loss": 1.2438, + "grad_norm": 9.77177906036377, + "learning_rate": 1.3876587894650686e-05 + }, + { + "step": 516, + "epoch": 2.6259541984732824, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493757952, + "loss": 1.2801, + "grad_norm": 15.15963077545166, + "learning_rate": 1.350478323929271e-05 + }, + { + "step": 517, + "epoch": 2.6310432569974553, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493702656, + "loss": 1.2735, + "grad_norm": 7.6226935386657715, + "learning_rate": 1.3137792566992001e-05 + }, + { + "step": 518, + "epoch": 2.6361323155216283, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493761024, + "loss": 1.2203, + "grad_norm": 6.464207172393799, + "learning_rate": 1.2775628820963091e-05 + }, + { + "step": 519, + "epoch": 2.6412213740458013, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493767168, + "loss": 1.2161, + "grad_norm": 7.604987621307373, + "learning_rate": 1.2418304774182075e-05 + }, + { + "step": 520, + "epoch": 2.6463104325699747, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493903872, + "loss": 1.2512, + "grad_norm": 6.369946002960205, + "learning_rate": 1.2065833028935968e-05 + }, + { + "step": 521, + "epoch": 2.6513994910941476, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493713408, + "loss": 1.2467, + "grad_norm": 9.130136489868164, + "learning_rate": 1.1718226016378507e-05 + }, + { + "step": 522, + "epoch": 2.6564885496183206, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49377792, + "loss": 1.2229, + "grad_norm": 6.836489200592041, + "learning_rate": 1.137549599609136e-05 + }, + { + "step": 523, + "epoch": 2.6615776081424936, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49369344, + "loss": 1.232, + "grad_norm": 6.147420406341553, + "learning_rate": 1.103765505565205e-05 + }, + { + "step": 524, + "epoch": 2.6666666666666665, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493710336, + "loss": 1.1679, + "grad_norm": 6.486333847045898, + "learning_rate": 1.0704715110207579e-05 + }, + { + "step": 525, + "epoch": 2.67175572519084, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493710336, + "loss": 1.2807, + "grad_norm": 10.21741008758545, + "learning_rate": 1.0376687902053981e-05 + }, + { + "step": 526, + "epoch": 2.676844783715013, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493836288, + "loss": 1.3166, + "grad_norm": 6.745300769805908, + "learning_rate": 1.0053585000222524e-05 + }, + { + "step": 527, + "epoch": 2.681933842239186, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49371648, + "loss": 1.093, + "grad_norm": 6.137012958526611, + "learning_rate": 9.735417800071433e-06 + }, + { + "step": 528, + "epoch": 2.687022900763359, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493874688, + "loss": 1.1726, + "grad_norm": 6.37439489364624, + "learning_rate": 9.42219752288414e-06 + }, + { + "step": 529, + "epoch": 2.6921119592875318, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493699584, + "loss": 1.2454, + "grad_norm": 7.248071193695068, + "learning_rate": 9.113935215473428e-06 + }, + { + "step": 530, + "epoch": 2.6972010178117047, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493718016, + "loss": 1.2577, + "grad_norm": 6.570479869842529, + "learning_rate": 8.810641749791902e-06 + }, + { + "step": 531, + "epoch": 2.7022900763358777, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493894656, + "loss": 1.183, + "grad_norm": 6.513850688934326, + "learning_rate": 8.512327822548481e-06 + }, + { + "step": 532, + "epoch": 2.7073791348600507, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493768704, + "loss": 1.1505, + "grad_norm": 6.98987340927124, + "learning_rate": 8.219003954831199e-06 + }, + { + "step": 533, + "epoch": 2.712468193384224, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493765632, + "loss": 1.2294, + "grad_norm": 6.274317741394043, + "learning_rate": 7.930680491736135e-06 + }, + { + "step": 534, + "epoch": 2.717557251908397, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493774848, + "loss": 1.2458, + "grad_norm": 7.534372806549072, + "learning_rate": 7.647367602002491e-06 + }, + { + "step": 535, + "epoch": 2.72264631043257, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493650432, + "loss": 1.2501, + "grad_norm": 6.9775238037109375, + "learning_rate": 7.369075277654091e-06 + }, + { + "step": 536, + "epoch": 2.727735368956743, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493933056, + "loss": 1.3793, + "grad_norm": 8.388665199279785, + "learning_rate": 7.095813333646832e-06 + }, + { + "step": 537, + "epoch": 2.732824427480916, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493696512, + "loss": 1.2002, + "grad_norm": 6.396453380584717, + "learning_rate": 6.827591407522548e-06 + }, + { + "step": 538, + "epoch": 2.7379134860050893, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493791744, + "loss": 1.2441, + "grad_norm": 9.681181907653809, + "learning_rate": 6.564418959069273e-06 + }, + { + "step": 539, + "epoch": 2.7430025445292623, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493757952, + "loss": 1.2945, + "grad_norm": 12.482309341430664, + "learning_rate": 6.3063052699873326e-06 + }, + { + "step": 540, + "epoch": 2.7480916030534353, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493725696, + "loss": 1.2768, + "grad_norm": 16.456439971923828, + "learning_rate": 6.053259443562286e-06 + }, + { + "step": 541, + "epoch": 2.753180661577608, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493744128, + "loss": 1.3088, + "grad_norm": 7.781978130340576, + "learning_rate": 5.8052904043435985e-06 + }, + { + "step": 542, + "epoch": 2.758269720101781, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493794816, + "loss": 1.2509, + "grad_norm": 7.159851551055908, + "learning_rate": 5.56240689783013e-06 + }, + { + "step": 543, + "epoch": 2.763358778625954, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493690368, + "loss": 1.3432, + "grad_norm": 7.764480113983154, + "learning_rate": 5.324617490161409e-06 + }, + { + "step": 544, + "epoch": 2.768447837150127, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493719552, + "loss": 1.3175, + "grad_norm": 10.31360912322998, + "learning_rate": 5.091930567815866e-06 + }, + { + "step": 545, + "epoch": 2.7735368956743, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49366272, + "loss": 1.3299, + "grad_norm": 7.8830742835998535, + "learning_rate": 4.86435433731473e-06 + }, + { + "step": 546, + "epoch": 2.778625954198473, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493737984, + "loss": 1.1795, + "grad_norm": 6.658789157867432, + "learning_rate": 4.641896824932861e-06 + }, + { + "step": 547, + "epoch": 2.7837150127226464, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493730304, + "loss": 1.2336, + "grad_norm": 8.826005935668945, + "learning_rate": 4.424565876415415e-06 + }, + { + "step": 548, + "epoch": 2.7888040712468194, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493761024, + "loss": 1.1937, + "grad_norm": 8.552996635437012, + "learning_rate": 4.212369156701373e-06 + }, + { + "step": 549, + "epoch": 2.7938931297709924, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493690368, + "loss": 1.2638, + "grad_norm": 6.332573890686035, + "learning_rate": 4.005314149653133e-06 + }, + { + "step": 550, + "epoch": 2.7989821882951653, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49371648, + "loss": 1.2379, + "grad_norm": 7.895116806030273, + "learning_rate": 3.8034081577924147e-06 + }, + { + "step": 551, + "epoch": 2.8040712468193383, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493811712, + "loss": 1.1673, + "grad_norm": 8.463910102844238, + "learning_rate": 3.6066583020429864e-06 + }, + { + "step": 552, + "epoch": 2.8091603053435117, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493707264, + "loss": 1.1589, + "grad_norm": 6.721690654754639, + "learning_rate": 3.415071521479246e-06 + }, + { + "step": 553, + "epoch": 2.8142493638676847, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49371648, + "loss": 1.2045, + "grad_norm": 8.094408988952637, + "learning_rate": 3.2286545730817183e-06 + }, + { + "step": 554, + "epoch": 2.8193384223918576, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493744128, + "loss": 1.3074, + "grad_norm": 7.108813762664795, + "learning_rate": 3.0474140314985628e-06 + }, + { + "step": 555, + "epoch": 2.8244274809160306, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493757952, + "loss": 1.2889, + "grad_norm": 7.752115726470947, + "learning_rate": 2.8713562888138754e-06 + }, + { + "step": 556, + "epoch": 2.8295165394402035, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493751808, + "loss": 1.2651, + "grad_norm": 6.113507270812988, + "learning_rate": 2.7004875543220506e-06 + }, + { + "step": 557, + "epoch": 2.8346055979643765, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493714944, + "loss": 1.1764, + "grad_norm": 5.841967582702637, + "learning_rate": 2.5348138543089425e-06 + }, + { + "step": 558, + "epoch": 2.8396946564885495, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493773312, + "loss": 1.2546, + "grad_norm": 7.509109020233154, + "learning_rate": 2.374341031839283e-06 + }, + { + "step": 559, + "epoch": 2.8447837150127224, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493682688, + "loss": 1.2861, + "grad_norm": 7.103532314300537, + "learning_rate": 2.2190747465505644e-06 + }, + { + "step": 560, + "epoch": 2.849872773536896, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493767168, + "loss": 1.1458, + "grad_norm": 6.623390197753906, + "learning_rate": 2.0690204744534976e-06 + }, + { + "step": 561, + "epoch": 2.854961832061069, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493805568, + "loss": 1.2161, + "grad_norm": 14.436988830566406, + "learning_rate": 1.924183507738819e-06 + }, + { + "step": 562, + "epoch": 2.8600508905852418, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493850112, + "loss": 1.3615, + "grad_norm": 7.930851459503174, + "learning_rate": 1.7845689545906704e-06 + }, + { + "step": 563, + "epoch": 2.8651399491094147, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493811712, + "loss": 1.2717, + "grad_norm": 7.868764400482178, + "learning_rate": 1.6501817390064786e-06 + }, + { + "step": 564, + "epoch": 2.8702290076335877, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493748736, + "loss": 1.3069, + "grad_norm": 6.627160549163818, + "learning_rate": 1.521026600623243e-06 + }, + { + "step": 565, + "epoch": 2.875318066157761, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493802496, + "loss": 1.2508, + "grad_norm": 15.541763305664062, + "learning_rate": 1.3971080945503866e-06 + }, + { + "step": 566, + "epoch": 2.880407124681934, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493741056, + "loss": 1.2922, + "grad_norm": 12.958121299743652, + "learning_rate": 1.2784305912090842e-06 + }, + { + "step": 567, + "epoch": 2.885496183206107, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493727232, + "loss": 1.1732, + "grad_norm": 6.77432918548584, + "learning_rate": 1.1649982761782195e-06 + }, + { + "step": 568, + "epoch": 2.89058524173028, + "cpu_mem": 1.549152256, + "gpu_mem": 4.4939392, + "loss": 1.1505, + "grad_norm": 6.752580165863037, + "learning_rate": 1.0568151500465693e-06 + }, + { + "step": 569, + "epoch": 2.895674300254453, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493804032, + "loss": 1.2743, + "grad_norm": 7.889694690704346, + "learning_rate": 9.538850282719833e-07 + }, + { + "step": 570, + "epoch": 2.900763358778626, + "cpu_mem": 1.549152256, + "gpu_mem": 4.49377792, + "loss": 1.1997, + "grad_norm": 9.918436050415039, + "learning_rate": 8.56211541046542e-07 + }, + { + "step": 571, + "epoch": 2.905852417302799, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493784064, + "loss": 1.221, + "grad_norm": 19.192699432373047, + "learning_rate": 7.637981331687582e-07 + }, + { + "step": 572, + "epoch": 2.910941475826972, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493817856, + "loss": 1.3288, + "grad_norm": 9.466809272766113, + "learning_rate": 6.766480639218752e-07 + }, + { + "step": 573, + "epoch": 2.916030534351145, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493725696, + "loss": 1.2348, + "grad_norm": 9.121527671813965, + "learning_rate": 5.947644069591084e-07 + }, + { + "step": 574, + "epoch": 2.921119592875318, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493810176, + "loss": 1.2017, + "grad_norm": 8.814129829406738, + "learning_rate": 5.181500501950986e-07 + }, + { + "step": 575, + "epoch": 2.926208651399491, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493698048, + "loss": 1.2682, + "grad_norm": 12.099109649658203, + "learning_rate": 4.468076957041433e-07 + }, + { + "step": 576, + "epoch": 2.931297709923664, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493761024, + "loss": 1.2118, + "grad_norm": 6.347292423248291, + "learning_rate": 3.807398596248401e-07 + }, + { + "step": 577, + "epoch": 2.936386768447837, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493725696, + "loss": 1.2037, + "grad_norm": 7.579617500305176, + "learning_rate": 3.199488720714072e-07 + }, + { + "step": 578, + "epoch": 2.94147582697201, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493725696, + "loss": 1.2345, + "grad_norm": 8.67547607421875, + "learning_rate": 2.64436877051466e-07 + }, + { + "step": 579, + "epoch": 2.9465648854961835, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493810176, + "loss": 1.3132, + "grad_norm": 8.392699241638184, + "learning_rate": 2.1420583239040167e-07 + }, + { + "step": 580, + "epoch": 2.9516539440203564, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493756416, + "loss": 1.3315, + "grad_norm": 7.64285945892334, + "learning_rate": 1.6925750966238494e-07 + }, + { + "step": 581, + "epoch": 2.9567430025445294, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493776384, + "loss": 1.329, + "grad_norm": 12.694356918334961, + "learning_rate": 1.295934941278387e-07 + }, + { + "step": 582, + "epoch": 2.9618320610687023, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493704192, + "loss": 1.2005, + "grad_norm": 7.679481029510498, + "learning_rate": 9.52151846775162e-08 + }, + { + "step": 583, + "epoch": 2.9669211195928753, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493791744, + "loss": 1.3267, + "grad_norm": 10.184659957885742, + "learning_rate": 6.612379378320709e-08 + }, + { + "step": 584, + "epoch": 2.9720101781170483, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493759488, + "loss": 1.1493, + "grad_norm": 6.854414939880371, + "learning_rate": 4.232034745495494e-08 + }, + { + "step": 585, + "epoch": 2.9770992366412212, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493737984, + "loss": 1.1749, + "grad_norm": 23.339820861816406, + "learning_rate": 2.3805685204869583e-08 + }, + { + "step": 586, + "epoch": 2.982188295165394, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493681152, + "loss": 1.2808, + "grad_norm": 6.932606220245361, + "learning_rate": 1.0580460017517444e-08 + }, + { + "step": 587, + "epoch": 2.9872773536895676, + "cpu_mem": 1.549152256, + "gpu_mem": 4.4937472, + "loss": 1.2562, + "grad_norm": 7.118431091308594, + "learning_rate": 2.645138326906604e-09 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493788672, + "loss": 1.2232, + "grad_norm": 13.628323554992676, + "learning_rate": 0.0 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 1.549152256, + "gpu_mem": 4.493788672, + "train_runtime": 8387.9552, + "train_samples_per_second": 4.495, + "train_steps_per_second": 0.07, + "total_flos": 0.0, + "train_loss": 1.3757885437027937 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r2-a2/adapter_config.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..91562a2718627f56cb3f88093dd26c3a98c35384 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r2-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 4, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 2, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "A" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r2-a2/eval_results.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..19aca3d26898c37dfaf3a62416e4f84aeb02d080 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "winogrande", + "results": 0.5043409629044988 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r2-a2/training_configuration.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..c201866487508992c3ff36704171e7f9f5fb6234 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "WINOGRANDE", + "dataset_id": "allenai/winogrande", + "preprocess_id": "winogrande_train_deepeval" + }, + "peft_config": { + "method": "abl_A", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 1577576 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_A-winogrande-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r2-a2", + "seed": 42, + "timestamp": "2025-08-30T16:10:34.860587" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r2-a2/training_logs.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..6f6383bd3a5e5c2a64f933fc4a38bd93927de1f9 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r2-a2/training_logs.json @@ -0,0 +1,5773 @@ +[ + { + "step": 1, + "epoch": 0.00625, + "cpu_mem": 1.487515648, + "gpu_mem": 4.423721472, + "loss": 3.3802, + "grad_norm": 285.3889465332031, + "learning_rate": 4.6875e-06 + }, + { + "step": 2, + "epoch": 0.0125, + "cpu_mem": 1.493413888, + "gpu_mem": 4.4364928, + "loss": 3.3361, + "grad_norm": 279.3614196777344, + "learning_rate": 9.375e-06 + }, + { + "step": 3, + "epoch": 0.01875, + "cpu_mem": 1.494003712, + "gpu_mem": 4.436497408, + "loss": 2.9072, + "grad_norm": 260.0558776855469, + "learning_rate": 1.40625e-05 + }, + { + "step": 4, + "epoch": 0.025, + "cpu_mem": 1.494396928, + "gpu_mem": 4.436495872, + "loss": 2.2773, + "grad_norm": 233.6356201171875, + "learning_rate": 1.875e-05 + }, + { + "step": 5, + "epoch": 0.03125, + "cpu_mem": 1.494790144, + "gpu_mem": 4.436495872, + "loss": 1.6509, + "grad_norm": 153.1364288330078, + "learning_rate": 2.3437499999999997e-05 + }, + { + "step": 6, + "epoch": 0.0375, + "cpu_mem": 1.494986752, + "gpu_mem": 4.436502016, + "loss": 1.1986, + "grad_norm": 79.50932312011719, + "learning_rate": 2.8125e-05 + }, + { + "step": 7, + "epoch": 0.04375, + "cpu_mem": 1.49518336, + "gpu_mem": 4.43650816, + "loss": 0.9684, + "grad_norm": 93.7468032836914, + "learning_rate": 3.28125e-05 + }, + { + "step": 8, + "epoch": 0.05, + "cpu_mem": 1.495379968, + "gpu_mem": 4.436491264, + "loss": 0.8225, + "grad_norm": 33.79230499267578, + "learning_rate": 3.75e-05 + }, + { + "step": 9, + "epoch": 0.05625, + "cpu_mem": 1.495576576, + "gpu_mem": 4.436497408, + "loss": 0.7797, + "grad_norm": 39.196659088134766, + "learning_rate": 4.2187499999999995e-05 + }, + { + "step": 10, + "epoch": 0.0625, + "cpu_mem": 1.495773184, + "gpu_mem": 4.43650048, + "loss": 0.9063, + "grad_norm": 121.2220230102539, + "learning_rate": 4.6874999999999994e-05 + }, + { + "step": 11, + "epoch": 0.06875, + "cpu_mem": 1.495969792, + "gpu_mem": 4.436489728, + "loss": 0.8815, + "grad_norm": 95.12213897705078, + "learning_rate": 5.156249999999999e-05 + }, + { + "step": 12, + "epoch": 0.075, + "cpu_mem": 1.4961664, + "gpu_mem": 4.436494336, + "loss": 0.6693, + "grad_norm": 9.803084373474121, + "learning_rate": 5.625e-05 + }, + { + "step": 13, + "epoch": 0.08125, + "cpu_mem": 1.4961664, + "gpu_mem": 4.436502016, + "loss": 0.741, + "grad_norm": 15.395696640014648, + "learning_rate": 6.09375e-05 + }, + { + "step": 14, + "epoch": 0.0875, + "cpu_mem": 1.496363008, + "gpu_mem": 4.436497408, + "loss": 0.7664, + "grad_norm": 27.210342407226562, + "learning_rate": 6.5625e-05 + }, + { + "step": 15, + "epoch": 0.09375, + "cpu_mem": 1.496559616, + "gpu_mem": 4.436497408, + "loss": 0.6949, + "grad_norm": 10.07458782196045, + "learning_rate": 7.03125e-05 + }, + { + "step": 16, + "epoch": 0.1, + "cpu_mem": 1.496756224, + "gpu_mem": 4.436494336, + "loss": 0.6965, + "grad_norm": 11.679877281188965, + "learning_rate": 7.5e-05 + }, + { + "step": 17, + "epoch": 0.10625, + "cpu_mem": 1.496756224, + "gpu_mem": 4.436494336, + "loss": 0.7316, + "grad_norm": 19.22392463684082, + "learning_rate": 7.968749999999999e-05 + }, + { + "step": 18, + "epoch": 0.1125, + "cpu_mem": 1.496756224, + "gpu_mem": 4.436497408, + "loss": 0.6987, + "grad_norm": 7.040602684020996, + "learning_rate": 8.437499999999999e-05 + }, + { + "step": 19, + "epoch": 0.11875, + "cpu_mem": 1.496756224, + "gpu_mem": 4.436494336, + "loss": 0.7117, + "grad_norm": 10.438284873962402, + "learning_rate": 8.906249999999999e-05 + }, + { + "step": 20, + "epoch": 0.125, + "cpu_mem": 1.496952832, + "gpu_mem": 4.436502016, + "loss": 0.7258, + "grad_norm": 10.764849662780762, + "learning_rate": 9.374999999999999e-05 + }, + { + "step": 21, + "epoch": 0.13125, + "cpu_mem": 1.496952832, + "gpu_mem": 4.436494336, + "loss": 0.6962, + "grad_norm": 11.162935256958008, + "learning_rate": 9.843749999999999e-05 + }, + { + "step": 22, + "epoch": 0.1375, + "cpu_mem": 1.49714944, + "gpu_mem": 4.436494336, + "loss": 0.7436, + "grad_norm": 13.140003204345703, + "learning_rate": 0.00010312499999999999 + }, + { + "step": 23, + "epoch": 0.14375, + "cpu_mem": 1.49714944, + "gpu_mem": 4.436489728, + "loss": 0.6971, + "grad_norm": 10.013650894165039, + "learning_rate": 0.00010781249999999998 + }, + { + "step": 24, + "epoch": 0.15, + "cpu_mem": 1.49714944, + "gpu_mem": 4.4364928, + "loss": 0.6991, + "grad_norm": 4.925834655761719, + "learning_rate": 0.0001125 + }, + { + "step": 25, + "epoch": 0.15625, + "cpu_mem": 1.49714944, + "gpu_mem": 4.436495872, + "loss": 0.7101, + "grad_norm": 6.387951850891113, + "learning_rate": 0.0001171875 + }, + { + "step": 26, + "epoch": 0.1625, + "cpu_mem": 1.49714944, + "gpu_mem": 4.436491264, + "loss": 0.7477, + "grad_norm": 16.547260284423828, + "learning_rate": 0.000121875 + }, + { + "step": 27, + "epoch": 0.16875, + "cpu_mem": 1.497346048, + "gpu_mem": 4.436489728, + "loss": 0.6878, + "grad_norm": 4.216243743896484, + "learning_rate": 0.0001265625 + }, + { + "step": 28, + "epoch": 0.175, + "cpu_mem": 1.497346048, + "gpu_mem": 4.436495872, + "loss": 0.7359, + "grad_norm": 14.428729057312012, + "learning_rate": 0.00013125 + }, + { + "step": 29, + "epoch": 0.18125, + "cpu_mem": 1.497346048, + "gpu_mem": 4.436494336, + "loss": 0.694, + "grad_norm": 3.808842658996582, + "learning_rate": 0.0001359375 + }, + { + "step": 30, + "epoch": 0.1875, + "cpu_mem": 1.497346048, + "gpu_mem": 4.436494336, + "loss": 0.7787, + "grad_norm": 18.62034034729004, + "learning_rate": 0.000140625 + }, + { + "step": 31, + "epoch": 0.19375, + "cpu_mem": 1.497346048, + "gpu_mem": 4.436494336, + "loss": 0.769, + "grad_norm": 16.24230194091797, + "learning_rate": 0.0001453125 + }, + { + "step": 32, + "epoch": 0.2, + "cpu_mem": 1.497346048, + "gpu_mem": 4.436491264, + "loss": 0.7193, + "grad_norm": 11.8719482421875, + "learning_rate": 0.00015 + }, + { + "step": 33, + "epoch": 0.20625, + "cpu_mem": 1.497542656, + "gpu_mem": 4.436491264, + "loss": 0.7475, + "grad_norm": 15.560171127319336, + "learning_rate": 0.00015468749999999999 + }, + { + "step": 34, + "epoch": 0.2125, + "cpu_mem": 1.497542656, + "gpu_mem": 4.436491264, + "loss": 0.7725, + "grad_norm": 16.460046768188477, + "learning_rate": 0.00015937499999999998 + }, + { + "step": 35, + "epoch": 0.21875, + "cpu_mem": 1.497542656, + "gpu_mem": 4.436497408, + "loss": 0.7281, + "grad_norm": 10.301942825317383, + "learning_rate": 0.00016406249999999998 + }, + { + "step": 36, + "epoch": 0.225, + "cpu_mem": 1.497542656, + "gpu_mem": 4.4364928, + "loss": 0.6989, + "grad_norm": 8.131368637084961, + "learning_rate": 0.00016874999999999998 + }, + { + "step": 37, + "epoch": 0.23125, + "cpu_mem": 1.497542656, + "gpu_mem": 4.436491264, + "loss": 0.6941, + "grad_norm": 3.8053653240203857, + "learning_rate": 0.00017343749999999998 + }, + { + "step": 38, + "epoch": 0.2375, + "cpu_mem": 1.497542656, + "gpu_mem": 4.436495872, + "loss": 0.7254, + "grad_norm": 9.349738121032715, + "learning_rate": 0.00017812499999999998 + }, + { + "step": 39, + "epoch": 0.24375, + "cpu_mem": 1.497542656, + "gpu_mem": 4.436502016, + "loss": 0.739, + "grad_norm": 10.760190963745117, + "learning_rate": 0.00018281249999999998 + }, + { + "step": 40, + "epoch": 0.25, + "cpu_mem": 1.497542656, + "gpu_mem": 4.436498944, + "loss": 0.7081, + "grad_norm": 5.190445899963379, + "learning_rate": 0.00018749999999999998 + }, + { + "step": 41, + "epoch": 0.25625, + "cpu_mem": 1.497542656, + "gpu_mem": 4.436498944, + "loss": 0.7034, + "grad_norm": 3.949763059616089, + "learning_rate": 0.00019218749999999998 + }, + { + "step": 42, + "epoch": 0.2625, + "cpu_mem": 1.497542656, + "gpu_mem": 4.436495872, + "loss": 0.7171, + "grad_norm": 8.843171119689941, + "learning_rate": 0.00019687499999999997 + }, + { + "step": 43, + "epoch": 0.26875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7255, + "grad_norm": 6.156717300415039, + "learning_rate": 0.00020156249999999997 + }, + { + "step": 44, + "epoch": 0.275, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6926, + "grad_norm": 4.2227091789245605, + "learning_rate": 0.00020624999999999997 + }, + { + "step": 45, + "epoch": 0.28125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436503552, + "loss": 0.6888, + "grad_norm": 1.6789780855178833, + "learning_rate": 0.00021093749999999997 + }, + { + "step": 46, + "epoch": 0.2875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7247, + "grad_norm": 5.187302589416504, + "learning_rate": 0.00021562499999999997 + }, + { + "step": 47, + "epoch": 0.29375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7053, + "grad_norm": 3.702117681503296, + "learning_rate": 0.00022031249999999997 + }, + { + "step": 48, + "epoch": 0.3, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6833, + "grad_norm": 2.526420831680298, + "learning_rate": 0.000225 + }, + { + "step": 49, + "epoch": 0.30625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.8034, + "grad_norm": 13.83984088897705, + "learning_rate": 0.0002296875 + }, + { + "step": 50, + "epoch": 0.3125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7013, + "grad_norm": 1.77960205078125, + "learning_rate": 0.000234375 + }, + { + "step": 51, + "epoch": 0.31875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7001, + "grad_norm": 4.421973705291748, + "learning_rate": 0.0002390625 + }, + { + "step": 52, + "epoch": 0.325, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7146, + "grad_norm": 7.783204555511475, + "learning_rate": 0.00024375 + }, + { + "step": 53, + "epoch": 0.33125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.8606, + "grad_norm": 155.81857299804688, + "learning_rate": 0.00024843749999999996 + }, + { + "step": 54, + "epoch": 0.3375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43648512, + "loss": 0.6999, + "grad_norm": 26.071014404296875, + "learning_rate": 0.000253125 + }, + { + "step": 55, + "epoch": 0.34375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.714, + "grad_norm": 5.301547527313232, + "learning_rate": 0.00025781249999999996 + }, + { + "step": 56, + "epoch": 0.35, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6965, + "grad_norm": 2.858142852783203, + "learning_rate": 0.0002625 + }, + { + "step": 57, + "epoch": 0.35625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.7615, + "grad_norm": 11.273687362670898, + "learning_rate": 0.00026718749999999996 + }, + { + "step": 58, + "epoch": 0.3625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7276, + "grad_norm": 7.816659927368164, + "learning_rate": 0.000271875 + }, + { + "step": 59, + "epoch": 0.36875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436503552, + "loss": 0.6895, + "grad_norm": 1.4052209854125977, + "learning_rate": 0.00027656249999999995 + }, + { + "step": 60, + "epoch": 0.375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7108, + "grad_norm": 4.213801383972168, + "learning_rate": 0.00028125 + }, + { + "step": 61, + "epoch": 0.38125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7167, + "grad_norm": 5.404194355010986, + "learning_rate": 0.00028593749999999995 + }, + { + "step": 62, + "epoch": 0.3875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6923, + "grad_norm": 1.015683889389038, + "learning_rate": 0.000290625 + }, + { + "step": 63, + "epoch": 0.39375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6911, + "grad_norm": 4.154592990875244, + "learning_rate": 0.00029531249999999995 + }, + { + "step": 64, + "epoch": 0.4, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.7367, + "grad_norm": 8.377941131591797, + "learning_rate": 0.0003 + }, + { + "step": 65, + "epoch": 0.40625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7282, + "grad_norm": 7.020410060882568, + "learning_rate": 0.00029999776892091325 + }, + { + "step": 66, + "epoch": 0.4125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.695, + "grad_norm": 1.5096642971038818, + "learning_rate": 0.00029999107575002246 + }, + { + "step": 67, + "epoch": 0.41875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6875, + "grad_norm": 3.222912549972534, + "learning_rate": 0.0002999799206864343 + }, + { + "step": 68, + "epoch": 0.425, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7327, + "grad_norm": 5.488400936126709, + "learning_rate": 0.0002999643040619863 + }, + { + "step": 69, + "epoch": 0.43125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7112, + "grad_norm": 3.8396189212799072, + "learning_rate": 0.0002999442263412377 + }, + { + "step": 70, + "epoch": 0.4375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.69, + "grad_norm": 1.7706767320632935, + "learning_rate": 0.00029991968812145484 + }, + { + "step": 71, + "epoch": 0.44375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.8567, + "grad_norm": 24.09041976928711, + "learning_rate": 0.00029989069013259374 + }, + { + "step": 72, + "epoch": 0.45, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7703, + "grad_norm": 17.66923713684082, + "learning_rate": 0.00029985723323727866 + }, + { + "step": 73, + "epoch": 0.45625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6947, + "grad_norm": 4.4372406005859375, + "learning_rate": 0.00029981931843077583 + }, + { + "step": 74, + "epoch": 0.4625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.7178, + "grad_norm": 7.610154628753662, + "learning_rate": 0.00029977694684096444 + }, + { + "step": 75, + "epoch": 0.46875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7512, + "grad_norm": 10.189748764038086, + "learning_rate": 0.0002997301197283027 + }, + { + "step": 76, + "epoch": 0.475, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6783, + "grad_norm": 1.04703688621521, + "learning_rate": 0.0002996788384857905 + }, + { + "step": 77, + "epoch": 0.48125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.7146, + "grad_norm": 3.964552164077759, + "learning_rate": 0.00029962310463892795 + }, + { + "step": 78, + "epoch": 0.4875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6976, + "grad_norm": 3.6057846546173096, + "learning_rate": 0.00029956291984566997 + }, + { + "step": 79, + "epoch": 0.49375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.712, + "grad_norm": 4.002328395843506, + "learning_rate": 0.00029949828589637703 + }, + { + "step": 80, + "epoch": 0.5, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7043, + "grad_norm": 1.9990167617797852, + "learning_rate": 0.0002994292047137618 + }, + { + "step": 81, + "epoch": 0.50625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.6962, + "grad_norm": 0.9582574367523193, + "learning_rate": 0.00029935567835283203 + }, + { + "step": 82, + "epoch": 0.5125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.7173, + "grad_norm": 4.804891109466553, + "learning_rate": 0.00029927770900082954 + }, + { + "step": 83, + "epoch": 0.51875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.683, + "grad_norm": 1.315712809562683, + "learning_rate": 0.0002991952989771647 + }, + { + "step": 84, + "epoch": 0.525, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.7392, + "grad_norm": 8.15776538848877, + "learning_rate": 0.0002991084507333479 + }, + { + "step": 85, + "epoch": 0.53125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7154, + "grad_norm": 4.294924259185791, + "learning_rate": 0.00029901716685291663 + }, + { + "step": 86, + "epoch": 0.5375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7126, + "grad_norm": 5.7178568840026855, + "learning_rate": 0.0002989214500513582 + }, + { + "step": 87, + "epoch": 0.54375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7332, + "grad_norm": 7.980727195739746, + "learning_rate": 0.0002988213031760294 + }, + { + "step": 88, + "epoch": 0.55, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6931, + "grad_norm": 0.731619656085968, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 89, + "epoch": 0.55625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.695, + "grad_norm": 2.1438097953796387, + "learning_rate": 0.0002986077312523219 + }, + { + "step": 90, + "epoch": 0.5625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7261, + "grad_norm": 4.310764312744141, + "learning_rate": 0.00029849431255722116 + }, + { + "step": 91, + "epoch": 0.56875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.7242, + "grad_norm": 5.902415752410889, + "learning_rate": 0.00029837647649471715 + }, + { + "step": 92, + "epoch": 0.575, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7186, + "grad_norm": 4.6063456535339355, + "learning_rate": 0.0002982542265701641 + }, + { + "step": 93, + "epoch": 0.58125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7012, + "grad_norm": 2.964845895767212, + "learning_rate": 0.0002981275664202187 + }, + { + "step": 94, + "epoch": 0.5875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6916, + "grad_norm": 0.7296403050422668, + "learning_rate": 0.00029799649981273186 + }, + { + "step": 95, + "epoch": 0.59375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7022, + "grad_norm": 2.481067657470703, + "learning_rate": 0.00029786103064663634 + }, + { + "step": 96, + "epoch": 0.6, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6997, + "grad_norm": 2.9389548301696777, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 97, + "epoch": 0.60625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6946, + "grad_norm": 0.5389977693557739, + "learning_rate": 0.00029757690088906156 + }, + { + "step": 98, + "epoch": 0.6125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6977, + "grad_norm": 0.4456123411655426, + "learning_rate": 0.00029742824874979515 + }, + { + "step": 99, + "epoch": 0.61875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6981, + "grad_norm": 3.107194662094116, + "learning_rate": 0.0002972752109560943 + }, + { + "step": 100, + "epoch": 0.625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7102, + "grad_norm": 2.419109344482422, + "learning_rate": 0.00029711779206048454 + }, + { + "step": 101, + "epoch": 0.63125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7002, + "grad_norm": 1.8480744361877441, + "learning_rate": 0.0002969559967458194 + }, + { + "step": 102, + "epoch": 0.6375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.6945, + "grad_norm": 1.7667357921600342, + "learning_rate": 0.0002967898298251407 + }, + { + "step": 103, + "epoch": 0.64375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7566, + "grad_norm": 7.269573211669922, + "learning_rate": 0.0002966192962415358 + }, + { + "step": 104, + "epoch": 0.65, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7276, + "grad_norm": 5.319644451141357, + "learning_rate": 0.00029644440106799 + }, + { + "step": 105, + "epoch": 0.65625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7199, + "grad_norm": 3.338465690612793, + "learning_rate": 0.00029626514950723627 + }, + { + "step": 106, + "epoch": 0.6625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6987, + "grad_norm": 1.7126240730285645, + "learning_rate": 0.0002960815468916 + }, + { + "step": 107, + "epoch": 0.66875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6969, + "grad_norm": 3.4810101985931396, + "learning_rate": 0.0002958935986828407 + }, + { + "step": 108, + "epoch": 0.675, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7112, + "grad_norm": 7.581526279449463, + "learning_rate": 0.00029570131047198915 + }, + { + "step": 109, + "epoch": 0.68125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7106, + "grad_norm": 1.8422462940216064, + "learning_rate": 0.0002955046879791816 + }, + { + "step": 110, + "epoch": 0.6875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.7126, + "grad_norm": 10.206927299499512, + "learning_rate": 0.00029530373705348895 + }, + { + "step": 111, + "epoch": 0.69375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6958, + "grad_norm": 0.7233216166496277, + "learning_rate": 0.00029509846367274336 + }, + { + "step": 112, + "epoch": 0.7, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6933, + "grad_norm": 0.654333770275116, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 113, + "epoch": 0.70625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436486656, + "loss": 0.6836, + "grad_norm": 4.067215919494629, + "learning_rate": 0.00029467497410015625 + }, + { + "step": 114, + "epoch": 0.7125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7089, + "grad_norm": 9.058445930480957, + "learning_rate": 0.00029445677050616437 + }, + { + "step": 115, + "epoch": 0.71875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7376, + "grad_norm": 11.79687213897705, + "learning_rate": 0.0002942342696524443 + }, + { + "step": 116, + "epoch": 0.725, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.777, + "grad_norm": 18.756315231323242, + "learning_rate": 0.0002940074781578893 + }, + { + "step": 117, + "epoch": 0.73125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7018, + "grad_norm": 6.430751323699951, + "learning_rate": 0.00029377640276902954 + }, + { + "step": 118, + "epoch": 0.7375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6932, + "grad_norm": 3.9170732498168945, + "learning_rate": 0.0002935410503598313 + }, + { + "step": 119, + "epoch": 0.74375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6844, + "grad_norm": 4.9054412841796875, + "learning_rate": 0.00029330142793149237 + }, + { + "step": 120, + "epoch": 0.75, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7826, + "grad_norm": 9.598040580749512, + "learning_rate": 0.000293057542612234 + }, + { + "step": 121, + "epoch": 0.75625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7868, + "grad_norm": 8.2277193069458, + "learning_rate": 0.0002928094016570886 + }, + { + "step": 122, + "epoch": 0.7625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7461, + "grad_norm": 7.935102939605713, + "learning_rate": 0.00029255701244768414 + }, + { + "step": 123, + "epoch": 0.76875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7158, + "grad_norm": 4.619503021240234, + "learning_rate": 0.0002923003824920244 + }, + { + "step": 124, + "epoch": 0.775, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7346, + "grad_norm": 6.56942081451416, + "learning_rate": 0.0002920395194242658 + }, + { + "step": 125, + "epoch": 0.78125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436486656, + "loss": 0.7036, + "grad_norm": 2.9651873111724854, + "learning_rate": 0.00029177443100449014 + }, + { + "step": 126, + "epoch": 0.7875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.693, + "grad_norm": 0.8042317032814026, + "learning_rate": 0.00029150512511847375 + }, + { + "step": 127, + "epoch": 0.79375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7012, + "grad_norm": 3.191408395767212, + "learning_rate": 0.00029123160977745306 + }, + { + "step": 128, + "epoch": 0.8, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.7023, + "grad_norm": 3.164586305618286, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 129, + "epoch": 0.80625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7224, + "grad_norm": 5.403274059295654, + "learning_rate": 0.00029067198340121094 + }, + { + "step": 130, + "epoch": 0.8125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7165, + "grad_norm": 4.267147541046143, + "learning_rate": 0.00029038588901359884 + }, + { + "step": 131, + "epoch": 0.81875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7258, + "grad_norm": 25.909683227539062, + "learning_rate": 0.00029009561846570604 + }, + { + "step": 132, + "epoch": 0.825, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.752, + "grad_norm": 20.13861846923828, + "learning_rate": 0.00028980118039241976 + }, + { + "step": 133, + "epoch": 0.83125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7012, + "grad_norm": 6.156324863433838, + "learning_rate": 0.00028950258355260177 + }, + { + "step": 134, + "epoch": 0.8375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6931, + "grad_norm": 0.7320787310600281, + "learning_rate": 0.00028919983682882766 + }, + { + "step": 135, + "epoch": 0.84375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.6816, + "grad_norm": 3.479029417037964, + "learning_rate": 0.0002888929492271224 + }, + { + "step": 136, + "epoch": 0.85, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6772, + "grad_norm": 0.9802557826042175, + "learning_rate": 0.000288581929876693 + }, + { + "step": 137, + "epoch": 0.85625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6891, + "grad_norm": 2.2437896728515625, + "learning_rate": 0.00028826678802965614 + }, + { + "step": 138, + "epoch": 0.8625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6941, + "grad_norm": 0.6797558069229126, + "learning_rate": 0.0002879475330607638 + }, + { + "step": 139, + "epoch": 0.86875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7097, + "grad_norm": 4.842504501342773, + "learning_rate": 0.00028762417446712363 + }, + { + "step": 140, + "epoch": 0.875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7239, + "grad_norm": 5.748685359954834, + "learning_rate": 0.00028729672186791704 + }, + { + "step": 141, + "epoch": 0.88125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6927, + "grad_norm": 1.6999866962432861, + "learning_rate": 0.00028696518500411254 + }, + { + "step": 142, + "epoch": 0.8875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7047, + "grad_norm": 3.253180503845215, + "learning_rate": 0.0002866295737381763 + }, + { + "step": 143, + "epoch": 0.89375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7452, + "grad_norm": 5.479006290435791, + "learning_rate": 0.0002862898980537788 + }, + { + "step": 144, + "epoch": 0.9, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7166, + "grad_norm": 3.281168222427368, + "learning_rate": 0.0002859461680554975 + }, + { + "step": 145, + "epoch": 0.90625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.6957, + "grad_norm": 2.4656944274902344, + "learning_rate": 0.0002855983939685165 + }, + { + "step": 146, + "epoch": 0.9125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7069, + "grad_norm": 2.9342591762542725, + "learning_rate": 0.0002852465861383224 + }, + { + "step": 147, + "epoch": 0.91875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6999, + "grad_norm": 1.5350013971328735, + "learning_rate": 0.00028489075503039643 + }, + { + "step": 148, + "epoch": 0.925, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6956, + "grad_norm": 1.0407488346099854, + "learning_rate": 0.00028453091122990323 + }, + { + "step": 149, + "epoch": 0.93125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.6895, + "grad_norm": 1.7011079788208008, + "learning_rate": 0.0002841670654413757 + }, + { + "step": 150, + "epoch": 0.9375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.6834, + "grad_norm": 0.4810512661933899, + "learning_rate": 0.0002837992284883971 + }, + { + "step": 151, + "epoch": 0.94375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7463, + "grad_norm": 6.7247209548950195, + "learning_rate": 0.0002834274113132784 + }, + { + "step": 152, + "epoch": 0.95, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7118, + "grad_norm": 3.6937692165374756, + "learning_rate": 0.0002830516249767332 + }, + { + "step": 153, + "epoch": 0.95625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.689, + "grad_norm": 0.8106914162635803, + "learning_rate": 0.0002826718806575488 + }, + { + "step": 154, + "epoch": 0.9625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6914, + "grad_norm": 0.625818133354187, + "learning_rate": 0.0002822881896522532 + }, + { + "step": 155, + "epoch": 0.96875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6808, + "grad_norm": 0.6899487972259521, + "learning_rate": 0.0002819005633747795 + }, + { + "step": 156, + "epoch": 0.975, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.7571, + "grad_norm": 8.942800521850586, + "learning_rate": 0.00028150901335612615 + }, + { + "step": 157, + "epoch": 0.98125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6214, + "grad_norm": 8.304414749145508, + "learning_rate": 0.0002811135512440138 + }, + { + "step": 158, + "epoch": 0.9875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436505088, + "loss": 0.7602, + "grad_norm": 10.992262840270996, + "learning_rate": 0.0002807141888025392 + }, + { + "step": 159, + "epoch": 0.99375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6943, + "grad_norm": 3.8538589477539062, + "learning_rate": 0.00028031093791182484 + }, + { + "step": 160, + "epoch": 1.0, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.7048, + "grad_norm": 2.993887186050415, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 161, + "epoch": 1.00625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6944, + "grad_norm": 2.347763776779175, + "learning_rate": 0.0002794928188811727 + }, + { + "step": 162, + "epoch": 1.0125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.698, + "grad_norm": 2.9874112606048584, + "learning_rate": 0.0002790779750784118 + }, + { + "step": 163, + "epoch": 1.01875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6893, + "grad_norm": 4.87800931930542, + "learning_rate": 0.0002786592915000408 + }, + { + "step": 164, + "epoch": 1.025, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6761, + "grad_norm": 2.7489819526672363, + "learning_rate": 0.00027823678060094197 + }, + { + "step": 165, + "epoch": 1.03125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.684, + "grad_norm": 4.792850971221924, + "learning_rate": 0.0002778104549498518 + }, + { + "step": 166, + "epoch": 1.0375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 1.5391, + "grad_norm": 629.42529296875, + "learning_rate": 0.00027738032722898683 + }, + { + "step": 167, + "epoch": 1.04375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 1.4526, + "grad_norm": 953.7625732421875, + "learning_rate": 0.00027694641023366656 + }, + { + "step": 168, + "epoch": 1.05, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.6903, + "grad_norm": 3.8154449462890625, + "learning_rate": 0.0002765087168719328 + }, + { + "step": 169, + "epoch": 1.05625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7706, + "grad_norm": 22.669921875, + "learning_rate": 0.00027606726016416567 + }, + { + "step": 170, + "epoch": 1.0625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.6919, + "grad_norm": 5.547135829925537, + "learning_rate": 0.00027562205324269617 + }, + { + "step": 171, + "epoch": 1.06875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7286, + "grad_norm": 13.827351570129395, + "learning_rate": 0.00027517310935141565 + }, + { + "step": 172, + "epoch": 1.075, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6993, + "grad_norm": 2.970682144165039, + "learning_rate": 0.0002747204418453818 + }, + { + "step": 173, + "epoch": 1.08125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436486656, + "loss": 0.6971, + "grad_norm": 6.324443817138672, + "learning_rate": 0.00027426406419042135 + }, + { + "step": 174, + "epoch": 1.0875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7232, + "grad_norm": 10.709604263305664, + "learning_rate": 0.00027380398996272956 + }, + { + "step": 175, + "epoch": 1.09375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6972, + "grad_norm": 4.308836460113525, + "learning_rate": 0.0002733402328484662 + }, + { + "step": 176, + "epoch": 1.1, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7233, + "grad_norm": 4.744782447814941, + "learning_rate": 0.00027287280664334875 + }, + { + "step": 177, + "epoch": 1.10625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7241, + "grad_norm": 5.329172134399414, + "learning_rate": 0.0002724017252522415 + }, + { + "step": 178, + "epoch": 1.1125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6877, + "grad_norm": 2.647876024246216, + "learning_rate": 0.0002719270026887423 + }, + { + "step": 179, + "epoch": 1.11875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6873, + "grad_norm": 1.5647369623184204, + "learning_rate": 0.0002714486530747656 + }, + { + "step": 180, + "epoch": 1.125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6928, + "grad_norm": 1.0161093473434448, + "learning_rate": 0.0002709666906401224 + }, + { + "step": 181, + "epoch": 1.13125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436509696, + "loss": 0.6923, + "grad_norm": 4.675751686096191, + "learning_rate": 0.0002704811297220967 + }, + { + "step": 182, + "epoch": 1.1375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7133, + "grad_norm": 5.3070244789123535, + "learning_rate": 0.00026999198476501945 + }, + { + "step": 183, + "epoch": 1.14375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6958, + "grad_norm": 2.993804693222046, + "learning_rate": 0.0002694992703198383 + }, + { + "step": 184, + "epoch": 1.15, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6822, + "grad_norm": 0.3614085018634796, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 185, + "epoch": 1.15625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7044, + "grad_norm": 3.6765096187591553, + "learning_rate": 0.0002685031916994403 + }, + { + "step": 186, + "epoch": 1.1625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.7386, + "grad_norm": 5.548724174499512, + "learning_rate": 0.0002679998571552925 + }, + { + "step": 187, + "epoch": 1.16875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6787, + "grad_norm": 1.3039053678512573, + "learning_rate": 0.0002674930123842975 + }, + { + "step": 188, + "epoch": 1.175, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6922, + "grad_norm": 0.9948331117630005, + "learning_rate": 0.0002669826724639322 + }, + { + "step": 189, + "epoch": 1.18125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6993, + "grad_norm": 2.046560764312744, + "learning_rate": 0.0002664688525756463 + }, + { + "step": 190, + "epoch": 1.1875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7098, + "grad_norm": 4.035542964935303, + "learning_rate": 0.0002659515680044105 + }, + { + "step": 191, + "epoch": 1.19375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.7152, + "grad_norm": 3.6364972591400146, + "learning_rate": 0.00026543083413826203 + }, + { + "step": 192, + "epoch": 1.2, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6859, + "grad_norm": 2.299128293991089, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 193, + "epoch": 1.20625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6917, + "grad_norm": 0.38097113370895386, + "learning_rate": 0.0002643790805859582 + }, + { + "step": 194, + "epoch": 1.2125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.6877, + "grad_norm": 0.7054401636123657, + "learning_rate": 0.00026384809218707423 + }, + { + "step": 195, + "epoch": 1.21875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7264, + "grad_norm": 4.0929718017578125, + "learning_rate": 0.0002633137170668897 + }, + { + "step": 196, + "epoch": 1.225, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6985, + "grad_norm": 1.2673958539962769, + "learning_rate": 0.0002627759711218466 + }, + { + "step": 197, + "epoch": 1.23125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.9032, + "grad_norm": 384.8502197265625, + "learning_rate": 0.00026223487034866133 + }, + { + "step": 198, + "epoch": 1.2375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7065, + "grad_norm": 3.4167368412017822, + "learning_rate": 0.00026169043084384896 + }, + { + "step": 199, + "epoch": 1.24375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6838, + "grad_norm": 1.3940906524658203, + "learning_rate": 0.00026114266880324387 + }, + { + "step": 200, + "epoch": 1.25, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.714, + "grad_norm": 4.32194709777832, + "learning_rate": 0.0002605916005215186 + }, + { + "step": 201, + "epoch": 1.25625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.6939, + "grad_norm": 0.57419353723526, + "learning_rate": 0.00026003724239169874 + }, + { + "step": 202, + "epoch": 1.2625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7649, + "grad_norm": 12.28019905090332, + "learning_rate": 0.00025947961090467533 + }, + { + "step": 203, + "epoch": 1.26875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6601, + "grad_norm": 3.002559185028076, + "learning_rate": 0.0002589187226487144 + }, + { + "step": 204, + "epoch": 1.275, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7439, + "grad_norm": 8.801597595214844, + "learning_rate": 0.0002583545943089633 + }, + { + "step": 205, + "epoch": 1.28125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.7298, + "grad_norm": 6.72214937210083, + "learning_rate": 0.00025778724266695466 + }, + { + "step": 206, + "epoch": 1.2875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7027, + "grad_norm": 2.207871437072754, + "learning_rate": 0.00025721668460010696 + }, + { + "step": 207, + "epoch": 1.29375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.7228, + "grad_norm": 4.154045104980469, + "learning_rate": 0.0002566429370812223 + }, + { + "step": 208, + "epoch": 1.3, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7363, + "grad_norm": 5.225917816162109, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 209, + "epoch": 1.30625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7046, + "grad_norm": 4.986802101135254, + "learning_rate": 0.0002554859420524386 + }, + { + "step": 210, + "epoch": 1.3125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6977, + "grad_norm": 3.269658327102661, + "learning_rate": 0.00025490272896050507 + }, + { + "step": 211, + "epoch": 1.31875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7303, + "grad_norm": 6.697885990142822, + "learning_rate": 0.00025431639525144175 + }, + { + "step": 212, + "epoch": 1.325, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.693, + "grad_norm": 1.2281299829483032, + "learning_rate": 0.0002537269583673404 + }, + { + "step": 213, + "epoch": 1.33125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7103, + "grad_norm": 4.732335567474365, + "learning_rate": 0.0002531344358426051 + }, + { + "step": 214, + "epoch": 1.3375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436503552, + "loss": 0.6919, + "grad_norm": 0.6390857696533203, + "learning_rate": 0.0002525388453034307 + }, + { + "step": 215, + "epoch": 1.34375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.6952, + "grad_norm": 3.3512179851531982, + "learning_rate": 0.0002519402044672784 + }, + { + "step": 216, + "epoch": 1.35, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6855, + "grad_norm": 3.189922332763672, + "learning_rate": 0.00025133853114234905 + }, + { + "step": 217, + "epoch": 1.35625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7215, + "grad_norm": 7.564175128936768, + "learning_rate": 0.00025073384322705274 + }, + { + "step": 218, + "epoch": 1.3625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7457, + "grad_norm": 11.786471366882324, + "learning_rate": 0.0002501261587094771 + }, + { + "step": 219, + "epoch": 1.36875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6974, + "grad_norm": 2.0770699977874756, + "learning_rate": 0.00024951549566685165 + }, + { + "step": 220, + "epoch": 1.375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6954, + "grad_norm": 0.7252816557884216, + "learning_rate": 0.0002489018722650103 + }, + { + "step": 221, + "epoch": 1.38125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7259, + "grad_norm": 6.376246452331543, + "learning_rate": 0.00024828530675785094 + }, + { + "step": 222, + "epoch": 1.3875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6945, + "grad_norm": 0.6783043146133423, + "learning_rate": 0.00024766581748679234 + }, + { + "step": 223, + "epoch": 1.39375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.6875, + "grad_norm": 1.0938622951507568, + "learning_rate": 0.0002470434228802286 + }, + { + "step": 224, + "epoch": 1.4, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7097, + "grad_norm": 2.026672840118408, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 225, + "epoch": 1.40625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7341, + "grad_norm": 3.4444167613983154, + "learning_rate": 0.0002457899918057468 + }, + { + "step": 226, + "epoch": 1.4125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6996, + "grad_norm": 1.6396775245666504, + "learning_rate": 0.0002451589926245468 + }, + { + "step": 227, + "epoch": 1.41875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.7309, + "grad_norm": 5.627193927764893, + "learning_rate": 0.00024452516268016865 + }, + { + "step": 228, + "epoch": 1.425, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.767, + "grad_norm": 7.814610958099365, + "learning_rate": 0.00024388852082760884 + }, + { + "step": 229, + "epoch": 1.43125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.7239, + "grad_norm": 4.230601787567139, + "learning_rate": 0.00024324908600551162 + }, + { + "step": 230, + "epoch": 1.4375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6932, + "grad_norm": 0.8061604499816895, + "learning_rate": 0.00024260687723560574 + }, + { + "step": 231, + "epoch": 1.44375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.7487, + "grad_norm": 13.217220306396484, + "learning_rate": 0.00024196191362213862 + }, + { + "step": 232, + "epoch": 1.45, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.7213, + "grad_norm": 35.40034484863281, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 233, + "epoch": 1.45625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.8857, + "grad_norm": 54.41415786743164, + "learning_rate": 0.0002406637986906913 + }, + { + "step": 234, + "epoch": 1.4625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436505088, + "loss": 8.6117, + "grad_norm": 955.7543334960938, + "learning_rate": 0.00024001068598867212 + }, + { + "step": 235, + "epoch": 1.46875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 2.7996, + "grad_norm": 117.43094635009766, + "learning_rate": 0.000239354895673865 + }, + { + "step": 236, + "epoch": 1.475, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6981, + "grad_norm": 11.096137046813965, + "learning_rate": 0.00023869644725453735 + }, + { + "step": 237, + "epoch": 1.48125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436503552, + "loss": 1.0426, + "grad_norm": 109.74464416503906, + "learning_rate": 0.00023803536031802918 + }, + { + "step": 238, + "epoch": 1.4875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.6445, + "grad_norm": 1.0396405458450317, + "learning_rate": 0.00023737165453017033 + }, + { + "step": 239, + "epoch": 1.49375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.8035, + "grad_norm": 29.488008499145508, + "learning_rate": 0.0002367053496346955 + }, + { + "step": 240, + "epoch": 1.5, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6964, + "grad_norm": 11.784250259399414, + "learning_rate": 0.00023603646545265687 + }, + { + "step": 241, + "epoch": 1.50625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7271, + "grad_norm": 22.180479049682617, + "learning_rate": 0.00023536502188183472 + }, + { + "step": 242, + "epoch": 1.5125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6864, + "grad_norm": 7.711079120635986, + "learning_rate": 0.00023469103889614505 + }, + { + "step": 243, + "epoch": 1.51875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436503552, + "loss": 0.7813, + "grad_norm": 27.22284507751465, + "learning_rate": 0.0002340145365450458 + }, + { + "step": 244, + "epoch": 1.525, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6929, + "grad_norm": 1.6457082033157349, + "learning_rate": 0.0002333355349529403 + }, + { + "step": 245, + "epoch": 1.53125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.6906, + "grad_norm": 3.238846778869629, + "learning_rate": 0.0002326540543185786 + }, + { + "step": 246, + "epoch": 1.5375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7703, + "grad_norm": 20.000551223754883, + "learning_rate": 0.0002319701149144565 + }, + { + "step": 247, + "epoch": 1.54375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.7116, + "grad_norm": 8.898648262023926, + "learning_rate": 0.00023128373708621275 + }, + { + "step": 248, + "epoch": 1.55, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6833, + "grad_norm": 2.6680290699005127, + "learning_rate": 0.00023059494125202357 + }, + { + "step": 249, + "epoch": 1.55625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.8925, + "grad_norm": 26.688859939575195, + "learning_rate": 0.00022990374790199532 + }, + { + "step": 250, + "epoch": 1.5625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7589, + "grad_norm": 12.566595077514648, + "learning_rate": 0.0002292101775975552 + }, + { + "step": 251, + "epoch": 1.56875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.8083, + "grad_norm": 10.73663330078125, + "learning_rate": 0.00022851425097083906 + }, + { + "step": 252, + "epoch": 1.575, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7571, + "grad_norm": 7.922856330871582, + "learning_rate": 0.00022781598872407822 + }, + { + "step": 253, + "epoch": 1.58125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7072, + "grad_norm": 5.189355850219727, + "learning_rate": 0.00022711541162898321 + }, + { + "step": 254, + "epoch": 1.5875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6976, + "grad_norm": 3.0700182914733887, + "learning_rate": 0.00022641254052612627 + }, + { + "step": 255, + "epoch": 1.59375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.7015, + "grad_norm": 8.973566055297852, + "learning_rate": 0.00022570739632432079 + }, + { + "step": 256, + "epoch": 1.6, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7038, + "grad_norm": 3.6078603267669678, + "learning_rate": 0.000225 + }, + { + "step": 257, + "epoch": 1.60625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7016, + "grad_norm": 3.224282741546631, + "learning_rate": 0.0002242903725965924 + }, + { + "step": 258, + "epoch": 1.6125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7081, + "grad_norm": 7.415835380554199, + "learning_rate": 0.00022357853522389615 + }, + { + "step": 259, + "epoch": 1.61875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6852, + "grad_norm": 3.429851770401001, + "learning_rate": 0.000222864509057451 + }, + { + "step": 260, + "epoch": 1.625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7103, + "grad_norm": 12.240126609802246, + "learning_rate": 0.00022214831533790813 + }, + { + "step": 261, + "epoch": 1.63125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7464, + "grad_norm": 25.689781188964844, + "learning_rate": 0.0002214299753703987 + }, + { + "step": 262, + "epoch": 1.6375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 3.7279, + "grad_norm": 2490.349853515625, + "learning_rate": 0.00022070951052389966 + }, + { + "step": 263, + "epoch": 1.64375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436505088, + "loss": 0.7442, + "grad_norm": 35.38889694213867, + "learning_rate": 0.00021998694223059837 + }, + { + "step": 264, + "epoch": 1.65, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.7168, + "grad_norm": 7.424646377563477, + "learning_rate": 0.0002192622919852551 + }, + { + "step": 265, + "epoch": 1.65625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6964, + "grad_norm": 2.228519916534424, + "learning_rate": 0.00021853558134456307 + }, + { + "step": 266, + "epoch": 1.6625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.7174, + "grad_norm": 6.220647811889648, + "learning_rate": 0.00021780683192650796 + }, + { + "step": 267, + "epoch": 1.66875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.6951, + "grad_norm": 2.9580321311950684, + "learning_rate": 0.00021707606540972413 + }, + { + "step": 268, + "epoch": 1.675, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7867, + "grad_norm": 20.92064094543457, + "learning_rate": 0.00021634330353285017 + }, + { + "step": 269, + "epoch": 1.68125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.7078, + "grad_norm": 9.697556495666504, + "learning_rate": 0.00021560856809388213 + }, + { + "step": 270, + "epoch": 1.6875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7026, + "grad_norm": 8.819724082946777, + "learning_rate": 0.00021487188094952489 + }, + { + "step": 271, + "epoch": 1.69375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 1.0088, + "grad_norm": 56.88178634643555, + "learning_rate": 0.0002141332640145423 + }, + { + "step": 272, + "epoch": 1.7, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.983, + "grad_norm": 39.75460433959961, + "learning_rate": 0.0002133927392611049 + }, + { + "step": 273, + "epoch": 1.70625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.6909, + "grad_norm": 1.4935669898986816, + "learning_rate": 0.00021265032871813658 + }, + { + "step": 274, + "epoch": 1.7125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6904, + "grad_norm": 3.2304587364196777, + "learning_rate": 0.00021190605447065917 + }, + { + "step": 275, + "epoch": 1.71875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7243, + "grad_norm": 6.715885639190674, + "learning_rate": 0.0002111599386591355 + }, + { + "step": 276, + "epoch": 1.725, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.7164, + "grad_norm": 5.32466459274292, + "learning_rate": 0.00021041200347881057 + }, + { + "step": 277, + "epoch": 1.73125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.6945, + "grad_norm": 3.247708797454834, + "learning_rate": 0.00020966227117905163 + }, + { + "step": 278, + "epoch": 1.7375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6944, + "grad_norm": 1.478901982307434, + "learning_rate": 0.00020891076406268612 + }, + { + "step": 279, + "epoch": 1.74375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6857, + "grad_norm": 2.018373966217041, + "learning_rate": 0.00020815750448533805 + }, + { + "step": 280, + "epoch": 1.75, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.7176, + "grad_norm": 4.572305679321289, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 281, + "epoch": 1.75625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.6899, + "grad_norm": 2.4458887577056885, + "learning_rate": 0.00020664581763018324 + }, + { + "step": 282, + "epoch": 1.7625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7109, + "grad_norm": 3.7370293140411377, + "learning_rate": 0.00020588743532161543 + }, + { + "step": 283, + "epoch": 1.76875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6858, + "grad_norm": 6.156641960144043, + "learning_rate": 0.00020512739048920552 + }, + { + "step": 284, + "epoch": 1.775, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7158, + "grad_norm": 8.562531471252441, + "learning_rate": 0.00020436570574255522 + }, + { + "step": 285, + "epoch": 1.78125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7581, + "grad_norm": 15.610877990722656, + "learning_rate": 0.00020360240374005 + }, + { + "step": 286, + "epoch": 1.7875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.8616, + "grad_norm": 28.86176872253418, + "learning_rate": 0.00020283750718818501 + }, + { + "step": 287, + "epoch": 1.79375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.7289, + "grad_norm": 12.65765380859375, + "learning_rate": 0.00020207103884088955 + }, + { + "step": 288, + "epoch": 1.8, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6942, + "grad_norm": 3.0870208740234375, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 289, + "epoch": 1.80625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6847, + "grad_norm": 1.7191615104675293, + "learning_rate": 0.00020053347800883298 + }, + { + "step": 290, + "epoch": 1.8125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6959, + "grad_norm": 4.709394931793213, + "learning_rate": 0.00019976243126300282 + }, + { + "step": 291, + "epoch": 1.81875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6923, + "grad_norm": 0.3384479582309723, + "learning_rate": 0.00019898990419824333 + }, + { + "step": 292, + "epoch": 1.825, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43648512, + "loss": 0.6974, + "grad_norm": 3.040966749191284, + "learning_rate": 0.00019821591979547423 + }, + { + "step": 293, + "epoch": 1.83125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6934, + "grad_norm": 6.0961785316467285, + "learning_rate": 0.00019744050107896774 + }, + { + "step": 294, + "epoch": 1.8375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436486656, + "loss": 0.6687, + "grad_norm": 3.7274763584136963, + "learning_rate": 0.0001966636711156636 + }, + { + "step": 295, + "epoch": 1.84375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7324, + "grad_norm": 8.206989288330078, + "learning_rate": 0.00019588545301448302 + }, + { + "step": 296, + "epoch": 1.85, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7884, + "grad_norm": 13.597814559936523, + "learning_rate": 0.00019510586992564093 + }, + { + "step": 297, + "epoch": 1.85625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7262, + "grad_norm": 7.7521209716796875, + "learning_rate": 0.0001943249450399578 + }, + { + "step": 298, + "epoch": 1.8625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7064, + "grad_norm": 4.958117485046387, + "learning_rate": 0.0001935427015881693 + }, + { + "step": 299, + "epoch": 1.86875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.717, + "grad_norm": 9.901411056518555, + "learning_rate": 0.00019275916284023563 + }, + { + "step": 300, + "epoch": 1.875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.8864, + "grad_norm": 110.6561508178711, + "learning_rate": 0.00019197435210464882 + }, + { + "step": 301, + "epoch": 1.88125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.7264, + "grad_norm": 12.446073532104492, + "learning_rate": 0.00019118829272773985 + }, + { + "step": 302, + "epoch": 1.8875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.717, + "grad_norm": 8.49349308013916, + "learning_rate": 0.00019040100809298392 + }, + { + "step": 303, + "epoch": 1.89375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436505088, + "loss": 0.6926, + "grad_norm": 1.2874635457992554, + "learning_rate": 0.00018961252162030476 + }, + { + "step": 304, + "epoch": 1.9, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6936, + "grad_norm": 2.8526198863983154, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 305, + "epoch": 1.90625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6873, + "grad_norm": 2.057274103164673, + "learning_rate": 0.00018803203701893393 + }, + { + "step": 306, + "epoch": 1.9125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6991, + "grad_norm": 4.646012783050537, + "learning_rate": 0.00018724008590605742 + }, + { + "step": 307, + "epoch": 1.91875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7495, + "grad_norm": 13.398412704467773, + "learning_rate": 0.0001864470269854896 + }, + { + "step": 308, + "epoch": 1.925, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6945, + "grad_norm": 3.868546724319458, + "learning_rate": 0.00018565288384892595 + }, + { + "step": 309, + "epoch": 1.93125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7191, + "grad_norm": 9.42562198638916, + "learning_rate": 0.00018485768012031518 + }, + { + "step": 310, + "epoch": 1.9375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6993, + "grad_norm": 5.16719388961792, + "learning_rate": 0.00018406143945515598 + }, + { + "step": 311, + "epoch": 1.94375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.7045, + "grad_norm": 4.41701602935791, + "learning_rate": 0.00018326418553979367 + }, + { + "step": 312, + "epoch": 1.95, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.6908, + "grad_norm": 0.18271850049495697, + "learning_rate": 0.0001824659420907154 + }, + { + "step": 313, + "epoch": 1.95625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6977, + "grad_norm": 3.310356378555298, + "learning_rate": 0.00018166673285384475 + }, + { + "step": 314, + "epoch": 1.9625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6938, + "grad_norm": 0.8721852898597717, + "learning_rate": 0.00018086658160383523 + }, + { + "step": 315, + "epoch": 1.96875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6897, + "grad_norm": 1.138742446899414, + "learning_rate": 0.00018006551214336304 + }, + { + "step": 316, + "epoch": 1.975, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7198, + "grad_norm": 7.447521686553955, + "learning_rate": 0.00017926354830241924 + }, + { + "step": 317, + "epoch": 1.98125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7193, + "grad_norm": 6.702768802642822, + "learning_rate": 0.00017846071393760044 + }, + { + "step": 318, + "epoch": 1.9875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.703, + "grad_norm": 4.019357204437256, + "learning_rate": 0.00017765703293139948 + }, + { + "step": 319, + "epoch": 1.99375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.6896, + "grad_norm": 2.661290168762207, + "learning_rate": 0.00017685252919149493 + }, + { + "step": 320, + "epoch": 2.0, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6723, + "grad_norm": 0.8227198123931885, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 321, + "epoch": 2.00625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6356, + "grad_norm": 0.611751139163971, + "learning_rate": 0.00017524114926294887 + }, + { + "step": 322, + "epoch": 2.0125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7778, + "grad_norm": 6.850287914276123, + "learning_rate": 0.0001744343210091883 + }, + { + "step": 323, + "epoch": 2.01875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.8188, + "grad_norm": 8.648664474487305, + "learning_rate": 0.00017362676589005967 + }, + { + "step": 324, + "epoch": 2.025, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436505088, + "loss": 0.859, + "grad_norm": 10.878265380859375, + "learning_rate": 0.0001728185079284875 + }, + { + "step": 325, + "epoch": 2.03125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7621, + "grad_norm": 7.449574947357178, + "learning_rate": 0.00017200957116830423 + }, + { + "step": 326, + "epoch": 2.0375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436503552, + "loss": 0.7155, + "grad_norm": 5.004981994628906, + "learning_rate": 0.00017119997967353514 + }, + { + "step": 327, + "epoch": 2.04375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6975, + "grad_norm": 3.7215969562530518, + "learning_rate": 0.00017038975752768211 + }, + { + "step": 328, + "epoch": 2.05, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.7004, + "grad_norm": 1.6247286796569824, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 329, + "epoch": 2.05625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.6936, + "grad_norm": 3.116683006286621, + "learning_rate": 0.0001687675177098179 + }, + { + "step": 330, + "epoch": 2.0625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7007, + "grad_norm": 2.8766980171203613, + "learning_rate": 0.00016795554829574435 + }, + { + "step": 331, + "epoch": 2.06875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7026, + "grad_norm": 3.182438611984253, + "learning_rate": 0.00016714304474502696 + }, + { + "step": 332, + "epoch": 2.075, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7061, + "grad_norm": 7.595936298370361, + "learning_rate": 0.00016633003122779467 + }, + { + "step": 333, + "epoch": 2.08125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7007, + "grad_norm": 6.171183109283447, + "learning_rate": 0.00016551653192934694 + }, + { + "step": 334, + "epoch": 2.0875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.694, + "grad_norm": 0.5194656252861023, + "learning_rate": 0.0001647025710494341 + }, + { + "step": 335, + "epoch": 2.09375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6938, + "grad_norm": 0.9934120774269104, + "learning_rate": 0.00016388817280153735 + }, + { + "step": 336, + "epoch": 2.1, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6885, + "grad_norm": 4.789129257202148, + "learning_rate": 0.00016307336141214873 + }, + { + "step": 337, + "epoch": 2.10625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.7003, + "grad_norm": 4.978322505950928, + "learning_rate": 0.00016225816112005022 + }, + { + "step": 338, + "epoch": 2.1125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6986, + "grad_norm": 5.017582893371582, + "learning_rate": 0.00016144259617559286 + }, + { + "step": 339, + "epoch": 2.11875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6932, + "grad_norm": 0.6535791158676147, + "learning_rate": 0.00016062669083997513 + }, + { + "step": 340, + "epoch": 2.125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6967, + "grad_norm": 1.8443365097045898, + "learning_rate": 0.00015981046938452146 + }, + { + "step": 341, + "epoch": 2.13125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7095, + "grad_norm": 5.252786636352539, + "learning_rate": 0.00015899395608996015 + }, + { + "step": 342, + "epoch": 2.1375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7028, + "grad_norm": 4.747971534729004, + "learning_rate": 0.00015817717524570094 + }, + { + "step": 343, + "epoch": 2.14375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6937, + "grad_norm": 0.3325246572494507, + "learning_rate": 0.0001573601511491127 + }, + { + "step": 344, + "epoch": 2.15, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6854, + "grad_norm": 2.7582578659057617, + "learning_rate": 0.00015654290810480042 + }, + { + "step": 345, + "epoch": 2.15625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6851, + "grad_norm": 0.7204828262329102, + "learning_rate": 0.00015572547042388223 + }, + { + "step": 346, + "epoch": 2.1625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.7124, + "grad_norm": 6.44591760635376, + "learning_rate": 0.00015490786242326643 + }, + { + "step": 347, + "epoch": 2.16875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.7344, + "grad_norm": 9.849533081054688, + "learning_rate": 0.00015409010842492777 + }, + { + "step": 348, + "epoch": 2.175, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.727, + "grad_norm": 8.471578598022461, + "learning_rate": 0.00015327223275518416 + }, + { + "step": 349, + "epoch": 2.18125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436483584, + "loss": 0.6919, + "grad_norm": 1.6116753816604614, + "learning_rate": 0.000152454259743973 + }, + { + "step": 350, + "epoch": 2.1875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6936, + "grad_norm": 0.303459495306015, + "learning_rate": 0.00015163621372412734 + }, + { + "step": 351, + "epoch": 2.19375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6904, + "grad_norm": 0.9633539915084839, + "learning_rate": 0.00015081811903065205 + }, + { + "step": 352, + "epoch": 2.2, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7085, + "grad_norm": 2.903498411178589, + "learning_rate": 0.00015 + }, + { + "step": 353, + "epoch": 2.20625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6978, + "grad_norm": 1.6771186590194702, + "learning_rate": 0.0001491818809693479 + }, + { + "step": 354, + "epoch": 2.2125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6287, + "grad_norm": 4.6518874168396, + "learning_rate": 0.00014836378627587266 + }, + { + "step": 355, + "epoch": 2.21875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7269, + "grad_norm": 3.381722927093506, + "learning_rate": 0.00014754574025602698 + }, + { + "step": 356, + "epoch": 2.225, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.7152, + "grad_norm": 2.5844802856445312, + "learning_rate": 0.00014672776724481584 + }, + { + "step": 357, + "epoch": 2.23125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7049, + "grad_norm": 2.0991740226745605, + "learning_rate": 0.00014590989157507224 + }, + { + "step": 358, + "epoch": 2.2375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6916, + "grad_norm": 0.6036875247955322, + "learning_rate": 0.00014509213757673357 + }, + { + "step": 359, + "epoch": 2.24375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7, + "grad_norm": 1.5937211513519287, + "learning_rate": 0.00014427452957611775 + }, + { + "step": 360, + "epoch": 2.25, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6927, + "grad_norm": 1.2352635860443115, + "learning_rate": 0.0001434570918951996 + }, + { + "step": 361, + "epoch": 2.25625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7155, + "grad_norm": 3.2748970985412598, + "learning_rate": 0.0001426398488508873 + }, + { + "step": 362, + "epoch": 2.2625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.6699, + "grad_norm": 1.1388378143310547, + "learning_rate": 0.00014182282475429903 + }, + { + "step": 363, + "epoch": 2.26875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6949, + "grad_norm": 1.4779818058013916, + "learning_rate": 0.00014100604391003985 + }, + { + "step": 364, + "epoch": 2.275, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7139, + "grad_norm": 3.3187434673309326, + "learning_rate": 0.0001401895306154785 + }, + { + "step": 365, + "epoch": 2.28125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7095, + "grad_norm": 3.445469617843628, + "learning_rate": 0.00013937330916002487 + }, + { + "step": 366, + "epoch": 2.2875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6929, + "grad_norm": 0.42188122868537903, + "learning_rate": 0.00013855740382440714 + }, + { + "step": 367, + "epoch": 2.29375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7203, + "grad_norm": 4.540777206420898, + "learning_rate": 0.0001377418388799498 + }, + { + "step": 368, + "epoch": 2.3, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7249, + "grad_norm": 4.73647928237915, + "learning_rate": 0.00013692663858785124 + }, + { + "step": 369, + "epoch": 2.30625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7231, + "grad_norm": 4.302307605743408, + "learning_rate": 0.00013611182719846268 + }, + { + "step": 370, + "epoch": 2.3125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6966, + "grad_norm": 1.0013757944107056, + "learning_rate": 0.0001352974289505659 + }, + { + "step": 371, + "epoch": 2.31875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6944, + "grad_norm": 0.4005091190338135, + "learning_rate": 0.000134483468070653 + }, + { + "step": 372, + "epoch": 2.325, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6964, + "grad_norm": 1.195398211479187, + "learning_rate": 0.00013366996877220533 + }, + { + "step": 373, + "epoch": 2.33125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6949, + "grad_norm": 0.445303350687027, + "learning_rate": 0.000132856955254973 + }, + { + "step": 374, + "epoch": 2.3375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436506624, + "loss": 0.6906, + "grad_norm": 1.8895539045333862, + "learning_rate": 0.00013204445170425565 + }, + { + "step": 375, + "epoch": 2.34375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6978, + "grad_norm": 1.450884222984314, + "learning_rate": 0.00013123248229018214 + }, + { + "step": 376, + "epoch": 2.35, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6958, + "grad_norm": 1.5937141180038452, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 377, + "epoch": 2.35625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436506624, + "loss": 0.6904, + "grad_norm": 0.8058385252952576, + "learning_rate": 0.0001296102424723179 + }, + { + "step": 378, + "epoch": 2.3625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.6866, + "grad_norm": 0.38076117634773254, + "learning_rate": 0.0001288000203264649 + }, + { + "step": 379, + "epoch": 2.36875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7251, + "grad_norm": 4.40585470199585, + "learning_rate": 0.00012799042883169574 + }, + { + "step": 380, + "epoch": 2.375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6778, + "grad_norm": 0.3937416672706604, + "learning_rate": 0.00012718149207151247 + }, + { + "step": 381, + "epoch": 2.38125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7257, + "grad_norm": 4.481349468231201, + "learning_rate": 0.00012637323410994033 + }, + { + "step": 382, + "epoch": 2.3875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6824, + "grad_norm": 0.7417492270469666, + "learning_rate": 0.0001255656789908117 + }, + { + "step": 383, + "epoch": 2.39375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6859, + "grad_norm": 0.5538635849952698, + "learning_rate": 0.0001247588507370511 + }, + { + "step": 384, + "epoch": 2.4, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7065, + "grad_norm": 2.597095012664795, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 385, + "epoch": 2.40625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7046, + "grad_norm": 2.733966112136841, + "learning_rate": 0.0001231474708085051 + }, + { + "step": 386, + "epoch": 2.4125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.6933, + "grad_norm": 0.28869497776031494, + "learning_rate": 0.0001223429670686005 + }, + { + "step": 387, + "epoch": 2.41875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7097, + "grad_norm": 2.621511936187744, + "learning_rate": 0.00012153928606239957 + }, + { + "step": 388, + "epoch": 2.425, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.694, + "grad_norm": 0.5179890394210815, + "learning_rate": 0.00012073645169758076 + }, + { + "step": 389, + "epoch": 2.43125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.698, + "grad_norm": 1.3671116828918457, + "learning_rate": 0.00011993448785663692 + }, + { + "step": 390, + "epoch": 2.4375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6924, + "grad_norm": 1.7928180694580078, + "learning_rate": 0.00011913341839616476 + }, + { + "step": 391, + "epoch": 2.44375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6932, + "grad_norm": 0.484050452709198, + "learning_rate": 0.00011833326714615522 + }, + { + "step": 392, + "epoch": 2.45, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436503552, + "loss": 0.6938, + "grad_norm": 0.32956069707870483, + "learning_rate": 0.00011753405790928456 + }, + { + "step": 393, + "epoch": 2.45625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6923, + "grad_norm": 1.2161016464233398, + "learning_rate": 0.0001167358144602063 + }, + { + "step": 394, + "epoch": 2.4625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6962, + "grad_norm": 1.0045655965805054, + "learning_rate": 0.00011593856054484402 + }, + { + "step": 395, + "epoch": 2.46875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6839, + "grad_norm": 3.7410693168640137, + "learning_rate": 0.00011514231987968482 + }, + { + "step": 396, + "epoch": 2.475, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6911, + "grad_norm": 0.3017803132534027, + "learning_rate": 0.00011434711615107404 + }, + { + "step": 397, + "epoch": 2.48125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.6921, + "grad_norm": 0.3129913806915283, + "learning_rate": 0.00011355297301451042 + }, + { + "step": 398, + "epoch": 2.4875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436503552, + "loss": 0.6897, + "grad_norm": 0.5246943831443787, + "learning_rate": 0.00011275991409394253 + }, + { + "step": 399, + "epoch": 2.49375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7041, + "grad_norm": 2.4613144397735596, + "learning_rate": 0.00011196796298106608 + }, + { + "step": 400, + "epoch": 2.5, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6989, + "grad_norm": 1.9753458499908447, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 401, + "epoch": 2.50625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6945, + "grad_norm": 3.1460142135620117, + "learning_rate": 0.00011038747837969526 + }, + { + "step": 402, + "epoch": 2.5125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6935, + "grad_norm": 0.6829839944839478, + "learning_rate": 0.00010959899190701608 + }, + { + "step": 403, + "epoch": 2.51875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6948, + "grad_norm": 1.0542210340499878, + "learning_rate": 0.00010881170727226018 + }, + { + "step": 404, + "epoch": 2.525, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6962, + "grad_norm": 1.5967320203781128, + "learning_rate": 0.00010802564789535119 + }, + { + "step": 405, + "epoch": 2.53125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6923, + "grad_norm": 1.2215348482131958, + "learning_rate": 0.00010724083715976441 + }, + { + "step": 406, + "epoch": 2.5375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6905, + "grad_norm": 0.8110335469245911, + "learning_rate": 0.00010645729841183066 + }, + { + "step": 407, + "epoch": 2.54375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.6932, + "grad_norm": 0.3255617618560791, + "learning_rate": 0.00010567505496004213 + }, + { + "step": 408, + "epoch": 2.55, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.6918, + "grad_norm": 0.7455755472183228, + "learning_rate": 0.00010489413007435904 + }, + { + "step": 409, + "epoch": 2.55625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6909, + "grad_norm": 0.4633011221885681, + "learning_rate": 0.00010411454698551695 + }, + { + "step": 410, + "epoch": 2.5625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6858, + "grad_norm": 1.4376530647277832, + "learning_rate": 0.00010333632888433638 + }, + { + "step": 411, + "epoch": 2.56875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.7093, + "grad_norm": 2.4805209636688232, + "learning_rate": 0.00010255949892103225 + }, + { + "step": 412, + "epoch": 2.575, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6888, + "grad_norm": 0.29221001267433167, + "learning_rate": 0.00010178408020452579 + }, + { + "step": 413, + "epoch": 2.58125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436503552, + "loss": 0.689, + "grad_norm": 0.19223886728286743, + "learning_rate": 0.00010101009580175669 + }, + { + "step": 414, + "epoch": 2.5875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6965, + "grad_norm": 0.8182043433189392, + "learning_rate": 0.00010023756873699722 + }, + { + "step": 415, + "epoch": 2.59375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6971, + "grad_norm": 0.9049632549285889, + "learning_rate": 9.946652199116699e-05 + }, + { + "step": 416, + "epoch": 2.6, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6958, + "grad_norm": 1.5953260660171509, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 417, + "epoch": 2.60625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6883, + "grad_norm": 0.9256069660186768, + "learning_rate": 9.792896115911045e-05 + }, + { + "step": 418, + "epoch": 2.6125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.7147, + "grad_norm": 3.0222573280334473, + "learning_rate": 9.716249281181497e-05 + }, + { + "step": 419, + "epoch": 2.61875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6753, + "grad_norm": 0.2415858507156372, + "learning_rate": 9.639759625994998e-05 + }, + { + "step": 420, + "epoch": 2.625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7192, + "grad_norm": 3.4998676776885986, + "learning_rate": 9.563429425744476e-05 + }, + { + "step": 421, + "epoch": 2.63125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7426, + "grad_norm": 5.271790981292725, + "learning_rate": 9.487260951079448e-05 + }, + { + "step": 422, + "epoch": 2.6375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7482, + "grad_norm": 5.714134693145752, + "learning_rate": 9.411256467838455e-05 + }, + { + "step": 423, + "epoch": 2.64375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6663, + "grad_norm": 1.3285033702850342, + "learning_rate": 9.335418236981677e-05 + }, + { + "step": 424, + "epoch": 2.65, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7104, + "grad_norm": 2.6549675464630127, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 425, + "epoch": 2.65625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7055, + "grad_norm": 2.144930601119995, + "learning_rate": 9.184249551466189e-05 + }, + { + "step": 426, + "epoch": 2.6625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.6855, + "grad_norm": 1.2357250452041626, + "learning_rate": 9.10892359373139e-05 + }, + { + "step": 427, + "epoch": 2.66875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436503552, + "loss": 0.6944, + "grad_norm": 0.36976227164268494, + "learning_rate": 9.033772882094833e-05 + }, + { + "step": 428, + "epoch": 2.675, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.7028, + "grad_norm": 2.920207977294922, + "learning_rate": 8.958799652118943e-05 + }, + { + "step": 429, + "epoch": 2.68125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6931, + "grad_norm": 0.3673052489757538, + "learning_rate": 8.884006134086449e-05 + }, + { + "step": 430, + "epoch": 2.6875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.698, + "grad_norm": 1.4576783180236816, + "learning_rate": 8.809394552934079e-05 + }, + { + "step": 431, + "epoch": 2.69375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6998, + "grad_norm": 2.8884172439575195, + "learning_rate": 8.734967128186338e-05 + }, + { + "step": 432, + "epoch": 2.7, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.6932, + "grad_norm": 0.2826431095600128, + "learning_rate": 8.660726073889511e-05 + }, + { + "step": 433, + "epoch": 2.70625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6945, + "grad_norm": 0.6497731804847717, + "learning_rate": 8.586673598545771e-05 + }, + { + "step": 434, + "epoch": 2.7125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6946, + "grad_norm": 0.6876727938652039, + "learning_rate": 8.512811905047505e-05 + }, + { + "step": 435, + "epoch": 2.71875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.6931, + "grad_norm": 1.1170787811279297, + "learning_rate": 8.439143190611787e-05 + }, + { + "step": 436, + "epoch": 2.725, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6928, + "grad_norm": 0.4449024200439453, + "learning_rate": 8.365669646714983e-05 + }, + { + "step": 437, + "epoch": 2.73125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.6946, + "grad_norm": 0.6323174834251404, + "learning_rate": 8.29239345902759e-05 + }, + { + "step": 438, + "epoch": 2.7375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6943, + "grad_norm": 0.2882833778858185, + "learning_rate": 8.219316807349204e-05 + }, + { + "step": 439, + "epoch": 2.74375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6937, + "grad_norm": 0.19587744772434235, + "learning_rate": 8.146441865543689e-05 + }, + { + "step": 440, + "epoch": 2.75, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6916, + "grad_norm": 3.122863531112671, + "learning_rate": 8.073770801474495e-05 + }, + { + "step": 441, + "epoch": 2.75625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6929, + "grad_norm": 0.21753369271755219, + "learning_rate": 8.001305776940163e-05 + }, + { + "step": 442, + "epoch": 2.7625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6907, + "grad_norm": 0.5891724228858948, + "learning_rate": 7.929048947610034e-05 + }, + { + "step": 443, + "epoch": 2.76875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6965, + "grad_norm": 0.8459489345550537, + "learning_rate": 7.857002462960132e-05 + }, + { + "step": 444, + "epoch": 2.775, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6875, + "grad_norm": 0.6120290756225586, + "learning_rate": 7.785168466209187e-05 + }, + { + "step": 445, + "epoch": 2.78125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.6889, + "grad_norm": 0.2132270187139511, + "learning_rate": 7.713549094254897e-05 + }, + { + "step": 446, + "epoch": 2.7875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6833, + "grad_norm": 0.5512095093727112, + "learning_rate": 7.64214647761038e-05 + }, + { + "step": 447, + "epoch": 2.79375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6854, + "grad_norm": 0.17841240763664246, + "learning_rate": 7.570962740340759e-05 + }, + { + "step": 448, + "epoch": 2.8, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.6857, + "grad_norm": 0.2665537893772125, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 449, + "epoch": 2.80625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6864, + "grad_norm": 0.4519351124763489, + "learning_rate": 7.429260367567916e-05 + }, + { + "step": 450, + "epoch": 2.8125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7188, + "grad_norm": 2.3208959102630615, + "learning_rate": 7.358745947387373e-05 + }, + { + "step": 451, + "epoch": 2.81875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7008, + "grad_norm": 1.2153486013412476, + "learning_rate": 7.288458837101675e-05 + }, + { + "step": 452, + "epoch": 2.825, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.666, + "grad_norm": 1.4940522909164429, + "learning_rate": 7.218401127592175e-05 + }, + { + "step": 453, + "epoch": 2.83125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6756, + "grad_norm": 1.0031754970550537, + "learning_rate": 7.14857490291609e-05 + }, + { + "step": 454, + "epoch": 2.8375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6939, + "grad_norm": 0.5635747313499451, + "learning_rate": 7.07898224024448e-05 + }, + { + "step": 455, + "epoch": 2.84375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7113, + "grad_norm": 1.9390994310379028, + "learning_rate": 7.009625209800465e-05 + }, + { + "step": 456, + "epoch": 2.85, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.7035, + "grad_norm": 1.4331762790679932, + "learning_rate": 6.940505874797639e-05 + }, + { + "step": 457, + "epoch": 2.85625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6983, + "grad_norm": 0.8458346724510193, + "learning_rate": 6.871626291378728e-05 + }, + { + "step": 458, + "epoch": 2.8625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.697, + "grad_norm": 1.5229713916778564, + "learning_rate": 6.80298850855435e-05 + }, + { + "step": 459, + "epoch": 2.86875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6935, + "grad_norm": 0.3006005883216858, + "learning_rate": 6.734594568142142e-05 + }, + { + "step": 460, + "epoch": 2.875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6902, + "grad_norm": 0.22879034280776978, + "learning_rate": 6.66644650470597e-05 + }, + { + "step": 461, + "epoch": 2.88125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.7338, + "grad_norm": 3.7611985206604004, + "learning_rate": 6.598546345495417e-05 + }, + { + "step": 462, + "epoch": 2.8875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6734, + "grad_norm": 0.8908981680870056, + "learning_rate": 6.530896110385494e-05 + }, + { + "step": 463, + "epoch": 2.89375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7001, + "grad_norm": 1.1307358741760254, + "learning_rate": 6.463497811816523e-05 + }, + { + "step": 464, + "epoch": 2.9, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7162, + "grad_norm": 2.171896457672119, + "learning_rate": 6.396353454734311e-05 + }, + { + "step": 465, + "epoch": 2.90625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6758, + "grad_norm": 0.3393321931362152, + "learning_rate": 6.32946503653045e-05 + }, + { + "step": 466, + "epoch": 2.9125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.7051, + "grad_norm": 1.4899122714996338, + "learning_rate": 6.262834546982969e-05 + }, + { + "step": 467, + "epoch": 2.91875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436503552, + "loss": 0.6987, + "grad_norm": 1.1016749143600464, + "learning_rate": 6.196463968197084e-05 + }, + { + "step": 468, + "epoch": 2.925, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6994, + "grad_norm": 1.2285826206207275, + "learning_rate": 6.130355274546267e-05 + }, + { + "step": 469, + "epoch": 2.93125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.7078, + "grad_norm": 2.8471813201904297, + "learning_rate": 6.064510432613499e-05 + }, + { + "step": 470, + "epoch": 2.9375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6958, + "grad_norm": 1.17753267288208, + "learning_rate": 5.998931401132786e-05 + }, + { + "step": 471, + "epoch": 2.94375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7016, + "grad_norm": 3.85834002494812, + "learning_rate": 5.933620130930867e-05 + }, + { + "step": 472, + "epoch": 2.95, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6896, + "grad_norm": 0.6093201637268066, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 473, + "epoch": 2.95625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.7001, + "grad_norm": 2.0244333744049072, + "learning_rate": 5.803808637786135e-05 + }, + { + "step": 474, + "epoch": 2.9625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.699, + "grad_norm": 2.0040009021759033, + "learning_rate": 5.739312276439427e-05 + }, + { + "step": 475, + "epoch": 2.96875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7041, + "grad_norm": 2.5479047298431396, + "learning_rate": 5.6750913994488415e-05 + }, + { + "step": 476, + "epoch": 2.975, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7048, + "grad_norm": 2.415851593017578, + "learning_rate": 5.6111479172391136e-05 + }, + { + "step": 477, + "epoch": 2.98125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7129, + "grad_norm": 3.355107307434082, + "learning_rate": 5.5474837319831314e-05 + }, + { + "step": 478, + "epoch": 2.9875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7067, + "grad_norm": 2.626291275024414, + "learning_rate": 5.4841007375453186e-05 + }, + { + "step": 479, + "epoch": 2.99375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.7026, + "grad_norm": 2.0984411239624023, + "learning_rate": 5.4210008194253196e-05 + }, + { + "step": 480, + "epoch": 3.0, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6958, + "grad_norm": 1.4404969215393066, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 481, + "epoch": 3.00625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.6934, + "grad_norm": 0.9449305534362793, + "learning_rate": 5.2956577119771405e-05 + }, + { + "step": 482, + "epoch": 3.0125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.6988, + "grad_norm": 1.3874497413635254, + "learning_rate": 5.233418251320765e-05 + }, + { + "step": 483, + "epoch": 3.01875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.705, + "grad_norm": 1.7997007369995117, + "learning_rate": 5.171469324214901e-05 + }, + { + "step": 484, + "epoch": 3.025, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6938, + "grad_norm": 0.4548819661140442, + "learning_rate": 5.109812773498967e-05 + }, + { + "step": 485, + "epoch": 3.03125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.706, + "grad_norm": 1.799126148223877, + "learning_rate": 5.048450433314835e-05 + }, + { + "step": 486, + "epoch": 3.0375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6872, + "grad_norm": 0.6468462347984314, + "learning_rate": 4.987384129052291e-05 + }, + { + "step": 487, + "epoch": 3.04375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6987, + "grad_norm": 1.1589343547821045, + "learning_rate": 4.926615677294723e-05 + }, + { + "step": 488, + "epoch": 3.05, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.7018, + "grad_norm": 2.3516039848327637, + "learning_rate": 4.866146885765096e-05 + }, + { + "step": 489, + "epoch": 3.05625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.6901, + "grad_norm": 1.828291893005371, + "learning_rate": 4.8059795532721575e-05 + }, + { + "step": 490, + "epoch": 3.0625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.694, + "grad_norm": 1.2622337341308594, + "learning_rate": 4.7461154696569294e-05 + }, + { + "step": 491, + "epoch": 3.06875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6951, + "grad_norm": 1.9759842157363892, + "learning_rate": 4.686556415739488e-05 + }, + { + "step": 492, + "epoch": 3.075, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.695, + "grad_norm": 1.0125367641448975, + "learning_rate": 4.62730416326596e-05 + }, + { + "step": 493, + "epoch": 3.08125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.698, + "grad_norm": 2.8907904624938965, + "learning_rate": 4.568360474855826e-05 + }, + { + "step": 494, + "epoch": 3.0875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6923, + "grad_norm": 0.6129000782966614, + "learning_rate": 4.509727103949492e-05 + }, + { + "step": 495, + "epoch": 3.09375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6951, + "grad_norm": 2.9931020736694336, + "learning_rate": 4.451405794756138e-05 + }, + { + "step": 496, + "epoch": 3.1, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436503552, + "loss": 0.6938, + "grad_norm": 1.1200330257415771, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 497, + "epoch": 3.10625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436503552, + "loss": 0.6971, + "grad_norm": 1.9377878904342651, + "learning_rate": 4.33570629187776e-05 + }, + { + "step": 498, + "epoch": 3.1125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7001, + "grad_norm": 2.697136163711548, + "learning_rate": 4.278331539989307e-05 + }, + { + "step": 499, + "epoch": 3.11875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.6939, + "grad_norm": 0.6682946085929871, + "learning_rate": 4.2212757333045283e-05 + }, + { + "step": 500, + "epoch": 3.125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.695, + "grad_norm": 0.9443612098693848, + "learning_rate": 4.164540569103667e-05 + }, + { + "step": 501, + "epoch": 3.13125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6936, + "grad_norm": 0.30825236439704895, + "learning_rate": 4.108127735128561e-05 + }, + { + "step": 502, + "epoch": 3.1375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.6934, + "grad_norm": 1.4312561750411987, + "learning_rate": 4.052038909532469e-05 + }, + { + "step": 503, + "epoch": 3.14375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6938, + "grad_norm": 1.903323769569397, + "learning_rate": 3.996275760830125e-05 + }, + { + "step": 504, + "epoch": 3.15, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6932, + "grad_norm": 0.3123742640018463, + "learning_rate": 3.94083994784814e-05 + }, + { + "step": 505, + "epoch": 3.15625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6948, + "grad_norm": 1.971826434135437, + "learning_rate": 3.885733119675616e-05 + }, + { + "step": 506, + "epoch": 3.1625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6932, + "grad_norm": 0.1599097102880478, + "learning_rate": 3.830956915615106e-05 + }, + { + "step": 507, + "epoch": 3.16875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.6927, + "grad_norm": 0.2972242534160614, + "learning_rate": 3.776512965133863e-05 + }, + { + "step": 508, + "epoch": 3.175, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.6916, + "grad_norm": 1.6250648498535156, + "learning_rate": 3.72240288781534e-05 + }, + { + "step": 509, + "epoch": 3.18125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.695, + "grad_norm": 1.6420953273773193, + "learning_rate": 3.66862829331103e-05 + }, + { + "step": 510, + "epoch": 3.1875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.6953, + "grad_norm": 0.5234501361846924, + "learning_rate": 3.6151907812925717e-05 + }, + { + "step": 511, + "epoch": 3.19375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6936, + "grad_norm": 0.18644632399082184, + "learning_rate": 3.562091941404179e-05 + }, + { + "step": 512, + "epoch": 3.2, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.6935, + "grad_norm": 0.18596041202545166, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 513, + "epoch": 3.20625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6937, + "grad_norm": 0.8594840168952942, + "learning_rate": 3.456916586173797e-05 + }, + { + "step": 514, + "epoch": 3.2125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6932, + "grad_norm": 1.7944879531860352, + "learning_rate": 3.404843199558945e-05 + }, + { + "step": 515, + "epoch": 3.21875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6941, + "grad_norm": 1.4224085807800293, + "learning_rate": 3.3531147424353664e-05 + }, + { + "step": 516, + "epoch": 3.225, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6926, + "grad_norm": 1.4243217706680298, + "learning_rate": 3.301732753606776e-05 + }, + { + "step": 517, + "epoch": 3.23125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.6986, + "grad_norm": 3.5158450603485107, + "learning_rate": 3.250698761570244e-05 + }, + { + "step": 518, + "epoch": 3.2375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7014, + "grad_norm": 4.417989253997803, + "learning_rate": 3.200014284470745e-05 + }, + { + "step": 519, + "epoch": 3.24375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6936, + "grad_norm": 0.2862250804901123, + "learning_rate": 3.149680830055967e-05 + }, + { + "step": 520, + "epoch": 3.25, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6968, + "grad_norm": 2.088808298110962, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 521, + "epoch": 3.25625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6932, + "grad_norm": 0.12058515101671219, + "learning_rate": 3.0500729680161663e-05 + }, + { + "step": 522, + "epoch": 3.2625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6937, + "grad_norm": 1.4527366161346436, + "learning_rate": 3.0008015234980552e-05 + }, + { + "step": 523, + "epoch": 3.26875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6935, + "grad_norm": 0.1328340470790863, + "learning_rate": 2.9518870277903274e-05 + }, + { + "step": 524, + "epoch": 3.275, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6932, + "grad_norm": 0.26891735196113586, + "learning_rate": 2.9033309359877597e-05 + }, + { + "step": 525, + "epoch": 3.28125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.6986, + "grad_norm": 2.0107481479644775, + "learning_rate": 2.855134692523438e-05 + }, + { + "step": 526, + "epoch": 3.2875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6929, + "grad_norm": 0.3955501914024353, + "learning_rate": 2.807299731125773e-05 + }, + { + "step": 527, + "epoch": 3.29375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.6912, + "grad_norm": 0.6446731686592102, + "learning_rate": 2.759827474775852e-05 + }, + { + "step": 528, + "epoch": 3.3, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.6908, + "grad_norm": 0.6325265765190125, + "learning_rate": 2.7127193356651213e-05 + }, + { + "step": 529, + "epoch": 3.30625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.7002, + "grad_norm": 2.023848533630371, + "learning_rate": 2.665976715153377e-05 + }, + { + "step": 530, + "epoch": 3.3125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6862, + "grad_norm": 1.4843679666519165, + "learning_rate": 2.619601003727043e-05 + }, + { + "step": 531, + "epoch": 3.31875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650816, + "loss": 0.7028, + "grad_norm": 2.356333017349243, + "learning_rate": 2.5735935809578656e-05 + }, + { + "step": 532, + "epoch": 3.325, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7029, + "grad_norm": 2.3548803329467773, + "learning_rate": 2.5279558154618197e-05 + }, + { + "step": 533, + "epoch": 3.33125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6966, + "grad_norm": 1.055834412574768, + "learning_rate": 2.4826890648584353e-05 + }, + { + "step": 534, + "epoch": 3.3375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6951, + "grad_norm": 0.6339371204376221, + "learning_rate": 2.4377946757303828e-05 + }, + { + "step": 535, + "epoch": 3.34375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6952, + "grad_norm": 0.5132887959480286, + "learning_rate": 2.393273983583427e-05 + }, + { + "step": 536, + "epoch": 3.35, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.694, + "grad_norm": 0.7200871109962463, + "learning_rate": 2.3491283128067174e-05 + }, + { + "step": 537, + "epoch": 3.35625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6932, + "grad_norm": 1.1157199144363403, + "learning_rate": 2.3053589766333414e-05 + }, + { + "step": 538, + "epoch": 3.3625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6917, + "grad_norm": 0.9060938358306885, + "learning_rate": 2.261967277101318e-05 + }, + { + "step": 539, + "epoch": 3.36875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6931, + "grad_norm": 0.12987352907657623, + "learning_rate": 2.218954505014821e-05 + }, + { + "step": 540, + "epoch": 3.375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7026, + "grad_norm": 2.9884724617004395, + "learning_rate": 2.1763219399058042e-05 + }, + { + "step": 541, + "epoch": 3.38125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6936, + "grad_norm": 0.48464328050613403, + "learning_rate": 2.1340708499959197e-05 + }, + { + "step": 542, + "epoch": 3.3875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6905, + "grad_norm": 0.5218011140823364, + "learning_rate": 2.0922024921588167e-05 + }, + { + "step": 543, + "epoch": 3.39375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6897, + "grad_norm": 0.5368003249168396, + "learning_rate": 2.0507181118827254e-05 + }, + { + "step": 544, + "epoch": 3.4, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6941, + "grad_norm": 0.7229459285736084, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 545, + "epoch": 3.40625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.6974, + "grad_norm": 1.171156644821167, + "learning_rate": 1.9689062088175154e-05 + }, + { + "step": 546, + "epoch": 3.4125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6949, + "grad_norm": 0.6926687359809875, + "learning_rate": 1.928581119746081e-05 + }, + { + "step": 547, + "epoch": 3.41875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6881, + "grad_norm": 0.7931606769561768, + "learning_rate": 1.8886448755986193e-05 + }, + { + "step": 548, + "epoch": 3.425, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6908, + "grad_norm": 0.13625140488147736, + "learning_rate": 1.8490986643873845e-05 + }, + { + "step": 549, + "epoch": 3.43125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6918, + "grad_norm": 0.13791726529598236, + "learning_rate": 1.8099436625220443e-05 + }, + { + "step": 550, + "epoch": 3.4375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436505088, + "loss": 0.6889, + "grad_norm": 0.5462064146995544, + "learning_rate": 1.7711810347746757e-05 + }, + { + "step": 551, + "epoch": 3.44375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6994, + "grad_norm": 1.8001433610916138, + "learning_rate": 1.7328119342451165e-05 + }, + { + "step": 552, + "epoch": 3.45, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.6917, + "grad_norm": 0.11930370330810547, + "learning_rate": 1.694837502326674e-05 + }, + { + "step": 553, + "epoch": 3.45625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.698, + "grad_norm": 1.3849172592163086, + "learning_rate": 1.6572588686721606e-05 + }, + { + "step": 554, + "epoch": 3.4625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6846, + "grad_norm": 1.7470752000808716, + "learning_rate": 1.6200771511602882e-05 + }, + { + "step": 555, + "epoch": 3.46875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.6763, + "grad_norm": 3.816652536392212, + "learning_rate": 1.583293455862422e-05 + }, + { + "step": 556, + "epoch": 3.475, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6915, + "grad_norm": 0.12161583453416824, + "learning_rate": 1.546908877009676e-05 + }, + { + "step": 557, + "epoch": 3.48125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7021, + "grad_norm": 2.1705498695373535, + "learning_rate": 1.5109244969603546e-05 + }, + { + "step": 558, + "epoch": 3.4875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7001, + "grad_norm": 1.703685998916626, + "learning_rate": 1.4753413861677604e-05 + }, + { + "step": 559, + "epoch": 3.49375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6889, + "grad_norm": 0.8694254159927368, + "learning_rate": 1.4401606031483497e-05 + }, + { + "step": 560, + "epoch": 3.5, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6977, + "grad_norm": 1.3277173042297363, + "learning_rate": 1.4053831944502508e-05 + }, + { + "step": 561, + "epoch": 3.50625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6915, + "grad_norm": 0.20042864978313446, + "learning_rate": 1.371010194622117e-05 + }, + { + "step": 562, + "epoch": 3.5125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.7038, + "grad_norm": 2.742241144180298, + "learning_rate": 1.3370426261823613e-05 + }, + { + "step": 563, + "epoch": 3.51875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6953, + "grad_norm": 0.8764731884002686, + "learning_rate": 1.3034814995887433e-05 + }, + { + "step": 564, + "epoch": 3.525, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.6915, + "grad_norm": 0.6461377143859863, + "learning_rate": 1.2703278132082934e-05 + }, + { + "step": 565, + "epoch": 3.53125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6927, + "grad_norm": 0.1436101794242859, + "learning_rate": 1.237582553287631e-05 + }, + { + "step": 566, + "epoch": 3.5375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6995, + "grad_norm": 2.779043674468994, + "learning_rate": 1.205246693923616e-05 + }, + { + "step": 567, + "epoch": 3.54375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.697, + "grad_norm": 1.653348684310913, + "learning_rate": 1.173321197034382e-05 + }, + { + "step": 568, + "epoch": 3.55, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.692, + "grad_norm": 1.2176493406295776, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 569, + "epoch": 3.55625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6955, + "grad_norm": 1.8435603380203247, + "learning_rate": 1.1107050772877507e-05 + }, + { + "step": 570, + "epoch": 3.5625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.6935, + "grad_norm": 0.5885941386222839, + "learning_rate": 1.0800163171172332e-05 + }, + { + "step": 571, + "epoch": 3.56875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.6932, + "grad_norm": 0.3721270263195038, + "learning_rate": 1.0497416447398187e-05 + }, + { + "step": 572, + "epoch": 3.575, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6933, + "grad_norm": 2.066635847091675, + "learning_rate": 1.0198819607580233e-05 + }, + { + "step": 573, + "epoch": 3.58125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.694, + "grad_norm": 0.6423971056938171, + "learning_rate": 9.904381534293993e-06 + }, + { + "step": 574, + "epoch": 3.5875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6942, + "grad_norm": 1.3742074966430664, + "learning_rate": 9.614110986401169e-06 + }, + { + "step": 575, + "epoch": 3.59375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6933, + "grad_norm": 0.4540001153945923, + "learning_rate": 9.32801659878905e-06 + }, + { + "step": 576, + "epoch": 3.6, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6939, + "grad_norm": 0.9763819575309753, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 577, + "epoch": 3.60625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.6929, + "grad_norm": 0.47869110107421875, + "learning_rate": 8.768390222546895e-06 + }, + { + "step": 578, + "epoch": 3.6125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436503552, + "loss": 0.6935, + "grad_norm": 0.7287462949752808, + "learning_rate": 8.494874881526215e-06 + }, + { + "step": 579, + "epoch": 3.61875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6936, + "grad_norm": 0.360977441072464, + "learning_rate": 8.225568995509834e-06 + }, + { + "step": 580, + "epoch": 3.625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6941, + "grad_norm": 2.3478143215179443, + "learning_rate": 7.960480575734162e-06 + }, + { + "step": 581, + "epoch": 3.63125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6933, + "grad_norm": 1.0061547756195068, + "learning_rate": 7.699617507975563e-06 + }, + { + "step": 582, + "epoch": 3.6375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.6932, + "grad_norm": 0.36713317036628723, + "learning_rate": 7.442987552315833e-06 + }, + { + "step": 583, + "epoch": 3.64375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.694, + "grad_norm": 0.11745814234018326, + "learning_rate": 7.190598342911358e-06 + }, + { + "step": 584, + "epoch": 3.65, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6933, + "grad_norm": 0.11780799180269241, + "learning_rate": 6.942457387765976e-06 + }, + { + "step": 585, + "epoch": 3.65625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436503552, + "loss": 0.692, + "grad_norm": 0.901201605796814, + "learning_rate": 6.698572068507596e-06 + }, + { + "step": 586, + "epoch": 3.6625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6965, + "grad_norm": 2.238145351409912, + "learning_rate": 6.458949640168675e-06 + }, + { + "step": 587, + "epoch": 3.66875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436503552, + "loss": 0.6914, + "grad_norm": 0.5264253616333008, + "learning_rate": 6.223597230970428e-06 + }, + { + "step": 588, + "epoch": 3.675, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6904, + "grad_norm": 1.795876383781433, + "learning_rate": 5.992521842110709e-06 + }, + { + "step": 589, + "epoch": 3.68125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6955, + "grad_norm": 1.3277792930603027, + "learning_rate": 5.7657303475556974e-06 + }, + { + "step": 590, + "epoch": 3.6875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6974, + "grad_norm": 2.0300650596618652, + "learning_rate": 5.543229493835594e-06 + }, + { + "step": 591, + "epoch": 3.69375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.699, + "grad_norm": 2.9670557975769043, + "learning_rate": 5.325025899843732e-06 + }, + { + "step": 592, + "epoch": 3.7, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.6969, + "grad_norm": 1.6886050701141357, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 593, + "epoch": 3.70625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6953, + "grad_norm": 1.2856357097625732, + "learning_rate": 4.901536327256589e-06 + }, + { + "step": 594, + "epoch": 3.7125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6981, + "grad_norm": 2.9998574256896973, + "learning_rate": 4.6962629465110365e-06 + }, + { + "step": 595, + "epoch": 3.71875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436502016, + "loss": 0.6933, + "grad_norm": 0.15387730300426483, + "learning_rate": 4.495312020818403e-06 + }, + { + "step": 596, + "epoch": 3.725, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.691, + "grad_norm": 1.5467422008514404, + "learning_rate": 4.298689528010785e-06 + }, + { + "step": 597, + "epoch": 3.73125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6923, + "grad_norm": 0.5135179758071899, + "learning_rate": 4.106401317159275e-06 + }, + { + "step": 598, + "epoch": 3.7375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6927, + "grad_norm": 1.547203540802002, + "learning_rate": 3.918453108399955e-06 + }, + { + "step": 599, + "epoch": 3.74375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6941, + "grad_norm": 0.8537980914115906, + "learning_rate": 3.7348504927637302e-06 + }, + { + "step": 600, + "epoch": 3.75, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.692, + "grad_norm": 0.8824675679206848, + "learning_rate": 3.5555989320099952e-06 + }, + { + "step": 601, + "epoch": 3.75625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.6938, + "grad_norm": 0.5226725339889526, + "learning_rate": 3.3807037584642316e-06 + }, + { + "step": 602, + "epoch": 3.7625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6947, + "grad_norm": 0.8332694172859192, + "learning_rate": 3.21017017485925e-06 + }, + { + "step": 603, + "epoch": 3.76875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.43650048, + "loss": 0.6951, + "grad_norm": 1.4768182039260864, + "learning_rate": 3.0440032541805825e-06 + }, + { + "step": 604, + "epoch": 3.775, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6929, + "grad_norm": 0.546176552772522, + "learning_rate": 2.882207939515435e-06 + }, + { + "step": 605, + "epoch": 3.78125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436489728, + "loss": 0.6949, + "grad_norm": 2.412780523300171, + "learning_rate": 2.7247890439057064e-06 + }, + { + "step": 606, + "epoch": 3.7875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6941, + "grad_norm": 1.1114288568496704, + "learning_rate": 2.5717512502048342e-06 + }, + { + "step": 607, + "epoch": 3.79375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6927, + "grad_norm": 0.9184398651123047, + "learning_rate": 2.423099110938376e-06 + }, + { + "step": 608, + "epoch": 3.8, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6934, + "grad_norm": 1.2785730361938477, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 609, + "epoch": 3.80625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6947, + "grad_norm": 2.0642752647399902, + "learning_rate": 2.1389693533636455e-06 + }, + { + "step": 610, + "epoch": 3.8125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6933, + "grad_norm": 0.3924402594566345, + "learning_rate": 2.003500187268153e-06 + }, + { + "step": 611, + "epoch": 3.81875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6925, + "grad_norm": 1.2844653129577637, + "learning_rate": 1.8724335797812685e-06 + }, + { + "step": 612, + "epoch": 3.825, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.6935, + "grad_norm": 0.9053114056587219, + "learning_rate": 1.7457734298359005e-06 + }, + { + "step": 613, + "epoch": 3.83125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6929, + "grad_norm": 0.3461986780166626, + "learning_rate": 1.6235235052828476e-06 + }, + { + "step": 614, + "epoch": 3.8375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6922, + "grad_norm": 1.2314815521240234, + "learning_rate": 1.505687442778819e-06 + }, + { + "step": 615, + "epoch": 3.84375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6924, + "grad_norm": 1.2845882177352905, + "learning_rate": 1.3922687476781047e-06 + }, + { + "step": 616, + "epoch": 3.85, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6933, + "grad_norm": 0.10496394336223602, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 617, + "epoch": 3.85625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6935, + "grad_norm": 1.054183840751648, + "learning_rate": 1.1786968239705486e-06 + }, + { + "step": 618, + "epoch": 3.8625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6936, + "grad_norm": 0.09316414594650269, + "learning_rate": 1.0785499486417438e-06 + }, + { + "step": 619, + "epoch": 3.86875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.693, + "grad_norm": 0.35075682401657104, + "learning_rate": 9.82833147083345e-07 + }, + { + "step": 620, + "epoch": 3.875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6917, + "grad_norm": 1.6169066429138184, + "learning_rate": 8.91549266652053e-07 + }, + { + "step": 621, + "epoch": 3.88125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6954, + "grad_norm": 2.1305136680603027, + "learning_rate": 8.04701022835319e-07 + }, + { + "step": 622, + "epoch": 3.8875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6944, + "grad_norm": 0.7953799962997437, + "learning_rate": 7.222909991704773e-07 + }, + { + "step": 623, + "epoch": 3.89375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.6926, + "grad_norm": 1.6301136016845703, + "learning_rate": 6.443216471679058e-07 + }, + { + "step": 624, + "epoch": 3.9, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6938, + "grad_norm": 0.8630837798118591, + "learning_rate": 5.707952862381681e-07 + }, + { + "step": 625, + "epoch": 3.90625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6929, + "grad_norm": 0.5528228878974915, + "learning_rate": 5.017141036229522e-07 + }, + { + "step": 626, + "epoch": 3.9125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6934, + "grad_norm": 0.1442209929227829, + "learning_rate": 4.370801543300051e-07 + }, + { + "step": 627, + "epoch": 3.91875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6932, + "grad_norm": 0.7466127276420593, + "learning_rate": 3.768953610720327e-07 + }, + { + "step": 628, + "epoch": 3.925, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6941, + "grad_norm": 0.7418840527534485, + "learning_rate": 3.211615142094781e-07 + }, + { + "step": 629, + "epoch": 3.93125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.4364928, + "loss": 0.692, + "grad_norm": 1.288956642150879, + "learning_rate": 2.6988027169728145e-07 + }, + { + "step": 630, + "epoch": 3.9375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436491264, + "loss": 0.6948, + "grad_norm": 1.4385509490966797, + "learning_rate": 2.2305315903553555e-07 + }, + { + "step": 631, + "epoch": 3.94375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436505088, + "loss": 0.6939, + "grad_norm": 1.1222437620162964, + "learning_rate": 1.8068156922413924e-07 + }, + { + "step": 632, + "epoch": 3.95, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.6918, + "grad_norm": 0.5726282000541687, + "learning_rate": 1.4276676272133025e-07 + }, + { + "step": 633, + "epoch": 3.95625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6931, + "grad_norm": 0.12152113765478134, + "learning_rate": 1.0930986740621539e-07 + }, + { + "step": 634, + "epoch": 3.9625, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436495872, + "loss": 0.694, + "grad_norm": 1.4223427772521973, + "learning_rate": 8.031187854514731e-08 + }, + { + "step": 635, + "epoch": 3.96875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6923, + "grad_norm": 0.3644523024559021, + "learning_rate": 5.577365876224815e-08 + }, + { + "step": 636, + "epoch": 3.975, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436488192, + "loss": 0.6934, + "grad_norm": 0.2463395595550537, + "learning_rate": 3.5695938013630134e-08 + }, + { + "step": 637, + "epoch": 3.98125, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436498944, + "loss": 0.6954, + "grad_norm": 2.4520082473754883, + "learning_rate": 2.007931356572956e-08 + }, + { + "step": 638, + "epoch": 3.9875, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6925, + "grad_norm": 1.2283471822738647, + "learning_rate": 8.924249977537712e-09 + }, + { + "step": 639, + "epoch": 3.99375, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436497408, + "loss": 0.6927, + "grad_norm": 1.2185505628585815, + "learning_rate": 2.2310790867619e-09 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "loss": 0.6916, + "grad_norm": 1.0751972198486328, + "learning_rate": 0.0 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.497739264, + "gpu_mem": 4.436494336, + "train_runtime": 1384.8444, + "train_samples_per_second": 29.56, + "train_steps_per_second": 0.462, + "total_flos": 0.0, + "train_loss": 0.7478797028772532 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r32-a2/adapter_config.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0052eed638e4aeb48f103586efb96096bb8d3ed --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r32-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 64, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 32, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "A" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r32-a2/eval_results.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc2f59f3dad59a371f46dd1db575a7703e7145a --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "winogrande", + "results": 0.5122336227308603 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r32-a2/training_configuration.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..e0e259004aeaffa5435c7598d4d3c8446b98380f --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "WINOGRANDE", + "dataset_id": "allenai/winogrande", + "preprocess_id": "winogrande_train_deepeval" + }, + "peft_config": { + "method": "abl_A", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 25389056 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_A-winogrande-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r32-a2", + "seed": 42, + "timestamp": "2025-08-31T06:05:39.750636" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r32-a2/training_logs.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..760bf79bc93cd10a09fe4b3d2f7005906a9f5729 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r32-a2/training_logs.json @@ -0,0 +1,5773 @@ +[ + { + "step": 1, + "epoch": 0.00625, + "cpu_mem": 1.496805376, + "gpu_mem": 4.518891008, + "loss": 3.3802, + "grad_norm": 253.36181640625, + "learning_rate": 4.6875e-06 + }, + { + "step": 2, + "epoch": 0.0125, + "cpu_mem": 1.502507008, + "gpu_mem": 4.722001408, + "loss": 3.3361, + "grad_norm": 248.338623046875, + "learning_rate": 9.375e-06 + }, + { + "step": 3, + "epoch": 0.01875, + "cpu_mem": 1.503096832, + "gpu_mem": 4.722006016, + "loss": 2.081, + "grad_norm": 174.30479431152344, + "learning_rate": 1.40625e-05 + }, + { + "step": 4, + "epoch": 0.025, + "cpu_mem": 1.503490048, + "gpu_mem": 4.72200448, + "loss": 1.1136, + "grad_norm": 60.59732437133789, + "learning_rate": 1.875e-05 + }, + { + "step": 5, + "epoch": 0.03125, + "cpu_mem": 1.503883264, + "gpu_mem": 4.72200448, + "loss": 0.8683, + "grad_norm": 95.81663513183594, + "learning_rate": 2.3437499999999997e-05 + }, + { + "step": 6, + "epoch": 0.0375, + "cpu_mem": 1.504079872, + "gpu_mem": 4.722010624, + "loss": 0.7639, + "grad_norm": 33.08975601196289, + "learning_rate": 2.8125e-05 + }, + { + "step": 7, + "epoch": 0.04375, + "cpu_mem": 1.50427648, + "gpu_mem": 4.722016768, + "loss": 0.7914, + "grad_norm": 50.723690032958984, + "learning_rate": 3.28125e-05 + }, + { + "step": 8, + "epoch": 0.05, + "cpu_mem": 1.504473088, + "gpu_mem": 4.721999872, + "loss": 0.7171, + "grad_norm": 10.0919828414917, + "learning_rate": 3.75e-05 + }, + { + "step": 9, + "epoch": 0.05625, + "cpu_mem": 1.504669696, + "gpu_mem": 4.722006016, + "loss": 0.7789, + "grad_norm": 28.384031295776367, + "learning_rate": 4.2187499999999995e-05 + }, + { + "step": 10, + "epoch": 0.0625, + "cpu_mem": 1.504866304, + "gpu_mem": 4.722009088, + "loss": 0.7199, + "grad_norm": 12.00358772277832, + "learning_rate": 4.6874999999999994e-05 + }, + { + "step": 11, + "epoch": 0.06875, + "cpu_mem": 1.505062912, + "gpu_mem": 4.721998336, + "loss": 0.6921, + "grad_norm": 3.459439754486084, + "learning_rate": 5.156249999999999e-05 + }, + { + "step": 12, + "epoch": 0.075, + "cpu_mem": 1.50525952, + "gpu_mem": 4.722002944, + "loss": 0.6953, + "grad_norm": 12.826359748840332, + "learning_rate": 5.625e-05 + }, + { + "step": 13, + "epoch": 0.08125, + "cpu_mem": 1.505456128, + "gpu_mem": 4.722010624, + "loss": 0.8001, + "grad_norm": 21.10012435913086, + "learning_rate": 6.09375e-05 + }, + { + "step": 14, + "epoch": 0.0875, + "cpu_mem": 1.505652736, + "gpu_mem": 4.722006016, + "loss": 0.6948, + "grad_norm": 8.596832275390625, + "learning_rate": 6.5625e-05 + }, + { + "step": 15, + "epoch": 0.09375, + "cpu_mem": 1.505849344, + "gpu_mem": 4.722006016, + "loss": 0.6937, + "grad_norm": 2.300356388092041, + "learning_rate": 7.03125e-05 + }, + { + "step": 16, + "epoch": 0.1, + "cpu_mem": 1.505849344, + "gpu_mem": 4.722002944, + "loss": 0.7617, + "grad_norm": 14.800594329833984, + "learning_rate": 7.5e-05 + }, + { + "step": 17, + "epoch": 0.10625, + "cpu_mem": 1.506045952, + "gpu_mem": 4.722002944, + "loss": 0.695, + "grad_norm": 2.175337076187134, + "learning_rate": 7.968749999999999e-05 + }, + { + "step": 18, + "epoch": 0.1125, + "cpu_mem": 1.506045952, + "gpu_mem": 4.722006016, + "loss": 0.7047, + "grad_norm": 2.1774232387542725, + "learning_rate": 8.437499999999999e-05 + }, + { + "step": 19, + "epoch": 0.11875, + "cpu_mem": 1.50624256, + "gpu_mem": 4.722002944, + "loss": 0.6966, + "grad_norm": 8.193639755249023, + "learning_rate": 8.906249999999999e-05 + }, + { + "step": 20, + "epoch": 0.125, + "cpu_mem": 1.50624256, + "gpu_mem": 4.722010624, + "loss": 0.7183, + "grad_norm": 11.231057167053223, + "learning_rate": 9.374999999999999e-05 + }, + { + "step": 21, + "epoch": 0.13125, + "cpu_mem": 1.50624256, + "gpu_mem": 4.722002944, + "loss": 0.7006, + "grad_norm": 6.143838882446289, + "learning_rate": 9.843749999999999e-05 + }, + { + "step": 22, + "epoch": 0.1375, + "cpu_mem": 1.506439168, + "gpu_mem": 4.722002944, + "loss": 0.8556, + "grad_norm": 21.67837905883789, + "learning_rate": 0.00010312499999999999 + }, + { + "step": 23, + "epoch": 0.14375, + "cpu_mem": 1.506439168, + "gpu_mem": 4.721998336, + "loss": 0.8513, + "grad_norm": 21.96662139892578, + "learning_rate": 0.00010781249999999998 + }, + { + "step": 24, + "epoch": 0.15, + "cpu_mem": 1.506439168, + "gpu_mem": 4.722001408, + "loss": 0.6999, + "grad_norm": 2.1809275150299072, + "learning_rate": 0.0001125 + }, + { + "step": 25, + "epoch": 0.15625, + "cpu_mem": 1.506635776, + "gpu_mem": 4.72200448, + "loss": 0.7038, + "grad_norm": 3.4095001220703125, + "learning_rate": 0.0001171875 + }, + { + "step": 26, + "epoch": 0.1625, + "cpu_mem": 1.506635776, + "gpu_mem": 4.721999872, + "loss": 0.6951, + "grad_norm": 1.9444891214370728, + "learning_rate": 0.000121875 + }, + { + "step": 27, + "epoch": 0.16875, + "cpu_mem": 1.506635776, + "gpu_mem": 4.721998336, + "loss": 0.707, + "grad_norm": 11.482319831848145, + "learning_rate": 0.0001265625 + }, + { + "step": 28, + "epoch": 0.175, + "cpu_mem": 1.506635776, + "gpu_mem": 4.72200448, + "loss": 0.7511, + "grad_norm": 15.075895309448242, + "learning_rate": 0.00013125 + }, + { + "step": 29, + "epoch": 0.18125, + "cpu_mem": 1.506635776, + "gpu_mem": 4.722002944, + "loss": 0.6974, + "grad_norm": 1.9298940896987915, + "learning_rate": 0.0001359375 + }, + { + "step": 30, + "epoch": 0.1875, + "cpu_mem": 1.506635776, + "gpu_mem": 4.722002944, + "loss": 0.9994, + "grad_norm": 29.269351959228516, + "learning_rate": 0.000140625 + }, + { + "step": 31, + "epoch": 0.19375, + "cpu_mem": 1.506635776, + "gpu_mem": 4.722002944, + "loss": 0.9209, + "grad_norm": 23.943628311157227, + "learning_rate": 0.0001453125 + }, + { + "step": 32, + "epoch": 0.2, + "cpu_mem": 1.506635776, + "gpu_mem": 4.721999872, + "loss": 0.6951, + "grad_norm": 8.235535621643066, + "learning_rate": 0.00015 + }, + { + "step": 33, + "epoch": 0.20625, + "cpu_mem": 1.506635776, + "gpu_mem": 4.721999872, + "loss": 0.9787, + "grad_norm": 23.34931755065918, + "learning_rate": 0.00015468749999999999 + }, + { + "step": 34, + "epoch": 0.2125, + "cpu_mem": 1.506832384, + "gpu_mem": 4.721999872, + "loss": 1.0282, + "grad_norm": 23.90515899658203, + "learning_rate": 0.00015937499999999998 + }, + { + "step": 35, + "epoch": 0.21875, + "cpu_mem": 1.506832384, + "gpu_mem": 4.722006016, + "loss": 0.8078, + "grad_norm": 14.190905570983887, + "learning_rate": 0.00016406249999999998 + }, + { + "step": 36, + "epoch": 0.225, + "cpu_mem": 1.506832384, + "gpu_mem": 4.722001408, + "loss": 0.7867, + "grad_norm": 15.379456520080566, + "learning_rate": 0.00016874999999999998 + }, + { + "step": 37, + "epoch": 0.23125, + "cpu_mem": 1.506832384, + "gpu_mem": 4.721999872, + "loss": 0.6984, + "grad_norm": 2.5342094898223877, + "learning_rate": 0.00017343749999999998 + }, + { + "step": 38, + "epoch": 0.2375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.705, + "grad_norm": 3.916247844696045, + "learning_rate": 0.00017812499999999998 + }, + { + "step": 39, + "epoch": 0.24375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.7085, + "grad_norm": 5.406979084014893, + "learning_rate": 0.00018281249999999998 + }, + { + "step": 40, + "epoch": 0.25, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.7168, + "grad_norm": 5.44709587097168, + "learning_rate": 0.00018749999999999998 + }, + { + "step": 41, + "epoch": 0.25625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.7109, + "grad_norm": 4.754012584686279, + "learning_rate": 0.00019218749999999998 + }, + { + "step": 42, + "epoch": 0.2625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7007, + "grad_norm": 5.803560256958008, + "learning_rate": 0.00019687499999999997 + }, + { + "step": 43, + "epoch": 0.26875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7216, + "grad_norm": 6.457539081573486, + "learning_rate": 0.00020156249999999997 + }, + { + "step": 44, + "epoch": 0.275, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6907, + "grad_norm": 1.7381521463394165, + "learning_rate": 0.00020624999999999997 + }, + { + "step": 45, + "epoch": 0.28125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72201216, + "loss": 0.6896, + "grad_norm": 1.9347078800201416, + "learning_rate": 0.00021093749999999997 + }, + { + "step": 46, + "epoch": 0.2875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7025, + "grad_norm": 0.6743069291114807, + "learning_rate": 0.00021562499999999997 + }, + { + "step": 47, + "epoch": 0.29375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7046, + "grad_norm": 3.2521748542785645, + "learning_rate": 0.00022031249999999997 + }, + { + "step": 48, + "epoch": 0.3, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6868, + "grad_norm": 2.556065082550049, + "learning_rate": 0.000225 + }, + { + "step": 49, + "epoch": 0.30625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.7846, + "grad_norm": 11.881805419921875, + "learning_rate": 0.0002296875 + }, + { + "step": 50, + "epoch": 0.3125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.7007, + "grad_norm": 2.2975411415100098, + "learning_rate": 0.000234375 + }, + { + "step": 51, + "epoch": 0.31875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6933, + "grad_norm": 3.276329755783081, + "learning_rate": 0.0002390625 + }, + { + "step": 52, + "epoch": 0.325, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.7561, + "grad_norm": 8.107083320617676, + "learning_rate": 0.00024375 + }, + { + "step": 53, + "epoch": 0.33125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7031, + "grad_norm": 4.401709079742432, + "learning_rate": 0.00024843749999999996 + }, + { + "step": 54, + "epoch": 0.3375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721993728, + "loss": 0.818, + "grad_norm": 10.398260116577148, + "learning_rate": 0.000253125 + }, + { + "step": 55, + "epoch": 0.34375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.7558, + "grad_norm": 6.696636199951172, + "learning_rate": 0.00025781249999999996 + }, + { + "step": 56, + "epoch": 0.35, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.697, + "grad_norm": 1.6235384941101074, + "learning_rate": 0.0002625 + }, + { + "step": 57, + "epoch": 0.35625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.7139, + "grad_norm": 3.6876883506774902, + "learning_rate": 0.00026718749999999996 + }, + { + "step": 58, + "epoch": 0.3625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.7047, + "grad_norm": 2.785215139389038, + "learning_rate": 0.000271875 + }, + { + "step": 59, + "epoch": 0.36875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72201216, + "loss": 0.6877, + "grad_norm": 0.7108306288719177, + "learning_rate": 0.00027656249999999995 + }, + { + "step": 60, + "epoch": 0.375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6977, + "grad_norm": 1.1876534223556519, + "learning_rate": 0.00028125 + }, + { + "step": 61, + "epoch": 0.38125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.8211, + "grad_norm": 7.314707279205322, + "learning_rate": 0.00028593749999999995 + }, + { + "step": 62, + "epoch": 0.3875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.802, + "grad_norm": 6.773654937744141, + "learning_rate": 0.000290625 + }, + { + "step": 63, + "epoch": 0.39375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.694, + "grad_norm": 2.547663450241089, + "learning_rate": 0.00029531249999999995 + }, + { + "step": 64, + "epoch": 0.4, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.8999, + "grad_norm": 9.247257232666016, + "learning_rate": 0.0003 + }, + { + "step": 65, + "epoch": 0.40625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.7776, + "grad_norm": 6.204533100128174, + "learning_rate": 0.00029999776892091325 + }, + { + "step": 66, + "epoch": 0.4125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7534, + "grad_norm": 5.747366428375244, + "learning_rate": 0.00029999107575002246 + }, + { + "step": 67, + "epoch": 0.41875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 1.1761, + "grad_norm": 24.665956497192383, + "learning_rate": 0.0002999799206864343 + }, + { + "step": 68, + "epoch": 0.425, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.8624, + "grad_norm": 10.04936408996582, + "learning_rate": 0.0002999643040619863 + }, + { + "step": 69, + "epoch": 0.43125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7373, + "grad_norm": 5.797850608825684, + "learning_rate": 0.0002999442263412377 + }, + { + "step": 70, + "epoch": 0.4375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.8312, + "grad_norm": 8.499218940734863, + "learning_rate": 0.00029991968812145484 + }, + { + "step": 71, + "epoch": 0.44375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.7085, + "grad_norm": 2.533390998840332, + "learning_rate": 0.00029989069013259374 + }, + { + "step": 72, + "epoch": 0.45, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.7452, + "grad_norm": 5.37766170501709, + "learning_rate": 0.00029985723323727866 + }, + { + "step": 73, + "epoch": 0.45625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.8258, + "grad_norm": 9.154284477233887, + "learning_rate": 0.00029981931843077583 + }, + { + "step": 74, + "epoch": 0.4625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6962, + "grad_norm": 0.5715246796607971, + "learning_rate": 0.00029977694684096444 + }, + { + "step": 75, + "epoch": 0.46875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6878, + "grad_norm": 2.7505483627319336, + "learning_rate": 0.0002997301197283027 + }, + { + "step": 76, + "epoch": 0.475, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.8035, + "grad_norm": 7.018910884857178, + "learning_rate": 0.0002996788384857905 + }, + { + "step": 77, + "epoch": 0.48125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.7074, + "grad_norm": 2.2377219200134277, + "learning_rate": 0.00029962310463892795 + }, + { + "step": 78, + "epoch": 0.4875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6947, + "grad_norm": 0.630942165851593, + "learning_rate": 0.00029956291984566997 + }, + { + "step": 79, + "epoch": 0.49375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7257, + "grad_norm": 3.1362733840942383, + "learning_rate": 0.00029949828589637703 + }, + { + "step": 80, + "epoch": 0.5, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.7002, + "grad_norm": 1.4922473430633545, + "learning_rate": 0.0002994292047137618 + }, + { + "step": 81, + "epoch": 0.50625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.7229, + "grad_norm": 2.751971483230591, + "learning_rate": 0.00029935567835283203 + }, + { + "step": 82, + "epoch": 0.5125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.7126, + "grad_norm": 2.0337600708007812, + "learning_rate": 0.00029927770900082954 + }, + { + "step": 83, + "epoch": 0.51875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6947, + "grad_norm": 1.6015232801437378, + "learning_rate": 0.0002991952989771647 + }, + { + "step": 84, + "epoch": 0.525, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.7813, + "grad_norm": 4.816605567932129, + "learning_rate": 0.0002991084507333479 + }, + { + "step": 85, + "epoch": 0.53125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7868, + "grad_norm": 5.120372772216797, + "learning_rate": 0.00029901716685291663 + }, + { + "step": 86, + "epoch": 0.5375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.7048, + "grad_norm": 0.6017482280731201, + "learning_rate": 0.0002989214500513582 + }, + { + "step": 87, + "epoch": 0.54375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7111, + "grad_norm": 3.281963348388672, + "learning_rate": 0.0002988213031760294 + }, + { + "step": 88, + "epoch": 0.55, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7136, + "grad_norm": 2.9227921962738037, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 89, + "epoch": 0.55625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6983, + "grad_norm": 2.0136280059814453, + "learning_rate": 0.0002986077312523219 + }, + { + "step": 90, + "epoch": 0.5625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7052, + "grad_norm": 2.654031991958618, + "learning_rate": 0.00029849431255722116 + }, + { + "step": 91, + "epoch": 0.56875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6687, + "grad_norm": 1.887153148651123, + "learning_rate": 0.00029837647649471715 + }, + { + "step": 92, + "epoch": 0.575, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 1.0065, + "grad_norm": 13.65556812286377, + "learning_rate": 0.0002982542265701641 + }, + { + "step": 93, + "epoch": 0.58125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 1.0244, + "grad_norm": 13.303121566772461, + "learning_rate": 0.0002981275664202187 + }, + { + "step": 94, + "epoch": 0.5875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.8488, + "grad_norm": 8.569211959838867, + "learning_rate": 0.00029799649981273186 + }, + { + "step": 95, + "epoch": 0.59375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6825, + "grad_norm": 0.8460494875907898, + "learning_rate": 0.00029786103064663634 + }, + { + "step": 96, + "epoch": 0.6, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6925, + "grad_norm": 1.385131597518921, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 97, + "epoch": 0.60625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7639, + "grad_norm": 4.247017860412598, + "learning_rate": 0.00029757690088906156 + }, + { + "step": 98, + "epoch": 0.6125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.7019, + "grad_norm": 1.6543782949447632, + "learning_rate": 0.00029742824874979515 + }, + { + "step": 99, + "epoch": 0.61875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.8164, + "grad_norm": 8.424407958984375, + "learning_rate": 0.0002972752109560943 + }, + { + "step": 100, + "epoch": 0.625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.806, + "grad_norm": 7.710474967956543, + "learning_rate": 0.00029711779206048454 + }, + { + "step": 101, + "epoch": 0.63125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7884, + "grad_norm": 7.09982442855835, + "learning_rate": 0.0002969559967458194 + }, + { + "step": 102, + "epoch": 0.6375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.6929, + "grad_norm": 0.11996857821941376, + "learning_rate": 0.0002967898298251407 + }, + { + "step": 103, + "epoch": 0.64375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.7009, + "grad_norm": 2.3729610443115234, + "learning_rate": 0.0002966192962415358 + }, + { + "step": 104, + "epoch": 0.65, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7361, + "grad_norm": 4.637565612792969, + "learning_rate": 0.00029644440106799 + }, + { + "step": 105, + "epoch": 0.65625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.696, + "grad_norm": 0.11807335168123245, + "learning_rate": 0.00029626514950723627 + }, + { + "step": 106, + "epoch": 0.6625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7192, + "grad_norm": 2.5029454231262207, + "learning_rate": 0.0002960815468916 + }, + { + "step": 107, + "epoch": 0.66875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7075, + "grad_norm": 1.7361555099487305, + "learning_rate": 0.0002958935986828407 + }, + { + "step": 108, + "epoch": 0.675, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6985, + "grad_norm": 1.1873990297317505, + "learning_rate": 0.00029570131047198915 + }, + { + "step": 109, + "epoch": 0.68125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6947, + "grad_norm": 0.2976088523864746, + "learning_rate": 0.0002955046879791816 + }, + { + "step": 110, + "epoch": 0.6875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6903, + "grad_norm": 1.344376802444458, + "learning_rate": 0.00029530373705348895 + }, + { + "step": 111, + "epoch": 0.69375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.7041, + "grad_norm": 1.4834744930267334, + "learning_rate": 0.00029509846367274336 + }, + { + "step": 112, + "epoch": 0.7, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6957, + "grad_norm": 0.7356757521629333, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 113, + "epoch": 0.70625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721995264, + "loss": 0.7018, + "grad_norm": 2.2577269077301025, + "learning_rate": 0.00029467497410015625 + }, + { + "step": 114, + "epoch": 0.7125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6947, + "grad_norm": 1.0147289037704468, + "learning_rate": 0.00029445677050616437 + }, + { + "step": 115, + "epoch": 0.71875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.7013, + "grad_norm": 1.2209444046020508, + "learning_rate": 0.0002942342696524443 + }, + { + "step": 116, + "epoch": 0.725, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.7553, + "grad_norm": 4.117190837860107, + "learning_rate": 0.0002940074781578893 + }, + { + "step": 117, + "epoch": 0.73125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6923, + "grad_norm": 0.3909473121166229, + "learning_rate": 0.00029377640276902954 + }, + { + "step": 118, + "epoch": 0.7375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6933, + "grad_norm": 0.17205289006233215, + "learning_rate": 0.0002935410503598313 + }, + { + "step": 119, + "epoch": 0.74375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7166, + "grad_norm": 2.7956204414367676, + "learning_rate": 0.00029330142793149237 + }, + { + "step": 120, + "epoch": 0.75, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6949, + "grad_norm": 0.13231131434440613, + "learning_rate": 0.000293057542612234 + }, + { + "step": 121, + "epoch": 0.75625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.704, + "grad_norm": 1.2877357006072998, + "learning_rate": 0.0002928094016570886 + }, + { + "step": 122, + "epoch": 0.7625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6995, + "grad_norm": 2.115797519683838, + "learning_rate": 0.00029255701244768414 + }, + { + "step": 123, + "epoch": 0.76875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.7006, + "grad_norm": 1.1016205549240112, + "learning_rate": 0.0002923003824920244 + }, + { + "step": 124, + "epoch": 0.775, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7171, + "grad_norm": 1.9102469682693481, + "learning_rate": 0.0002920395194242658 + }, + { + "step": 125, + "epoch": 0.78125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721995264, + "loss": 0.6984, + "grad_norm": 0.8227682113647461, + "learning_rate": 0.00029177443100449014 + }, + { + "step": 126, + "epoch": 0.7875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6914, + "grad_norm": 0.1047651618719101, + "learning_rate": 0.00029150512511847375 + }, + { + "step": 127, + "epoch": 0.79375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.709, + "grad_norm": 1.6401056051254272, + "learning_rate": 0.00029123160977745306 + }, + { + "step": 128, + "epoch": 0.8, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.7362, + "grad_norm": 2.816617488861084, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 129, + "epoch": 0.80625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6966, + "grad_norm": 0.6609585881233215, + "learning_rate": 0.00029067198340121094 + }, + { + "step": 130, + "epoch": 0.8125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6963, + "grad_norm": 0.36387738585472107, + "learning_rate": 0.00029038588901359884 + }, + { + "step": 131, + "epoch": 0.81875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.7148, + "grad_norm": 1.8245207071304321, + "learning_rate": 0.00029009561846570604 + }, + { + "step": 132, + "epoch": 0.825, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6893, + "grad_norm": 0.4718245267868042, + "learning_rate": 0.00028980118039241976 + }, + { + "step": 133, + "epoch": 0.83125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6777, + "grad_norm": 1.7044023275375366, + "learning_rate": 0.00028950258355260177 + }, + { + "step": 134, + "epoch": 0.8375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7155, + "grad_norm": 1.5899581909179688, + "learning_rate": 0.00028919983682882766 + }, + { + "step": 135, + "epoch": 0.84375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6705, + "grad_norm": 0.2669835686683655, + "learning_rate": 0.0002888929492271224 + }, + { + "step": 136, + "epoch": 0.85, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6762, + "grad_norm": 0.1435621976852417, + "learning_rate": 0.000288581929876693 + }, + { + "step": 137, + "epoch": 0.85625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6888, + "grad_norm": 0.5717059969902039, + "learning_rate": 0.00028826678802965614 + }, + { + "step": 138, + "epoch": 0.8625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6961, + "grad_norm": 0.5882948040962219, + "learning_rate": 0.0002879475330607638 + }, + { + "step": 139, + "epoch": 0.86875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6961, + "grad_norm": 0.5054025650024414, + "learning_rate": 0.00028762417446712363 + }, + { + "step": 140, + "epoch": 0.875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7303, + "grad_norm": 2.6438841819763184, + "learning_rate": 0.00028729672186791704 + }, + { + "step": 141, + "epoch": 0.88125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7027, + "grad_norm": 1.6810760498046875, + "learning_rate": 0.00028696518500411254 + }, + { + "step": 142, + "epoch": 0.8875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6882, + "grad_norm": 0.5073636174201965, + "learning_rate": 0.0002866295737381763 + }, + { + "step": 143, + "epoch": 0.89375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7306, + "grad_norm": 1.8619046211242676, + "learning_rate": 0.0002862898980537788 + }, + { + "step": 144, + "epoch": 0.9, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7223, + "grad_norm": 1.7392523288726807, + "learning_rate": 0.0002859461680554975 + }, + { + "step": 145, + "epoch": 0.90625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.6877, + "grad_norm": 0.9857088923454285, + "learning_rate": 0.0002855983939685165 + }, + { + "step": 146, + "epoch": 0.9125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.697, + "grad_norm": 1.5808554887771606, + "learning_rate": 0.0002852465861383224 + }, + { + "step": 147, + "epoch": 0.91875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6989, + "grad_norm": 1.1464070081710815, + "learning_rate": 0.00028489075503039643 + }, + { + "step": 148, + "epoch": 0.925, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6949, + "grad_norm": 0.577153205871582, + "learning_rate": 0.00028453091122990323 + }, + { + "step": 149, + "epoch": 0.93125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6932, + "grad_norm": 1.2154101133346558, + "learning_rate": 0.0002841670654413757 + }, + { + "step": 150, + "epoch": 0.9375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6817, + "grad_norm": 0.24656634032726288, + "learning_rate": 0.0002837992284883971 + }, + { + "step": 151, + "epoch": 0.94375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.7387, + "grad_norm": 2.3622326850891113, + "learning_rate": 0.0002834274113132784 + }, + { + "step": 152, + "epoch": 0.95, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.7146, + "grad_norm": 1.523602843284607, + "learning_rate": 0.0002830516249767332 + }, + { + "step": 153, + "epoch": 0.95625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6897, + "grad_norm": 0.2152739316225052, + "learning_rate": 0.0002826718806575488 + }, + { + "step": 154, + "epoch": 0.9625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6944, + "grad_norm": 0.23420673608779907, + "learning_rate": 0.0002822881896522532 + }, + { + "step": 155, + "epoch": 0.96875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6824, + "grad_norm": 0.11238644272089005, + "learning_rate": 0.0002819005633747795 + }, + { + "step": 156, + "epoch": 0.975, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.7565, + "grad_norm": 3.3180534839630127, + "learning_rate": 0.00028150901335612615 + }, + { + "step": 157, + "epoch": 0.98125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6069, + "grad_norm": 2.0010712146759033, + "learning_rate": 0.0002811135512440138 + }, + { + "step": 158, + "epoch": 0.9875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722013696, + "loss": 0.7669, + "grad_norm": 3.6668853759765625, + "learning_rate": 0.0002807141888025392 + }, + { + "step": 159, + "epoch": 0.99375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7043, + "grad_norm": 1.7419414520263672, + "learning_rate": 0.00028031093791182484 + }, + { + "step": 160, + "epoch": 1.0, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6989, + "grad_norm": 0.8371703028678894, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 161, + "epoch": 1.00625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.7174, + "grad_norm": 1.8541243076324463, + "learning_rate": 0.0002794928188811727 + }, + { + "step": 162, + "epoch": 1.0125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.7208, + "grad_norm": 1.914364218711853, + "learning_rate": 0.0002790779750784118 + }, + { + "step": 163, + "epoch": 1.01875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6821, + "grad_norm": 0.8147350549697876, + "learning_rate": 0.0002786592915000408 + }, + { + "step": 164, + "epoch": 1.025, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6848, + "grad_norm": 1.0596096515655518, + "learning_rate": 0.00027823678060094197 + }, + { + "step": 165, + "epoch": 1.03125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6771, + "grad_norm": 0.23953458666801453, + "learning_rate": 0.0002778104549498518 + }, + { + "step": 166, + "epoch": 1.0375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7342, + "grad_norm": 2.343430995941162, + "learning_rate": 0.00027738032722898683 + }, + { + "step": 167, + "epoch": 1.04375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.715, + "grad_norm": 1.7582224607467651, + "learning_rate": 0.00027694641023366656 + }, + { + "step": 168, + "epoch": 1.05, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6979, + "grad_norm": 1.045183777809143, + "learning_rate": 0.0002765087168719328 + }, + { + "step": 169, + "epoch": 1.05625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7049, + "grad_norm": 1.4784225225448608, + "learning_rate": 0.00027606726016416567 + }, + { + "step": 170, + "epoch": 1.0625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.6881, + "grad_norm": 0.5554291009902954, + "learning_rate": 0.00027562205324269617 + }, + { + "step": 171, + "epoch": 1.06875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.7076, + "grad_norm": 2.163924217224121, + "learning_rate": 0.00027517310935141565 + }, + { + "step": 172, + "epoch": 1.075, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6985, + "grad_norm": 0.889345645904541, + "learning_rate": 0.0002747204418453818 + }, + { + "step": 173, + "epoch": 1.08125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721995264, + "loss": 0.6976, + "grad_norm": 0.7876308560371399, + "learning_rate": 0.00027426406419042135 + }, + { + "step": 174, + "epoch": 1.0875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6949, + "grad_norm": 0.37984946370124817, + "learning_rate": 0.00027380398996272956 + }, + { + "step": 175, + "epoch": 1.09375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7098, + "grad_norm": 1.5838266611099243, + "learning_rate": 0.0002733402328484662 + }, + { + "step": 176, + "epoch": 1.1, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.693, + "grad_norm": 0.050825413316488266, + "learning_rate": 0.00027287280664334875 + }, + { + "step": 177, + "epoch": 1.10625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6957, + "grad_norm": 0.5806949734687805, + "learning_rate": 0.0002724017252522415 + }, + { + "step": 178, + "epoch": 1.1125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.693, + "grad_norm": 0.31667885184288025, + "learning_rate": 0.0002719270026887423 + }, + { + "step": 179, + "epoch": 1.11875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6926, + "grad_norm": 0.18580806255340576, + "learning_rate": 0.0002714486530747656 + }, + { + "step": 180, + "epoch": 1.125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6934, + "grad_norm": 0.2203487604856491, + "learning_rate": 0.0002709666906401224 + }, + { + "step": 181, + "epoch": 1.13125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722018304, + "loss": 0.6781, + "grad_norm": 1.0616934299468994, + "learning_rate": 0.0002704811297220967 + }, + { + "step": 182, + "epoch": 1.1375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7454, + "grad_norm": 2.1857786178588867, + "learning_rate": 0.00026999198476501945 + }, + { + "step": 183, + "epoch": 1.14375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7362, + "grad_norm": 1.9325038194656372, + "learning_rate": 0.0002694992703198383 + }, + { + "step": 184, + "epoch": 1.15, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.706, + "grad_norm": 1.332340121269226, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 185, + "epoch": 1.15625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6899, + "grad_norm": 0.26042014360427856, + "learning_rate": 0.0002685031916994403 + }, + { + "step": 186, + "epoch": 1.1625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.7508, + "grad_norm": 2.723623275756836, + "learning_rate": 0.0002679998571552925 + }, + { + "step": 187, + "epoch": 1.16875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6822, + "grad_norm": 0.8886094093322754, + "learning_rate": 0.0002674930123842975 + }, + { + "step": 188, + "epoch": 1.175, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7062, + "grad_norm": 1.2770217657089233, + "learning_rate": 0.0002669826724639322 + }, + { + "step": 189, + "epoch": 1.18125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7034, + "grad_norm": 1.127233624458313, + "learning_rate": 0.0002664688525756463 + }, + { + "step": 190, + "epoch": 1.1875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.7275, + "grad_norm": 2.677400588989258, + "learning_rate": 0.0002659515680044105 + }, + { + "step": 191, + "epoch": 1.19375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.74, + "grad_norm": 2.948096752166748, + "learning_rate": 0.00026543083413826203 + }, + { + "step": 192, + "epoch": 1.2, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6872, + "grad_norm": 1.53387451171875, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 193, + "epoch": 1.20625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6956, + "grad_norm": 0.16909082233905792, + "learning_rate": 0.0002643790805859582 + }, + { + "step": 194, + "epoch": 1.2125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.6924, + "grad_norm": 0.2191479355096817, + "learning_rate": 0.00026384809218707423 + }, + { + "step": 195, + "epoch": 1.21875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.7294, + "grad_norm": 3.705435276031494, + "learning_rate": 0.0002633137170668897 + }, + { + "step": 196, + "epoch": 1.225, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6943, + "grad_norm": 0.24546389281749725, + "learning_rate": 0.0002627759711218466 + }, + { + "step": 197, + "epoch": 1.23125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.7164, + "grad_norm": 2.8429245948791504, + "learning_rate": 0.00026223487034866133 + }, + { + "step": 198, + "epoch": 1.2375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7018, + "grad_norm": 1.9661619663238525, + "learning_rate": 0.00026169043084384896 + }, + { + "step": 199, + "epoch": 1.24375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.694, + "grad_norm": 1.9150495529174805, + "learning_rate": 0.00026114266880324387 + }, + { + "step": 200, + "epoch": 1.25, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6928, + "grad_norm": 0.606046736240387, + "learning_rate": 0.0002605916005215186 + }, + { + "step": 201, + "epoch": 1.25625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.6964, + "grad_norm": 0.8584450483322144, + "learning_rate": 0.00026003724239169874 + }, + { + "step": 202, + "epoch": 1.2625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.709, + "grad_norm": 3.1065943241119385, + "learning_rate": 0.00025947961090467533 + }, + { + "step": 203, + "epoch": 1.26875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.7409, + "grad_norm": 5.0242600440979, + "learning_rate": 0.0002589187226487144 + }, + { + "step": 204, + "epoch": 1.275, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7121, + "grad_norm": 2.291961431503296, + "learning_rate": 0.0002583545943089633 + }, + { + "step": 205, + "epoch": 1.28125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6959, + "grad_norm": 0.8388645052909851, + "learning_rate": 0.00025778724266695466 + }, + { + "step": 206, + "epoch": 1.2875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7079, + "grad_norm": 1.5561375617980957, + "learning_rate": 0.00025721668460010696 + }, + { + "step": 207, + "epoch": 1.29375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6979, + "grad_norm": 0.8813296556472778, + "learning_rate": 0.0002566429370812223 + }, + { + "step": 208, + "epoch": 1.3, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6816, + "grad_norm": 0.7830796837806702, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 209, + "epoch": 1.30625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7126, + "grad_norm": 1.6642495393753052, + "learning_rate": 0.0002554859420524386 + }, + { + "step": 210, + "epoch": 1.3125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6944, + "grad_norm": 0.7529539465904236, + "learning_rate": 0.00025490272896050507 + }, + { + "step": 211, + "epoch": 1.31875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7476, + "grad_norm": 3.4023630619049072, + "learning_rate": 0.00025431639525144175 + }, + { + "step": 212, + "epoch": 1.325, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7228, + "grad_norm": 2.1201043128967285, + "learning_rate": 0.0002537269583673404 + }, + { + "step": 213, + "epoch": 1.33125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6934, + "grad_norm": 0.5101386904716492, + "learning_rate": 0.0002531344358426051 + }, + { + "step": 214, + "epoch": 1.3375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72201216, + "loss": 0.6946, + "grad_norm": 0.49885666370391846, + "learning_rate": 0.0002525388453034307 + }, + { + "step": 215, + "epoch": 1.34375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.7186, + "grad_norm": 1.7088440656661987, + "learning_rate": 0.0002519402044672784 + }, + { + "step": 216, + "epoch": 1.35, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7311, + "grad_norm": 2.112462043762207, + "learning_rate": 0.00025133853114234905 + }, + { + "step": 217, + "epoch": 1.35625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6926, + "grad_norm": 0.06346680223941803, + "learning_rate": 0.00025073384322705274 + }, + { + "step": 218, + "epoch": 1.3625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6964, + "grad_norm": 1.3871630430221558, + "learning_rate": 0.0002501261587094771 + }, + { + "step": 219, + "epoch": 1.36875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6942, + "grad_norm": 0.46353602409362793, + "learning_rate": 0.00024951549566685165 + }, + { + "step": 220, + "epoch": 1.375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6943, + "grad_norm": 0.08255524933338165, + "learning_rate": 0.0002489018722650103 + }, + { + "step": 221, + "epoch": 1.38125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6906, + "grad_norm": 1.5914099216461182, + "learning_rate": 0.00024828530675785094 + }, + { + "step": 222, + "epoch": 1.3875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7012, + "grad_norm": 1.0943394899368286, + "learning_rate": 0.00024766581748679234 + }, + { + "step": 223, + "epoch": 1.39375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.6871, + "grad_norm": 0.3458816707134247, + "learning_rate": 0.0002470434228802286 + }, + { + "step": 224, + "epoch": 1.4, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6965, + "grad_norm": 0.7421514987945557, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 225, + "epoch": 1.40625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.7023, + "grad_norm": 1.5280835628509521, + "learning_rate": 0.0002457899918057468 + }, + { + "step": 226, + "epoch": 1.4125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.692, + "grad_norm": 0.105143703520298, + "learning_rate": 0.0002451589926245468 + }, + { + "step": 227, + "epoch": 1.41875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.7078, + "grad_norm": 1.6071431636810303, + "learning_rate": 0.00024452516268016865 + }, + { + "step": 228, + "epoch": 1.425, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.7133, + "grad_norm": 1.8062376976013184, + "learning_rate": 0.00024388852082760884 + }, + { + "step": 229, + "epoch": 1.43125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6997, + "grad_norm": 0.8809529542922974, + "learning_rate": 0.00024324908600551162 + }, + { + "step": 230, + "epoch": 1.4375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6917, + "grad_norm": 0.1358325183391571, + "learning_rate": 0.00024260687723560574 + }, + { + "step": 231, + "epoch": 1.44375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.7242, + "grad_norm": 1.6336575746536255, + "learning_rate": 0.00024196191362213862 + }, + { + "step": 232, + "epoch": 1.45, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.7078, + "grad_norm": 0.9952573776245117, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 233, + "epoch": 1.45625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6935, + "grad_norm": 0.050323233008384705, + "learning_rate": 0.0002406637986906913 + }, + { + "step": 234, + "epoch": 1.4625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722013696, + "loss": 0.6988, + "grad_norm": 0.7715749740600586, + "learning_rate": 0.00024001068598867212 + }, + { + "step": 235, + "epoch": 1.46875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.724, + "grad_norm": 2.0942633152008057, + "learning_rate": 0.000239354895673865 + }, + { + "step": 236, + "epoch": 1.475, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6775, + "grad_norm": 0.9262389540672302, + "learning_rate": 0.00023869644725453735 + }, + { + "step": 237, + "epoch": 1.48125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72201216, + "loss": 0.7182, + "grad_norm": 2.0630316734313965, + "learning_rate": 0.00023803536031802918 + }, + { + "step": 238, + "epoch": 1.4875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.7078, + "grad_norm": 2.5474140644073486, + "learning_rate": 0.00023737165453017033 + }, + { + "step": 239, + "epoch": 1.49375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.7062, + "grad_norm": 1.2299562692642212, + "learning_rate": 0.0002367053496346955 + }, + { + "step": 240, + "epoch": 1.5, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7014, + "grad_norm": 0.7796033024787903, + "learning_rate": 0.00023603646545265687 + }, + { + "step": 241, + "epoch": 1.50625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6778, + "grad_norm": 0.7502149939537048, + "learning_rate": 0.00023536502188183472 + }, + { + "step": 242, + "epoch": 1.5125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6825, + "grad_norm": 0.2572711110115051, + "learning_rate": 0.00023469103889614505 + }, + { + "step": 243, + "epoch": 1.51875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72201216, + "loss": 0.7318, + "grad_norm": 1.9181207418441772, + "learning_rate": 0.0002340145365450458 + }, + { + "step": 244, + "epoch": 1.525, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7059, + "grad_norm": 0.9235746264457703, + "learning_rate": 0.0002333355349529403 + }, + { + "step": 245, + "epoch": 1.53125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6941, + "grad_norm": 0.5369551181793213, + "learning_rate": 0.0002326540543185786 + }, + { + "step": 246, + "epoch": 1.5375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7084, + "grad_norm": 0.9984049797058105, + "learning_rate": 0.0002319701149144565 + }, + { + "step": 247, + "epoch": 1.54375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6958, + "grad_norm": 0.46566784381866455, + "learning_rate": 0.00023128373708621275 + }, + { + "step": 248, + "epoch": 1.55, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7188, + "grad_norm": 1.4297221899032593, + "learning_rate": 0.00023059494125202357 + }, + { + "step": 249, + "epoch": 1.55625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6906, + "grad_norm": 0.5402958989143372, + "learning_rate": 0.00022990374790199532 + }, + { + "step": 250, + "epoch": 1.5625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6918, + "grad_norm": 0.868184506893158, + "learning_rate": 0.0002292101775975552 + }, + { + "step": 251, + "epoch": 1.56875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.7173, + "grad_norm": 1.6750541925430298, + "learning_rate": 0.00022851425097083906 + }, + { + "step": 252, + "epoch": 1.575, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7295, + "grad_norm": 2.2165791988372803, + "learning_rate": 0.00022781598872407822 + }, + { + "step": 253, + "epoch": 1.58125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6975, + "grad_norm": 0.598291277885437, + "learning_rate": 0.00022711541162898321 + }, + { + "step": 254, + "epoch": 1.5875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6952, + "grad_norm": 0.26201707124710083, + "learning_rate": 0.00022641254052612627 + }, + { + "step": 255, + "epoch": 1.59375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.6894, + "grad_norm": 0.1049848347902298, + "learning_rate": 0.00022570739632432079 + }, + { + "step": 256, + "epoch": 1.6, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.7146, + "grad_norm": 1.0774105787277222, + "learning_rate": 0.000225 + }, + { + "step": 257, + "epoch": 1.60625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6964, + "grad_norm": 0.38671672344207764, + "learning_rate": 0.0002242903725965924 + }, + { + "step": 258, + "epoch": 1.6125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6991, + "grad_norm": 0.8245941400527954, + "learning_rate": 0.00022357853522389615 + }, + { + "step": 259, + "epoch": 1.61875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7142, + "grad_norm": 1.5421704053878784, + "learning_rate": 0.000222864509057451 + }, + { + "step": 260, + "epoch": 1.625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7127, + "grad_norm": 1.2671699523925781, + "learning_rate": 0.00022214831533790813 + }, + { + "step": 261, + "epoch": 1.63125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6778, + "grad_norm": 0.9946145415306091, + "learning_rate": 0.0002214299753703987 + }, + { + "step": 262, + "epoch": 1.6375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6986, + "grad_norm": 0.5910211205482483, + "learning_rate": 0.00022070951052389966 + }, + { + "step": 263, + "epoch": 1.64375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722013696, + "loss": 0.697, + "grad_norm": 0.5007100701332092, + "learning_rate": 0.00021998694223059837 + }, + { + "step": 264, + "epoch": 1.65, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6933, + "grad_norm": 0.03463464230298996, + "learning_rate": 0.0002192622919852551 + }, + { + "step": 265, + "epoch": 1.65625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6928, + "grad_norm": 0.2834303677082062, + "learning_rate": 0.00021853558134456307 + }, + { + "step": 266, + "epoch": 1.6625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.7059, + "grad_norm": 1.3462779521942139, + "learning_rate": 0.00021780683192650796 + }, + { + "step": 267, + "epoch": 1.66875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.702, + "grad_norm": 1.140692949295044, + "learning_rate": 0.00021707606540972413 + }, + { + "step": 268, + "epoch": 1.675, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6982, + "grad_norm": 1.110337734222412, + "learning_rate": 0.00021634330353285017 + }, + { + "step": 269, + "epoch": 1.68125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6903, + "grad_norm": 0.5540251135826111, + "learning_rate": 0.00021560856809388213 + }, + { + "step": 270, + "epoch": 1.6875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7055, + "grad_norm": 1.0985108613967896, + "learning_rate": 0.00021487188094952489 + }, + { + "step": 271, + "epoch": 1.69375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6949, + "grad_norm": 0.24486607313156128, + "learning_rate": 0.0002141332640145423 + }, + { + "step": 272, + "epoch": 1.7, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6926, + "grad_norm": 0.9832409620285034, + "learning_rate": 0.0002133927392611049 + }, + { + "step": 273, + "epoch": 1.70625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6966, + "grad_norm": 0.6323509216308594, + "learning_rate": 0.00021265032871813658 + }, + { + "step": 274, + "epoch": 1.7125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6914, + "grad_norm": 0.7212057709693909, + "learning_rate": 0.00021190605447065917 + }, + { + "step": 275, + "epoch": 1.71875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6996, + "grad_norm": 0.7943763732910156, + "learning_rate": 0.0002111599386591355 + }, + { + "step": 276, + "epoch": 1.725, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6955, + "grad_norm": 0.5179142355918884, + "learning_rate": 0.00021041200347881057 + }, + { + "step": 277, + "epoch": 1.73125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.7003, + "grad_norm": 0.9354894757270813, + "learning_rate": 0.00020966227117905163 + }, + { + "step": 278, + "epoch": 1.7375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6929, + "grad_norm": 0.033356476575136185, + "learning_rate": 0.00020891076406268612 + }, + { + "step": 279, + "epoch": 1.74375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6904, + "grad_norm": 0.6118065118789673, + "learning_rate": 0.00020815750448533805 + }, + { + "step": 280, + "epoch": 1.75, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.6919, + "grad_norm": 0.0503770150244236, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 281, + "epoch": 1.75625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.6837, + "grad_norm": 0.3223724961280823, + "learning_rate": 0.00020664581763018324 + }, + { + "step": 282, + "epoch": 1.7625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.707, + "grad_norm": 0.778380811214447, + "learning_rate": 0.00020588743532161543 + }, + { + "step": 283, + "epoch": 1.76875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7332, + "grad_norm": 1.7186168432235718, + "learning_rate": 0.00020512739048920552 + }, + { + "step": 284, + "epoch": 1.775, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6988, + "grad_norm": 0.502504825592041, + "learning_rate": 0.00020436570574255522 + }, + { + "step": 285, + "epoch": 1.78125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.693, + "grad_norm": 0.03212689608335495, + "learning_rate": 0.00020360240374005 + }, + { + "step": 286, + "epoch": 1.7875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.7317, + "grad_norm": 1.6994009017944336, + "learning_rate": 0.00020283750718818501 + }, + { + "step": 287, + "epoch": 1.79375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.7058, + "grad_norm": 0.7400434017181396, + "learning_rate": 0.00020207103884088955 + }, + { + "step": 288, + "epoch": 1.8, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6957, + "grad_norm": 0.3466211259365082, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 289, + "epoch": 1.80625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6895, + "grad_norm": 0.4400763511657715, + "learning_rate": 0.00020053347800883298 + }, + { + "step": 290, + "epoch": 1.8125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6924, + "grad_norm": 0.48643961548805237, + "learning_rate": 0.00019976243126300282 + }, + { + "step": 291, + "epoch": 1.81875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.7154, + "grad_norm": 1.5077401399612427, + "learning_rate": 0.00019898990419824333 + }, + { + "step": 292, + "epoch": 1.825, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721993728, + "loss": 0.7084, + "grad_norm": 1.144429087638855, + "learning_rate": 0.00019821591979547423 + }, + { + "step": 293, + "epoch": 1.83125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.7112, + "grad_norm": 1.3918482065200806, + "learning_rate": 0.00019744050107896774 + }, + { + "step": 294, + "epoch": 1.8375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721995264, + "loss": 0.6802, + "grad_norm": 1.1186531782150269, + "learning_rate": 0.0001966636711156636 + }, + { + "step": 295, + "epoch": 1.84375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.7263, + "grad_norm": 2.208902359008789, + "learning_rate": 0.00019588545301448302 + }, + { + "step": 296, + "epoch": 1.85, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7897, + "grad_norm": 4.876894950866699, + "learning_rate": 0.00019510586992564093 + }, + { + "step": 297, + "epoch": 1.85625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7175, + "grad_norm": 2.461993455886841, + "learning_rate": 0.0001943249450399578 + }, + { + "step": 298, + "epoch": 1.8625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6938, + "grad_norm": 0.4089767336845398, + "learning_rate": 0.0001935427015881693 + }, + { + "step": 299, + "epoch": 1.86875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7509, + "grad_norm": 4.674840450286865, + "learning_rate": 0.00019275916284023563 + }, + { + "step": 300, + "epoch": 1.875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6788, + "grad_norm": 1.767511010169983, + "learning_rate": 0.00019197435210464882 + }, + { + "step": 301, + "epoch": 1.88125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.7572, + "grad_norm": 4.6527018547058105, + "learning_rate": 0.00019118829272773985 + }, + { + "step": 302, + "epoch": 1.8875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7086, + "grad_norm": 2.1728527545928955, + "learning_rate": 0.00019040100809298392 + }, + { + "step": 303, + "epoch": 1.89375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722013696, + "loss": 0.7109, + "grad_norm": 2.1390697956085205, + "learning_rate": 0.00018961252162030476 + }, + { + "step": 304, + "epoch": 1.9, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7411, + "grad_norm": 3.304837942123413, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 305, + "epoch": 1.90625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.74, + "grad_norm": 3.0795912742614746, + "learning_rate": 0.00018803203701893393 + }, + { + "step": 306, + "epoch": 1.9125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.7039, + "grad_norm": 1.306732416152954, + "learning_rate": 0.00018724008590605742 + }, + { + "step": 307, + "epoch": 1.91875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.7067, + "grad_norm": 1.559555172920227, + "learning_rate": 0.0001864470269854896 + }, + { + "step": 308, + "epoch": 1.925, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6893, + "grad_norm": 0.14900675415992737, + "learning_rate": 0.00018565288384892595 + }, + { + "step": 309, + "epoch": 1.93125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.7236, + "grad_norm": 2.231937885284424, + "learning_rate": 0.00018485768012031518 + }, + { + "step": 310, + "epoch": 1.9375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6917, + "grad_norm": 1.0128005743026733, + "learning_rate": 0.00018406143945515598 + }, + { + "step": 311, + "epoch": 1.94375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.6925, + "grad_norm": 0.5809023976325989, + "learning_rate": 0.00018326418553979367 + }, + { + "step": 312, + "epoch": 1.95, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.7009, + "grad_norm": 1.0089682340621948, + "learning_rate": 0.0001824659420907154 + }, + { + "step": 313, + "epoch": 1.95625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6922, + "grad_norm": 0.5457061529159546, + "learning_rate": 0.00018166673285384475 + }, + { + "step": 314, + "epoch": 1.9625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6939, + "grad_norm": 0.22410281002521515, + "learning_rate": 0.00018086658160383523 + }, + { + "step": 315, + "epoch": 1.96875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6922, + "grad_norm": 0.6153343915939331, + "learning_rate": 0.00018006551214336304 + }, + { + "step": 316, + "epoch": 1.975, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7068, + "grad_norm": 1.7180736064910889, + "learning_rate": 0.00017926354830241924 + }, + { + "step": 317, + "epoch": 1.98125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6939, + "grad_norm": 0.9836577773094177, + "learning_rate": 0.00017846071393760044 + }, + { + "step": 318, + "epoch": 1.9875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6925, + "grad_norm": 0.663631021976471, + "learning_rate": 0.00017765703293139948 + }, + { + "step": 319, + "epoch": 1.99375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6864, + "grad_norm": 0.8140488862991333, + "learning_rate": 0.00017685252919149493 + }, + { + "step": 320, + "epoch": 2.0, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.672, + "grad_norm": 0.147809699177742, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 321, + "epoch": 2.00625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6475, + "grad_norm": 1.171762466430664, + "learning_rate": 0.00017524114926294887 + }, + { + "step": 322, + "epoch": 2.0125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.7124, + "grad_norm": 1.5292456150054932, + "learning_rate": 0.0001744343210091883 + }, + { + "step": 323, + "epoch": 2.01875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7114, + "grad_norm": 1.351194977760315, + "learning_rate": 0.00017362676589005967 + }, + { + "step": 324, + "epoch": 2.025, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722013696, + "loss": 0.7054, + "grad_norm": 1.1488467454910278, + "learning_rate": 0.0001728185079284875 + }, + { + "step": 325, + "epoch": 2.03125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.7004, + "grad_norm": 0.7932384014129639, + "learning_rate": 0.00017200957116830423 + }, + { + "step": 326, + "epoch": 2.0375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72201216, + "loss": 0.7148, + "grad_norm": 1.5226446390151978, + "learning_rate": 0.00017119997967353514 + }, + { + "step": 327, + "epoch": 2.04375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.7385, + "grad_norm": 2.567781925201416, + "learning_rate": 0.00017038975752768211 + }, + { + "step": 328, + "epoch": 2.05, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6967, + "grad_norm": 0.7359356880187988, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 329, + "epoch": 2.05625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6937, + "grad_norm": 1.1023664474487305, + "learning_rate": 0.0001687675177098179 + }, + { + "step": 330, + "epoch": 2.0625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6951, + "grad_norm": 0.4773405194282532, + "learning_rate": 0.00016795554829574435 + }, + { + "step": 331, + "epoch": 2.06875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6938, + "grad_norm": 0.40381762385368347, + "learning_rate": 0.00016714304474502696 + }, + { + "step": 332, + "epoch": 2.075, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.7095, + "grad_norm": 2.455918788909912, + "learning_rate": 0.00016633003122779467 + }, + { + "step": 333, + "epoch": 2.08125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6968, + "grad_norm": 1.6153181791305542, + "learning_rate": 0.00016551653192934694 + }, + { + "step": 334, + "epoch": 2.0875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6938, + "grad_norm": 0.34931743144989014, + "learning_rate": 0.0001647025710494341 + }, + { + "step": 335, + "epoch": 2.09375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6981, + "grad_norm": 0.8530497550964355, + "learning_rate": 0.00016388817280153735 + }, + { + "step": 336, + "epoch": 2.1, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.677, + "grad_norm": 0.35526755452156067, + "learning_rate": 0.00016307336141214873 + }, + { + "step": 337, + "epoch": 2.10625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.7242, + "grad_norm": 2.128298044204712, + "learning_rate": 0.00016225816112005022 + }, + { + "step": 338, + "epoch": 2.1125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7226, + "grad_norm": 2.095752239227295, + "learning_rate": 0.00016144259617559286 + }, + { + "step": 339, + "epoch": 2.11875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6996, + "grad_norm": 0.829171359539032, + "learning_rate": 0.00016062669083997513 + }, + { + "step": 340, + "epoch": 2.125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6938, + "grad_norm": 0.06415485590696335, + "learning_rate": 0.00015981046938452146 + }, + { + "step": 341, + "epoch": 2.13125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7072, + "grad_norm": 1.3254549503326416, + "learning_rate": 0.00015899395608996015 + }, + { + "step": 342, + "epoch": 2.1375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7118, + "grad_norm": 1.389275312423706, + "learning_rate": 0.00015817717524570094 + }, + { + "step": 343, + "epoch": 2.14375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6969, + "grad_norm": 0.4752300977706909, + "learning_rate": 0.0001573601511491127 + }, + { + "step": 344, + "epoch": 2.15, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7012, + "grad_norm": 1.1394513845443726, + "learning_rate": 0.00015654290810480042 + }, + { + "step": 345, + "epoch": 2.15625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.69, + "grad_norm": 0.5380935072898865, + "learning_rate": 0.00015572547042388223 + }, + { + "step": 346, + "epoch": 2.1625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.7009, + "grad_norm": 0.7160326242446899, + "learning_rate": 0.00015490786242326643 + }, + { + "step": 347, + "epoch": 2.16875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.7238, + "grad_norm": 1.560713291168213, + "learning_rate": 0.00015409010842492777 + }, + { + "step": 348, + "epoch": 2.175, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.7236, + "grad_norm": 1.5134072303771973, + "learning_rate": 0.00015327223275518416 + }, + { + "step": 349, + "epoch": 2.18125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721992192, + "loss": 0.6939, + "grad_norm": 0.3534829914569855, + "learning_rate": 0.000152454259743973 + }, + { + "step": 350, + "epoch": 2.1875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6944, + "grad_norm": 0.23275676369667053, + "learning_rate": 0.00015163621372412734 + }, + { + "step": 351, + "epoch": 2.19375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.696, + "grad_norm": 0.5760765671730042, + "learning_rate": 0.00015081811903065205 + }, + { + "step": 352, + "epoch": 2.2, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6968, + "grad_norm": 0.40162572264671326, + "learning_rate": 0.00015 + }, + { + "step": 353, + "epoch": 2.20625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6926, + "grad_norm": 0.1475517600774765, + "learning_rate": 0.0001491818809693479 + }, + { + "step": 354, + "epoch": 2.2125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6388, + "grad_norm": 1.8116768598556519, + "learning_rate": 0.00014836378627587266 + }, + { + "step": 355, + "epoch": 2.21875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.7329, + "grad_norm": 1.452401041984558, + "learning_rate": 0.00014754574025602698 + }, + { + "step": 356, + "epoch": 2.225, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.7314, + "grad_norm": 1.3823747634887695, + "learning_rate": 0.00014672776724481584 + }, + { + "step": 357, + "epoch": 2.23125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.7292, + "grad_norm": 1.3154454231262207, + "learning_rate": 0.00014590989157507224 + }, + { + "step": 358, + "epoch": 2.2375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7089, + "grad_norm": 0.8156904578208923, + "learning_rate": 0.00014509213757673357 + }, + { + "step": 359, + "epoch": 2.24375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6949, + "grad_norm": 0.18324065208435059, + "learning_rate": 0.00014427452957611775 + }, + { + "step": 360, + "epoch": 2.25, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.692, + "grad_norm": 0.2909948527812958, + "learning_rate": 0.0001434570918951996 + }, + { + "step": 361, + "epoch": 2.25625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.7029, + "grad_norm": 0.6741107106208801, + "learning_rate": 0.0001426398488508873 + }, + { + "step": 362, + "epoch": 2.2625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.674, + "grad_norm": 0.48998114466667175, + "learning_rate": 0.00014182282475429903 + }, + { + "step": 363, + "epoch": 2.26875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6978, + "grad_norm": 0.6333187818527222, + "learning_rate": 0.00014100604391003985 + }, + { + "step": 364, + "epoch": 2.275, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7258, + "grad_norm": 1.6222736835479736, + "learning_rate": 0.0001401895306154785 + }, + { + "step": 365, + "epoch": 2.28125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7166, + "grad_norm": 1.6382309198379517, + "learning_rate": 0.00013937330916002487 + }, + { + "step": 366, + "epoch": 2.2875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6936, + "grad_norm": 0.055141881108284, + "learning_rate": 0.00013855740382440714 + }, + { + "step": 367, + "epoch": 2.29375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.739, + "grad_norm": 2.675079584121704, + "learning_rate": 0.0001377418388799498 + }, + { + "step": 368, + "epoch": 2.3, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7478, + "grad_norm": 2.76946759223938, + "learning_rate": 0.00013692663858785124 + }, + { + "step": 369, + "epoch": 2.30625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7276, + "grad_norm": 2.1541788578033447, + "learning_rate": 0.00013611182719846268 + }, + { + "step": 370, + "epoch": 2.3125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6933, + "grad_norm": 0.1162518635392189, + "learning_rate": 0.0001352974289505659 + }, + { + "step": 371, + "epoch": 2.31875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.7011, + "grad_norm": 0.6593225598335266, + "learning_rate": 0.000134483468070653 + }, + { + "step": 372, + "epoch": 2.325, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.7108, + "grad_norm": 1.0208215713500977, + "learning_rate": 0.00013366996877220533 + }, + { + "step": 373, + "epoch": 2.33125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6997, + "grad_norm": 0.5687302350997925, + "learning_rate": 0.000132856955254973 + }, + { + "step": 374, + "epoch": 2.3375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722015232, + "loss": 0.6856, + "grad_norm": 0.6811498403549194, + "learning_rate": 0.00013204445170425565 + }, + { + "step": 375, + "epoch": 2.34375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6993, + "grad_norm": 0.7316981554031372, + "learning_rate": 0.00013123248229018214 + }, + { + "step": 376, + "epoch": 2.35, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6938, + "grad_norm": 0.6477458477020264, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 377, + "epoch": 2.35625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722015232, + "loss": 0.689, + "grad_norm": 0.05032190680503845, + "learning_rate": 0.0001296102424723179 + }, + { + "step": 378, + "epoch": 2.3625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.687, + "grad_norm": 0.25585857033729553, + "learning_rate": 0.0001288000203264649 + }, + { + "step": 379, + "epoch": 2.36875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.741, + "grad_norm": 1.969105839729309, + "learning_rate": 0.00012799042883169574 + }, + { + "step": 380, + "epoch": 2.375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6773, + "grad_norm": 0.11669527739286423, + "learning_rate": 0.00012718149207151247 + }, + { + "step": 381, + "epoch": 2.38125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7152, + "grad_norm": 1.2444268465042114, + "learning_rate": 0.00012637323410994033 + }, + { + "step": 382, + "epoch": 2.3875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6834, + "grad_norm": 0.30255258083343506, + "learning_rate": 0.0001255656789908117 + }, + { + "step": 383, + "epoch": 2.39375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6895, + "grad_norm": 0.3896716237068176, + "learning_rate": 0.0001247588507370511 + }, + { + "step": 384, + "epoch": 2.4, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6966, + "grad_norm": 0.49173131585121155, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 385, + "epoch": 2.40625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6927, + "grad_norm": 0.7615609765052795, + "learning_rate": 0.0001231474708085051 + }, + { + "step": 386, + "epoch": 2.4125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.6945, + "grad_norm": 0.24636490643024445, + "learning_rate": 0.0001223429670686005 + }, + { + "step": 387, + "epoch": 2.41875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.717, + "grad_norm": 1.3357396125793457, + "learning_rate": 0.00012153928606239957 + }, + { + "step": 388, + "epoch": 2.425, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6938, + "grad_norm": 0.32602453231811523, + "learning_rate": 0.00012073645169758076 + }, + { + "step": 389, + "epoch": 2.43125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.7031, + "grad_norm": 0.7256397008895874, + "learning_rate": 0.00011993448785663692 + }, + { + "step": 390, + "epoch": 2.4375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6874, + "grad_norm": 0.2378477156162262, + "learning_rate": 0.00011913341839616476 + }, + { + "step": 391, + "epoch": 2.44375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6932, + "grad_norm": 0.1010773628950119, + "learning_rate": 0.00011833326714615522 + }, + { + "step": 392, + "epoch": 2.45, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72201216, + "loss": 0.6942, + "grad_norm": 0.12203691154718399, + "learning_rate": 0.00011753405790928456 + }, + { + "step": 393, + "epoch": 2.45625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6933, + "grad_norm": 0.38720273971557617, + "learning_rate": 0.0001167358144602063 + }, + { + "step": 394, + "epoch": 2.4625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6934, + "grad_norm": 0.11456672847270966, + "learning_rate": 0.00011593856054484402 + }, + { + "step": 395, + "epoch": 2.46875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6982, + "grad_norm": 1.3692469596862793, + "learning_rate": 0.00011514231987968482 + }, + { + "step": 396, + "epoch": 2.475, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.693, + "grad_norm": 0.2088041752576828, + "learning_rate": 0.00011434711615107404 + }, + { + "step": 397, + "epoch": 2.48125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.6919, + "grad_norm": 0.08067941665649414, + "learning_rate": 0.00011355297301451042 + }, + { + "step": 398, + "epoch": 2.4875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72201216, + "loss": 0.6917, + "grad_norm": 0.04766916483640671, + "learning_rate": 0.00011275991409394253 + }, + { + "step": 399, + "epoch": 2.49375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7071, + "grad_norm": 0.714054524898529, + "learning_rate": 0.00011196796298106608 + }, + { + "step": 400, + "epoch": 2.5, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.7064, + "grad_norm": 0.6950631737709045, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 401, + "epoch": 2.50625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.681, + "grad_norm": 0.5628954172134399, + "learning_rate": 0.00011038747837969526 + }, + { + "step": 402, + "epoch": 2.5125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6936, + "grad_norm": 0.12058182060718536, + "learning_rate": 0.00010959899190701608 + }, + { + "step": 403, + "epoch": 2.51875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6917, + "grad_norm": 0.03569090738892555, + "learning_rate": 0.00010881170727226018 + }, + { + "step": 404, + "epoch": 2.525, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6991, + "grad_norm": 0.5240115523338318, + "learning_rate": 0.00010802564789535119 + }, + { + "step": 405, + "epoch": 2.53125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6913, + "grad_norm": 0.25071612000465393, + "learning_rate": 0.00010724083715976441 + }, + { + "step": 406, + "epoch": 2.5375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6924, + "grad_norm": 0.2947244942188263, + "learning_rate": 0.00010645729841183066 + }, + { + "step": 407, + "epoch": 2.54375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6928, + "grad_norm": 0.18944112956523895, + "learning_rate": 0.00010567505496004213 + }, + { + "step": 408, + "epoch": 2.55, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.6925, + "grad_norm": 0.1659955233335495, + "learning_rate": 0.00010489413007435904 + }, + { + "step": 409, + "epoch": 2.55625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6919, + "grad_norm": 0.12951691448688507, + "learning_rate": 0.00010411454698551695 + }, + { + "step": 410, + "epoch": 2.5625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6879, + "grad_norm": 0.4179939031600952, + "learning_rate": 0.00010333632888433638 + }, + { + "step": 411, + "epoch": 2.56875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.7039, + "grad_norm": 0.6893770098686218, + "learning_rate": 0.00010255949892103225 + }, + { + "step": 412, + "epoch": 2.575, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6894, + "grad_norm": 0.10409317910671234, + "learning_rate": 0.00010178408020452579 + }, + { + "step": 413, + "epoch": 2.58125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72201216, + "loss": 0.6892, + "grad_norm": 0.0931537002325058, + "learning_rate": 0.00010101009580175669 + }, + { + "step": 414, + "epoch": 2.5875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.696, + "grad_norm": 0.26734885573387146, + "learning_rate": 0.00010023756873699722 + }, + { + "step": 415, + "epoch": 2.59375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6982, + "grad_norm": 0.3635084927082062, + "learning_rate": 9.946652199116699e-05 + }, + { + "step": 416, + "epoch": 2.6, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.7021, + "grad_norm": 0.6513778567314148, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 417, + "epoch": 2.60625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6964, + "grad_norm": 0.5275430083274841, + "learning_rate": 9.792896115911045e-05 + }, + { + "step": 418, + "epoch": 2.6125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6958, + "grad_norm": 0.4282492399215698, + "learning_rate": 9.716249281181497e-05 + }, + { + "step": 419, + "epoch": 2.61875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6852, + "grad_norm": 0.4905282258987427, + "learning_rate": 9.639759625994998e-05 + }, + { + "step": 420, + "epoch": 2.625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6979, + "grad_norm": 0.3437327742576599, + "learning_rate": 9.563429425744476e-05 + }, + { + "step": 421, + "epoch": 2.63125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7115, + "grad_norm": 0.7991838455200195, + "learning_rate": 9.487260951079448e-05 + }, + { + "step": 422, + "epoch": 2.6375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.7182, + "grad_norm": 1.0175750255584717, + "learning_rate": 9.411256467838455e-05 + }, + { + "step": 423, + "epoch": 2.64375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6761, + "grad_norm": 0.6140373945236206, + "learning_rate": 9.335418236981677e-05 + }, + { + "step": 424, + "epoch": 2.65, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6986, + "grad_norm": 0.3820516765117645, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 425, + "epoch": 2.65625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6963, + "grad_norm": 0.31093692779541016, + "learning_rate": 9.184249551466189e-05 + }, + { + "step": 426, + "epoch": 2.6625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6894, + "grad_norm": 0.48381736874580383, + "learning_rate": 9.10892359373139e-05 + }, + { + "step": 427, + "epoch": 2.66875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72201216, + "loss": 0.6933, + "grad_norm": 0.05699583515524864, + "learning_rate": 9.033772882094833e-05 + }, + { + "step": 428, + "epoch": 2.675, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.6958, + "grad_norm": 0.7452366948127747, + "learning_rate": 8.958799652118943e-05 + }, + { + "step": 429, + "epoch": 2.68125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6938, + "grad_norm": 0.05632916837930679, + "learning_rate": 8.884006134086449e-05 + }, + { + "step": 430, + "epoch": 2.6875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6959, + "grad_norm": 0.3754892945289612, + "learning_rate": 8.809394552934079e-05 + }, + { + "step": 431, + "epoch": 2.69375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.7023, + "grad_norm": 0.9102657437324524, + "learning_rate": 8.734967128186338e-05 + }, + { + "step": 432, + "epoch": 2.7, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6941, + "grad_norm": 0.12370803207159042, + "learning_rate": 8.660726073889511e-05 + }, + { + "step": 433, + "epoch": 2.70625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6932, + "grad_norm": 0.05132635310292244, + "learning_rate": 8.586673598545771e-05 + }, + { + "step": 434, + "epoch": 2.7125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6938, + "grad_norm": 0.08951124548912048, + "learning_rate": 8.512811905047505e-05 + }, + { + "step": 435, + "epoch": 2.71875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6941, + "grad_norm": 0.3546404242515564, + "learning_rate": 8.439143190611787e-05 + }, + { + "step": 436, + "epoch": 2.725, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6929, + "grad_norm": 0.07085944712162018, + "learning_rate": 8.365669646714983e-05 + }, + { + "step": 437, + "epoch": 2.73125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.695, + "grad_norm": 0.2061643749475479, + "learning_rate": 8.29239345902759e-05 + }, + { + "step": 438, + "epoch": 2.7375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6938, + "grad_norm": 0.12730197608470917, + "learning_rate": 8.219316807349204e-05 + }, + { + "step": 439, + "epoch": 2.74375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6942, + "grad_norm": 0.13758644461631775, + "learning_rate": 8.146441865543689e-05 + }, + { + "step": 440, + "epoch": 2.75, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6819, + "grad_norm": 0.953137993812561, + "learning_rate": 8.073770801474495e-05 + }, + { + "step": 441, + "epoch": 2.75625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6932, + "grad_norm": 0.10767873376607895, + "learning_rate": 8.001305776940163e-05 + }, + { + "step": 442, + "epoch": 2.7625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6891, + "grad_norm": 0.05973926559090614, + "learning_rate": 7.929048947610034e-05 + }, + { + "step": 443, + "epoch": 2.76875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7006, + "grad_norm": 0.4356878399848938, + "learning_rate": 7.857002462960132e-05 + }, + { + "step": 444, + "epoch": 2.775, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6859, + "grad_norm": 0.09186040610074997, + "learning_rate": 7.785168466209187e-05 + }, + { + "step": 445, + "epoch": 2.78125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6893, + "grad_norm": 0.049820538610219955, + "learning_rate": 7.713549094254897e-05 + }, + { + "step": 446, + "epoch": 2.7875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6825, + "grad_norm": 0.1556348204612732, + "learning_rate": 7.64214647761038e-05 + }, + { + "step": 447, + "epoch": 2.79375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6856, + "grad_norm": 0.035700466483831406, + "learning_rate": 7.570962740340759e-05 + }, + { + "step": 448, + "epoch": 2.8, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6858, + "grad_norm": 0.023349786177277565, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 449, + "epoch": 2.80625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6856, + "grad_norm": 0.04076904430985451, + "learning_rate": 7.429260367567916e-05 + }, + { + "step": 450, + "epoch": 2.8125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.7116, + "grad_norm": 0.6944258213043213, + "learning_rate": 7.358745947387373e-05 + }, + { + "step": 451, + "epoch": 2.81875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6973, + "grad_norm": 0.3278908431529999, + "learning_rate": 7.288458837101675e-05 + }, + { + "step": 452, + "epoch": 2.825, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6684, + "grad_norm": 0.5812587141990662, + "learning_rate": 7.218401127592175e-05 + }, + { + "step": 453, + "epoch": 2.83125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6758, + "grad_norm": 0.3776197135448456, + "learning_rate": 7.14857490291609e-05 + }, + { + "step": 454, + "epoch": 2.8375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6929, + "grad_norm": 0.17476889491081238, + "learning_rate": 7.07898224024448e-05 + }, + { + "step": 455, + "epoch": 2.84375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.7108, + "grad_norm": 0.7049618363380432, + "learning_rate": 7.009625209800465e-05 + }, + { + "step": 456, + "epoch": 2.85, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.7045, + "grad_norm": 0.555120587348938, + "learning_rate": 6.940505874797639e-05 + }, + { + "step": 457, + "epoch": 2.85625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.699, + "grad_norm": 0.3617209196090698, + "learning_rate": 6.871626291378728e-05 + }, + { + "step": 458, + "epoch": 2.8625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7015, + "grad_norm": 0.678576648235321, + "learning_rate": 6.80298850855435e-05 + }, + { + "step": 459, + "epoch": 2.86875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6937, + "grad_norm": 0.03691326826810837, + "learning_rate": 6.734594568142142e-05 + }, + { + "step": 460, + "epoch": 2.875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6907, + "grad_norm": 0.2061082422733307, + "learning_rate": 6.66644650470597e-05 + }, + { + "step": 461, + "epoch": 2.88125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.7199, + "grad_norm": 1.404151201248169, + "learning_rate": 6.598546345495417e-05 + }, + { + "step": 462, + "epoch": 2.8875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6783, + "grad_norm": 0.47311046719551086, + "learning_rate": 6.530896110385494e-05 + }, + { + "step": 463, + "epoch": 2.89375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6965, + "grad_norm": 0.2852426767349243, + "learning_rate": 6.463497811816523e-05 + }, + { + "step": 464, + "epoch": 2.9, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.7082, + "grad_norm": 0.6729643940925598, + "learning_rate": 6.396353454734311e-05 + }, + { + "step": 465, + "epoch": 2.90625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6787, + "grad_norm": 0.26002708077430725, + "learning_rate": 6.32946503653045e-05 + }, + { + "step": 466, + "epoch": 2.9125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.7005, + "grad_norm": 0.4206511974334717, + "learning_rate": 6.262834546982969e-05 + }, + { + "step": 467, + "epoch": 2.91875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72201216, + "loss": 0.6957, + "grad_norm": 0.28226757049560547, + "learning_rate": 6.196463968197084e-05 + }, + { + "step": 468, + "epoch": 2.925, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6985, + "grad_norm": 0.34583038091659546, + "learning_rate": 6.130355274546267e-05 + }, + { + "step": 469, + "epoch": 2.93125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.7103, + "grad_norm": 0.8141036629676819, + "learning_rate": 6.064510432613499e-05 + }, + { + "step": 470, + "epoch": 2.9375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6987, + "grad_norm": 0.39698874950408936, + "learning_rate": 5.998931401132786e-05 + }, + { + "step": 471, + "epoch": 2.94375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6885, + "grad_norm": 0.6410323977470398, + "learning_rate": 5.933620130930867e-05 + }, + { + "step": 472, + "epoch": 2.95, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6949, + "grad_norm": 0.34195244312286377, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 473, + "epoch": 2.95625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6937, + "grad_norm": 0.04685399681329727, + "learning_rate": 5.803808637786135e-05 + }, + { + "step": 474, + "epoch": 2.9625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.6934, + "grad_norm": 0.04792621731758118, + "learning_rate": 5.739312276439427e-05 + }, + { + "step": 475, + "epoch": 2.96875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6943, + "grad_norm": 0.13553772866725922, + "learning_rate": 5.6750913994488415e-05 + }, + { + "step": 476, + "epoch": 2.975, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6948, + "grad_norm": 0.16955740749835968, + "learning_rate": 5.6111479172391136e-05 + }, + { + "step": 477, + "epoch": 2.98125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7004, + "grad_norm": 0.5295268893241882, + "learning_rate": 5.5474837319831314e-05 + }, + { + "step": 478, + "epoch": 2.9875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6999, + "grad_norm": 0.5231136083602905, + "learning_rate": 5.4841007375453186e-05 + }, + { + "step": 479, + "epoch": 2.99375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6982, + "grad_norm": 0.5041728615760803, + "learning_rate": 5.4210008194253196e-05 + }, + { + "step": 480, + "epoch": 3.0, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6964, + "grad_norm": 0.47974154353141785, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 481, + "epoch": 3.00625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.6935, + "grad_norm": 0.3911196291446686, + "learning_rate": 5.2956577119771405e-05 + }, + { + "step": 482, + "epoch": 3.0125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6965, + "grad_norm": 0.49167630076408386, + "learning_rate": 5.233418251320765e-05 + }, + { + "step": 483, + "epoch": 3.01875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6993, + "grad_norm": 0.7172385454177856, + "learning_rate": 5.171469324214901e-05 + }, + { + "step": 484, + "epoch": 3.025, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6937, + "grad_norm": 0.06756031513214111, + "learning_rate": 5.109812773498967e-05 + }, + { + "step": 485, + "epoch": 3.03125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7011, + "grad_norm": 0.7694441080093384, + "learning_rate": 5.048450433314835e-05 + }, + { + "step": 486, + "epoch": 3.0375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6895, + "grad_norm": 0.36409929394721985, + "learning_rate": 4.987384129052291e-05 + }, + { + "step": 487, + "epoch": 3.04375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6969, + "grad_norm": 0.43730631470680237, + "learning_rate": 4.926615677294723e-05 + }, + { + "step": 488, + "epoch": 3.05, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6999, + "grad_norm": 0.9139493703842163, + "learning_rate": 4.866146885765096e-05 + }, + { + "step": 489, + "epoch": 3.05625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6911, + "grad_norm": 0.6938626766204834, + "learning_rate": 4.8059795532721575e-05 + }, + { + "step": 490, + "epoch": 3.0625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6943, + "grad_norm": 0.3918024003505707, + "learning_rate": 4.7461154696569294e-05 + }, + { + "step": 491, + "epoch": 3.06875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6939, + "grad_norm": 0.5959441661834717, + "learning_rate": 4.686556415739488e-05 + }, + { + "step": 492, + "epoch": 3.075, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6938, + "grad_norm": 0.259413480758667, + "learning_rate": 4.62730416326596e-05 + }, + { + "step": 493, + "epoch": 3.08125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6944, + "grad_norm": 0.8133082985877991, + "learning_rate": 4.568360474855826e-05 + }, + { + "step": 494, + "epoch": 3.0875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6938, + "grad_norm": 0.25433075428009033, + "learning_rate": 4.509727103949492e-05 + }, + { + "step": 495, + "epoch": 3.09375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6902, + "grad_norm": 0.8785492181777954, + "learning_rate": 4.451405794756138e-05 + }, + { + "step": 496, + "epoch": 3.1, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72201216, + "loss": 0.6913, + "grad_norm": 0.24204744398593903, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 497, + "epoch": 3.10625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72201216, + "loss": 0.7021, + "grad_norm": 0.7825486063957214, + "learning_rate": 4.33570629187776e-05 + }, + { + "step": 498, + "epoch": 3.1125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7063, + "grad_norm": 1.0599770545959473, + "learning_rate": 4.278331539989307e-05 + }, + { + "step": 499, + "epoch": 3.11875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.696, + "grad_norm": 0.29672661423683167, + "learning_rate": 4.2212757333045283e-05 + }, + { + "step": 500, + "epoch": 3.125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6969, + "grad_norm": 0.37917080521583557, + "learning_rate": 4.164540569103667e-05 + }, + { + "step": 501, + "epoch": 3.13125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6931, + "grad_norm": 0.03871191293001175, + "learning_rate": 4.108127735128561e-05 + }, + { + "step": 502, + "epoch": 3.1375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6908, + "grad_norm": 0.375658243894577, + "learning_rate": 4.052038909532469e-05 + }, + { + "step": 503, + "epoch": 3.14375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6912, + "grad_norm": 0.4859120547771454, + "learning_rate": 3.996275760830125e-05 + }, + { + "step": 504, + "epoch": 3.15, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6935, + "grad_norm": 0.06265486776828766, + "learning_rate": 3.94083994784814e-05 + }, + { + "step": 505, + "epoch": 3.15625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.696, + "grad_norm": 0.585866391658783, + "learning_rate": 3.885733119675616e-05 + }, + { + "step": 506, + "epoch": 3.1625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6933, + "grad_norm": 0.05168027803301811, + "learning_rate": 3.830956915615106e-05 + }, + { + "step": 507, + "epoch": 3.16875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.6928, + "grad_norm": 0.08898420631885529, + "learning_rate": 3.776512965133863e-05 + }, + { + "step": 508, + "epoch": 3.175, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.6922, + "grad_norm": 0.4967854619026184, + "learning_rate": 3.72240288781534e-05 + }, + { + "step": 509, + "epoch": 3.18125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6942, + "grad_norm": 0.43492287397384644, + "learning_rate": 3.66862829331103e-05 + }, + { + "step": 510, + "epoch": 3.1875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6936, + "grad_norm": 0.12775187194347382, + "learning_rate": 3.6151907812925717e-05 + }, + { + "step": 511, + "epoch": 3.19375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6928, + "grad_norm": 0.1181931346654892, + "learning_rate": 3.562091941404179e-05 + }, + { + "step": 512, + "epoch": 3.2, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6937, + "grad_norm": 0.039027538150548935, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 513, + "epoch": 3.20625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6924, + "grad_norm": 0.20072561502456665, + "learning_rate": 3.456916586173797e-05 + }, + { + "step": 514, + "epoch": 3.2125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6942, + "grad_norm": 0.5149424076080322, + "learning_rate": 3.404843199558945e-05 + }, + { + "step": 515, + "epoch": 3.21875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6929, + "grad_norm": 0.3882922828197479, + "learning_rate": 3.3531147424353664e-05 + }, + { + "step": 516, + "epoch": 3.225, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6938, + "grad_norm": 0.38078826665878296, + "learning_rate": 3.301732753606776e-05 + }, + { + "step": 517, + "epoch": 3.23125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.696, + "grad_norm": 0.9299198389053345, + "learning_rate": 3.250698761570244e-05 + }, + { + "step": 518, + "epoch": 3.2375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6962, + "grad_norm": 1.1123151779174805, + "learning_rate": 3.200014284470745e-05 + }, + { + "step": 519, + "epoch": 3.24375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6939, + "grad_norm": 0.0619676373898983, + "learning_rate": 3.149680830055967e-05 + }, + { + "step": 520, + "epoch": 3.25, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6916, + "grad_norm": 0.4437364935874939, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 521, + "epoch": 3.25625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6932, + "grad_norm": 0.11427575349807739, + "learning_rate": 3.0500729680161663e-05 + }, + { + "step": 522, + "epoch": 3.2625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6886, + "grad_norm": 0.26638174057006836, + "learning_rate": 3.0008015234980552e-05 + }, + { + "step": 523, + "epoch": 3.26875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6938, + "grad_norm": 0.18699491024017334, + "learning_rate": 2.9518870277903274e-05 + }, + { + "step": 524, + "epoch": 3.275, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6948, + "grad_norm": 0.21797117590904236, + "learning_rate": 2.9033309359877597e-05 + }, + { + "step": 525, + "epoch": 3.28125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.7079, + "grad_norm": 0.762208104133606, + "learning_rate": 2.855134692523438e-05 + }, + { + "step": 526, + "epoch": 3.2875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6958, + "grad_norm": 0.2438938468694687, + "learning_rate": 2.807299731125773e-05 + }, + { + "step": 527, + "epoch": 3.29375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6887, + "grad_norm": 0.08344245702028275, + "learning_rate": 2.759827474775852e-05 + }, + { + "step": 528, + "epoch": 3.3, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6883, + "grad_norm": 0.09373270720243454, + "learning_rate": 2.7127193356651213e-05 + }, + { + "step": 529, + "epoch": 3.30625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7057, + "grad_norm": 0.7275422811508179, + "learning_rate": 2.665976715153377e-05 + }, + { + "step": 530, + "epoch": 3.3125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6834, + "grad_norm": 0.4045466184616089, + "learning_rate": 2.619601003727043e-05 + }, + { + "step": 531, + "epoch": 3.31875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722016768, + "loss": 0.7042, + "grad_norm": 0.7878732085227966, + "learning_rate": 2.5735935809578656e-05 + }, + { + "step": 532, + "epoch": 3.325, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7022, + "grad_norm": 0.7499514222145081, + "learning_rate": 2.5279558154618197e-05 + }, + { + "step": 533, + "epoch": 3.33125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6952, + "grad_norm": 0.31874969601631165, + "learning_rate": 2.4826890648584353e-05 + }, + { + "step": 534, + "epoch": 3.3375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.693, + "grad_norm": 0.17992019653320312, + "learning_rate": 2.4377946757303828e-05 + }, + { + "step": 535, + "epoch": 3.34375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.691, + "grad_norm": 0.16485540568828583, + "learning_rate": 2.393273983583427e-05 + }, + { + "step": 536, + "epoch": 3.35, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6897, + "grad_norm": 0.221756249666214, + "learning_rate": 2.3491283128067174e-05 + }, + { + "step": 537, + "epoch": 3.35625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6944, + "grad_norm": 0.4367470443248749, + "learning_rate": 2.3053589766333414e-05 + }, + { + "step": 538, + "epoch": 3.3625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6893, + "grad_norm": 0.21301725506782532, + "learning_rate": 2.261967277101318e-05 + }, + { + "step": 539, + "epoch": 3.36875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6911, + "grad_norm": 0.17393499612808228, + "learning_rate": 2.218954505014821e-05 + }, + { + "step": 540, + "epoch": 3.375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7033, + "grad_norm": 0.9295821785926819, + "learning_rate": 2.1763219399058042e-05 + }, + { + "step": 541, + "epoch": 3.38125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6976, + "grad_norm": 0.5456365346908569, + "learning_rate": 2.1340708499959197e-05 + }, + { + "step": 542, + "epoch": 3.3875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6871, + "grad_norm": 0.2547103464603424, + "learning_rate": 2.0922024921588167e-05 + }, + { + "step": 543, + "epoch": 3.39375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6854, + "grad_norm": 0.2985079288482666, + "learning_rate": 2.0507181118827254e-05 + }, + { + "step": 544, + "epoch": 3.4, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.7035, + "grad_norm": 0.5428258776664734, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 545, + "epoch": 3.40625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.6974, + "grad_norm": 0.2864561080932617, + "learning_rate": 1.9689062088175154e-05 + }, + { + "step": 546, + "epoch": 3.4125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.694, + "grad_norm": 0.20373576879501343, + "learning_rate": 1.928581119746081e-05 + }, + { + "step": 547, + "epoch": 3.41875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6899, + "grad_norm": 0.3121243715286255, + "learning_rate": 1.8886448755986193e-05 + }, + { + "step": 548, + "epoch": 3.425, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.689, + "grad_norm": 0.19042468070983887, + "learning_rate": 1.8490986643873845e-05 + }, + { + "step": 549, + "epoch": 3.43125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6919, + "grad_norm": 0.12357912957668304, + "learning_rate": 1.8099436625220443e-05 + }, + { + "step": 550, + "epoch": 3.4375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722013696, + "loss": 0.6939, + "grad_norm": 0.26279664039611816, + "learning_rate": 1.7711810347746757e-05 + }, + { + "step": 551, + "epoch": 3.44375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6955, + "grad_norm": 0.3802650570869446, + "learning_rate": 1.7328119342451165e-05 + }, + { + "step": 552, + "epoch": 3.45, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6927, + "grad_norm": 0.1281893402338028, + "learning_rate": 1.694837502326674e-05 + }, + { + "step": 553, + "epoch": 3.45625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.696, + "grad_norm": 0.28200215101242065, + "learning_rate": 1.6572588686721606e-05 + }, + { + "step": 554, + "epoch": 3.4625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.688, + "grad_norm": 0.5013014078140259, + "learning_rate": 1.6200771511602882e-05 + }, + { + "step": 555, + "epoch": 3.46875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.6831, + "grad_norm": 1.0752190351486206, + "learning_rate": 1.583293455862422e-05 + }, + { + "step": 556, + "epoch": 3.475, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6921, + "grad_norm": 0.11804443597793579, + "learning_rate": 1.546908877009676e-05 + }, + { + "step": 557, + "epoch": 3.48125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6964, + "grad_norm": 0.5417994856834412, + "learning_rate": 1.5109244969603546e-05 + }, + { + "step": 558, + "epoch": 3.4875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6976, + "grad_norm": 0.43129071593284607, + "learning_rate": 1.4753413861677604e-05 + }, + { + "step": 559, + "epoch": 3.49375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6889, + "grad_norm": 0.2633020281791687, + "learning_rate": 1.4401606031483497e-05 + }, + { + "step": 560, + "epoch": 3.5, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6932, + "grad_norm": 0.4429054856300354, + "learning_rate": 1.4053831944502508e-05 + }, + { + "step": 561, + "epoch": 3.50625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6911, + "grad_norm": 0.07670487463474274, + "learning_rate": 1.371010194622117e-05 + }, + { + "step": 562, + "epoch": 3.5125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.7023, + "grad_norm": 0.7201810479164124, + "learning_rate": 1.3370426261823613e-05 + }, + { + "step": 563, + "epoch": 3.51875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6951, + "grad_norm": 0.2277008295059204, + "learning_rate": 1.3034814995887433e-05 + }, + { + "step": 564, + "epoch": 3.525, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6902, + "grad_norm": 0.18292681872844696, + "learning_rate": 1.2703278132082934e-05 + }, + { + "step": 565, + "epoch": 3.53125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6886, + "grad_norm": 0.2133987694978714, + "learning_rate": 1.237582553287631e-05 + }, + { + "step": 566, + "epoch": 3.5375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6952, + "grad_norm": 0.8150560259819031, + "learning_rate": 1.205246693923616e-05 + }, + { + "step": 567, + "epoch": 3.54375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.697, + "grad_norm": 0.4710053503513336, + "learning_rate": 1.173321197034382e-05 + }, + { + "step": 568, + "epoch": 3.55, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6868, + "grad_norm": 0.34910866618156433, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 569, + "epoch": 3.55625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6925, + "grad_norm": 0.6385883688926697, + "learning_rate": 1.1107050772877507e-05 + }, + { + "step": 570, + "epoch": 3.5625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6877, + "grad_norm": 0.3366970717906952, + "learning_rate": 1.0800163171172332e-05 + }, + { + "step": 571, + "epoch": 3.56875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6923, + "grad_norm": 0.1365206092596054, + "learning_rate": 1.0497416447398187e-05 + }, + { + "step": 572, + "epoch": 3.575, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6872, + "grad_norm": 0.6673657894134521, + "learning_rate": 1.0198819607580233e-05 + }, + { + "step": 573, + "epoch": 3.58125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.7102, + "grad_norm": 0.6619124412536621, + "learning_rate": 9.904381534293993e-06 + }, + { + "step": 574, + "epoch": 3.5875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6933, + "grad_norm": 0.39239004254341125, + "learning_rate": 9.614110986401169e-06 + }, + { + "step": 575, + "epoch": 3.59375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.707, + "grad_norm": 0.7946501970291138, + "learning_rate": 9.32801659878905e-06 + }, + { + "step": 576, + "epoch": 3.6, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.7031, + "grad_norm": 0.4762343764305115, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 577, + "epoch": 3.60625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6922, + "grad_norm": 0.13024264574050903, + "learning_rate": 8.768390222546895e-06 + }, + { + "step": 578, + "epoch": 3.6125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72201216, + "loss": 0.6942, + "grad_norm": 0.22889314591884613, + "learning_rate": 8.494874881526215e-06 + }, + { + "step": 579, + "epoch": 3.61875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6922, + "grad_norm": 0.156460702419281, + "learning_rate": 8.225568995509834e-06 + }, + { + "step": 580, + "epoch": 3.625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6916, + "grad_norm": 0.6951426863670349, + "learning_rate": 7.960480575734162e-06 + }, + { + "step": 581, + "epoch": 3.63125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6937, + "grad_norm": 0.3074381351470947, + "learning_rate": 7.699617507975563e-06 + }, + { + "step": 582, + "epoch": 3.6375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.6903, + "grad_norm": 0.14660872519016266, + "learning_rate": 7.442987552315833e-06 + }, + { + "step": 583, + "epoch": 3.64375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6936, + "grad_norm": 0.0688883513212204, + "learning_rate": 7.190598342911358e-06 + }, + { + "step": 584, + "epoch": 3.65, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6923, + "grad_norm": 0.06541229039430618, + "learning_rate": 6.942457387765976e-06 + }, + { + "step": 585, + "epoch": 3.65625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72201216, + "loss": 0.6898, + "grad_norm": 0.2879359722137451, + "learning_rate": 6.698572068507596e-06 + }, + { + "step": 586, + "epoch": 3.6625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6939, + "grad_norm": 0.6094048023223877, + "learning_rate": 6.458949640168675e-06 + }, + { + "step": 587, + "epoch": 3.66875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72201216, + "loss": 0.6909, + "grad_norm": 0.18644501268863678, + "learning_rate": 6.223597230970428e-06 + }, + { + "step": 588, + "epoch": 3.675, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6908, + "grad_norm": 0.5668491721153259, + "learning_rate": 5.992521842110709e-06 + }, + { + "step": 589, + "epoch": 3.68125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.692, + "grad_norm": 0.3535112738609314, + "learning_rate": 5.7657303475556974e-06 + }, + { + "step": 590, + "epoch": 3.6875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6945, + "grad_norm": 0.5275602340698242, + "learning_rate": 5.543229493835594e-06 + }, + { + "step": 591, + "epoch": 3.69375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6939, + "grad_norm": 0.8414390087127686, + "learning_rate": 5.325025899843732e-06 + }, + { + "step": 592, + "epoch": 3.7, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.6922, + "grad_norm": 0.443567156791687, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 593, + "epoch": 3.70625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6935, + "grad_norm": 0.332862913608551, + "learning_rate": 4.901536327256589e-06 + }, + { + "step": 594, + "epoch": 3.7125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6923, + "grad_norm": 0.8357807397842407, + "learning_rate": 4.6962629465110365e-06 + }, + { + "step": 595, + "epoch": 3.71875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722010624, + "loss": 0.6947, + "grad_norm": 0.14301778376102448, + "learning_rate": 4.495312020818403e-06 + }, + { + "step": 596, + "epoch": 3.725, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.691, + "grad_norm": 0.49041053652763367, + "learning_rate": 4.298689528010785e-06 + }, + { + "step": 597, + "epoch": 3.73125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6961, + "grad_norm": 0.26852214336395264, + "learning_rate": 4.106401317159275e-06 + }, + { + "step": 598, + "epoch": 3.7375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6908, + "grad_norm": 0.5034162402153015, + "learning_rate": 3.918453108399955e-06 + }, + { + "step": 599, + "epoch": 3.74375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6969, + "grad_norm": 0.35638222098350525, + "learning_rate": 3.7348504927637302e-06 + }, + { + "step": 600, + "epoch": 3.75, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6945, + "grad_norm": 0.32320109009742737, + "learning_rate": 3.5555989320099952e-06 + }, + { + "step": 601, + "epoch": 3.75625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6963, + "grad_norm": 0.1886107325553894, + "learning_rate": 3.3807037584642316e-06 + }, + { + "step": 602, + "epoch": 3.7625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6976, + "grad_norm": 0.24228334426879883, + "learning_rate": 3.21017017485925e-06 + }, + { + "step": 603, + "epoch": 3.76875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722009088, + "loss": 0.6894, + "grad_norm": 0.4074154794216156, + "learning_rate": 3.0440032541805825e-06 + }, + { + "step": 604, + "epoch": 3.775, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6884, + "grad_norm": 0.2438240349292755, + "learning_rate": 2.882207939515435e-06 + }, + { + "step": 605, + "epoch": 3.78125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721998336, + "loss": 0.6975, + "grad_norm": 0.6965439915657043, + "learning_rate": 2.7247890439057064e-06 + }, + { + "step": 606, + "epoch": 3.7875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6968, + "grad_norm": 0.3171299695968628, + "learning_rate": 2.5717512502048342e-06 + }, + { + "step": 607, + "epoch": 3.79375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6935, + "grad_norm": 0.34889906644821167, + "learning_rate": 2.423099110938376e-06 + }, + { + "step": 608, + "epoch": 3.8, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6979, + "grad_norm": 0.4664706587791443, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 609, + "epoch": 3.80625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.689, + "grad_norm": 0.5706819891929626, + "learning_rate": 2.1389693533636455e-06 + }, + { + "step": 610, + "epoch": 3.8125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6959, + "grad_norm": 0.14852261543273926, + "learning_rate": 2.003500187268153e-06 + }, + { + "step": 611, + "epoch": 3.81875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6936, + "grad_norm": 0.4180295169353485, + "learning_rate": 1.8724335797812685e-06 + }, + { + "step": 612, + "epoch": 3.825, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.6911, + "grad_norm": 0.33913618326187134, + "learning_rate": 1.7457734298359005e-06 + }, + { + "step": 613, + "epoch": 3.83125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6916, + "grad_norm": 0.15716542303562164, + "learning_rate": 1.6235235052828476e-06 + }, + { + "step": 614, + "epoch": 3.8375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.692, + "grad_norm": 0.4292500913143158, + "learning_rate": 1.505687442778819e-06 + }, + { + "step": 615, + "epoch": 3.84375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6913, + "grad_norm": 0.4329659044742584, + "learning_rate": 1.3922687476781047e-06 + }, + { + "step": 616, + "epoch": 3.85, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6881, + "grad_norm": 0.17447079718112946, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 617, + "epoch": 3.85625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6904, + "grad_norm": 0.2905360460281372, + "learning_rate": 1.1786968239705486e-06 + }, + { + "step": 618, + "epoch": 3.8625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6921, + "grad_norm": 0.12517249584197998, + "learning_rate": 1.0785499486417438e-06 + }, + { + "step": 619, + "epoch": 3.86875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6882, + "grad_norm": 0.25557687878608704, + "learning_rate": 9.82833147083345e-07 + }, + { + "step": 620, + "epoch": 3.875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6917, + "grad_norm": 0.5218807458877563, + "learning_rate": 8.91549266652053e-07 + }, + { + "step": 621, + "epoch": 3.88125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6913, + "grad_norm": 0.564297616481781, + "learning_rate": 8.04701022835319e-07 + }, + { + "step": 622, + "epoch": 3.8875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6917, + "grad_norm": 0.18698681890964508, + "learning_rate": 7.222909991704773e-07 + }, + { + "step": 623, + "epoch": 3.89375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6946, + "grad_norm": 0.5373379588127136, + "learning_rate": 6.443216471679058e-07 + }, + { + "step": 624, + "epoch": 3.9, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6927, + "grad_norm": 0.17427648603916168, + "learning_rate": 5.707952862381681e-07 + }, + { + "step": 625, + "epoch": 3.90625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6923, + "grad_norm": 0.23881128430366516, + "learning_rate": 5.017141036229522e-07 + }, + { + "step": 626, + "epoch": 3.9125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6871, + "grad_norm": 0.17665249109268188, + "learning_rate": 4.370801543300051e-07 + }, + { + "step": 627, + "epoch": 3.91875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6929, + "grad_norm": 0.21103167533874512, + "learning_rate": 3.768953610720327e-07 + }, + { + "step": 628, + "epoch": 3.925, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6875, + "grad_norm": 0.2518172264099121, + "learning_rate": 3.211615142094781e-07 + }, + { + "step": 629, + "epoch": 3.93125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722001408, + "loss": 0.6983, + "grad_norm": 0.4583607614040375, + "learning_rate": 2.6988027169728145e-07 + }, + { + "step": 630, + "epoch": 3.9375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.721999872, + "loss": 0.6908, + "grad_norm": 0.3846648037433624, + "learning_rate": 2.2305315903553555e-07 + }, + { + "step": 631, + "epoch": 3.94375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722013696, + "loss": 0.6961, + "grad_norm": 0.33054932951927185, + "learning_rate": 1.8068156922413924e-07 + }, + { + "step": 632, + "epoch": 3.95, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6913, + "grad_norm": 0.2451932281255722, + "learning_rate": 1.4276676272133025e-07 + }, + { + "step": 633, + "epoch": 3.95625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6923, + "grad_norm": 0.16210006177425385, + "learning_rate": 1.0930986740621539e-07 + }, + { + "step": 634, + "epoch": 3.9625, + "cpu_mem": 1.507028992, + "gpu_mem": 4.72200448, + "loss": 0.6915, + "grad_norm": 0.3608314096927643, + "learning_rate": 8.031187854514731e-08 + }, + { + "step": 635, + "epoch": 3.96875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6939, + "grad_norm": 0.18425299227237701, + "learning_rate": 5.577365876224815e-08 + }, + { + "step": 636, + "epoch": 3.975, + "cpu_mem": 1.507028992, + "gpu_mem": 4.7219968, + "loss": 0.695, + "grad_norm": 0.1473870575428009, + "learning_rate": 3.5695938013630134e-08 + }, + { + "step": 637, + "epoch": 3.98125, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722007552, + "loss": 0.6923, + "grad_norm": 0.663429856300354, + "learning_rate": 2.007931356572956e-08 + }, + { + "step": 638, + "epoch": 3.9875, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6909, + "grad_norm": 0.42640239000320435, + "learning_rate": 8.924249977537712e-09 + }, + { + "step": 639, + "epoch": 3.99375, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722006016, + "loss": 0.6913, + "grad_norm": 0.4275354743003845, + "learning_rate": 2.2310790867619e-09 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "loss": 0.6906, + "grad_norm": 0.3600873649120331, + "learning_rate": 0.0 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.507028992, + "gpu_mem": 4.722002944, + "train_runtime": 1398.1988, + "train_samples_per_second": 29.278, + "train_steps_per_second": 0.458, + "total_flos": 0.0, + "train_loss": 0.7189324093982578 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r8-a2/adapter_config.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6934cfad94edb068f0d54db83e6a8b58f0fc939 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r8-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 16, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 8, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "A" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r8-a2/eval_results.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..172f382ff656ab30025c1543cd83e6c02131ae11 --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "winogrande", + "results": 0.5011838989739542 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r8-a2/training_configuration.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..5df66fd72c8d93d6d69318a93c910dcc3ff09dcb --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "WINOGRANDE", + "dataset_id": "allenai/winogrande", + "preprocess_id": "winogrande_train_deepeval" + }, + "peft_config": { + "method": "abl_A", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 6317696 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_A-winogrande-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r8-a2", + "seed": 42, + "timestamp": "2025-08-30T23:07:33.378108" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r8-a2/training_logs.json b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..ab226a8c72a605a1e962ee98508ea3698737c19d --- /dev/null +++ b/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-winogrande-r8-a2/training_logs.json @@ -0,0 +1,5773 @@ +[ + { + "step": 1, + "epoch": 0.00625, + "cpu_mem": 1.489842176, + "gpu_mem": 4.442644992, + "loss": 3.3802, + "grad_norm": 241.92054748535156, + "learning_rate": 4.6875e-06 + }, + { + "step": 2, + "epoch": 0.0125, + "cpu_mem": 1.495740416, + "gpu_mem": 4.49326336, + "loss": 3.3361, + "grad_norm": 237.23513793945312, + "learning_rate": 9.375e-06 + }, + { + "step": 3, + "epoch": 0.01875, + "cpu_mem": 1.496133632, + "gpu_mem": 4.493267968, + "loss": 2.6156, + "grad_norm": 206.58497619628906, + "learning_rate": 1.40625e-05 + }, + { + "step": 4, + "epoch": 0.025, + "cpu_mem": 1.496526848, + "gpu_mem": 4.493266432, + "loss": 1.6678, + "grad_norm": 133.3934783935547, + "learning_rate": 1.875e-05 + }, + { + "step": 5, + "epoch": 0.03125, + "cpu_mem": 1.496920064, + "gpu_mem": 4.493266432, + "loss": 1.1536, + "grad_norm": 116.51089477539062, + "learning_rate": 2.3437499999999997e-05 + }, + { + "step": 6, + "epoch": 0.0375, + "cpu_mem": 1.49731328, + "gpu_mem": 4.493272576, + "loss": 0.9124, + "grad_norm": 32.28062438964844, + "learning_rate": 2.8125e-05 + }, + { + "step": 7, + "epoch": 0.04375, + "cpu_mem": 1.497509888, + "gpu_mem": 4.49327872, + "loss": 0.7486, + "grad_norm": 14.254436492919922, + "learning_rate": 3.28125e-05 + }, + { + "step": 8, + "epoch": 0.05, + "cpu_mem": 1.497706496, + "gpu_mem": 4.493261824, + "loss": 0.802, + "grad_norm": 72.2181167602539, + "learning_rate": 3.75e-05 + }, + { + "step": 9, + "epoch": 0.05625, + "cpu_mem": 1.497903104, + "gpu_mem": 4.493267968, + "loss": 0.6977, + "grad_norm": 20.233924865722656, + "learning_rate": 4.2187499999999995e-05 + }, + { + "step": 10, + "epoch": 0.0625, + "cpu_mem": 1.498099712, + "gpu_mem": 4.49327104, + "loss": 0.7983, + "grad_norm": 46.40990447998047, + "learning_rate": 4.6874999999999994e-05 + }, + { + "step": 11, + "epoch": 0.06875, + "cpu_mem": 1.49829632, + "gpu_mem": 4.493260288, + "loss": 0.6855, + "grad_norm": 7.200876235961914, + "learning_rate": 5.156249999999999e-05 + }, + { + "step": 12, + "epoch": 0.075, + "cpu_mem": 1.49829632, + "gpu_mem": 4.493264896, + "loss": 0.8906, + "grad_norm": 47.66444778442383, + "learning_rate": 5.625e-05 + }, + { + "step": 13, + "epoch": 0.08125, + "cpu_mem": 1.498492928, + "gpu_mem": 4.493272576, + "loss": 0.7376, + "grad_norm": 15.40773868560791, + "learning_rate": 6.09375e-05 + }, + { + "step": 14, + "epoch": 0.0875, + "cpu_mem": 1.498689536, + "gpu_mem": 4.493267968, + "loss": 0.6948, + "grad_norm": 5.827868938446045, + "learning_rate": 6.5625e-05 + }, + { + "step": 15, + "epoch": 0.09375, + "cpu_mem": 1.498886144, + "gpu_mem": 4.493267968, + "loss": 0.7189, + "grad_norm": 14.677419662475586, + "learning_rate": 7.03125e-05 + }, + { + "step": 16, + "epoch": 0.1, + "cpu_mem": 1.498886144, + "gpu_mem": 4.493264896, + "loss": 0.7133, + "grad_norm": 5.714837074279785, + "learning_rate": 7.5e-05 + }, + { + "step": 17, + "epoch": 0.10625, + "cpu_mem": 1.499082752, + "gpu_mem": 4.493264896, + "loss": 0.7044, + "grad_norm": 5.801148414611816, + "learning_rate": 7.968749999999999e-05 + }, + { + "step": 18, + "epoch": 0.1125, + "cpu_mem": 1.499082752, + "gpu_mem": 4.493267968, + "loss": 0.7074, + "grad_norm": 6.313905239105225, + "learning_rate": 8.437499999999999e-05 + }, + { + "step": 19, + "epoch": 0.11875, + "cpu_mem": 1.499082752, + "gpu_mem": 4.493264896, + "loss": 0.6966, + "grad_norm": 3.1683034896850586, + "learning_rate": 8.906249999999999e-05 + }, + { + "step": 20, + "epoch": 0.125, + "cpu_mem": 1.49927936, + "gpu_mem": 4.493272576, + "loss": 0.6793, + "grad_norm": 3.106034278869629, + "learning_rate": 9.374999999999999e-05 + }, + { + "step": 21, + "epoch": 0.13125, + "cpu_mem": 1.49927936, + "gpu_mem": 4.493264896, + "loss": 0.7682, + "grad_norm": 17.09679412841797, + "learning_rate": 9.843749999999999e-05 + }, + { + "step": 22, + "epoch": 0.1375, + "cpu_mem": 1.499475968, + "gpu_mem": 4.493264896, + "loss": 0.7315, + "grad_norm": 11.036335945129395, + "learning_rate": 0.00010312499999999999 + }, + { + "step": 23, + "epoch": 0.14375, + "cpu_mem": 1.499475968, + "gpu_mem": 4.493260288, + "loss": 0.8852, + "grad_norm": 38.46369934082031, + "learning_rate": 0.00010781249999999998 + }, + { + "step": 24, + "epoch": 0.15, + "cpu_mem": 1.499475968, + "gpu_mem": 4.49326336, + "loss": 0.8004, + "grad_norm": 22.296222686767578, + "learning_rate": 0.0001125 + }, + { + "step": 25, + "epoch": 0.15625, + "cpu_mem": 1.499475968, + "gpu_mem": 4.493266432, + "loss": 0.7041, + "grad_norm": 4.417088031768799, + "learning_rate": 0.0001171875 + }, + { + "step": 26, + "epoch": 0.1625, + "cpu_mem": 1.499672576, + "gpu_mem": 4.493261824, + "loss": 0.7508, + "grad_norm": 17.992212295532227, + "learning_rate": 0.000121875 + }, + { + "step": 27, + "epoch": 0.16875, + "cpu_mem": 1.499672576, + "gpu_mem": 4.493260288, + "loss": 0.9875, + "grad_norm": 64.20729064941406, + "learning_rate": 0.0001265625 + }, + { + "step": 28, + "epoch": 0.175, + "cpu_mem": 1.499672576, + "gpu_mem": 4.493266432, + "loss": 0.708, + "grad_norm": 10.643282890319824, + "learning_rate": 0.00013125 + }, + { + "step": 29, + "epoch": 0.18125, + "cpu_mem": 1.499672576, + "gpu_mem": 4.493264896, + "loss": 0.7655, + "grad_norm": 14.436238288879395, + "learning_rate": 0.0001359375 + }, + { + "step": 30, + "epoch": 0.1875, + "cpu_mem": 1.499672576, + "gpu_mem": 4.493264896, + "loss": 0.6903, + "grad_norm": 2.0598816871643066, + "learning_rate": 0.000140625 + }, + { + "step": 31, + "epoch": 0.19375, + "cpu_mem": 1.499672576, + "gpu_mem": 4.493264896, + "loss": 0.7092, + "grad_norm": 6.2164812088012695, + "learning_rate": 0.0001453125 + }, + { + "step": 32, + "epoch": 0.2, + "cpu_mem": 1.499672576, + "gpu_mem": 4.493261824, + "loss": 0.691, + "grad_norm": 4.591337203979492, + "learning_rate": 0.00015 + }, + { + "step": 33, + "epoch": 0.20625, + "cpu_mem": 1.499672576, + "gpu_mem": 4.493261824, + "loss": 0.7468, + "grad_norm": 10.687963485717773, + "learning_rate": 0.00015468749999999999 + }, + { + "step": 34, + "epoch": 0.2125, + "cpu_mem": 1.499672576, + "gpu_mem": 4.493261824, + "loss": 0.7017, + "grad_norm": 2.3519680500030518, + "learning_rate": 0.00015937499999999998 + }, + { + "step": 35, + "epoch": 0.21875, + "cpu_mem": 1.499869184, + "gpu_mem": 4.493267968, + "loss": 0.7106, + "grad_norm": 4.529821395874023, + "learning_rate": 0.00016406249999999998 + }, + { + "step": 36, + "epoch": 0.225, + "cpu_mem": 1.499869184, + "gpu_mem": 4.49326336, + "loss": 0.6864, + "grad_norm": 1.711564064025879, + "learning_rate": 0.00016874999999999998 + }, + { + "step": 37, + "epoch": 0.23125, + "cpu_mem": 1.499869184, + "gpu_mem": 4.493261824, + "loss": 0.8478, + "grad_norm": 16.192481994628906, + "learning_rate": 0.00017343749999999998 + }, + { + "step": 38, + "epoch": 0.2375, + "cpu_mem": 1.499869184, + "gpu_mem": 4.493266432, + "loss": 0.7852, + "grad_norm": 12.088907241821289, + "learning_rate": 0.00017812499999999998 + }, + { + "step": 39, + "epoch": 0.24375, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493272576, + "loss": 0.6928, + "grad_norm": 2.703690528869629, + "learning_rate": 0.00018281249999999998 + }, + { + "step": 40, + "epoch": 0.25, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493269504, + "loss": 0.6955, + "grad_norm": 1.5588886737823486, + "learning_rate": 0.00018749999999999998 + }, + { + "step": 41, + "epoch": 0.25625, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493269504, + "loss": 0.8198, + "grad_norm": 14.196533203125, + "learning_rate": 0.00019218749999999998 + }, + { + "step": 42, + "epoch": 0.2625, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493266432, + "loss": 0.7159, + "grad_norm": 7.397229194641113, + "learning_rate": 0.00019687499999999997 + }, + { + "step": 43, + "epoch": 0.26875, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493266432, + "loss": 0.7554, + "grad_norm": 9.009133338928223, + "learning_rate": 0.00020156249999999997 + }, + { + "step": 44, + "epoch": 0.275, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493266432, + "loss": 0.7439, + "grad_norm": 8.223872184753418, + "learning_rate": 0.00020624999999999997 + }, + { + "step": 45, + "epoch": 0.28125, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493274112, + "loss": 0.7477, + "grad_norm": 7.320959568023682, + "learning_rate": 0.00021093749999999997 + }, + { + "step": 46, + "epoch": 0.2875, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493266432, + "loss": 0.7, + "grad_norm": 1.0148037672042847, + "learning_rate": 0.00021562499999999997 + }, + { + "step": 47, + "epoch": 0.29375, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493267968, + "loss": 0.702, + "grad_norm": 2.2630841732025146, + "learning_rate": 0.00022031249999999997 + }, + { + "step": 48, + "epoch": 0.3, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493267968, + "loss": 0.6933, + "grad_norm": 2.9933841228485107, + "learning_rate": 0.000225 + }, + { + "step": 49, + "epoch": 0.30625, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493258752, + "loss": 0.7554, + "grad_norm": 9.739350318908691, + "learning_rate": 0.0002296875 + }, + { + "step": 50, + "epoch": 0.3125, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493261824, + "loss": 0.7125, + "grad_norm": 3.671002149581909, + "learning_rate": 0.000234375 + }, + { + "step": 51, + "epoch": 0.31875, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493269504, + "loss": 0.6863, + "grad_norm": 0.5323218107223511, + "learning_rate": 0.0002390625 + }, + { + "step": 52, + "epoch": 0.325, + "cpu_mem": 1.500065792, + "gpu_mem": 4.49326336, + "loss": 0.7075, + "grad_norm": 3.700941562652588, + "learning_rate": 0.00024375 + }, + { + "step": 53, + "epoch": 0.33125, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493267968, + "loss": 0.6812, + "grad_norm": 0.809123694896698, + "learning_rate": 0.00024843749999999996 + }, + { + "step": 54, + "epoch": 0.3375, + "cpu_mem": 1.500065792, + "gpu_mem": 4.49325568, + "loss": 0.9025, + "grad_norm": 15.331682205200195, + "learning_rate": 0.000253125 + }, + { + "step": 55, + "epoch": 0.34375, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493260288, + "loss": 0.8268, + "grad_norm": 10.932428359985352, + "learning_rate": 0.00025781249999999996 + }, + { + "step": 56, + "epoch": 0.35, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493267968, + "loss": 0.7215, + "grad_norm": 4.250907897949219, + "learning_rate": 0.0002625 + }, + { + "step": 57, + "epoch": 0.35625, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493258752, + "loss": 0.6913, + "grad_norm": 0.7723212242126465, + "learning_rate": 0.00026718749999999996 + }, + { + "step": 58, + "epoch": 0.3625, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493261824, + "loss": 0.7933, + "grad_norm": 8.13784408569336, + "learning_rate": 0.000271875 + }, + { + "step": 59, + "epoch": 0.36875, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493274112, + "loss": 0.788, + "grad_norm": 8.403929710388184, + "learning_rate": 0.00027656249999999995 + }, + { + "step": 60, + "epoch": 0.375, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493264896, + "loss": 0.7033, + "grad_norm": 2.1682679653167725, + "learning_rate": 0.00028125 + }, + { + "step": 61, + "epoch": 0.38125, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493264896, + "loss": 0.6999, + "grad_norm": 1.0328296422958374, + "learning_rate": 0.00028593749999999995 + }, + { + "step": 62, + "epoch": 0.3875, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493264896, + "loss": 0.7319, + "grad_norm": 4.539892196655273, + "learning_rate": 0.000290625 + }, + { + "step": 63, + "epoch": 0.39375, + "cpu_mem": 1.500065792, + "gpu_mem": 4.49326336, + "loss": 0.7322, + "grad_norm": 5.495602607727051, + "learning_rate": 0.00029531249999999995 + }, + { + "step": 64, + "epoch": 0.4, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493272576, + "loss": 0.7487, + "grad_norm": 6.873269081115723, + "learning_rate": 0.0003 + }, + { + "step": 65, + "epoch": 0.40625, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493261824, + "loss": 0.6861, + "grad_norm": 1.7422949075698853, + "learning_rate": 0.00029999776892091325 + }, + { + "step": 66, + "epoch": 0.4125, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493267968, + "loss": 0.6895, + "grad_norm": 0.5567264556884766, + "learning_rate": 0.00029999107575002246 + }, + { + "step": 67, + "epoch": 0.41875, + "cpu_mem": 1.500065792, + "gpu_mem": 4.493266432, + "loss": 0.8246, + "grad_norm": 7.1748247146606445, + "learning_rate": 0.0002999799206864343 + }, + { + "step": 68, + "epoch": 0.425, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7517, + "grad_norm": 4.574430465698242, + "learning_rate": 0.0002999643040619863 + }, + { + "step": 69, + "epoch": 0.43125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7036, + "grad_norm": 1.8765952587127686, + "learning_rate": 0.0002999442263412377 + }, + { + "step": 70, + "epoch": 0.4375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493272576, + "loss": 0.7803, + "grad_norm": 6.148961544036865, + "learning_rate": 0.00029991968812145484 + }, + { + "step": 71, + "epoch": 0.44375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.7692, + "grad_norm": 5.490943431854248, + "learning_rate": 0.00029989069013259374 + }, + { + "step": 72, + "epoch": 0.45, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7095, + "grad_norm": 2.5259900093078613, + "learning_rate": 0.00029985723323727866 + }, + { + "step": 73, + "epoch": 0.45625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7864, + "grad_norm": 6.741501808166504, + "learning_rate": 0.00029981931843077583 + }, + { + "step": 74, + "epoch": 0.4625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.7644, + "grad_norm": 5.247608184814453, + "learning_rate": 0.00029977694684096444 + }, + { + "step": 75, + "epoch": 0.46875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7472, + "grad_norm": 5.129408836364746, + "learning_rate": 0.0002997301197283027 + }, + { + "step": 76, + "epoch": 0.475, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7772, + "grad_norm": 7.057398796081543, + "learning_rate": 0.0002996788384857905 + }, + { + "step": 77, + "epoch": 0.48125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493272576, + "loss": 0.7174, + "grad_norm": 2.686394691467285, + "learning_rate": 0.00029962310463892795 + }, + { + "step": 78, + "epoch": 0.4875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6997, + "grad_norm": 1.111324667930603, + "learning_rate": 0.00029956291984566997 + }, + { + "step": 79, + "epoch": 0.49375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6972, + "grad_norm": 2.704512596130371, + "learning_rate": 0.00029949828589637703 + }, + { + "step": 80, + "epoch": 0.5, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6946, + "grad_norm": 1.1792985200881958, + "learning_rate": 0.0002994292047137618 + }, + { + "step": 81, + "epoch": 0.50625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493258752, + "loss": 0.6948, + "grad_norm": 12.228708267211914, + "learning_rate": 0.00029935567835283203 + }, + { + "step": 82, + "epoch": 0.5125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6938, + "grad_norm": 3.419274091720581, + "learning_rate": 0.00029927770900082954 + }, + { + "step": 83, + "epoch": 0.51875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6926, + "grad_norm": 5.114155292510986, + "learning_rate": 0.0002991952989771647 + }, + { + "step": 84, + "epoch": 0.525, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 1.6208, + "grad_norm": 57.96391677856445, + "learning_rate": 0.0002991084507333479 + }, + { + "step": 85, + "epoch": 0.53125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.9398, + "grad_norm": 17.7019100189209, + "learning_rate": 0.00029901716685291663 + }, + { + "step": 86, + "epoch": 0.5375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6948, + "grad_norm": 0.6490803360939026, + "learning_rate": 0.0002989214500513582 + }, + { + "step": 87, + "epoch": 0.54375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7052, + "grad_norm": 3.2710015773773193, + "learning_rate": 0.0002988213031760294 + }, + { + "step": 88, + "epoch": 0.55, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7411, + "grad_norm": 4.992422580718994, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 89, + "epoch": 0.55625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6906, + "grad_norm": 0.9565786123275757, + "learning_rate": 0.0002986077312523219 + }, + { + "step": 90, + "epoch": 0.5625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7411, + "grad_norm": 4.6436543464660645, + "learning_rate": 0.00029849431255722116 + }, + { + "step": 91, + "epoch": 0.56875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6727, + "grad_norm": 2.3105859756469727, + "learning_rate": 0.00029837647649471715 + }, + { + "step": 92, + "epoch": 0.575, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 1.4656, + "grad_norm": 44.74017333984375, + "learning_rate": 0.0002982542265701641 + }, + { + "step": 93, + "epoch": 0.58125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 1.1692, + "grad_norm": 21.132986068725586, + "learning_rate": 0.0002981275664202187 + }, + { + "step": 94, + "epoch": 0.5875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.7387, + "grad_norm": 6.328951358795166, + "learning_rate": 0.00029799649981273186 + }, + { + "step": 95, + "epoch": 0.59375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.8933, + "grad_norm": 16.533849716186523, + "learning_rate": 0.00029786103064663634 + }, + { + "step": 96, + "epoch": 0.6, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.7163, + "grad_norm": 4.40908670425415, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 97, + "epoch": 0.60625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6933, + "grad_norm": 0.4498700499534607, + "learning_rate": 0.00029757690088906156 + }, + { + "step": 98, + "epoch": 0.6125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7547, + "grad_norm": 5.490511417388916, + "learning_rate": 0.00029742824874979515 + }, + { + "step": 99, + "epoch": 0.61875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.7939, + "grad_norm": 7.969686985015869, + "learning_rate": 0.0002972752109560943 + }, + { + "step": 100, + "epoch": 0.625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6962, + "grad_norm": 1.1928246021270752, + "learning_rate": 0.00029711779206048454 + }, + { + "step": 101, + "epoch": 0.63125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6917, + "grad_norm": 0.264098197221756, + "learning_rate": 0.0002969559967458194 + }, + { + "step": 102, + "epoch": 0.6375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493272576, + "loss": 0.7083, + "grad_norm": 2.608646869659424, + "learning_rate": 0.0002967898298251407 + }, + { + "step": 103, + "epoch": 0.64375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.7072, + "grad_norm": 2.576261043548584, + "learning_rate": 0.0002966192962415358 + }, + { + "step": 104, + "epoch": 0.65, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6929, + "grad_norm": 0.9027281999588013, + "learning_rate": 0.00029644440106799 + }, + { + "step": 105, + "epoch": 0.65625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.7033, + "grad_norm": 1.727279543876648, + "learning_rate": 0.00029626514950723627 + }, + { + "step": 106, + "epoch": 0.6625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6946, + "grad_norm": 1.028830647468567, + "learning_rate": 0.0002960815468916 + }, + { + "step": 107, + "epoch": 0.66875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7178, + "grad_norm": 2.843503475189209, + "learning_rate": 0.0002958935986828407 + }, + { + "step": 108, + "epoch": 0.675, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7539, + "grad_norm": 4.509593486785889, + "learning_rate": 0.00029570131047198915 + }, + { + "step": 109, + "epoch": 0.68125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7, + "grad_norm": 1.3366409540176392, + "learning_rate": 0.0002955046879791816 + }, + { + "step": 110, + "epoch": 0.6875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.7319, + "grad_norm": 4.228572368621826, + "learning_rate": 0.00029530373705348895 + }, + { + "step": 111, + "epoch": 0.69375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7168, + "grad_norm": 2.5328028202056885, + "learning_rate": 0.00029509846367274336 + }, + { + "step": 112, + "epoch": 0.7, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7023, + "grad_norm": 1.5049101114273071, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 113, + "epoch": 0.70625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493257216, + "loss": 0.6825, + "grad_norm": 1.373574137687683, + "learning_rate": 0.00029467497410015625 + }, + { + "step": 114, + "epoch": 0.7125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7285, + "grad_norm": 4.410015106201172, + "learning_rate": 0.00029445677050616437 + }, + { + "step": 115, + "epoch": 0.71875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7883, + "grad_norm": 7.70558500289917, + "learning_rate": 0.0002942342696524443 + }, + { + "step": 116, + "epoch": 0.725, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.8674, + "grad_norm": 11.392669677734375, + "learning_rate": 0.0002940074781578893 + }, + { + "step": 117, + "epoch": 0.73125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6921, + "grad_norm": 0.27105242013931274, + "learning_rate": 0.00029377640276902954 + }, + { + "step": 118, + "epoch": 0.7375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7158, + "grad_norm": 3.3987226486206055, + "learning_rate": 0.0002935410503598313 + }, + { + "step": 119, + "epoch": 0.74375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7455, + "grad_norm": 6.11848783493042, + "learning_rate": 0.00029330142793149237 + }, + { + "step": 120, + "epoch": 0.75, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7053, + "grad_norm": 2.1207821369171143, + "learning_rate": 0.000293057542612234 + }, + { + "step": 121, + "epoch": 0.75625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7251, + "grad_norm": 3.593137264251709, + "learning_rate": 0.0002928094016570886 + }, + { + "step": 122, + "epoch": 0.7625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7043, + "grad_norm": 3.666558027267456, + "learning_rate": 0.00029255701244768414 + }, + { + "step": 123, + "epoch": 0.76875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.729, + "grad_norm": 3.8450446128845215, + "learning_rate": 0.0002923003824920244 + }, + { + "step": 124, + "epoch": 0.775, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7744, + "grad_norm": 5.433503150939941, + "learning_rate": 0.0002920395194242658 + }, + { + "step": 125, + "epoch": 0.78125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493257216, + "loss": 0.7151, + "grad_norm": 2.581589460372925, + "learning_rate": 0.00029177443100449014 + }, + { + "step": 126, + "epoch": 0.7875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6934, + "grad_norm": 0.2945093810558319, + "learning_rate": 0.00029150512511847375 + }, + { + "step": 127, + "epoch": 0.79375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7188, + "grad_norm": 2.12423038482666, + "learning_rate": 0.00029123160977745306 + }, + { + "step": 128, + "epoch": 0.8, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.7511, + "grad_norm": 3.161350965499878, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 129, + "epoch": 0.80625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6991, + "grad_norm": 0.8415344953536987, + "learning_rate": 0.00029067198340121094 + }, + { + "step": 130, + "epoch": 0.8125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7003, + "grad_norm": 1.016400694847107, + "learning_rate": 0.00029038588901359884 + }, + { + "step": 131, + "epoch": 0.81875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.7221, + "grad_norm": 2.540095567703247, + "learning_rate": 0.00029009561846570604 + }, + { + "step": 132, + "epoch": 0.825, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6883, + "grad_norm": 0.5550497174263, + "learning_rate": 0.00028980118039241976 + }, + { + "step": 133, + "epoch": 0.83125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6773, + "grad_norm": 2.0311598777770996, + "learning_rate": 0.00028950258355260177 + }, + { + "step": 134, + "epoch": 0.8375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7111, + "grad_norm": 1.7298951148986816, + "learning_rate": 0.00028919983682882766 + }, + { + "step": 135, + "epoch": 0.84375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6729, + "grad_norm": 0.485177606344223, + "learning_rate": 0.0002888929492271224 + }, + { + "step": 136, + "epoch": 0.85, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6771, + "grad_norm": 0.2785748243331909, + "learning_rate": 0.000288581929876693 + }, + { + "step": 137, + "epoch": 0.85625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6913, + "grad_norm": 0.9659852981567383, + "learning_rate": 0.00028826678802965614 + }, + { + "step": 138, + "epoch": 0.8625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6972, + "grad_norm": 0.8120244145393372, + "learning_rate": 0.0002879475330607638 + }, + { + "step": 139, + "epoch": 0.86875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.696, + "grad_norm": 0.6336333751678467, + "learning_rate": 0.00028762417446712363 + }, + { + "step": 140, + "epoch": 0.875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7359, + "grad_norm": 3.1806602478027344, + "learning_rate": 0.00028729672186791704 + }, + { + "step": 141, + "epoch": 0.88125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7234, + "grad_norm": 2.5956625938415527, + "learning_rate": 0.00028696518500411254 + }, + { + "step": 142, + "epoch": 0.8875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6902, + "grad_norm": 0.7235323190689087, + "learning_rate": 0.0002866295737381763 + }, + { + "step": 143, + "epoch": 0.89375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7118, + "grad_norm": 1.4236794710159302, + "learning_rate": 0.0002862898980537788 + }, + { + "step": 144, + "epoch": 0.9, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7215, + "grad_norm": 1.7723703384399414, + "learning_rate": 0.0002859461680554975 + }, + { + "step": 145, + "epoch": 0.90625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493272576, + "loss": 0.6842, + "grad_norm": 0.61899733543396, + "learning_rate": 0.0002855983939685165 + }, + { + "step": 146, + "epoch": 0.9125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6901, + "grad_norm": 1.285110592842102, + "learning_rate": 0.0002852465861383224 + }, + { + "step": 147, + "epoch": 0.91875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.7135, + "grad_norm": 1.925487995147705, + "learning_rate": 0.00028489075503039643 + }, + { + "step": 148, + "epoch": 0.925, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6944, + "grad_norm": 0.147901251912117, + "learning_rate": 0.00028453091122990323 + }, + { + "step": 149, + "epoch": 0.93125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6944, + "grad_norm": 1.6006721258163452, + "learning_rate": 0.0002841670654413757 + }, + { + "step": 150, + "epoch": 0.9375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6845, + "grad_norm": 0.5316305160522461, + "learning_rate": 0.0002837992284883971 + }, + { + "step": 151, + "epoch": 0.94375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.745, + "grad_norm": 3.1067898273468018, + "learning_rate": 0.0002834274113132784 + }, + { + "step": 152, + "epoch": 0.95, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.7175, + "grad_norm": 1.9210160970687866, + "learning_rate": 0.0002830516249767332 + }, + { + "step": 153, + "epoch": 0.95625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6882, + "grad_norm": 0.17985808849334717, + "learning_rate": 0.0002826718806575488 + }, + { + "step": 154, + "epoch": 0.9625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6941, + "grad_norm": 0.15282978117465973, + "learning_rate": 0.0002822881896522532 + }, + { + "step": 155, + "epoch": 0.96875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.681, + "grad_norm": 0.3058810830116272, + "learning_rate": 0.0002819005633747795 + }, + { + "step": 156, + "epoch": 0.975, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493258752, + "loss": 0.7488, + "grad_norm": 4.585237503051758, + "learning_rate": 0.00028150901335612615 + }, + { + "step": 157, + "epoch": 0.98125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6123, + "grad_norm": 3.74735951423645, + "learning_rate": 0.0002811135512440138 + }, + { + "step": 158, + "epoch": 0.9875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493275648, + "loss": 0.7517, + "grad_norm": 5.700966835021973, + "learning_rate": 0.0002807141888025392 + }, + { + "step": 159, + "epoch": 0.99375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6966, + "grad_norm": 2.520826816558838, + "learning_rate": 0.00028031093791182484 + }, + { + "step": 160, + "epoch": 1.0, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6924, + "grad_norm": 0.431797593832016, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 161, + "epoch": 1.00625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.7387, + "grad_norm": 6.695068359375, + "learning_rate": 0.0002794928188811727 + }, + { + "step": 162, + "epoch": 1.0125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.7519, + "grad_norm": 7.507557392120361, + "learning_rate": 0.0002790779750784118 + }, + { + "step": 163, + "epoch": 1.01875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6755, + "grad_norm": 0.8674174547195435, + "learning_rate": 0.0002786592915000408 + }, + { + "step": 164, + "epoch": 1.025, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6922, + "grad_norm": 2.699544906616211, + "learning_rate": 0.00027823678060094197 + }, + { + "step": 165, + "epoch": 1.03125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.7022, + "grad_norm": 2.6713438034057617, + "learning_rate": 0.0002778104549498518 + }, + { + "step": 166, + "epoch": 1.0375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6978, + "grad_norm": 0.9585144519805908, + "learning_rate": 0.00027738032722898683 + }, + { + "step": 167, + "epoch": 1.04375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6961, + "grad_norm": 0.9979376196861267, + "learning_rate": 0.00027694641023366656 + }, + { + "step": 168, + "epoch": 1.05, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6978, + "grad_norm": 0.8003730177879333, + "learning_rate": 0.0002765087168719328 + }, + { + "step": 169, + "epoch": 1.05625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6933, + "grad_norm": 0.5531396269798279, + "learning_rate": 0.00027606726016416567 + }, + { + "step": 170, + "epoch": 1.0625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493272576, + "loss": 0.6962, + "grad_norm": 1.5462050437927246, + "learning_rate": 0.00027562205324269617 + }, + { + "step": 171, + "epoch": 1.06875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.7019, + "grad_norm": 2.304170846939087, + "learning_rate": 0.00027517310935141565 + }, + { + "step": 172, + "epoch": 1.075, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.696, + "grad_norm": 0.9403184056282043, + "learning_rate": 0.0002747204418453818 + }, + { + "step": 173, + "epoch": 1.08125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493257216, + "loss": 0.6947, + "grad_norm": 0.5160229206085205, + "learning_rate": 0.00027426406419042135 + }, + { + "step": 174, + "epoch": 1.0875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.7001, + "grad_norm": 0.9345813393592834, + "learning_rate": 0.00027380398996272956 + }, + { + "step": 175, + "epoch": 1.09375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7051, + "grad_norm": 1.832680583000183, + "learning_rate": 0.0002733402328484662 + }, + { + "step": 176, + "epoch": 1.1, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6978, + "grad_norm": 0.8821761608123779, + "learning_rate": 0.00027287280664334875 + }, + { + "step": 177, + "epoch": 1.10625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.7015, + "grad_norm": 1.3142913579940796, + "learning_rate": 0.0002724017252522415 + }, + { + "step": 178, + "epoch": 1.1125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6944, + "grad_norm": 0.6960073113441467, + "learning_rate": 0.0002719270026887423 + }, + { + "step": 179, + "epoch": 1.11875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6971, + "grad_norm": 0.7478674650192261, + "learning_rate": 0.0002714486530747656 + }, + { + "step": 180, + "epoch": 1.125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6934, + "grad_norm": 0.12494221329689026, + "learning_rate": 0.0002709666906401224 + }, + { + "step": 181, + "epoch": 1.13125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493280256, + "loss": 0.6776, + "grad_norm": 1.4560763835906982, + "learning_rate": 0.0002704811297220967 + }, + { + "step": 182, + "epoch": 1.1375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7581, + "grad_norm": 3.400517463684082, + "learning_rate": 0.00026999198476501945 + }, + { + "step": 183, + "epoch": 1.14375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7568, + "grad_norm": 3.2092673778533936, + "learning_rate": 0.0002694992703198383 + }, + { + "step": 184, + "epoch": 1.15, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7291, + "grad_norm": 2.3551928997039795, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 185, + "epoch": 1.15625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6941, + "grad_norm": 0.7031469941139221, + "learning_rate": 0.0002685031916994403 + }, + { + "step": 186, + "epoch": 1.1625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.7166, + "grad_norm": 1.6007237434387207, + "learning_rate": 0.0002679998571552925 + }, + { + "step": 187, + "epoch": 1.16875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6779, + "grad_norm": 0.19755305349826813, + "learning_rate": 0.0002674930123842975 + }, + { + "step": 188, + "epoch": 1.175, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7012, + "grad_norm": 0.9701418280601501, + "learning_rate": 0.0002669826724639322 + }, + { + "step": 189, + "epoch": 1.18125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6971, + "grad_norm": 1.077275037765503, + "learning_rate": 0.0002664688525756463 + }, + { + "step": 190, + "epoch": 1.1875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7857, + "grad_norm": 6.793179512023926, + "learning_rate": 0.0002659515680044105 + }, + { + "step": 191, + "epoch": 1.19375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.8391, + "grad_norm": 9.918543815612793, + "learning_rate": 0.00026543083413826203 + }, + { + "step": 192, + "epoch": 1.2, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6771, + "grad_norm": 1.46857488155365, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 193, + "epoch": 1.20625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6945, + "grad_norm": 0.539936363697052, + "learning_rate": 0.0002643790805859582 + }, + { + "step": 194, + "epoch": 1.2125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493258752, + "loss": 0.7095, + "grad_norm": 3.961989402770996, + "learning_rate": 0.00026384809218707423 + }, + { + "step": 195, + "epoch": 1.21875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6758, + "grad_norm": 2.863858461380005, + "learning_rate": 0.0002633137170668897 + }, + { + "step": 196, + "epoch": 1.225, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7017, + "grad_norm": 2.6536271572113037, + "learning_rate": 0.0002627759711218466 + }, + { + "step": 197, + "epoch": 1.23125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493272576, + "loss": 0.7035, + "grad_norm": 2.418693780899048, + "learning_rate": 0.00026223487034866133 + }, + { + "step": 198, + "epoch": 1.2375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6928, + "grad_norm": 0.7153254747390747, + "learning_rate": 0.00026169043084384896 + }, + { + "step": 199, + "epoch": 1.24375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6991, + "grad_norm": 2.8292646408081055, + "learning_rate": 0.00026114266880324387 + }, + { + "step": 200, + "epoch": 1.25, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6968, + "grad_norm": 1.492846965789795, + "learning_rate": 0.0002605916005215186 + }, + { + "step": 201, + "epoch": 1.25625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493272576, + "loss": 0.694, + "grad_norm": 0.3003822863101959, + "learning_rate": 0.00026003724239169874 + }, + { + "step": 202, + "epoch": 1.2625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7146, + "grad_norm": 3.9491968154907227, + "learning_rate": 0.00025947961090467533 + }, + { + "step": 203, + "epoch": 1.26875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6833, + "grad_norm": 3.211869478225708, + "learning_rate": 0.0002589187226487144 + }, + { + "step": 204, + "epoch": 1.275, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.695, + "grad_norm": 0.7908339500427246, + "learning_rate": 0.0002583545943089633 + }, + { + "step": 205, + "epoch": 1.28125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6931, + "grad_norm": 0.45537179708480835, + "learning_rate": 0.00025778724266695466 + }, + { + "step": 206, + "epoch": 1.2875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6946, + "grad_norm": 0.49169057607650757, + "learning_rate": 0.00025721668460010696 + }, + { + "step": 207, + "epoch": 1.29375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.7035, + "grad_norm": 1.8431308269500732, + "learning_rate": 0.0002566429370812223 + }, + { + "step": 208, + "epoch": 1.3, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7135, + "grad_norm": 3.174839496612549, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 209, + "epoch": 1.30625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6937, + "grad_norm": 1.3419971466064453, + "learning_rate": 0.0002554859420524386 + }, + { + "step": 210, + "epoch": 1.3125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.693, + "grad_norm": 0.9114062190055847, + "learning_rate": 0.00025490272896050507 + }, + { + "step": 211, + "epoch": 1.31875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7184, + "grad_norm": 3.1268670558929443, + "learning_rate": 0.00025431639525144175 + }, + { + "step": 212, + "epoch": 1.325, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6989, + "grad_norm": 1.0792210102081299, + "learning_rate": 0.0002537269583673404 + }, + { + "step": 213, + "epoch": 1.33125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6941, + "grad_norm": 0.6904277205467224, + "learning_rate": 0.0002531344358426051 + }, + { + "step": 214, + "epoch": 1.3375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493274112, + "loss": 0.6927, + "grad_norm": 0.29041874408721924, + "learning_rate": 0.0002525388453034307 + }, + { + "step": 215, + "epoch": 1.34375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.7054, + "grad_norm": 1.6864479780197144, + "learning_rate": 0.0002519402044672784 + }, + { + "step": 216, + "epoch": 1.35, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6982, + "grad_norm": 1.7221693992614746, + "learning_rate": 0.00025133853114234905 + }, + { + "step": 217, + "epoch": 1.35625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7034, + "grad_norm": 1.277147889137268, + "learning_rate": 0.00025073384322705274 + }, + { + "step": 218, + "epoch": 1.3625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7213, + "grad_norm": 2.4702181816101074, + "learning_rate": 0.0002501261587094771 + }, + { + "step": 219, + "epoch": 1.36875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6915, + "grad_norm": 0.08919399231672287, + "learning_rate": 0.00024951549566685165 + }, + { + "step": 220, + "epoch": 1.375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6935, + "grad_norm": 0.10424022376537323, + "learning_rate": 0.0002489018722650103 + }, + { + "step": 221, + "epoch": 1.38125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7009, + "grad_norm": 1.8277904987335205, + "learning_rate": 0.00024828530675785094 + }, + { + "step": 222, + "epoch": 1.3875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6936, + "grad_norm": 0.20885172486305237, + "learning_rate": 0.00024766581748679234 + }, + { + "step": 223, + "epoch": 1.39375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493258752, + "loss": 0.6919, + "grad_norm": 0.8126945495605469, + "learning_rate": 0.0002470434228802286 + }, + { + "step": 224, + "epoch": 1.4, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6941, + "grad_norm": 0.3484436571598053, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 225, + "epoch": 1.40625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.71, + "grad_norm": 1.4394502639770508, + "learning_rate": 0.0002457899918057468 + }, + { + "step": 226, + "epoch": 1.4125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.7018, + "grad_norm": 0.9746183156967163, + "learning_rate": 0.0002451589926245468 + }, + { + "step": 227, + "epoch": 1.41875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6934, + "grad_norm": 0.08334704488515854, + "learning_rate": 0.00024452516268016865 + }, + { + "step": 228, + "epoch": 1.425, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6984, + "grad_norm": 0.7150031924247742, + "learning_rate": 0.00024388852082760884 + }, + { + "step": 229, + "epoch": 1.43125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.7006, + "grad_norm": 0.8219296336174011, + "learning_rate": 0.00024324908600551162 + }, + { + "step": 230, + "epoch": 1.4375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6997, + "grad_norm": 0.7749820947647095, + "learning_rate": 0.00024260687723560574 + }, + { + "step": 231, + "epoch": 1.44375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.698, + "grad_norm": 1.2074204683303833, + "learning_rate": 0.00024196191362213862 + }, + { + "step": 232, + "epoch": 1.45, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6955, + "grad_norm": 0.6325491666793823, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 233, + "epoch": 1.45625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.694, + "grad_norm": 0.06653673201799393, + "learning_rate": 0.0002406637986906913 + }, + { + "step": 234, + "epoch": 1.4625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493275648, + "loss": 0.6948, + "grad_norm": 0.17440035939216614, + "learning_rate": 0.00024001068598867212 + }, + { + "step": 235, + "epoch": 1.46875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6996, + "grad_norm": 0.8864902257919312, + "learning_rate": 0.000239354895673865 + }, + { + "step": 236, + "epoch": 1.475, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6894, + "grad_norm": 1.1396602392196655, + "learning_rate": 0.00023869644725453735 + }, + { + "step": 237, + "epoch": 1.48125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493274112, + "loss": 0.7053, + "grad_norm": 1.2941300868988037, + "learning_rate": 0.00023803536031802918 + }, + { + "step": 238, + "epoch": 1.4875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493272576, + "loss": 0.704, + "grad_norm": 2.0237395763397217, + "learning_rate": 0.00023737165453017033 + }, + { + "step": 239, + "epoch": 1.49375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6995, + "grad_norm": 0.8102009892463684, + "learning_rate": 0.0002367053496346955 + }, + { + "step": 240, + "epoch": 1.5, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6959, + "grad_norm": 0.41678386926651, + "learning_rate": 0.00023603646545265687 + }, + { + "step": 241, + "epoch": 1.50625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6807, + "grad_norm": 0.8438976407051086, + "learning_rate": 0.00023536502188183472 + }, + { + "step": 242, + "epoch": 1.5125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6823, + "grad_norm": 0.24372252821922302, + "learning_rate": 0.00023469103889614505 + }, + { + "step": 243, + "epoch": 1.51875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493274112, + "loss": 0.7396, + "grad_norm": 2.019932270050049, + "learning_rate": 0.0002340145365450458 + }, + { + "step": 244, + "epoch": 1.525, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7165, + "grad_norm": 1.2462793588638306, + "learning_rate": 0.0002333355349529403 + }, + { + "step": 245, + "epoch": 1.53125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.707, + "grad_norm": 1.0546326637268066, + "learning_rate": 0.0002326540543185786 + }, + { + "step": 246, + "epoch": 1.5375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6946, + "grad_norm": 0.48132720589637756, + "learning_rate": 0.0002319701149144565 + }, + { + "step": 247, + "epoch": 1.54375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6921, + "grad_norm": 0.17469234764575958, + "learning_rate": 0.00023128373708621275 + }, + { + "step": 248, + "epoch": 1.55, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7209, + "grad_norm": 1.9517871141433716, + "learning_rate": 0.00023059494125202357 + }, + { + "step": 249, + "epoch": 1.55625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6859, + "grad_norm": 0.11797522008419037, + "learning_rate": 0.00022990374790199532 + }, + { + "step": 250, + "epoch": 1.5625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7247, + "grad_norm": 2.208719491958618, + "learning_rate": 0.0002292101775975552 + }, + { + "step": 251, + "epoch": 1.56875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6896, + "grad_norm": 0.06746906042098999, + "learning_rate": 0.00022851425097083906 + }, + { + "step": 252, + "epoch": 1.575, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6875, + "grad_norm": 0.8524601459503174, + "learning_rate": 0.00022781598872407822 + }, + { + "step": 253, + "epoch": 1.58125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6938, + "grad_norm": 0.2273312509059906, + "learning_rate": 0.00022711541162898321 + }, + { + "step": 254, + "epoch": 1.5875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6949, + "grad_norm": 0.37860554456710815, + "learning_rate": 0.00022641254052612627 + }, + { + "step": 255, + "epoch": 1.59375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493258752, + "loss": 0.6919, + "grad_norm": 0.5357683300971985, + "learning_rate": 0.00022570739632432079 + }, + { + "step": 256, + "epoch": 1.6, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6969, + "grad_norm": 0.7294994592666626, + "learning_rate": 0.000225 + }, + { + "step": 257, + "epoch": 1.60625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6947, + "grad_norm": 0.26196059584617615, + "learning_rate": 0.0002242903725965924 + }, + { + "step": 258, + "epoch": 1.6125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6919, + "grad_norm": 0.8628998398780823, + "learning_rate": 0.00022357853522389615 + }, + { + "step": 259, + "epoch": 1.61875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7165, + "grad_norm": 2.6308817863464355, + "learning_rate": 0.000222864509057451 + }, + { + "step": 260, + "epoch": 1.625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7065, + "grad_norm": 1.732404351234436, + "learning_rate": 0.00022214831533790813 + }, + { + "step": 261, + "epoch": 1.63125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6869, + "grad_norm": 1.8235645294189453, + "learning_rate": 0.0002214299753703987 + }, + { + "step": 262, + "epoch": 1.6375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6954, + "grad_norm": 0.3137063980102539, + "learning_rate": 0.00022070951052389966 + }, + { + "step": 263, + "epoch": 1.64375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493275648, + "loss": 0.6934, + "grad_norm": 0.22100473940372467, + "learning_rate": 0.00021998694223059837 + }, + { + "step": 264, + "epoch": 1.65, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6931, + "grad_norm": 0.19698597490787506, + "learning_rate": 0.0002192622919852551 + }, + { + "step": 265, + "epoch": 1.65625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6934, + "grad_norm": 0.4261871874332428, + "learning_rate": 0.00021853558134456307 + }, + { + "step": 266, + "epoch": 1.6625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.697, + "grad_norm": 0.9189908504486084, + "learning_rate": 0.00021780683192650796 + }, + { + "step": 267, + "epoch": 1.66875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6932, + "grad_norm": 0.6724843978881836, + "learning_rate": 0.00021707606540972413 + }, + { + "step": 268, + "epoch": 1.675, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7077, + "grad_norm": 1.6830873489379883, + "learning_rate": 0.00021634330353285017 + }, + { + "step": 269, + "epoch": 1.68125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6874, + "grad_norm": 0.3641982972621918, + "learning_rate": 0.00021560856809388213 + }, + { + "step": 270, + "epoch": 1.6875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7104, + "grad_norm": 1.629470705986023, + "learning_rate": 0.00021487188094952489 + }, + { + "step": 271, + "epoch": 1.69375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6957, + "grad_norm": 0.4395933747291565, + "learning_rate": 0.0002141332640145423 + }, + { + "step": 272, + "epoch": 1.7, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6916, + "grad_norm": 1.0776561498641968, + "learning_rate": 0.0002133927392611049 + }, + { + "step": 273, + "epoch": 1.70625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6959, + "grad_norm": 0.681092381477356, + "learning_rate": 0.00021265032871813658 + }, + { + "step": 274, + "epoch": 1.7125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6937, + "grad_norm": 0.9071853756904602, + "learning_rate": 0.00021190605447065917 + }, + { + "step": 275, + "epoch": 1.71875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6973, + "grad_norm": 0.8492677807807922, + "learning_rate": 0.0002111599386591355 + }, + { + "step": 276, + "epoch": 1.725, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6944, + "grad_norm": 0.5235853791236877, + "learning_rate": 0.00021041200347881057 + }, + { + "step": 277, + "epoch": 1.73125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493272576, + "loss": 0.7015, + "grad_norm": 1.0504201650619507, + "learning_rate": 0.00020966227117905163 + }, + { + "step": 278, + "epoch": 1.7375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6925, + "grad_norm": 0.07368438690900803, + "learning_rate": 0.00020891076406268612 + }, + { + "step": 279, + "epoch": 1.74375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6922, + "grad_norm": 0.7447234988212585, + "learning_rate": 0.00020815750448533805 + }, + { + "step": 280, + "epoch": 1.75, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493272576, + "loss": 0.6926, + "grad_norm": 0.13185036182403564, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 281, + "epoch": 1.75625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493258752, + "loss": 0.6846, + "grad_norm": 0.42570960521698, + "learning_rate": 0.00020664581763018324 + }, + { + "step": 282, + "epoch": 1.7625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7048, + "grad_norm": 0.7761011123657227, + "learning_rate": 0.00020588743532161543 + }, + { + "step": 283, + "epoch": 1.76875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7316, + "grad_norm": 1.8634319305419922, + "learning_rate": 0.00020512739048920552 + }, + { + "step": 284, + "epoch": 1.775, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.7003, + "grad_norm": 0.6440865397453308, + "learning_rate": 0.00020436570574255522 + }, + { + "step": 285, + "epoch": 1.78125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6932, + "grad_norm": 0.10810733586549759, + "learning_rate": 0.00020360240374005 + }, + { + "step": 286, + "epoch": 1.7875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.7234, + "grad_norm": 1.994807481765747, + "learning_rate": 0.00020283750718818501 + }, + { + "step": 287, + "epoch": 1.79375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493258752, + "loss": 0.7052, + "grad_norm": 0.8896049857139587, + "learning_rate": 0.00020207103884088955 + }, + { + "step": 288, + "epoch": 1.8, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6975, + "grad_norm": 0.5150673985481262, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 289, + "epoch": 1.80625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6948, + "grad_norm": 0.7968298196792603, + "learning_rate": 0.00020053347800883298 + }, + { + "step": 290, + "epoch": 1.8125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6897, + "grad_norm": 0.09721643477678299, + "learning_rate": 0.00019976243126300282 + }, + { + "step": 291, + "epoch": 1.81875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.7023, + "grad_norm": 1.1131184101104736, + "learning_rate": 0.00019898990419824333 + }, + { + "step": 292, + "epoch": 1.825, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49325568, + "loss": 0.7048, + "grad_norm": 1.2400481700897217, + "learning_rate": 0.00019821591979547423 + }, + { + "step": 293, + "epoch": 1.83125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7228, + "grad_norm": 2.0650367736816406, + "learning_rate": 0.00019744050107896774 + }, + { + "step": 294, + "epoch": 1.8375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493257216, + "loss": 0.6956, + "grad_norm": 1.948431372642517, + "learning_rate": 0.0001966636711156636 + }, + { + "step": 295, + "epoch": 1.84375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7166, + "grad_norm": 2.5801656246185303, + "learning_rate": 0.00019588545301448302 + }, + { + "step": 296, + "epoch": 1.85, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7897, + "grad_norm": 7.1929450035095215, + "learning_rate": 0.00019510586992564093 + }, + { + "step": 297, + "epoch": 1.85625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7359, + "grad_norm": 4.777463912963867, + "learning_rate": 0.0001943249450399578 + }, + { + "step": 298, + "epoch": 1.8625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.7183, + "grad_norm": 3.6388299465179443, + "learning_rate": 0.0001935427015881693 + }, + { + "step": 299, + "epoch": 1.86875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6904, + "grad_norm": 0.5462889075279236, + "learning_rate": 0.00019275916284023563 + }, + { + "step": 300, + "epoch": 1.875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6898, + "grad_norm": 3.060173749923706, + "learning_rate": 0.00019197435210464882 + }, + { + "step": 301, + "epoch": 1.88125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.7095, + "grad_norm": 2.9402120113372803, + "learning_rate": 0.00019118829272773985 + }, + { + "step": 302, + "epoch": 1.8875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7179, + "grad_norm": 3.5880584716796875, + "learning_rate": 0.00019040100809298392 + }, + { + "step": 303, + "epoch": 1.89375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493275648, + "loss": 0.6958, + "grad_norm": 1.4134478569030762, + "learning_rate": 0.00018961252162030476 + }, + { + "step": 304, + "epoch": 1.9, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6891, + "grad_norm": 0.14993619918823242, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 305, + "epoch": 1.90625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.689, + "grad_norm": 1.0507315397262573, + "learning_rate": 0.00018803203701893393 + }, + { + "step": 306, + "epoch": 1.9125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6916, + "grad_norm": 0.31253916025161743, + "learning_rate": 0.00018724008590605742 + }, + { + "step": 307, + "epoch": 1.91875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.7009, + "grad_norm": 2.317969799041748, + "learning_rate": 0.0001864470269854896 + }, + { + "step": 308, + "epoch": 1.925, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6925, + "grad_norm": 1.1158487796783447, + "learning_rate": 0.00018565288384892595 + }, + { + "step": 309, + "epoch": 1.93125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6984, + "grad_norm": 2.6258902549743652, + "learning_rate": 0.00018485768012031518 + }, + { + "step": 310, + "epoch": 1.9375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.696, + "grad_norm": 2.1946189403533936, + "learning_rate": 0.00018406143945515598 + }, + { + "step": 311, + "epoch": 1.94375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493258752, + "loss": 0.6927, + "grad_norm": 1.0818856954574585, + "learning_rate": 0.00018326418553979367 + }, + { + "step": 312, + "epoch": 1.95, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493258752, + "loss": 0.7003, + "grad_norm": 1.4573496580123901, + "learning_rate": 0.0001824659420907154 + }, + { + "step": 313, + "epoch": 1.95625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6897, + "grad_norm": 0.21819184720516205, + "learning_rate": 0.00018166673285384475 + }, + { + "step": 314, + "epoch": 1.9625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6972, + "grad_norm": 0.9096593260765076, + "learning_rate": 0.00018086658160383523 + }, + { + "step": 315, + "epoch": 1.96875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6892, + "grad_norm": 0.3919566869735718, + "learning_rate": 0.00018006551214336304 + }, + { + "step": 316, + "epoch": 1.975, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7011, + "grad_norm": 1.9796565771102905, + "learning_rate": 0.00017926354830241924 + }, + { + "step": 317, + "epoch": 1.98125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6923, + "grad_norm": 1.0768581628799438, + "learning_rate": 0.00017846071393760044 + }, + { + "step": 318, + "epoch": 1.9875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6895, + "grad_norm": 0.23801913857460022, + "learning_rate": 0.00017765703293139948 + }, + { + "step": 319, + "epoch": 1.99375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6817, + "grad_norm": 0.4173364043235779, + "learning_rate": 0.00017685252919149493 + }, + { + "step": 320, + "epoch": 2.0, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6718, + "grad_norm": 0.3730367422103882, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 321, + "epoch": 2.00625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6373, + "grad_norm": 1.2801495790481567, + "learning_rate": 0.00017524114926294887 + }, + { + "step": 322, + "epoch": 2.0125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7242, + "grad_norm": 3.726969003677368, + "learning_rate": 0.0001744343210091883 + }, + { + "step": 323, + "epoch": 2.01875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7357, + "grad_norm": 4.303971767425537, + "learning_rate": 0.00017362676589005967 + }, + { + "step": 324, + "epoch": 2.025, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493275648, + "loss": 0.7604, + "grad_norm": 5.400115013122559, + "learning_rate": 0.0001728185079284875 + }, + { + "step": 325, + "epoch": 2.03125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7155, + "grad_norm": 2.808410406112671, + "learning_rate": 0.00017200957116830423 + }, + { + "step": 326, + "epoch": 2.0375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493274112, + "loss": 0.6988, + "grad_norm": 1.4254833459854126, + "learning_rate": 0.00017119997967353514 + }, + { + "step": 327, + "epoch": 2.04375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6995, + "grad_norm": 2.451799154281616, + "learning_rate": 0.00017038975752768211 + }, + { + "step": 328, + "epoch": 2.05, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6946, + "grad_norm": 0.8888295292854309, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 329, + "epoch": 2.05625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.688, + "grad_norm": 0.8599239587783813, + "learning_rate": 0.0001687675177098179 + }, + { + "step": 330, + "epoch": 2.0625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6982, + "grad_norm": 1.2661281824111938, + "learning_rate": 0.00016795554829574435 + }, + { + "step": 331, + "epoch": 2.06875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6954, + "grad_norm": 0.6262684464454651, + "learning_rate": 0.00016714304474502696 + }, + { + "step": 332, + "epoch": 2.075, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.7249, + "grad_norm": 3.004535675048828, + "learning_rate": 0.00016633003122779467 + }, + { + "step": 333, + "epoch": 2.08125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.7198, + "grad_norm": 2.6521518230438232, + "learning_rate": 0.00016551653192934694 + }, + { + "step": 334, + "epoch": 2.0875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6934, + "grad_norm": 0.16629600524902344, + "learning_rate": 0.0001647025710494341 + }, + { + "step": 335, + "epoch": 2.09375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.701, + "grad_norm": 1.5891329050064087, + "learning_rate": 0.00016388817280153735 + }, + { + "step": 336, + "epoch": 2.1, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6766, + "grad_norm": 0.27493518590927124, + "learning_rate": 0.00016307336141214873 + }, + { + "step": 337, + "epoch": 2.10625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.7328, + "grad_norm": 3.8974907398223877, + "learning_rate": 0.00016225816112005022 + }, + { + "step": 338, + "epoch": 2.1125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.721, + "grad_norm": 3.181713342666626, + "learning_rate": 0.00016144259617559286 + }, + { + "step": 339, + "epoch": 2.11875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.695, + "grad_norm": 0.6114965081214905, + "learning_rate": 0.00016062669083997513 + }, + { + "step": 340, + "epoch": 2.125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.7, + "grad_norm": 0.7707285284996033, + "learning_rate": 0.00015981046938452146 + }, + { + "step": 341, + "epoch": 2.13125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7176, + "grad_norm": 1.9871183633804321, + "learning_rate": 0.00015899395608996015 + }, + { + "step": 342, + "epoch": 2.1375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7079, + "grad_norm": 1.5275901556015015, + "learning_rate": 0.00015817717524570094 + }, + { + "step": 343, + "epoch": 2.14375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6935, + "grad_norm": 0.05540606006979942, + "learning_rate": 0.0001573601511491127 + }, + { + "step": 344, + "epoch": 2.15, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6855, + "grad_norm": 0.6088613867759705, + "learning_rate": 0.00015654290810480042 + }, + { + "step": 345, + "epoch": 2.15625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6857, + "grad_norm": 0.05370793864130974, + "learning_rate": 0.00015572547042388223 + }, + { + "step": 346, + "epoch": 2.1625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.7094, + "grad_norm": 1.3492392301559448, + "learning_rate": 0.00015490786242326643 + }, + { + "step": 347, + "epoch": 2.16875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.7267, + "grad_norm": 2.3795320987701416, + "learning_rate": 0.00015409010842492777 + }, + { + "step": 348, + "epoch": 2.175, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.7169, + "grad_norm": 2.334015369415283, + "learning_rate": 0.00015327223275518416 + }, + { + "step": 349, + "epoch": 2.18125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493254144, + "loss": 0.6922, + "grad_norm": 0.09802702069282532, + "learning_rate": 0.000152454259743973 + }, + { + "step": 350, + "epoch": 2.1875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6954, + "grad_norm": 0.7963466644287109, + "learning_rate": 0.00015163621372412734 + }, + { + "step": 351, + "epoch": 2.19375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.689, + "grad_norm": 0.23487374186515808, + "learning_rate": 0.00015081811903065205 + }, + { + "step": 352, + "epoch": 2.2, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7064, + "grad_norm": 2.3972465991973877, + "learning_rate": 0.00015 + }, + { + "step": 353, + "epoch": 2.20625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6945, + "grad_norm": 0.9253482818603516, + "learning_rate": 0.0001491818809693479 + }, + { + "step": 354, + "epoch": 2.2125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6587, + "grad_norm": 5.5463690757751465, + "learning_rate": 0.00014836378627587266 + }, + { + "step": 355, + "epoch": 2.21875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.7048, + "grad_norm": 2.2290616035461426, + "learning_rate": 0.00014754574025602698 + }, + { + "step": 356, + "epoch": 2.225, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6958, + "grad_norm": 1.160280704498291, + "learning_rate": 0.00014672776724481584 + }, + { + "step": 357, + "epoch": 2.23125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6906, + "grad_norm": 0.48988962173461914, + "learning_rate": 0.00014590989157507224 + }, + { + "step": 358, + "epoch": 2.2375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6928, + "grad_norm": 0.5344785451889038, + "learning_rate": 0.00014509213757673357 + }, + { + "step": 359, + "epoch": 2.24375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.7073, + "grad_norm": 1.3381860256195068, + "learning_rate": 0.00014427452957611775 + }, + { + "step": 360, + "epoch": 2.25, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6916, + "grad_norm": 0.505049467086792, + "learning_rate": 0.0001434570918951996 + }, + { + "step": 361, + "epoch": 2.25625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7081, + "grad_norm": 1.2437971830368042, + "learning_rate": 0.0001426398488508873 + }, + { + "step": 362, + "epoch": 2.2625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.678, + "grad_norm": 0.9628796577453613, + "learning_rate": 0.00014182282475429903 + }, + { + "step": 363, + "epoch": 2.26875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.691, + "grad_norm": 0.10424486547708511, + "learning_rate": 0.00014100604391003985 + }, + { + "step": 364, + "epoch": 2.275, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7017, + "grad_norm": 1.034665584564209, + "learning_rate": 0.0001401895306154785 + }, + { + "step": 365, + "epoch": 2.28125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6949, + "grad_norm": 1.1727701425552368, + "learning_rate": 0.00013937330916002487 + }, + { + "step": 366, + "epoch": 2.2875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6936, + "grad_norm": 0.25218909978866577, + "learning_rate": 0.00013855740382440714 + }, + { + "step": 367, + "epoch": 2.29375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7241, + "grad_norm": 2.5076680183410645, + "learning_rate": 0.0001377418388799498 + }, + { + "step": 368, + "epoch": 2.3, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7253, + "grad_norm": 2.3764379024505615, + "learning_rate": 0.00013692663858785124 + }, + { + "step": 369, + "epoch": 2.30625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7196, + "grad_norm": 2.071523904800415, + "learning_rate": 0.00013611182719846268 + }, + { + "step": 370, + "epoch": 2.3125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6952, + "grad_norm": 0.490558385848999, + "learning_rate": 0.0001352974289505659 + }, + { + "step": 371, + "epoch": 2.31875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.693, + "grad_norm": 0.09968169033527374, + "learning_rate": 0.000134483468070653 + }, + { + "step": 372, + "epoch": 2.325, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6945, + "grad_norm": 0.4566900134086609, + "learning_rate": 0.00013366996877220533 + }, + { + "step": 373, + "epoch": 2.33125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.694, + "grad_norm": 0.20526276528835297, + "learning_rate": 0.000132856955254973 + }, + { + "step": 374, + "epoch": 2.3375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493277184, + "loss": 0.6864, + "grad_norm": 0.682645320892334, + "learning_rate": 0.00013204445170425565 + }, + { + "step": 375, + "epoch": 2.34375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7034, + "grad_norm": 0.7644042372703552, + "learning_rate": 0.00013123248229018214 + }, + { + "step": 376, + "epoch": 2.35, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7045, + "grad_norm": 0.867067277431488, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 377, + "epoch": 2.35625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493277184, + "loss": 0.698, + "grad_norm": 0.5723317265510559, + "learning_rate": 0.0001296102424723179 + }, + { + "step": 378, + "epoch": 2.3625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6922, + "grad_norm": 0.5128520131111145, + "learning_rate": 0.0001288000203264649 + }, + { + "step": 379, + "epoch": 2.36875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7027, + "grad_norm": 0.7763327360153198, + "learning_rate": 0.00012799042883169574 + }, + { + "step": 380, + "epoch": 2.375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6807, + "grad_norm": 0.4654196500778198, + "learning_rate": 0.00012718149207151247 + }, + { + "step": 381, + "epoch": 2.38125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7072, + "grad_norm": 0.9251275658607483, + "learning_rate": 0.00012637323410994033 + }, + { + "step": 382, + "epoch": 2.3875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6815, + "grad_norm": 0.09352774173021317, + "learning_rate": 0.0001255656789908117 + }, + { + "step": 383, + "epoch": 2.39375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6865, + "grad_norm": 0.17104831337928772, + "learning_rate": 0.0001247588507370511 + }, + { + "step": 384, + "epoch": 2.4, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7156, + "grad_norm": 1.282277226448059, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 385, + "epoch": 2.40625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7223, + "grad_norm": 1.6112420558929443, + "learning_rate": 0.0001231474708085051 + }, + { + "step": 386, + "epoch": 2.4125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493258752, + "loss": 0.7, + "grad_norm": 0.5846309065818787, + "learning_rate": 0.0001223429670686005 + }, + { + "step": 387, + "epoch": 2.41875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6913, + "grad_norm": 0.5182252526283264, + "learning_rate": 0.00012153928606239957 + }, + { + "step": 388, + "epoch": 2.425, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6933, + "grad_norm": 0.26854726672172546, + "learning_rate": 0.00012073645169758076 + }, + { + "step": 389, + "epoch": 2.43125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6949, + "grad_norm": 0.331524133682251, + "learning_rate": 0.00011993448785663692 + }, + { + "step": 390, + "epoch": 2.4375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6884, + "grad_norm": 0.3172411024570465, + "learning_rate": 0.00011913341839616476 + }, + { + "step": 391, + "epoch": 2.44375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6948, + "grad_norm": 0.2731734812259674, + "learning_rate": 0.00011833326714615522 + }, + { + "step": 392, + "epoch": 2.45, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493274112, + "loss": 0.6986, + "grad_norm": 0.4783206582069397, + "learning_rate": 0.00011753405790928456 + }, + { + "step": 393, + "epoch": 2.45625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6891, + "grad_norm": 0.06278761476278305, + "learning_rate": 0.0001167358144602063 + }, + { + "step": 394, + "epoch": 2.4625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6998, + "grad_norm": 0.5763875842094421, + "learning_rate": 0.00011593856054484402 + }, + { + "step": 395, + "epoch": 2.46875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6757, + "grad_norm": 1.269555926322937, + "learning_rate": 0.00011514231987968482 + }, + { + "step": 396, + "epoch": 2.475, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6919, + "grad_norm": 0.07345152646303177, + "learning_rate": 0.00011434711615107404 + }, + { + "step": 397, + "epoch": 2.48125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493258752, + "loss": 0.6916, + "grad_norm": 0.11456043273210526, + "learning_rate": 0.00011355297301451042 + }, + { + "step": 398, + "epoch": 2.4875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493274112, + "loss": 0.6922, + "grad_norm": 0.130578875541687, + "learning_rate": 0.00011275991409394253 + }, + { + "step": 399, + "epoch": 2.49375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7048, + "grad_norm": 0.9714057445526123, + "learning_rate": 0.00011196796298106608 + }, + { + "step": 400, + "epoch": 2.5, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6985, + "grad_norm": 0.8308908343315125, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 401, + "epoch": 2.50625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6974, + "grad_norm": 1.545379638671875, + "learning_rate": 0.00011038747837969526 + }, + { + "step": 402, + "epoch": 2.5125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6947, + "grad_norm": 0.4468715190887451, + "learning_rate": 0.00010959899190701608 + }, + { + "step": 403, + "epoch": 2.51875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6971, + "grad_norm": 0.7232534289360046, + "learning_rate": 0.00010881170727226018 + }, + { + "step": 404, + "epoch": 2.525, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.692, + "grad_norm": 0.4824254512786865, + "learning_rate": 0.00010802564789535119 + }, + { + "step": 405, + "epoch": 2.53125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6937, + "grad_norm": 0.7461310625076294, + "learning_rate": 0.00010724083715976441 + }, + { + "step": 406, + "epoch": 2.5375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6925, + "grad_norm": 0.5481137633323669, + "learning_rate": 0.00010645729841183066 + }, + { + "step": 407, + "epoch": 2.54375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6918, + "grad_norm": 0.11391840875148773, + "learning_rate": 0.00010567505496004213 + }, + { + "step": 408, + "epoch": 2.55, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493272576, + "loss": 0.6925, + "grad_norm": 0.11428897082805634, + "learning_rate": 0.00010489413007435904 + }, + { + "step": 409, + "epoch": 2.55625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6926, + "grad_norm": 0.24725113809108734, + "learning_rate": 0.00010411454698551695 + }, + { + "step": 410, + "epoch": 2.5625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6814, + "grad_norm": 0.1699390858411789, + "learning_rate": 0.00010333632888433638 + }, + { + "step": 411, + "epoch": 2.56875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.7213, + "grad_norm": 1.2918518781661987, + "learning_rate": 0.00010255949892103225 + }, + { + "step": 412, + "epoch": 2.575, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6892, + "grad_norm": 0.1743762344121933, + "learning_rate": 0.00010178408020452579 + }, + { + "step": 413, + "epoch": 2.58125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493274112, + "loss": 0.6894, + "grad_norm": 0.11508974432945251, + "learning_rate": 0.00010101009580175669 + }, + { + "step": 414, + "epoch": 2.5875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6982, + "grad_norm": 0.4648977816104889, + "learning_rate": 0.00010023756873699722 + }, + { + "step": 415, + "epoch": 2.59375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6982, + "grad_norm": 0.4838847219944, + "learning_rate": 9.946652199116699e-05 + }, + { + "step": 416, + "epoch": 2.6, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6987, + "grad_norm": 0.7993524074554443, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 417, + "epoch": 2.60625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6903, + "grad_norm": 0.5528243184089661, + "learning_rate": 9.792896115911045e-05 + }, + { + "step": 418, + "epoch": 2.6125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.7043, + "grad_norm": 1.0386658906936646, + "learning_rate": 9.716249281181497e-05 + }, + { + "step": 419, + "epoch": 2.61875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6778, + "grad_norm": 0.44743525981903076, + "learning_rate": 9.639759625994998e-05 + }, + { + "step": 420, + "epoch": 2.625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.706, + "grad_norm": 0.997342586517334, + "learning_rate": 9.563429425744476e-05 + }, + { + "step": 421, + "epoch": 2.63125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7231, + "grad_norm": 1.649965524673462, + "learning_rate": 9.487260951079448e-05 + }, + { + "step": 422, + "epoch": 2.6375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.7261, + "grad_norm": 1.8350296020507812, + "learning_rate": 9.411256467838455e-05 + }, + { + "step": 423, + "epoch": 2.64375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6734, + "grad_norm": 0.8413083553314209, + "learning_rate": 9.335418236981677e-05 + }, + { + "step": 424, + "epoch": 2.65, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7006, + "grad_norm": 0.6736396551132202, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 425, + "epoch": 2.65625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6979, + "grad_norm": 0.506266176700592, + "learning_rate": 9.184249551466189e-05 + }, + { + "step": 426, + "epoch": 2.6625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6898, + "grad_norm": 0.7107889652252197, + "learning_rate": 9.10892359373139e-05 + }, + { + "step": 427, + "epoch": 2.66875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493274112, + "loss": 0.694, + "grad_norm": 0.11548663675785065, + "learning_rate": 9.033772882094833e-05 + }, + { + "step": 428, + "epoch": 2.675, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493258752, + "loss": 0.6945, + "grad_norm": 0.9708290696144104, + "learning_rate": 8.958799652118943e-05 + }, + { + "step": 429, + "epoch": 2.68125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6937, + "grad_norm": 0.12105577439069748, + "learning_rate": 8.884006134086449e-05 + }, + { + "step": 430, + "epoch": 2.6875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6966, + "grad_norm": 0.5344308018684387, + "learning_rate": 8.809394552934079e-05 + }, + { + "step": 431, + "epoch": 2.69375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.7043, + "grad_norm": 1.186991810798645, + "learning_rate": 8.734967128186338e-05 + }, + { + "step": 432, + "epoch": 2.7, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6937, + "grad_norm": 0.2096429020166397, + "learning_rate": 8.660726073889511e-05 + }, + { + "step": 433, + "epoch": 2.70625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6934, + "grad_norm": 0.07441913336515427, + "learning_rate": 8.586673598545771e-05 + }, + { + "step": 434, + "epoch": 2.7125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.694, + "grad_norm": 0.10533465445041656, + "learning_rate": 8.512811905047505e-05 + }, + { + "step": 435, + "epoch": 2.71875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6935, + "grad_norm": 0.4451207220554352, + "learning_rate": 8.439143190611787e-05 + }, + { + "step": 436, + "epoch": 2.725, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6922, + "grad_norm": 0.09995988011360168, + "learning_rate": 8.365669646714983e-05 + }, + { + "step": 437, + "epoch": 2.73125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493272576, + "loss": 0.6946, + "grad_norm": 0.2722465693950653, + "learning_rate": 8.29239345902759e-05 + }, + { + "step": 438, + "epoch": 2.7375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6939, + "grad_norm": 0.19642047584056854, + "learning_rate": 8.219316807349204e-05 + }, + { + "step": 439, + "epoch": 2.74375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6946, + "grad_norm": 0.22155985236167908, + "learning_rate": 8.146441865543689e-05 + }, + { + "step": 440, + "epoch": 2.75, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6801, + "grad_norm": 1.199425458908081, + "learning_rate": 8.073770801474495e-05 + }, + { + "step": 441, + "epoch": 2.75625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6934, + "grad_norm": 0.17855043709278107, + "learning_rate": 8.001305776940163e-05 + }, + { + "step": 442, + "epoch": 2.7625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6894, + "grad_norm": 0.08617018163204193, + "learning_rate": 7.929048947610034e-05 + }, + { + "step": 443, + "epoch": 2.76875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7001, + "grad_norm": 0.5804123878479004, + "learning_rate": 7.857002462960132e-05 + }, + { + "step": 444, + "epoch": 2.775, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6863, + "grad_norm": 0.16037242114543915, + "learning_rate": 7.785168466209187e-05 + }, + { + "step": 445, + "epoch": 2.78125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6888, + "grad_norm": 0.056581269949674606, + "learning_rate": 7.713549094254897e-05 + }, + { + "step": 446, + "epoch": 2.7875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6828, + "grad_norm": 0.27575674653053284, + "learning_rate": 7.64214647761038e-05 + }, + { + "step": 447, + "epoch": 2.79375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6854, + "grad_norm": 0.09556476026773453, + "learning_rate": 7.570962740340759e-05 + }, + { + "step": 448, + "epoch": 2.8, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6854, + "grad_norm": 0.0785076692700386, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 449, + "epoch": 2.80625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6866, + "grad_norm": 0.07456810772418976, + "learning_rate": 7.429260367567916e-05 + }, + { + "step": 450, + "epoch": 2.8125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.7104, + "grad_norm": 0.9712949991226196, + "learning_rate": 7.358745947387373e-05 + }, + { + "step": 451, + "epoch": 2.81875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6966, + "grad_norm": 0.4443783462047577, + "learning_rate": 7.288458837101675e-05 + }, + { + "step": 452, + "epoch": 2.825, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6692, + "grad_norm": 0.8357337713241577, + "learning_rate": 7.218401127592175e-05 + }, + { + "step": 453, + "epoch": 2.83125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6761, + "grad_norm": 0.5487228631973267, + "learning_rate": 7.14857490291609e-05 + }, + { + "step": 454, + "epoch": 2.8375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6919, + "grad_norm": 0.24701420962810516, + "learning_rate": 7.07898224024448e-05 + }, + { + "step": 455, + "epoch": 2.84375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.71, + "grad_norm": 0.9884291291236877, + "learning_rate": 7.009625209800465e-05 + }, + { + "step": 456, + "epoch": 2.85, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.7038, + "grad_norm": 0.758908748626709, + "learning_rate": 6.940505874797639e-05 + }, + { + "step": 457, + "epoch": 2.85625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6985, + "grad_norm": 0.4858686923980713, + "learning_rate": 6.871626291378728e-05 + }, + { + "step": 458, + "epoch": 2.8625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7014, + "grad_norm": 0.8791607618331909, + "learning_rate": 6.80298850855435e-05 + }, + { + "step": 459, + "epoch": 2.86875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6956, + "grad_norm": 0.10233551263809204, + "learning_rate": 6.734594568142142e-05 + }, + { + "step": 460, + "epoch": 2.875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6909, + "grad_norm": 0.269578218460083, + "learning_rate": 6.66644650470597e-05 + }, + { + "step": 461, + "epoch": 2.88125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.7177, + "grad_norm": 1.6057580709457397, + "learning_rate": 6.598546345495417e-05 + }, + { + "step": 462, + "epoch": 2.8875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6775, + "grad_norm": 0.5913205146789551, + "learning_rate": 6.530896110385494e-05 + }, + { + "step": 463, + "epoch": 2.89375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6962, + "grad_norm": 0.3205437958240509, + "learning_rate": 6.463497811816523e-05 + }, + { + "step": 464, + "epoch": 2.9, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.7088, + "grad_norm": 0.7877692580223083, + "learning_rate": 6.396353454734311e-05 + }, + { + "step": 465, + "epoch": 2.90625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6792, + "grad_norm": 0.32332825660705566, + "learning_rate": 6.32946503653045e-05 + }, + { + "step": 466, + "epoch": 2.9125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.7008, + "grad_norm": 0.5397357940673828, + "learning_rate": 6.262834546982969e-05 + }, + { + "step": 467, + "epoch": 2.91875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493274112, + "loss": 0.6954, + "grad_norm": 0.38779446482658386, + "learning_rate": 6.196463968197084e-05 + }, + { + "step": 468, + "epoch": 2.925, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6985, + "grad_norm": 0.4589802324771881, + "learning_rate": 6.130355274546267e-05 + }, + { + "step": 469, + "epoch": 2.93125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.7097, + "grad_norm": 1.100497841835022, + "learning_rate": 6.064510432613499e-05 + }, + { + "step": 470, + "epoch": 2.9375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6981, + "grad_norm": 0.5463441610336304, + "learning_rate": 5.998931401132786e-05 + }, + { + "step": 471, + "epoch": 2.94375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6904, + "grad_norm": 0.9861207008361816, + "learning_rate": 5.933620130930867e-05 + }, + { + "step": 472, + "epoch": 2.95, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6936, + "grad_norm": 0.44449710845947266, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 473, + "epoch": 2.95625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6933, + "grad_norm": 0.13664671778678894, + "learning_rate": 5.803808637786135e-05 + }, + { + "step": 474, + "epoch": 2.9625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493258752, + "loss": 0.6934, + "grad_norm": 0.09472903609275818, + "learning_rate": 5.739312276439427e-05 + }, + { + "step": 475, + "epoch": 2.96875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6956, + "grad_norm": 0.3515406847000122, + "learning_rate": 5.6750913994488415e-05 + }, + { + "step": 476, + "epoch": 2.975, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6962, + "grad_norm": 0.374911367893219, + "learning_rate": 5.6111479172391136e-05 + }, + { + "step": 477, + "epoch": 2.98125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7016, + "grad_norm": 0.8814480900764465, + "learning_rate": 5.5474837319831314e-05 + }, + { + "step": 478, + "epoch": 2.9875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7002, + "grad_norm": 0.766776978969574, + "learning_rate": 5.4841007375453186e-05 + }, + { + "step": 479, + "epoch": 2.99375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6955, + "grad_norm": 0.5970671772956848, + "learning_rate": 5.4210008194253196e-05 + }, + { + "step": 480, + "epoch": 3.0, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6952, + "grad_norm": 0.5079426765441895, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 481, + "epoch": 3.00625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493258752, + "loss": 0.6902, + "grad_norm": 0.31031328439712524, + "learning_rate": 5.2956577119771405e-05 + }, + { + "step": 482, + "epoch": 3.0125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6987, + "grad_norm": 0.5803223252296448, + "learning_rate": 5.233418251320765e-05 + }, + { + "step": 483, + "epoch": 3.01875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7039, + "grad_norm": 0.7869393229484558, + "learning_rate": 5.171469324214901e-05 + }, + { + "step": 484, + "epoch": 3.025, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6937, + "grad_norm": 0.17968864738941193, + "learning_rate": 5.109812773498967e-05 + }, + { + "step": 485, + "epoch": 3.03125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7063, + "grad_norm": 0.8002002835273743, + "learning_rate": 5.048450433314835e-05 + }, + { + "step": 486, + "epoch": 3.0375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6875, + "grad_norm": 0.2426045536994934, + "learning_rate": 4.987384129052291e-05 + }, + { + "step": 487, + "epoch": 3.04375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6993, + "grad_norm": 0.48433029651641846, + "learning_rate": 4.926615677294723e-05 + }, + { + "step": 488, + "epoch": 3.05, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.7033, + "grad_norm": 0.9329314827919006, + "learning_rate": 4.866146885765096e-05 + }, + { + "step": 489, + "epoch": 3.05625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6884, + "grad_norm": 0.6353458166122437, + "learning_rate": 4.8059795532721575e-05 + }, + { + "step": 490, + "epoch": 3.0625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6957, + "grad_norm": 0.44617047905921936, + "learning_rate": 4.7461154696569294e-05 + }, + { + "step": 491, + "epoch": 3.06875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6924, + "grad_norm": 0.5938495993614197, + "learning_rate": 4.686556415739488e-05 + }, + { + "step": 492, + "epoch": 3.075, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6931, + "grad_norm": 0.25697192549705505, + "learning_rate": 4.62730416326596e-05 + }, + { + "step": 493, + "epoch": 3.08125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6939, + "grad_norm": 0.8882484436035156, + "learning_rate": 4.568360474855826e-05 + }, + { + "step": 494, + "epoch": 3.0875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6933, + "grad_norm": 0.2750925123691559, + "learning_rate": 4.509727103949492e-05 + }, + { + "step": 495, + "epoch": 3.09375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6916, + "grad_norm": 0.9524023532867432, + "learning_rate": 4.451405794756138e-05 + }, + { + "step": 496, + "epoch": 3.1, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493274112, + "loss": 0.6919, + "grad_norm": 0.31535953283309937, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 497, + "epoch": 3.10625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493274112, + "loss": 0.6999, + "grad_norm": 0.7158803939819336, + "learning_rate": 4.33570629187776e-05 + }, + { + "step": 498, + "epoch": 3.1125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7017, + "grad_norm": 0.9887136220932007, + "learning_rate": 4.278331539989307e-05 + }, + { + "step": 499, + "epoch": 3.11875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493272576, + "loss": 0.695, + "grad_norm": 0.24638275802135468, + "learning_rate": 4.2212757333045283e-05 + }, + { + "step": 500, + "epoch": 3.125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6956, + "grad_norm": 0.34131544828414917, + "learning_rate": 4.164540569103667e-05 + }, + { + "step": 501, + "epoch": 3.13125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6942, + "grad_norm": 0.09952346235513687, + "learning_rate": 4.108127735128561e-05 + }, + { + "step": 502, + "epoch": 3.1375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6921, + "grad_norm": 0.43539658188819885, + "learning_rate": 4.052038909532469e-05 + }, + { + "step": 503, + "epoch": 3.14375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6918, + "grad_norm": 0.5655514001846313, + "learning_rate": 3.996275760830125e-05 + }, + { + "step": 504, + "epoch": 3.15, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6946, + "grad_norm": 0.10511130839586258, + "learning_rate": 3.94083994784814e-05 + }, + { + "step": 505, + "epoch": 3.15625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6945, + "grad_norm": 0.6537057757377625, + "learning_rate": 3.885733119675616e-05 + }, + { + "step": 506, + "epoch": 3.1625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.694, + "grad_norm": 0.07887302339076996, + "learning_rate": 3.830956915615106e-05 + }, + { + "step": 507, + "epoch": 3.16875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493272576, + "loss": 0.6939, + "grad_norm": 0.12917043268680573, + "learning_rate": 3.776512965133863e-05 + }, + { + "step": 508, + "epoch": 3.175, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493258752, + "loss": 0.692, + "grad_norm": 0.5770266056060791, + "learning_rate": 3.72240288781534e-05 + }, + { + "step": 509, + "epoch": 3.18125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6923, + "grad_norm": 0.5236108303070068, + "learning_rate": 3.66862829331103e-05 + }, + { + "step": 510, + "epoch": 3.1875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6952, + "grad_norm": 0.18125870823860168, + "learning_rate": 3.6151907812925717e-05 + }, + { + "step": 511, + "epoch": 3.19375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6944, + "grad_norm": 0.1543625295162201, + "learning_rate": 3.562091941404179e-05 + }, + { + "step": 512, + "epoch": 3.2, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6921, + "grad_norm": 0.08381187915802002, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 513, + "epoch": 3.20625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6932, + "grad_norm": 0.2679308354854584, + "learning_rate": 3.456916586173797e-05 + }, + { + "step": 514, + "epoch": 3.2125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6939, + "grad_norm": 0.5971918106079102, + "learning_rate": 3.404843199558945e-05 + }, + { + "step": 515, + "epoch": 3.21875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6948, + "grad_norm": 0.4888291358947754, + "learning_rate": 3.3531147424353664e-05 + }, + { + "step": 516, + "epoch": 3.225, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6936, + "grad_norm": 0.48184147477149963, + "learning_rate": 3.301732753606776e-05 + }, + { + "step": 517, + "epoch": 3.23125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6946, + "grad_norm": 1.0939360857009888, + "learning_rate": 3.250698761570244e-05 + }, + { + "step": 518, + "epoch": 3.2375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6942, + "grad_norm": 1.336277723312378, + "learning_rate": 3.200014284470745e-05 + }, + { + "step": 519, + "epoch": 3.24375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6926, + "grad_norm": 0.06913850456476212, + "learning_rate": 3.149680830055967e-05 + }, + { + "step": 520, + "epoch": 3.25, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6912, + "grad_norm": 0.531511664390564, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 521, + "epoch": 3.25625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6927, + "grad_norm": 0.11752745509147644, + "learning_rate": 3.0500729680161663e-05 + }, + { + "step": 522, + "epoch": 3.2625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6886, + "grad_norm": 0.3513859808444977, + "learning_rate": 3.0008015234980552e-05 + }, + { + "step": 523, + "epoch": 3.26875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6937, + "grad_norm": 0.1914355456829071, + "learning_rate": 2.9518870277903274e-05 + }, + { + "step": 524, + "epoch": 3.275, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.694, + "grad_norm": 0.20215129852294922, + "learning_rate": 2.9033309359877597e-05 + }, + { + "step": 525, + "epoch": 3.28125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.7041, + "grad_norm": 0.8248656392097473, + "learning_rate": 2.855134692523438e-05 + }, + { + "step": 526, + "epoch": 3.2875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6946, + "grad_norm": 0.2487579882144928, + "learning_rate": 2.807299731125773e-05 + }, + { + "step": 527, + "epoch": 3.29375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6882, + "grad_norm": 0.15016788244247437, + "learning_rate": 2.759827474775852e-05 + }, + { + "step": 528, + "epoch": 3.3, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6891, + "grad_norm": 0.16379833221435547, + "learning_rate": 2.7127193356651213e-05 + }, + { + "step": 529, + "epoch": 3.30625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.7036, + "grad_norm": 0.8011008501052856, + "learning_rate": 2.665976715153377e-05 + }, + { + "step": 530, + "epoch": 3.3125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6852, + "grad_norm": 0.5098150372505188, + "learning_rate": 2.619601003727043e-05 + }, + { + "step": 531, + "epoch": 3.31875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327872, + "loss": 0.7045, + "grad_norm": 0.9001055359840393, + "learning_rate": 2.5735935809578656e-05 + }, + { + "step": 532, + "epoch": 3.325, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7017, + "grad_norm": 0.8778653144836426, + "learning_rate": 2.5279558154618197e-05 + }, + { + "step": 533, + "epoch": 3.33125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6948, + "grad_norm": 0.3711833357810974, + "learning_rate": 2.4826890648584353e-05 + }, + { + "step": 534, + "epoch": 3.3375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6936, + "grad_norm": 0.20933866500854492, + "learning_rate": 2.4377946757303828e-05 + }, + { + "step": 535, + "epoch": 3.34375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6956, + "grad_norm": 0.1666393131017685, + "learning_rate": 2.393273983583427e-05 + }, + { + "step": 536, + "epoch": 3.35, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6937, + "grad_norm": 0.24948449432849884, + "learning_rate": 2.3491283128067174e-05 + }, + { + "step": 537, + "epoch": 3.35625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6925, + "grad_norm": 0.4152412414550781, + "learning_rate": 2.3053589766333414e-05 + }, + { + "step": 538, + "epoch": 3.3625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6884, + "grad_norm": 0.3124934136867523, + "learning_rate": 2.261967277101318e-05 + }, + { + "step": 539, + "epoch": 3.36875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6929, + "grad_norm": 0.051713231950998306, + "learning_rate": 2.218954505014821e-05 + }, + { + "step": 540, + "epoch": 3.375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7015, + "grad_norm": 0.962104856967926, + "learning_rate": 2.1763219399058042e-05 + }, + { + "step": 541, + "epoch": 3.38125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6931, + "grad_norm": 0.15692028403282166, + "learning_rate": 2.1340708499959197e-05 + }, + { + "step": 542, + "epoch": 3.3875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.691, + "grad_norm": 0.2183641642332077, + "learning_rate": 2.0922024921588167e-05 + }, + { + "step": 543, + "epoch": 3.39375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6906, + "grad_norm": 0.21790091693401337, + "learning_rate": 2.0507181118827254e-05 + }, + { + "step": 544, + "epoch": 3.4, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6928, + "grad_norm": 0.17110152542591095, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 545, + "epoch": 3.40625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493272576, + "loss": 0.6959, + "grad_norm": 0.28584593534469604, + "learning_rate": 1.9689062088175154e-05 + }, + { + "step": 546, + "epoch": 3.4125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6938, + "grad_norm": 0.16342511773109436, + "learning_rate": 1.928581119746081e-05 + }, + { + "step": 547, + "epoch": 3.41875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6891, + "grad_norm": 0.3222731947898865, + "learning_rate": 1.8886448755986193e-05 + }, + { + "step": 548, + "epoch": 3.425, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6907, + "grad_norm": 0.11666169762611389, + "learning_rate": 1.8490986643873845e-05 + }, + { + "step": 549, + "epoch": 3.43125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6916, + "grad_norm": 0.09830675274133682, + "learning_rate": 1.8099436625220443e-05 + }, + { + "step": 550, + "epoch": 3.4375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493275648, + "loss": 0.6893, + "grad_norm": 0.20368245244026184, + "learning_rate": 1.7711810347746757e-05 + }, + { + "step": 551, + "epoch": 3.44375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.698, + "grad_norm": 0.5435935258865356, + "learning_rate": 1.7328119342451165e-05 + }, + { + "step": 552, + "epoch": 3.45, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6932, + "grad_norm": 0.10252252966165543, + "learning_rate": 1.694837502326674e-05 + }, + { + "step": 553, + "epoch": 3.45625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6947, + "grad_norm": 0.41972804069519043, + "learning_rate": 1.6572588686721606e-05 + }, + { + "step": 554, + "epoch": 3.4625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6847, + "grad_norm": 0.554143488407135, + "learning_rate": 1.6200771511602882e-05 + }, + { + "step": 555, + "epoch": 3.46875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493272576, + "loss": 0.6775, + "grad_norm": 1.2635470628738403, + "learning_rate": 1.583293455862422e-05 + }, + { + "step": 556, + "epoch": 3.475, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6886, + "grad_norm": 0.10575351119041443, + "learning_rate": 1.546908877009676e-05 + }, + { + "step": 557, + "epoch": 3.48125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.701, + "grad_norm": 0.6854659914970398, + "learning_rate": 1.5109244969603546e-05 + }, + { + "step": 558, + "epoch": 3.4875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6999, + "grad_norm": 0.5649988651275635, + "learning_rate": 1.4753413861677604e-05 + }, + { + "step": 559, + "epoch": 3.49375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6886, + "grad_norm": 0.28524959087371826, + "learning_rate": 1.4401606031483497e-05 + }, + { + "step": 560, + "epoch": 3.5, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6974, + "grad_norm": 0.4405934512615204, + "learning_rate": 1.4053831944502508e-05 + }, + { + "step": 561, + "epoch": 3.50625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6908, + "grad_norm": 0.08347369730472565, + "learning_rate": 1.371010194622117e-05 + }, + { + "step": 562, + "epoch": 3.5125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.7043, + "grad_norm": 0.9190731644630432, + "learning_rate": 1.3370426261823613e-05 + }, + { + "step": 563, + "epoch": 3.51875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6947, + "grad_norm": 0.309678852558136, + "learning_rate": 1.3034814995887433e-05 + }, + { + "step": 564, + "epoch": 3.525, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6913, + "grad_norm": 0.20287437736988068, + "learning_rate": 1.2703278132082934e-05 + }, + { + "step": 565, + "epoch": 3.53125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6918, + "grad_norm": 0.0759817436337471, + "learning_rate": 1.237582553287631e-05 + }, + { + "step": 566, + "epoch": 3.5375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.7018, + "grad_norm": 0.9782448410987854, + "learning_rate": 1.205246693923616e-05 + }, + { + "step": 567, + "epoch": 3.54375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6977, + "grad_norm": 0.582689106464386, + "learning_rate": 1.173321197034382e-05 + }, + { + "step": 568, + "epoch": 3.55, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6916, + "grad_norm": 0.4059756100177765, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 569, + "epoch": 3.55625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6964, + "grad_norm": 0.6595931053161621, + "learning_rate": 1.1107050772877507e-05 + }, + { + "step": 570, + "epoch": 3.5625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6904, + "grad_norm": 0.2270367592573166, + "learning_rate": 1.0800163171172332e-05 + }, + { + "step": 571, + "epoch": 3.56875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.691, + "grad_norm": 0.17621037364006042, + "learning_rate": 1.0497416447398187e-05 + }, + { + "step": 572, + "epoch": 3.575, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6926, + "grad_norm": 0.7058303952217102, + "learning_rate": 1.0198819607580233e-05 + }, + { + "step": 573, + "epoch": 3.58125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6935, + "grad_norm": 0.24757826328277588, + "learning_rate": 9.904381534293993e-06 + }, + { + "step": 574, + "epoch": 3.5875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6938, + "grad_norm": 0.48802077770233154, + "learning_rate": 9.614110986401169e-06 + }, + { + "step": 575, + "epoch": 3.59375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6931, + "grad_norm": 0.15181876718997955, + "learning_rate": 9.32801659878905e-06 + }, + { + "step": 576, + "epoch": 3.6, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6952, + "grad_norm": 0.3617120385169983, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 577, + "epoch": 3.60625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.69, + "grad_norm": 0.16492193937301636, + "learning_rate": 8.768390222546895e-06 + }, + { + "step": 578, + "epoch": 3.6125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493274112, + "loss": 0.6943, + "grad_norm": 0.26307886838912964, + "learning_rate": 8.494874881526215e-06 + }, + { + "step": 579, + "epoch": 3.61875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6941, + "grad_norm": 0.1621062308549881, + "learning_rate": 8.225568995509834e-06 + }, + { + "step": 580, + "epoch": 3.625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6944, + "grad_norm": 0.8452896475791931, + "learning_rate": 7.960480575734162e-06 + }, + { + "step": 581, + "epoch": 3.63125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6935, + "grad_norm": 0.3781874477863312, + "learning_rate": 7.699617507975563e-06 + }, + { + "step": 582, + "epoch": 3.6375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493258752, + "loss": 0.693, + "grad_norm": 0.13579323887825012, + "learning_rate": 7.442987552315833e-06 + }, + { + "step": 583, + "epoch": 3.64375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6939, + "grad_norm": 0.059740811586380005, + "learning_rate": 7.190598342911358e-06 + }, + { + "step": 584, + "epoch": 3.65, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6914, + "grad_norm": 0.08294578641653061, + "learning_rate": 6.942457387765976e-06 + }, + { + "step": 585, + "epoch": 3.65625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493274112, + "loss": 0.6922, + "grad_norm": 0.34723609685897827, + "learning_rate": 6.698572068507596e-06 + }, + { + "step": 586, + "epoch": 3.6625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6935, + "grad_norm": 0.7457748651504517, + "learning_rate": 6.458949640168675e-06 + }, + { + "step": 587, + "epoch": 3.66875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493274112, + "loss": 0.6899, + "grad_norm": 0.24280379712581635, + "learning_rate": 6.223597230970428e-06 + }, + { + "step": 588, + "epoch": 3.675, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6905, + "grad_norm": 0.6816572546958923, + "learning_rate": 5.992521842110709e-06 + }, + { + "step": 589, + "epoch": 3.68125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6942, + "grad_norm": 0.3972599506378174, + "learning_rate": 5.7657303475556974e-06 + }, + { + "step": 590, + "epoch": 3.6875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6941, + "grad_norm": 0.6357232928276062, + "learning_rate": 5.543229493835594e-06 + }, + { + "step": 591, + "epoch": 3.69375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6961, + "grad_norm": 0.9896860122680664, + "learning_rate": 5.325025899843732e-06 + }, + { + "step": 592, + "epoch": 3.7, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493272576, + "loss": 0.6943, + "grad_norm": 0.5112079381942749, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 593, + "epoch": 3.70625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6924, + "grad_norm": 0.38294121623039246, + "learning_rate": 4.901536327256589e-06 + }, + { + "step": 594, + "epoch": 3.7125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.693, + "grad_norm": 0.9832779765129089, + "learning_rate": 4.6962629465110365e-06 + }, + { + "step": 595, + "epoch": 3.71875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493272576, + "loss": 0.6931, + "grad_norm": 0.12397433817386627, + "learning_rate": 4.495312020818403e-06 + }, + { + "step": 596, + "epoch": 3.725, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.697, + "grad_norm": 0.5781083703041077, + "learning_rate": 4.298689528010785e-06 + }, + { + "step": 597, + "epoch": 3.73125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6929, + "grad_norm": 0.25781261920928955, + "learning_rate": 4.106401317159275e-06 + }, + { + "step": 598, + "epoch": 3.7375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6952, + "grad_norm": 0.6172275543212891, + "learning_rate": 3.918453108399955e-06 + }, + { + "step": 599, + "epoch": 3.74375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6929, + "grad_norm": 0.23490451276302338, + "learning_rate": 3.7348504927637302e-06 + }, + { + "step": 600, + "epoch": 3.75, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6936, + "grad_norm": 0.3750430941581726, + "learning_rate": 3.5555989320099952e-06 + }, + { + "step": 601, + "epoch": 3.75625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6919, + "grad_norm": 0.1534176617860794, + "learning_rate": 3.3807037584642316e-06 + }, + { + "step": 602, + "epoch": 3.7625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6932, + "grad_norm": 0.2276819497346878, + "learning_rate": 3.21017017485925e-06 + }, + { + "step": 603, + "epoch": 3.76875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49327104, + "loss": 0.6924, + "grad_norm": 0.45422545075416565, + "learning_rate": 3.0440032541805825e-06 + }, + { + "step": 604, + "epoch": 3.775, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6933, + "grad_norm": 0.25877222418785095, + "learning_rate": 2.882207939515435e-06 + }, + { + "step": 605, + "epoch": 3.78125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493260288, + "loss": 0.6907, + "grad_norm": 0.8079131841659546, + "learning_rate": 2.7247890439057064e-06 + }, + { + "step": 606, + "epoch": 3.7875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6904, + "grad_norm": 0.35475847125053406, + "learning_rate": 2.5717512502048342e-06 + }, + { + "step": 607, + "epoch": 3.79375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6926, + "grad_norm": 0.39005500078201294, + "learning_rate": 2.423099110938376e-06 + }, + { + "step": 608, + "epoch": 3.8, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6964, + "grad_norm": 0.5171116590499878, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 609, + "epoch": 3.80625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6896, + "grad_norm": 0.6807390451431274, + "learning_rate": 2.1389693533636455e-06 + }, + { + "step": 610, + "epoch": 3.8125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6912, + "grad_norm": 0.11151736974716187, + "learning_rate": 2.003500187268153e-06 + }, + { + "step": 611, + "epoch": 3.81875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6952, + "grad_norm": 0.5057077407836914, + "learning_rate": 1.8724335797812685e-06 + }, + { + "step": 612, + "epoch": 3.825, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493258752, + "loss": 0.6925, + "grad_norm": 0.39858320355415344, + "learning_rate": 1.7457734298359005e-06 + }, + { + "step": 613, + "epoch": 3.83125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6965, + "grad_norm": 0.1804555207490921, + "learning_rate": 1.6235235052828476e-06 + }, + { + "step": 614, + "epoch": 3.8375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6932, + "grad_norm": 0.5016638040542603, + "learning_rate": 1.505687442778819e-06 + }, + { + "step": 615, + "epoch": 3.84375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6925, + "grad_norm": 0.5114827156066895, + "learning_rate": 1.3922687476781047e-06 + }, + { + "step": 616, + "epoch": 3.85, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6914, + "grad_norm": 0.07644855231046677, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 617, + "epoch": 3.85625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6941, + "grad_norm": 0.33672669529914856, + "learning_rate": 1.1786968239705486e-06 + }, + { + "step": 618, + "epoch": 3.8625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.691, + "grad_norm": 0.08696132898330688, + "learning_rate": 1.0785499486417438e-06 + }, + { + "step": 619, + "epoch": 3.86875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6933, + "grad_norm": 0.15540868043899536, + "learning_rate": 9.82833147083345e-07 + }, + { + "step": 620, + "epoch": 3.875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.693, + "grad_norm": 0.6164510250091553, + "learning_rate": 8.91549266652053e-07 + }, + { + "step": 621, + "epoch": 3.88125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6922, + "grad_norm": 0.6853800415992737, + "learning_rate": 8.04701022835319e-07 + }, + { + "step": 622, + "epoch": 3.8875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.692, + "grad_norm": 0.22956722974777222, + "learning_rate": 7.222909991704773e-07 + }, + { + "step": 623, + "epoch": 3.89375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6925, + "grad_norm": 0.6276169419288635, + "learning_rate": 6.443216471679058e-07 + }, + { + "step": 624, + "epoch": 3.9, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6928, + "grad_norm": 0.24217267334461212, + "learning_rate": 5.707952862381681e-07 + }, + { + "step": 625, + "epoch": 3.90625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6937, + "grad_norm": 0.26531368494033813, + "learning_rate": 5.017141036229522e-07 + }, + { + "step": 626, + "epoch": 3.9125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.693, + "grad_norm": 0.057315047830343246, + "learning_rate": 4.370801543300051e-07 + }, + { + "step": 627, + "epoch": 3.91875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6927, + "grad_norm": 0.23406030237674713, + "learning_rate": 3.768953610720327e-07 + }, + { + "step": 628, + "epoch": 3.925, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6928, + "grad_norm": 0.22061894834041595, + "learning_rate": 3.211615142094781e-07 + }, + { + "step": 629, + "epoch": 3.93125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.49326336, + "loss": 0.6937, + "grad_norm": 0.5037204623222351, + "learning_rate": 2.6988027169728145e-07 + }, + { + "step": 630, + "epoch": 3.9375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493261824, + "loss": 0.6921, + "grad_norm": 0.455049067735672, + "learning_rate": 2.2305315903553555e-07 + }, + { + "step": 631, + "epoch": 3.94375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493275648, + "loss": 0.6916, + "grad_norm": 0.3497721552848816, + "learning_rate": 1.8068156922413924e-07 + }, + { + "step": 632, + "epoch": 3.95, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.69, + "grad_norm": 0.27360719442367554, + "learning_rate": 1.4276676272133025e-07 + }, + { + "step": 633, + "epoch": 3.95625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6942, + "grad_norm": 0.09777634590864182, + "learning_rate": 1.0930986740621539e-07 + }, + { + "step": 634, + "epoch": 3.9625, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493266432, + "loss": 0.6932, + "grad_norm": 0.4687745273113251, + "learning_rate": 8.031187854514731e-08 + }, + { + "step": 635, + "epoch": 3.96875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6918, + "grad_norm": 0.16850349307060242, + "learning_rate": 5.577365876224815e-08 + }, + { + "step": 636, + "epoch": 3.975, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493258752, + "loss": 0.6936, + "grad_norm": 0.0936020165681839, + "learning_rate": 3.5695938013630134e-08 + }, + { + "step": 637, + "epoch": 3.98125, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493269504, + "loss": 0.6926, + "grad_norm": 0.8080611228942871, + "learning_rate": 2.007931356572956e-08 + }, + { + "step": 638, + "epoch": 3.9875, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6944, + "grad_norm": 0.4949457049369812, + "learning_rate": 8.924249977537712e-09 + }, + { + "step": 639, + "epoch": 3.99375, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493267968, + "loss": 0.6938, + "grad_norm": 0.49963483214378357, + "learning_rate": 2.2310790867619e-09 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "loss": 0.6931, + "grad_norm": 0.42930105328559875, + "learning_rate": 0.0 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.5002624, + "gpu_mem": 4.493264896, + "train_runtime": 1387.1284, + "train_samples_per_second": 29.511, + "train_steps_per_second": 0.461, + "total_flos": 0.0, + "train_loss": 0.7222484215162694 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r2-a2/adapter_config.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a6595677543f23b42f06770761e8d2aa18b1163d --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r2-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 4, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 2, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "D" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r2-a2/eval_results.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a9e40189136bf299dc9aaae1e3f110f34689776e --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_c", + "results": 0.25853242320819114 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r2-a2/training_configuration.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..69fb3490d1ff44f4a021b68649a8101d3dd5fa66 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_C", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "abl_D", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 789096 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_D-arc_c-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r2-a2", + "seed": 42, + "timestamp": "2025-09-02T08:48:46.763834" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r2-a2/training_logs.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..c68cf0cde168dec74ca19c24018cde9614354d56 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r2-a2/training_logs.json @@ -0,0 +1,625 @@ +[ + { + "step": 1, + "epoch": 0.05714285714285714, + "cpu_mem": 1.696833536, + "gpu_mem": 4.421125632, + "loss": 4.4614, + "grad_norm": 9.372712135314941, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 2, + "epoch": 0.11428571428571428, + "cpu_mem": 1.697030144, + "gpu_mem": 4.42758144, + "loss": 4.6994, + "grad_norm": 9.505729675292969, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 3, + "epoch": 0.17142857142857143, + "cpu_mem": 1.697030144, + "gpu_mem": 4.42761216, + "loss": 4.2879, + "grad_norm": 9.940461158752441, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 4, + "epoch": 0.22857142857142856, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427578368, + "loss": 3.9191, + "grad_norm": 9.804134368896484, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 5, + "epoch": 0.2857142857142857, + "cpu_mem": 1.697226752, + "gpu_mem": 4.42756608, + "loss": 3.6181, + "grad_norm": 9.346920013427734, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 6, + "epoch": 0.34285714285714286, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427629056, + "loss": 3.4964, + "grad_norm": 7.807424068450928, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 7, + "epoch": 0.4, + "cpu_mem": 1.697226752, + "gpu_mem": 4.4276352, + "loss": 2.9965, + "grad_norm": 6.8249430656433105, + "learning_rate": 0.0003 + }, + { + "step": 8, + "epoch": 0.45714285714285713, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427593728, + "loss": 2.7416, + "grad_norm": 5.749145984649658, + "learning_rate": 0.00029980111348272456 + }, + { + "step": 9, + "epoch": 0.5142857142857142, + "cpu_mem": 1.697226752, + "gpu_mem": 4.42758912, + "loss": 2.4866, + "grad_norm": 4.574408531188965, + "learning_rate": 0.00029920498134218835 + }, + { + "step": 10, + "epoch": 0.5714285714285714, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427578368, + "loss": 2.1443, + "grad_norm": 3.5885202884674072, + "learning_rate": 0.0002982131844136615 + }, + { + "step": 11, + "epoch": 0.6285714285714286, + "cpu_mem": 1.697226752, + "gpu_mem": 4.42758912, + "loss": 1.8385, + "grad_norm": 2.463916063308716, + "learning_rate": 0.0002968283527643036 + }, + { + "step": 12, + "epoch": 0.6857142857142857, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427613696, + "loss": 1.7234, + "grad_norm": 1.6683762073516846, + "learning_rate": 0.000295054158718698 + }, + { + "step": 13, + "epoch": 0.7428571428571429, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427613696, + "loss": 1.8995, + "grad_norm": 1.8558967113494873, + "learning_rate": 0.00029289530712050735 + }, + { + "step": 14, + "epoch": 0.8, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427561472, + "loss": 1.6942, + "grad_norm": 0.9398776888847351, + "learning_rate": 0.000290357522856074 + }, + { + "step": 15, + "epoch": 0.8571428571428571, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427636736, + "loss": 1.6018, + "grad_norm": 0.6968910694122314, + "learning_rate": 0.0002874475356730507 + }, + { + "step": 16, + "epoch": 0.9142857142857143, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427630592, + "loss": 1.6606, + "grad_norm": 0.7243131995201111, + "learning_rate": 0.0002841730623343193 + }, + { + "step": 17, + "epoch": 0.9714285714285714, + "cpu_mem": 1.697226752, + "gpu_mem": 4.4276352, + "loss": 1.4995, + "grad_norm": 0.5723839402198792, + "learning_rate": 0.00028054278615452326 + }, + { + "step": 18, + "epoch": 1.0285714285714285, + "cpu_mem": 1.697226752, + "gpu_mem": 4.43082496, + "loss": 2.384, + "grad_norm": 1.6258989572525024, + "learning_rate": 0.0002765663339734778 + }, + { + "step": 19, + "epoch": 1.0857142857142856, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430823424, + "loss": 1.591, + "grad_norm": 0.8138455748558044, + "learning_rate": 0.00027225425062752165 + }, + { + "step": 20, + "epoch": 1.1428571428571428, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430798848, + "loss": 1.4479, + "grad_norm": 0.6239368915557861, + "learning_rate": 0.0002676179709865066 + }, + { + "step": 21, + "epoch": 1.2, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430806528, + "loss": 1.4492, + "grad_norm": 0.3986297845840454, + "learning_rate": 0.0002626697896305779 + }, + { + "step": 22, + "epoch": 1.2571428571428571, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430835712, + "loss": 1.602, + "grad_norm": 1.0776423215866089, + "learning_rate": 0.000257422828247159 + }, + { + "step": 23, + "epoch": 1.3142857142857143, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430864896, + "loss": 1.3933, + "grad_norm": 0.35843774676322937, + "learning_rate": 0.00025189100083459397 + }, + { + "step": 24, + "epoch": 1.3714285714285714, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430808064, + "loss": 1.4628, + "grad_norm": 0.39809557795524597, + "learning_rate": 0.0002460889768047263 + }, + { + "step": 25, + "epoch": 1.4285714285714286, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430877184, + "loss": 1.3473, + "grad_norm": 0.3980904221534729, + "learning_rate": 0.00024003214208225522 + }, + { + "step": 26, + "epoch": 1.4857142857142858, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430834176, + "loss": 1.4136, + "grad_norm": 0.4054908752441406, + "learning_rate": 0.00023373655830402968 + }, + { + "step": 27, + "epoch": 1.5428571428571427, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430792704, + "loss": 1.4158, + "grad_norm": 0.38542529940605164, + "learning_rate": 0.00022721892022647462 + }, + { + "step": 28, + "epoch": 1.6, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430838784, + "loss": 1.5844, + "grad_norm": 0.9159950017929077, + "learning_rate": 0.000220496511454098 + }, + { + "step": 29, + "epoch": 1.657142857142857, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430834176, + "loss": 1.4166, + "grad_norm": 0.32921159267425537, + "learning_rate": 0.0002135871586064791 + }, + { + "step": 30, + "epoch": 1.7142857142857144, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430823424, + "loss": 1.4374, + "grad_norm": 0.4871726334095001, + "learning_rate": 0.00020650918404527775 + }, + { + "step": 31, + "epoch": 1.7714285714285714, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430854144, + "loss": 1.4105, + "grad_norm": 0.41168299317359924, + "learning_rate": 0.00019928135728662522 + }, + { + "step": 32, + "epoch": 1.8285714285714287, + "cpu_mem": 1.697226752, + "gpu_mem": 4.43086336, + "loss": 1.4218, + "grad_norm": 0.32501131296157837, + "learning_rate": 0.00019192284522774142 + }, + { + "step": 33, + "epoch": 1.8857142857142857, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430843392, + "loss": 1.4401, + "grad_norm": 0.33109134435653687, + "learning_rate": 0.00018445316131976934 + }, + { + "step": 34, + "epoch": 1.9428571428571428, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430821888, + "loss": 1.421, + "grad_norm": 0.3925471007823944, + "learning_rate": 0.00017689211382161034 + }, + { + "step": 35, + "epoch": 2.0, + "cpu_mem": 1.697226752, + "gpu_mem": 4.43070976, + "loss": 2.1134, + "grad_norm": 0.3770363926887512, + "learning_rate": 0.00016925975327198266 + }, + { + "step": 36, + "epoch": 2.057142857142857, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427607552, + "loss": 1.3952, + "grad_norm": 0.4153933525085449, + "learning_rate": 0.00016157631931899697 + }, + { + "step": 37, + "epoch": 2.1142857142857143, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427616768, + "loss": 1.4243, + "grad_norm": 0.34528809785842896, + "learning_rate": 0.0001538621870482483 + }, + { + "step": 38, + "epoch": 2.1714285714285713, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427587584, + "loss": 1.377, + "grad_norm": 0.3698920011520386, + "learning_rate": 0.00014613781295175172 + }, + { + "step": 39, + "epoch": 2.2285714285714286, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427606016, + "loss": 1.3633, + "grad_norm": 0.2796523869037628, + "learning_rate": 0.00013842368068100303 + }, + { + "step": 40, + "epoch": 2.2857142857142856, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427582976, + "loss": 1.4254, + "grad_norm": 0.32899579405784607, + "learning_rate": 0.00013074024672801731 + }, + { + "step": 41, + "epoch": 2.342857142857143, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427584512, + "loss": 1.4214, + "grad_norm": 0.29549214243888855, + "learning_rate": 0.00012310788617838966 + }, + { + "step": 42, + "epoch": 2.4, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427613696, + "loss": 1.3469, + "grad_norm": 0.337439626455307, + "learning_rate": 0.00011554683868023067 + }, + { + "step": 43, + "epoch": 2.4571428571428573, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427629056, + "loss": 1.4095, + "grad_norm": 0.5586375594139099, + "learning_rate": 0.00010807715477225858 + }, + { + "step": 44, + "epoch": 2.5142857142857142, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427647488, + "loss": 1.3974, + "grad_norm": 0.30426549911499023, + "learning_rate": 0.00010071864271337478 + }, + { + "step": 45, + "epoch": 2.571428571428571, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427601408, + "loss": 1.3612, + "grad_norm": 0.21573281288146973, + "learning_rate": 9.34908159547222e-05 + }, + { + "step": 46, + "epoch": 2.6285714285714286, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427595264, + "loss": 1.3481, + "grad_norm": 0.47057032585144043, + "learning_rate": 8.641284139352091e-05 + }, + { + "step": 47, + "epoch": 2.685714285714286, + "cpu_mem": 1.697226752, + "gpu_mem": 4.42758912, + "loss": 1.3869, + "grad_norm": 0.45078304409980774, + "learning_rate": 7.950348854590204e-05 + }, + { + "step": 48, + "epoch": 2.742857142857143, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427593728, + "loss": 1.3336, + "grad_norm": 0.3017481863498688, + "learning_rate": 7.278107977352543e-05 + }, + { + "step": 49, + "epoch": 2.8, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427584512, + "loss": 1.3889, + "grad_norm": 0.42076191306114197, + "learning_rate": 6.626344169597031e-05 + }, + { + "step": 50, + "epoch": 2.857142857142857, + "cpu_mem": 1.697226752, + "gpu_mem": 4.42756608, + "loss": 1.3624, + "grad_norm": 0.2517501711845398, + "learning_rate": 5.996785791774478e-05 + }, + { + "step": 51, + "epoch": 2.914285714285714, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427590656, + "loss": 1.3882, + "grad_norm": 0.38685446977615356, + "learning_rate": 5.391102319527373e-05 + }, + { + "step": 52, + "epoch": 2.9714285714285715, + "cpu_mem": 1.697226752, + "gpu_mem": 4.427618304, + "loss": 1.3943, + "grad_norm": 0.34950628876686096, + "learning_rate": 4.8108999165406026e-05 + }, + { + "step": 53, + "epoch": 3.0285714285714285, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430820352, + "loss": 2.0359, + "grad_norm": 0.5848061442375183, + "learning_rate": 4.257717175284103e-05 + }, + { + "step": 54, + "epoch": 3.085714285714286, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430789632, + "loss": 1.3851, + "grad_norm": 0.24298778176307678, + "learning_rate": 3.733021036942205e-05 + }, + { + "step": 55, + "epoch": 3.142857142857143, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430823424, + "loss": 1.4022, + "grad_norm": 0.38060420751571655, + "learning_rate": 3.238202901349345e-05 + }, + { + "step": 56, + "epoch": 3.2, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430897152, + "loss": 1.3846, + "grad_norm": 0.31405743956565857, + "learning_rate": 2.774574937247831e-05 + }, + { + "step": 57, + "epoch": 3.257142857142857, + "cpu_mem": 1.697226752, + "gpu_mem": 4.43084032, + "loss": 1.3708, + "grad_norm": 0.1922149956226349, + "learning_rate": 2.3433666026522153e-05 + }, + { + "step": 58, + "epoch": 3.314285714285714, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430834176, + "loss": 1.3258, + "grad_norm": 0.4310964345932007, + "learning_rate": 1.945721384547671e-05 + }, + { + "step": 59, + "epoch": 3.3714285714285714, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430884864, + "loss": 1.3606, + "grad_norm": 0.37557414174079895, + "learning_rate": 1.5826937665680693e-05 + }, + { + "step": 60, + "epoch": 3.4285714285714284, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430811136, + "loss": 1.4284, + "grad_norm": 0.3953492045402527, + "learning_rate": 1.2552464326949302e-05 + }, + { + "step": 61, + "epoch": 3.4857142857142858, + "cpu_mem": 1.697226752, + "gpu_mem": 4.43082496, + "loss": 1.4005, + "grad_norm": 0.3415544629096985, + "learning_rate": 9.64247714392597e-06 + }, + { + "step": 62, + "epoch": 3.5428571428571427, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430826496, + "loss": 1.3717, + "grad_norm": 0.3572997748851776, + "learning_rate": 7.104692879492624e-06 + }, + { + "step": 63, + "epoch": 3.6, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430815744, + "loss": 1.3627, + "grad_norm": 0.2714832127094269, + "learning_rate": 4.945841281301943e-06 + }, + { + "step": 64, + "epoch": 3.657142857142857, + "cpu_mem": 1.697226752, + "gpu_mem": 4.43083264, + "loss": 1.3488, + "grad_norm": 0.3246787488460541, + "learning_rate": 3.1716472356963286e-06 + }, + { + "step": 65, + "epoch": 3.7142857142857144, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430854144, + "loss": 1.3766, + "grad_norm": 0.41255703568458557, + "learning_rate": 1.7868155863384415e-06 + }, + { + "step": 66, + "epoch": 3.7714285714285714, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430844928, + "loss": 1.3593, + "grad_norm": 0.4100288450717926, + "learning_rate": 7.950186578116413e-07 + }, + { + "step": 67, + "epoch": 3.8285714285714287, + "cpu_mem": 1.697226752, + "gpu_mem": 4.43087104, + "loss": 1.3571, + "grad_norm": 0.30215904116630554, + "learning_rate": 1.988865172754206e-07 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430821888, + "loss": 1.3177, + "grad_norm": 0.3593924641609192, + "learning_rate": 0.0 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.697226752, + "gpu_mem": 4.430821888, + "train_runtime": 372.1856, + "train_samples_per_second": 12.026, + "train_steps_per_second": 0.183, + "total_flos": 0.0, + "train_loss": 1.774010255056269 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r8-a2/adapter_config.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c9ded8039b496858a8aa3d756f427279337f8964 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r8-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 16, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 8, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "D" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r8-a2/eval_results.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..acf6e4d59deb95eba77fd56b93cf338490c85144 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_c", + "results": 0.28924914675767915 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r8-a2/training_configuration.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..6df8775de8a57afa1711925ffd2dd4a2dc902962 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_C", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "abl_D", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 3163776 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_D-arc_c-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r8-a2", + "seed": 42, + "timestamp": "2025-09-02T15:43:09.582091" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r8-a2/training_logs.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..e253552b0322e689f37e6df91ed98314ba6b901f --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_c-r8-a2/training_logs.json @@ -0,0 +1,625 @@ +[ + { + "step": 1, + "epoch": 0.05714285714285714, + "cpu_mem": 1.697902592, + "gpu_mem": 4.430771712, + "loss": 4.4614, + "grad_norm": 19.400867462158203, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 2, + "epoch": 0.11428571428571428, + "cpu_mem": 1.6980992, + "gpu_mem": 4.45615104, + "loss": 4.6994, + "grad_norm": 19.758705139160156, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 3, + "epoch": 0.17142857142857143, + "cpu_mem": 1.698295808, + "gpu_mem": 4.45618176, + "loss": 3.9774, + "grad_norm": 20.124223709106445, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 4, + "epoch": 0.22857142857142856, + "cpu_mem": 1.698295808, + "gpu_mem": 4.456147968, + "loss": 3.0822, + "grad_norm": 15.91993522644043, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 5, + "epoch": 0.2857142857142857, + "cpu_mem": 1.698295808, + "gpu_mem": 4.45613568, + "loss": 2.3806, + "grad_norm": 10.648056983947754, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 6, + "epoch": 0.34285714285714286, + "cpu_mem": 1.698295808, + "gpu_mem": 4.456198656, + "loss": 2.181, + "grad_norm": 8.884888648986816, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 7, + "epoch": 0.4, + "cpu_mem": 1.698295808, + "gpu_mem": 4.4562048, + "loss": 1.714, + "grad_norm": 2.4684672355651855, + "learning_rate": 0.0003 + }, + { + "step": 8, + "epoch": 0.45714285714285713, + "cpu_mem": 1.698295808, + "gpu_mem": 4.456163328, + "loss": 1.608, + "grad_norm": 1.6026487350463867, + "learning_rate": 0.00029980111348272456 + }, + { + "step": 9, + "epoch": 0.5142857142857142, + "cpu_mem": 1.698295808, + "gpu_mem": 4.45615872, + "loss": 1.6429, + "grad_norm": 2.1883907318115234, + "learning_rate": 0.00029920498134218835 + }, + { + "step": 10, + "epoch": 0.5714285714285714, + "cpu_mem": 1.698295808, + "gpu_mem": 4.456147968, + "loss": 1.4279, + "grad_norm": 0.8725115656852722, + "learning_rate": 0.0002982131844136615 + }, + { + "step": 11, + "epoch": 0.6285714285714286, + "cpu_mem": 1.698295808, + "gpu_mem": 4.45615872, + "loss": 1.3752, + "grad_norm": 0.5333192348480225, + "learning_rate": 0.0002968283527643036 + }, + { + "step": 12, + "epoch": 0.6857142857142857, + "cpu_mem": 1.698295808, + "gpu_mem": 4.456183296, + "loss": 1.4199, + "grad_norm": 1.0847747325897217, + "learning_rate": 0.000295054158718698 + }, + { + "step": 13, + "epoch": 0.7428571428571429, + "cpu_mem": 1.698295808, + "gpu_mem": 4.456183296, + "loss": 1.4196, + "grad_norm": 1.9492383003234863, + "learning_rate": 0.00029289530712050735 + }, + { + "step": 14, + "epoch": 0.8, + "cpu_mem": 1.698295808, + "gpu_mem": 4.456131072, + "loss": 1.4268, + "grad_norm": 1.0490226745605469, + "learning_rate": 0.000290357522856074 + }, + { + "step": 15, + "epoch": 0.8571428571428571, + "cpu_mem": 1.698295808, + "gpu_mem": 4.456206336, + "loss": 1.4702, + "grad_norm": 1.7151758670806885, + "learning_rate": 0.0002874475356730507 + }, + { + "step": 16, + "epoch": 0.9142857142857143, + "cpu_mem": 1.698295808, + "gpu_mem": 4.456200192, + "loss": 1.396, + "grad_norm": 0.5570492744445801, + "learning_rate": 0.0002841730623343193 + }, + { + "step": 17, + "epoch": 0.9714285714285714, + "cpu_mem": 1.698295808, + "gpu_mem": 4.4562048, + "loss": 1.4186, + "grad_norm": 0.9546794891357422, + "learning_rate": 0.00028054278615452326 + }, + { + "step": 18, + "epoch": 1.0285714285714285, + "cpu_mem": 1.698295808, + "gpu_mem": 4.46885632, + "loss": 1.9975, + "grad_norm": 1.0332984924316406, + "learning_rate": 0.0002765663339734778 + }, + { + "step": 19, + "epoch": 1.0857142857142856, + "cpu_mem": 1.698295808, + "gpu_mem": 4.468854784, + "loss": 1.3462, + "grad_norm": 0.5303434133529663, + "learning_rate": 0.00027225425062752165 + }, + { + "step": 20, + "epoch": 1.1428571428571428, + "cpu_mem": 1.698295808, + "gpu_mem": 4.468830208, + "loss": 1.3765, + "grad_norm": 0.9883305430412292, + "learning_rate": 0.0002676179709865066 + }, + { + "step": 21, + "epoch": 1.2, + "cpu_mem": 1.698295808, + "gpu_mem": 4.468837888, + "loss": 1.3492, + "grad_norm": 0.5697071552276611, + "learning_rate": 0.0002626697896305779 + }, + { + "step": 22, + "epoch": 1.2571428571428571, + "cpu_mem": 1.698295808, + "gpu_mem": 4.468867072, + "loss": 1.3572, + "grad_norm": 0.829210102558136, + "learning_rate": 0.000257422828247159 + }, + { + "step": 23, + "epoch": 1.3142857142857143, + "cpu_mem": 1.698295808, + "gpu_mem": 4.468896256, + "loss": 1.3047, + "grad_norm": 0.43821457028388977, + "learning_rate": 0.00025189100083459397 + }, + { + "step": 24, + "epoch": 1.3714285714285714, + "cpu_mem": 1.698295808, + "gpu_mem": 4.468839424, + "loss": 1.3898, + "grad_norm": 0.8927768468856812, + "learning_rate": 0.0002460889768047263 + }, + { + "step": 25, + "epoch": 1.4285714285714286, + "cpu_mem": 1.698295808, + "gpu_mem": 4.468908544, + "loss": 1.2697, + "grad_norm": 0.45413684844970703, + "learning_rate": 0.00024003214208225522 + }, + { + "step": 26, + "epoch": 1.4857142857142858, + "cpu_mem": 1.698295808, + "gpu_mem": 4.468865536, + "loss": 1.3127, + "grad_norm": 0.40348803997039795, + "learning_rate": 0.00023373655830402968 + }, + { + "step": 27, + "epoch": 1.5428571428571427, + "cpu_mem": 1.698295808, + "gpu_mem": 4.468824064, + "loss": 1.3473, + "grad_norm": 0.5828485488891602, + "learning_rate": 0.00022721892022647462 + }, + { + "step": 28, + "epoch": 1.6, + "cpu_mem": 1.698295808, + "gpu_mem": 4.468870144, + "loss": 1.4587, + "grad_norm": 1.5144755840301514, + "learning_rate": 0.000220496511454098 + }, + { + "step": 29, + "epoch": 1.657142857142857, + "cpu_mem": 1.698295808, + "gpu_mem": 4.468865536, + "loss": 1.3599, + "grad_norm": 0.3945765793323517, + "learning_rate": 0.0002135871586064791 + }, + { + "step": 30, + "epoch": 1.7142857142857144, + "cpu_mem": 1.698295808, + "gpu_mem": 4.468854784, + "loss": 1.3584, + "grad_norm": 0.5291534066200256, + "learning_rate": 0.00020650918404527775 + }, + { + "step": 31, + "epoch": 1.7714285714285714, + "cpu_mem": 1.698295808, + "gpu_mem": 4.468885504, + "loss": 1.3545, + "grad_norm": 0.46884438395500183, + "learning_rate": 0.00019928135728662522 + }, + { + "step": 32, + "epoch": 1.8285714285714287, + "cpu_mem": 1.698295808, + "gpu_mem": 4.46889472, + "loss": 1.3678, + "grad_norm": 0.5220944285392761, + "learning_rate": 0.00019192284522774142 + }, + { + "step": 33, + "epoch": 1.8857142857142857, + "cpu_mem": 1.698295808, + "gpu_mem": 4.468874752, + "loss": 1.3935, + "grad_norm": 0.5094056725502014, + "learning_rate": 0.00018445316131976934 + }, + { + "step": 34, + "epoch": 1.9428571428571428, + "cpu_mem": 1.698295808, + "gpu_mem": 4.468853248, + "loss": 1.3778, + "grad_norm": 0.4601474702358246, + "learning_rate": 0.00017689211382161034 + }, + { + "step": 35, + "epoch": 2.0, + "cpu_mem": 1.698295808, + "gpu_mem": 4.46874112, + "loss": 2.1371, + "grad_norm": 1.1062135696411133, + "learning_rate": 0.00016925975327198266 + }, + { + "step": 36, + "epoch": 2.057142857142857, + "cpu_mem": 1.698295808, + "gpu_mem": 4.456177152, + "loss": 1.4373, + "grad_norm": 1.052232027053833, + "learning_rate": 0.00016157631931899697 + }, + { + "step": 37, + "epoch": 2.1142857142857143, + "cpu_mem": 1.698295808, + "gpu_mem": 4.456186368, + "loss": 1.3538, + "grad_norm": 0.3943697512149811, + "learning_rate": 0.0001538621870482483 + }, + { + "step": 38, + "epoch": 2.1714285714285713, + "cpu_mem": 1.698295808, + "gpu_mem": 4.456157184, + "loss": 1.3616, + "grad_norm": 0.5321637392044067, + "learning_rate": 0.00014613781295175172 + }, + { + "step": 39, + "epoch": 2.2285714285714286, + "cpu_mem": 1.698295808, + "gpu_mem": 4.456175616, + "loss": 1.346, + "grad_norm": 0.3658493161201477, + "learning_rate": 0.00013842368068100303 + }, + { + "step": 40, + "epoch": 2.2857142857142856, + "cpu_mem": 1.698295808, + "gpu_mem": 4.456152576, + "loss": 1.382, + "grad_norm": 0.4273500144481659, + "learning_rate": 0.00013074024672801731 + }, + { + "step": 41, + "epoch": 2.342857142857143, + "cpu_mem": 1.698295808, + "gpu_mem": 4.456154112, + "loss": 1.3834, + "grad_norm": 0.3950561285018921, + "learning_rate": 0.00012310788617838966 + }, + { + "step": 42, + "epoch": 2.4, + "cpu_mem": 1.698295808, + "gpu_mem": 4.456183296, + "loss": 1.3171, + "grad_norm": 0.5361433625221252, + "learning_rate": 0.00011554683868023067 + }, + { + "step": 43, + "epoch": 2.4571428571428573, + "cpu_mem": 1.698295808, + "gpu_mem": 4.456198656, + "loss": 1.3459, + "grad_norm": 0.5288717150688171, + "learning_rate": 0.00010807715477225858 + }, + { + "step": 44, + "epoch": 2.5142857142857142, + "cpu_mem": 1.698492416, + "gpu_mem": 4.456217088, + "loss": 1.3412, + "grad_norm": 0.2969790995121002, + "learning_rate": 0.00010071864271337478 + }, + { + "step": 45, + "epoch": 2.571428571428571, + "cpu_mem": 1.698492416, + "gpu_mem": 4.456171008, + "loss": 1.3258, + "grad_norm": 0.3398008942604065, + "learning_rate": 9.34908159547222e-05 + }, + { + "step": 46, + "epoch": 2.6285714285714286, + "cpu_mem": 1.698492416, + "gpu_mem": 4.456164864, + "loss": 1.2939, + "grad_norm": 0.468073308467865, + "learning_rate": 8.641284139352091e-05 + }, + { + "step": 47, + "epoch": 2.685714285714286, + "cpu_mem": 1.698492416, + "gpu_mem": 4.45615872, + "loss": 1.3066, + "grad_norm": 0.460665762424469, + "learning_rate": 7.950348854590204e-05 + }, + { + "step": 48, + "epoch": 2.742857142857143, + "cpu_mem": 1.698492416, + "gpu_mem": 4.456163328, + "loss": 1.2871, + "grad_norm": 0.378970742225647, + "learning_rate": 7.278107977352543e-05 + }, + { + "step": 49, + "epoch": 2.8, + "cpu_mem": 1.698492416, + "gpu_mem": 4.456154112, + "loss": 1.308, + "grad_norm": 0.46546679735183716, + "learning_rate": 6.626344169597031e-05 + }, + { + "step": 50, + "epoch": 2.857142857142857, + "cpu_mem": 1.698492416, + "gpu_mem": 4.45613568, + "loss": 1.3269, + "grad_norm": 0.3771406412124634, + "learning_rate": 5.996785791774478e-05 + }, + { + "step": 51, + "epoch": 2.914285714285714, + "cpu_mem": 1.698492416, + "gpu_mem": 4.456160256, + "loss": 1.3664, + "grad_norm": 0.683644711971283, + "learning_rate": 5.391102319527373e-05 + }, + { + "step": 52, + "epoch": 2.9714285714285715, + "cpu_mem": 1.698492416, + "gpu_mem": 4.456187904, + "loss": 1.3598, + "grad_norm": 0.36342349648475647, + "learning_rate": 4.8108999165406026e-05 + }, + { + "step": 53, + "epoch": 3.0285714285714285, + "cpu_mem": 1.698492416, + "gpu_mem": 4.468851712, + "loss": 1.9665, + "grad_norm": 0.6932979226112366, + "learning_rate": 4.257717175284103e-05 + }, + { + "step": 54, + "epoch": 3.085714285714286, + "cpu_mem": 1.698492416, + "gpu_mem": 4.468820992, + "loss": 1.3753, + "grad_norm": 0.6314702033996582, + "learning_rate": 3.733021036942205e-05 + }, + { + "step": 55, + "epoch": 3.142857142857143, + "cpu_mem": 1.698492416, + "gpu_mem": 4.468854784, + "loss": 1.3816, + "grad_norm": 0.7213732004165649, + "learning_rate": 3.238202901349345e-05 + }, + { + "step": 56, + "epoch": 3.2, + "cpu_mem": 1.698492416, + "gpu_mem": 4.468928512, + "loss": 1.3321, + "grad_norm": 0.42442506551742554, + "learning_rate": 2.774574937247831e-05 + }, + { + "step": 57, + "epoch": 3.257142857142857, + "cpu_mem": 1.698492416, + "gpu_mem": 4.46887168, + "loss": 1.3354, + "grad_norm": 0.4221729338169098, + "learning_rate": 2.3433666026522153e-05 + }, + { + "step": 58, + "epoch": 3.314285714285714, + "cpu_mem": 1.698492416, + "gpu_mem": 4.468865536, + "loss": 1.2422, + "grad_norm": 0.44664695858955383, + "learning_rate": 1.945721384547671e-05 + }, + { + "step": 59, + "epoch": 3.3714285714285714, + "cpu_mem": 1.698492416, + "gpu_mem": 4.468916224, + "loss": 1.3287, + "grad_norm": 0.5179294943809509, + "learning_rate": 1.5826937665680693e-05 + }, + { + "step": 60, + "epoch": 3.4285714285714284, + "cpu_mem": 1.698492416, + "gpu_mem": 4.468842496, + "loss": 1.3857, + "grad_norm": 0.7303962707519531, + "learning_rate": 1.2552464326949302e-05 + }, + { + "step": 61, + "epoch": 3.4857142857142858, + "cpu_mem": 1.698492416, + "gpu_mem": 4.46885632, + "loss": 1.3938, + "grad_norm": 0.6757739186286926, + "learning_rate": 9.64247714392597e-06 + }, + { + "step": 62, + "epoch": 3.5428571428571427, + "cpu_mem": 1.698492416, + "gpu_mem": 4.468857856, + "loss": 1.3305, + "grad_norm": 0.41094067692756653, + "learning_rate": 7.104692879492624e-06 + }, + { + "step": 63, + "epoch": 3.6, + "cpu_mem": 1.698492416, + "gpu_mem": 4.468847104, + "loss": 1.3294, + "grad_norm": 0.39702531695365906, + "learning_rate": 4.945841281301943e-06 + }, + { + "step": 64, + "epoch": 3.657142857142857, + "cpu_mem": 1.698492416, + "gpu_mem": 4.468864, + "loss": 1.3089, + "grad_norm": 0.46185895800590515, + "learning_rate": 3.1716472356963286e-06 + }, + { + "step": 65, + "epoch": 3.7142857142857144, + "cpu_mem": 1.698492416, + "gpu_mem": 4.468885504, + "loss": 1.33, + "grad_norm": 0.4505058526992798, + "learning_rate": 1.7868155863384415e-06 + }, + { + "step": 66, + "epoch": 3.7714285714285714, + "cpu_mem": 1.698492416, + "gpu_mem": 4.468876288, + "loss": 1.2903, + "grad_norm": 0.43471312522888184, + "learning_rate": 7.950186578116413e-07 + }, + { + "step": 67, + "epoch": 3.8285714285714287, + "cpu_mem": 1.698492416, + "gpu_mem": 4.4689024, + "loss": 1.3186, + "grad_norm": 0.3572932481765747, + "learning_rate": 1.988865172754206e-07 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.698492416, + "gpu_mem": 4.468853248, + "loss": 1.3038, + "grad_norm": 0.5262019038200378, + "learning_rate": 0.0 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.698492416, + "gpu_mem": 4.468853248, + "train_runtime": 371.6933, + "train_samples_per_second": 12.042, + "train_steps_per_second": 0.183, + "total_flos": 0.0, + "train_loss": 1.5846292060964249 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r32-a2/adapter_config.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4d69c6bc9ef572e681044e096143c4cad32a3229 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r32-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 64, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 32, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "D" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r32-a2/eval_results.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..754537073241b21a71884d7f2908612c8ac49084 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_e", + "results": 0.4532828282828283 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r32-a2/training_configuration.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..9ba477f28a5038dd4a9905505223319d6b5c457c --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_E", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "abl_D", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 12773376 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_D-arc_e-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r32-a2", + "seed": 42, + "timestamp": "2025-09-02T22:00:30.632087" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r32-a2/training_logs.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..014964b28f93be2f199924b35f71ef7f93e73ea1 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r32-a2/training_logs.json @@ -0,0 +1,1273 @@ +[ + { + "step": 1, + "epoch": 0.028169014084507043, + "cpu_mem": 1.722679296, + "gpu_mem": 4.469852672, + "loss": 4.6319, + "grad_norm": 36.45879364013672, + "learning_rate": 2.1428571428571425e-05 + }, + { + "step": 2, + "epoch": 0.056338028169014086, + "cpu_mem": 1.722875904, + "gpu_mem": 4.572100608, + "loss": 4.4578, + "grad_norm": 36.681556701660156, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 3, + "epoch": 0.08450704225352113, + "cpu_mem": 1.722875904, + "gpu_mem": 4.572079104, + "loss": 3.8268, + "grad_norm": 32.93587112426758, + "learning_rate": 6.428571428571427e-05 + }, + { + "step": 4, + "epoch": 0.11267605633802817, + "cpu_mem": 1.723072512, + "gpu_mem": 4.5720576, + "loss": 3.0484, + "grad_norm": 40.40604782104492, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 5, + "epoch": 0.14084507042253522, + "cpu_mem": 1.72326912, + "gpu_mem": 4.572099072, + "loss": 2.1038, + "grad_norm": 12.745757102966309, + "learning_rate": 0.00010714285714285714 + }, + { + "step": 6, + "epoch": 0.16901408450704225, + "cpu_mem": 1.72326912, + "gpu_mem": 4.572074496, + "loss": 1.6536, + "grad_norm": 4.352513313293457, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 7, + "epoch": 0.19718309859154928, + "cpu_mem": 1.72326912, + "gpu_mem": 4.572097536, + "loss": 1.565, + "grad_norm": 2.8345093727111816, + "learning_rate": 0.00015 + }, + { + "step": 8, + "epoch": 0.22535211267605634, + "cpu_mem": 1.72326912, + "gpu_mem": 4.572056064, + "loss": 1.447, + "grad_norm": 2.121713399887085, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 9, + "epoch": 0.2535211267605634, + "cpu_mem": 1.72326912, + "gpu_mem": 4.5720576, + "loss": 1.4029, + "grad_norm": 2.9546499252319336, + "learning_rate": 0.00019285714285714286 + }, + { + "step": 10, + "epoch": 0.28169014084507044, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572052992, + "loss": 1.5041, + "grad_norm": 4.273380756378174, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 11, + "epoch": 0.30985915492957744, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572131328, + "loss": 1.3784, + "grad_norm": 1.9875341653823853, + "learning_rate": 0.00023571428571428569 + }, + { + "step": 12, + "epoch": 0.3380281690140845, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572105216, + "loss": 1.3467, + "grad_norm": 4.471842288970947, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 13, + "epoch": 0.36619718309859156, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572056064, + "loss": 1.3736, + "grad_norm": 1.8003240823745728, + "learning_rate": 0.00027857142857142854 + }, + { + "step": 14, + "epoch": 0.39436619718309857, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572077568, + "loss": 1.3333, + "grad_norm": 1.6007957458496094, + "learning_rate": 0.0003 + }, + { + "step": 15, + "epoch": 0.4225352112676056, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572054528, + "loss": 1.3545, + "grad_norm": 1.2756412029266357, + "learning_rate": 0.0002999533773001224 + }, + { + "step": 16, + "epoch": 0.4507042253521127, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572059136, + "loss": 1.3601, + "grad_norm": 0.8909265398979187, + "learning_rate": 0.0002998135381828383 + }, + { + "step": 17, + "epoch": 0.4788732394366197, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572096, + "loss": 1.3048, + "grad_norm": 0.8183813691139221, + "learning_rate": 0.00029958056957717696 + }, + { + "step": 18, + "epoch": 0.5070422535211268, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572106752, + "loss": 1.3967, + "grad_norm": 2.0747129917144775, + "learning_rate": 0.0002992546163048102 + }, + { + "step": 19, + "epoch": 0.5352112676056338, + "cpu_mem": 1.723465728, + "gpu_mem": 4.57204992, + "loss": 1.4493, + "grad_norm": 2.548840045928955, + "learning_rate": 0.0002988358809900258 + }, + { + "step": 20, + "epoch": 0.5633802816901409, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572120576, + "loss": 1.4099, + "grad_norm": 1.5279086828231812, + "learning_rate": 0.0002983246239337692 + }, + { + "step": 21, + "epoch": 0.5915492957746479, + "cpu_mem": 1.723465728, + "gpu_mem": 4.57211904, + "loss": 1.3201, + "grad_norm": 0.9961609244346619, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 22, + "epoch": 0.6197183098591549, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572076032, + "loss": 1.3579, + "grad_norm": 1.1940791606903076, + "learning_rate": 0.00029702587317728153 + }, + { + "step": 23, + "epoch": 0.647887323943662, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572092928, + "loss": 1.3296, + "grad_norm": 0.7753334641456604, + "learning_rate": 0.0002962391868272735 + }, + { + "step": 24, + "epoch": 0.676056338028169, + "cpu_mem": 1.723465728, + "gpu_mem": 4.57204992, + "loss": 1.346, + "grad_norm": 0.7554585337638855, + "learning_rate": 0.00029536159293436166 + }, + { + "step": 25, + "epoch": 0.704225352112676, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572079104, + "loss": 1.3713, + "grad_norm": 0.5672120451927185, + "learning_rate": 0.00029439363704250176 + }, + { + "step": 26, + "epoch": 0.7323943661971831, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572059136, + "loss": 1.4402, + "grad_norm": 0.6943309307098389, + "learning_rate": 0.00029333592086792107 + }, + { + "step": 27, + "epoch": 0.7605633802816901, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572085248, + "loss": 1.3735, + "grad_norm": 0.7753919363021851, + "learning_rate": 0.0002921891019250697 + }, + { + "step": 28, + "epoch": 0.7887323943661971, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572085248, + "loss": 1.3615, + "grad_norm": 0.40731504559516907, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 29, + "epoch": 0.8169014084507042, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572063744, + "loss": 1.2595, + "grad_norm": 0.535586953163147, + "learning_rate": 0.00028963106229663063 + }, + { + "step": 30, + "epoch": 0.8450704225352113, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572054528, + "loss": 1.3465, + "grad_norm": 0.56028151512146, + "learning_rate": 0.00028822143178056114 + }, + { + "step": 31, + "epoch": 0.8732394366197183, + "cpu_mem": 1.723465728, + "gpu_mem": 4.57207296, + "loss": 1.3596, + "grad_norm": 0.4320484697818756, + "learning_rate": 0.00028672587784675096 + }, + { + "step": 32, + "epoch": 0.9014084507042254, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572096, + "loss": 1.3415, + "grad_norm": 0.7146104574203491, + "learning_rate": 0.0002851453301853628 + }, + { + "step": 33, + "epoch": 0.9295774647887324, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572092928, + "loss": 1.3688, + "grad_norm": 0.6620209217071533, + "learning_rate": 0.00028348077132172027 + }, + { + "step": 34, + "epoch": 0.9577464788732394, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572096, + "loss": 1.4361, + "grad_norm": 0.9934812784194946, + "learning_rate": 0.0002817332360055343 + }, + { + "step": 35, + "epoch": 0.9859154929577465, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572077568, + "loss": 1.3076, + "grad_norm": 0.5487916469573975, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 36, + "epoch": 1.0140845070422535, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623161856, + "loss": 1.9896, + "grad_norm": 1.0605766773223877, + "learning_rate": 0.0002779936322448233 + }, + { + "step": 37, + "epoch": 1.0422535211267605, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623166464, + "loss": 1.3325, + "grad_norm": 0.2673764228820801, + "learning_rate": 0.0002760038884726157 + }, + { + "step": 38, + "epoch": 1.0704225352112675, + "cpu_mem": 1.723465728, + "gpu_mem": 4.62314496, + "loss": 1.2375, + "grad_norm": 0.448163241147995, + "learning_rate": 0.00027393581614739923 + }, + { + "step": 39, + "epoch": 1.0985915492957747, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623134208, + "loss": 1.3539, + "grad_norm": 0.8144217133522034, + "learning_rate": 0.0002717907008573785 + }, + { + "step": 40, + "epoch": 1.1267605633802817, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623197184, + "loss": 1.4096, + "grad_norm": 0.8239535093307495, + "learning_rate": 0.0002695698760834384 + }, + { + "step": 41, + "epoch": 1.1549295774647887, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623157248, + "loss": 1.318, + "grad_norm": 0.3538970947265625, + "learning_rate": 0.00026727472237020447 + }, + { + "step": 42, + "epoch": 1.1830985915492958, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623200256, + "loss": 1.3438, + "grad_norm": 0.32310113310813904, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 43, + "epoch": 1.2112676056338028, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623149568, + "loss": 1.3927, + "grad_norm": 0.49185827374458313, + "learning_rate": 0.0002624671804451601 + }, + { + "step": 44, + "epoch": 1.2394366197183098, + "cpu_mem": 1.723465728, + "gpu_mem": 4.62321408, + "loss": 1.3737, + "grad_norm": 0.6502453684806824, + "learning_rate": 0.0002599577807744739 + }, + { + "step": 45, + "epoch": 1.267605633802817, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623181824, + "loss": 1.3643, + "grad_norm": 0.4915526211261749, + "learning_rate": 0.0002573800273889577 + }, + { + "step": 46, + "epoch": 1.295774647887324, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623186432, + "loss": 1.3459, + "grad_norm": 0.28328266739845276, + "learning_rate": 0.0002547355227129109 + }, + { + "step": 47, + "epoch": 1.323943661971831, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623132672, + "loss": 1.3153, + "grad_norm": 0.6817641258239746, + "learning_rate": 0.00025202591066563786 + }, + { + "step": 48, + "epoch": 1.352112676056338, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623146496, + "loss": 1.3175, + "grad_norm": 0.4045603573322296, + "learning_rate": 0.0002492528756395289 + }, + { + "step": 49, + "epoch": 1.380281690140845, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623135744, + "loss": 1.3104, + "grad_norm": 0.47856390476226807, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 50, + "epoch": 1.408450704225352, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623149568, + "loss": 1.3143, + "grad_norm": 0.4425677955150604, + "learning_rate": 0.00024352347027881003 + }, + { + "step": 51, + "epoch": 1.436619718309859, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623201792, + "loss": 1.3574, + "grad_norm": 0.5529178380966187, + "learning_rate": 0.0002405706615488216 + }, + { + "step": 52, + "epoch": 1.4647887323943662, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623149568, + "loss": 1.3535, + "grad_norm": 0.6092351078987122, + "learning_rate": 0.00023756155083521846 + }, + { + "step": 53, + "epoch": 1.4929577464788732, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623218688, + "loss": 1.299, + "grad_norm": 0.4857425391674042, + "learning_rate": 0.00023449800870954326 + }, + { + "step": 54, + "epoch": 1.5211267605633803, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623186432, + "loss": 1.274, + "grad_norm": 0.513917863368988, + "learning_rate": 0.0002313819395798639 + }, + { + "step": 55, + "epoch": 1.5492957746478875, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623195648, + "loss": 1.3672, + "grad_norm": 0.64312744140625, + "learning_rate": 0.0002282152805069247 + }, + { + "step": 56, + "epoch": 1.5774647887323945, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623171072, + "loss": 1.3145, + "grad_norm": 0.5384689569473267, + "learning_rate": 0.000225 + }, + { + "step": 57, + "epoch": 1.6056338028169015, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623204864, + "loss": 1.2936, + "grad_norm": 0.28170645236968994, + "learning_rate": 0.00022173809679319772 + }, + { + "step": 58, + "epoch": 1.6338028169014085, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623186432, + "loss": 1.3219, + "grad_norm": 0.4637914299964905, + "learning_rate": 0.00021843159860297442 + }, + { + "step": 59, + "epoch": 1.6619718309859155, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623172608, + "loss": 1.305, + "grad_norm": 0.3103134334087372, + "learning_rate": 0.00021508256086763368 + }, + { + "step": 60, + "epoch": 1.6901408450704225, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623211008, + "loss": 1.3079, + "grad_norm": 0.7782206535339355, + "learning_rate": 0.00021169306546959174 + }, + { + "step": 61, + "epoch": 1.7183098591549295, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623143424, + "loss": 1.2849, + "grad_norm": 0.6881386637687683, + "learning_rate": 0.0002082652194412042 + }, + { + "step": 62, + "epoch": 1.7464788732394365, + "cpu_mem": 1.723465728, + "gpu_mem": 4.62319104, + "loss": 1.3518, + "grad_norm": 0.4648824632167816, + "learning_rate": 0.00020480115365495926 + }, + { + "step": 63, + "epoch": 1.7746478873239435, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623140352, + "loss": 1.3097, + "grad_norm": 0.626806378364563, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 64, + "epoch": 1.8028169014084507, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623189504, + "loss": 1.3291, + "grad_norm": 0.710722029209137, + "learning_rate": 0.00019777299753775265 + }, + { + "step": 65, + "epoch": 1.8309859154929577, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623187968, + "loss": 1.3278, + "grad_norm": 0.5693137049674988, + "learning_rate": 0.00019421327616163563 + }, + { + "step": 66, + "epoch": 1.8591549295774648, + "cpu_mem": 1.723465728, + "gpu_mem": 4.6232064, + "loss": 1.2654, + "grad_norm": 0.5933160185813904, + "learning_rate": 0.00019062607022145078 + }, + { + "step": 67, + "epoch": 1.887323943661972, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623148032, + "loss": 1.3131, + "grad_norm": 0.5623018145561218, + "learning_rate": 0.00018701360965354402 + }, + { + "step": 68, + "epoch": 1.915492957746479, + "cpu_mem": 1.723465728, + "gpu_mem": 4.62316032, + "loss": 1.3476, + "grad_norm": 0.5520971417427063, + "learning_rate": 0.00018337814009344714 + }, + { + "step": 69, + "epoch": 1.943661971830986, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623184896, + "loss": 1.3214, + "grad_norm": 0.8189231157302856, + "learning_rate": 0.0001797219214799096 + }, + { + "step": 70, + "epoch": 1.971830985915493, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623161856, + "loss": 1.2638, + "grad_norm": 0.5372128486633301, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 71, + "epoch": 2.0, + "cpu_mem": 1.723465728, + "gpu_mem": 4.622997504, + "loss": 2.0036, + "grad_norm": 1.5936216115951538, + "learning_rate": 0.00017235633992642615 + }, + { + "step": 72, + "epoch": 2.028169014084507, + "cpu_mem": 1.723465728, + "gpu_mem": 4.57208832, + "loss": 1.3225, + "grad_norm": 0.6790529489517212, + "learning_rate": 0.00016865155569712278 + }, + { + "step": 73, + "epoch": 2.056338028169014, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572051456, + "loss": 1.2909, + "grad_norm": 0.5629863739013672, + "learning_rate": 0.0001649351769893725 + }, + { + "step": 74, + "epoch": 2.084507042253521, + "cpu_mem": 1.723465728, + "gpu_mem": 4.57211136, + "loss": 1.2696, + "grad_norm": 0.5548259615898132, + "learning_rate": 0.00016120951403796364 + }, + { + "step": 75, + "epoch": 2.112676056338028, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572079104, + "loss": 1.3284, + "grad_norm": 0.7793273329734802, + "learning_rate": 0.00015747688284910457 + }, + { + "step": 76, + "epoch": 2.140845070422535, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572089856, + "loss": 1.272, + "grad_norm": 0.5340213179588318, + "learning_rate": 0.00015373960376071093 + }, + { + "step": 77, + "epoch": 2.169014084507042, + "cpu_mem": 1.723465728, + "gpu_mem": 4.57212672, + "loss": 1.3101, + "grad_norm": 0.47268882393836975, + "learning_rate": 0.00015 + }, + { + "step": 78, + "epoch": 2.1971830985915495, + "cpu_mem": 1.723465728, + "gpu_mem": 4.57211136, + "loss": 1.3425, + "grad_norm": 0.6505216360092163, + "learning_rate": 0.00014626039623928907 + }, + { + "step": 79, + "epoch": 2.2253521126760565, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572062208, + "loss": 1.2464, + "grad_norm": 0.8644428849220276, + "learning_rate": 0.0001425231171508954 + }, + { + "step": 80, + "epoch": 2.2535211267605635, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572106752, + "loss": 1.3184, + "grad_norm": 0.6468827128410339, + "learning_rate": 0.00013879048596203636 + }, + { + "step": 81, + "epoch": 2.2816901408450705, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572092928, + "loss": 1.3054, + "grad_norm": 0.8182364702224731, + "learning_rate": 0.0001350648230106275 + }, + { + "step": 82, + "epoch": 2.3098591549295775, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572060672, + "loss": 1.2958, + "grad_norm": 0.897240936756134, + "learning_rate": 0.00013134844430287725 + }, + { + "step": 83, + "epoch": 2.3380281690140845, + "cpu_mem": 1.723465728, + "gpu_mem": 4.57211136, + "loss": 1.3086, + "grad_norm": 1.11750328540802, + "learning_rate": 0.0001276436600735738 + }, + { + "step": 84, + "epoch": 2.3661971830985915, + "cpu_mem": 1.723465728, + "gpu_mem": 4.57204992, + "loss": 1.3598, + "grad_norm": 1.0985162258148193, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 85, + "epoch": 2.3943661971830985, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572096, + "loss": 1.3771, + "grad_norm": 1.3943769931793213, + "learning_rate": 0.00012027807852009038 + }, + { + "step": 86, + "epoch": 2.4225352112676055, + "cpu_mem": 1.723465728, + "gpu_mem": 4.57204992, + "loss": 1.3093, + "grad_norm": 0.7010465264320374, + "learning_rate": 0.00011662185990655284 + }, + { + "step": 87, + "epoch": 2.4507042253521125, + "cpu_mem": 1.723465728, + "gpu_mem": 4.57208064, + "loss": 1.3349, + "grad_norm": 0.8608537316322327, + "learning_rate": 0.00011298639034645593 + }, + { + "step": 88, + "epoch": 2.4788732394366195, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572056064, + "loss": 1.2987, + "grad_norm": 0.9607675075531006, + "learning_rate": 0.00010937392977854923 + }, + { + "step": 89, + "epoch": 2.507042253521127, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572109824, + "loss": 1.2497, + "grad_norm": 0.9415262341499329, + "learning_rate": 0.00010578672383836435 + }, + { + "step": 90, + "epoch": 2.535211267605634, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572091392, + "loss": 1.2455, + "grad_norm": 0.8596855998039246, + "learning_rate": 0.00010222700246224735 + }, + { + "step": 91, + "epoch": 2.563380281690141, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572040704, + "loss": 1.3462, + "grad_norm": 1.6377224922180176, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 92, + "epoch": 2.591549295774648, + "cpu_mem": 1.723465728, + "gpu_mem": 4.57206528, + "loss": 1.2477, + "grad_norm": 0.6721343994140625, + "learning_rate": 9.519884634504074e-05 + }, + { + "step": 93, + "epoch": 2.619718309859155, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572068352, + "loss": 1.2622, + "grad_norm": 1.6813093423843384, + "learning_rate": 9.17347805587958e-05 + }, + { + "step": 94, + "epoch": 2.647887323943662, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572060672, + "loss": 1.2178, + "grad_norm": 1.2276933193206787, + "learning_rate": 8.830693453040829e-05 + }, + { + "step": 95, + "epoch": 2.676056338028169, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572099072, + "loss": 1.2426, + "grad_norm": 0.7991220355033875, + "learning_rate": 8.491743913236628e-05 + }, + { + "step": 96, + "epoch": 2.704225352112676, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572108288, + "loss": 1.1574, + "grad_norm": 1.24543297290802, + "learning_rate": 8.156840139702554e-05 + }, + { + "step": 97, + "epoch": 2.732394366197183, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572051456, + "loss": 1.278, + "grad_norm": 1.3419033288955688, + "learning_rate": 7.82619032068023e-05 + }, + { + "step": 98, + "epoch": 2.76056338028169, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572051456, + "loss": 1.2878, + "grad_norm": 1.4254034757614136, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 99, + "epoch": 2.788732394366197, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572048384, + "loss": 1.2312, + "grad_norm": 1.054552674293518, + "learning_rate": 7.17847194930753e-05 + }, + { + "step": 100, + "epoch": 2.816901408450704, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572046848, + "loss": 1.2256, + "grad_norm": 1.3664337396621704, + "learning_rate": 6.86180604201361e-05 + }, + { + "step": 101, + "epoch": 2.845070422535211, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572089856, + "loss": 1.1881, + "grad_norm": 1.050628423690796, + "learning_rate": 6.550199129045668e-05 + }, + { + "step": 102, + "epoch": 2.873239436619718, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572028416, + "loss": 1.2544, + "grad_norm": 1.049895167350769, + "learning_rate": 6.243844916478155e-05 + }, + { + "step": 103, + "epoch": 2.9014084507042255, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572077568, + "loss": 1.2479, + "grad_norm": 1.2685251235961914, + "learning_rate": 5.9429338451178355e-05 + }, + { + "step": 104, + "epoch": 2.9295774647887325, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572140544, + "loss": 1.338, + "grad_norm": 1.7948390245437622, + "learning_rate": 5.6476529721189974e-05 + }, + { + "step": 105, + "epoch": 2.9577464788732395, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572092928, + "loss": 1.1778, + "grad_norm": 0.8694394826889038, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 106, + "epoch": 2.9859154929577465, + "cpu_mem": 1.723465728, + "gpu_mem": 4.572074496, + "loss": 1.2814, + "grad_norm": 1.238435983657837, + "learning_rate": 5.074712436047112e-05 + }, + { + "step": 107, + "epoch": 3.0140845070422535, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623184896, + "loss": 1.6933, + "grad_norm": 1.8174742460250854, + "learning_rate": 4.7974089334362057e-05 + }, + { + "step": 108, + "epoch": 3.0422535211267605, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623166464, + "loss": 1.213, + "grad_norm": 0.8821410536766052, + "learning_rate": 4.526447728708908e-05 + }, + { + "step": 109, + "epoch": 3.0704225352112675, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623155712, + "loss": 1.1742, + "grad_norm": 1.0146138668060303, + "learning_rate": 4.261997261104223e-05 + }, + { + "step": 110, + "epoch": 3.0985915492957745, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623209472, + "loss": 1.1932, + "grad_norm": 1.5023205280303955, + "learning_rate": 4.004221922552608e-05 + }, + { + "step": 111, + "epoch": 3.1267605633802815, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623169536, + "loss": 1.2135, + "grad_norm": 1.2969108819961548, + "learning_rate": 3.753281955483985e-05 + }, + { + "step": 112, + "epoch": 3.1549295774647885, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623187968, + "loss": 1.1908, + "grad_norm": 0.9395973682403564, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 113, + "epoch": 3.183098591549296, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623250944, + "loss": 1.1625, + "grad_norm": 0.9785551428794861, + "learning_rate": 3.2725277629795526e-05 + }, + { + "step": 114, + "epoch": 3.211267605633803, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623178752, + "loss": 1.2298, + "grad_norm": 1.288878321647644, + "learning_rate": 3.0430123916561672e-05 + }, + { + "step": 115, + "epoch": 3.23943661971831, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623172608, + "loss": 1.2344, + "grad_norm": 1.1662344932556152, + "learning_rate": 2.8209299142621522e-05 + }, + { + "step": 116, + "epoch": 3.267605633802817, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623187968, + "loss": 1.1688, + "grad_norm": 1.1845673322677612, + "learning_rate": 2.6064183852600797e-05 + }, + { + "step": 117, + "epoch": 3.295774647887324, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623203328, + "loss": 1.1269, + "grad_norm": 1.0607956647872925, + "learning_rate": 2.3996111527384288e-05 + }, + { + "step": 118, + "epoch": 3.323943661971831, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623194112, + "loss": 1.1499, + "grad_norm": 1.200290322303772, + "learning_rate": 2.2006367755176655e-05 + }, + { + "step": 119, + "epoch": 3.352112676056338, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623184896, + "loss": 1.1745, + "grad_norm": 1.1815811395645142, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 120, + "epoch": 3.380281690140845, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623203328, + "loss": 1.1513, + "grad_norm": 1.055599331855774, + "learning_rate": 1.82667639944657e-05 + }, + { + "step": 121, + "epoch": 3.408450704225352, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623201792, + "loss": 1.1195, + "grad_norm": 0.9936859607696533, + "learning_rate": 1.6519228678279718e-05 + }, + { + "step": 122, + "epoch": 3.436619718309859, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623158784, + "loss": 1.0325, + "grad_norm": 1.160421371459961, + "learning_rate": 1.4854669814637143e-05 + }, + { + "step": 123, + "epoch": 3.464788732394366, + "cpu_mem": 1.723465728, + "gpu_mem": 4.62319104, + "loss": 1.1149, + "grad_norm": 1.0793308019638062, + "learning_rate": 1.3274122153249028e-05 + }, + { + "step": 124, + "epoch": 3.492957746478873, + "cpu_mem": 1.723465728, + "gpu_mem": 4.62314496, + "loss": 1.195, + "grad_norm": 1.2750935554504395, + "learning_rate": 1.1778568219438839e-05 + }, + { + "step": 125, + "epoch": 3.52112676056338, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623189504, + "loss": 1.1325, + "grad_norm": 1.6646710634231567, + "learning_rate": 1.036893770336938e-05 + }, + { + "step": 126, + "epoch": 3.5492957746478875, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623140352, + "loss": 1.1179, + "grad_norm": 1.3640121221542358, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 127, + "epoch": 3.5774647887323945, + "cpu_mem": 1.723465728, + "gpu_mem": 4.62315264, + "loss": 1.1553, + "grad_norm": 1.7339750528335571, + "learning_rate": 7.810898074930243e-06 + }, + { + "step": 128, + "epoch": 3.6056338028169015, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623177216, + "loss": 1.207, + "grad_norm": 1.6928668022155762, + "learning_rate": 6.664079132078881e-06 + }, + { + "step": 129, + "epoch": 3.6338028169014085, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623138816, + "loss": 1.1152, + "grad_norm": 2.0333118438720703, + "learning_rate": 5.606362957498195e-06 + }, + { + "step": 130, + "epoch": 3.6619718309859155, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623141888, + "loss": 1.1615, + "grad_norm": 1.8231158256530762, + "learning_rate": 4.638407065638322e-06 + }, + { + "step": 131, + "epoch": 3.6901408450704225, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623154176, + "loss": 1.0961, + "grad_norm": 1.1981271505355835, + "learning_rate": 3.760813172726457e-06 + }, + { + "step": 132, + "epoch": 3.7183098591549295, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623118848, + "loss": 1.1015, + "grad_norm": 1.1937692165374756, + "learning_rate": 2.9741268227184255e-06 + }, + { + "step": 133, + "epoch": 3.7464788732394365, + "cpu_mem": 1.723465728, + "gpu_mem": 4.62316032, + "loss": 1.1162, + "grad_norm": 1.3339706659317017, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 134, + "epoch": 3.7746478873239435, + "cpu_mem": 1.723465728, + "gpu_mem": 4.62317568, + "loss": 1.1475, + "grad_norm": 1.419571876525879, + "learning_rate": 1.6753760662307215e-06 + }, + { + "step": 135, + "epoch": 3.802816901408451, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623140352, + "loss": 1.1742, + "grad_norm": 1.4531906843185425, + "learning_rate": 1.1641190099741904e-06 + }, + { + "step": 136, + "epoch": 3.830985915492958, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623148032, + "loss": 1.1818, + "grad_norm": 1.6564372777938843, + "learning_rate": 7.453836951897885e-07 + }, + { + "step": 137, + "epoch": 3.859154929577465, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623169536, + "loss": 1.1225, + "grad_norm": 1.5165504217147827, + "learning_rate": 4.194304228229806e-07 + }, + { + "step": 138, + "epoch": 3.887323943661972, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623180288, + "loss": 1.1821, + "grad_norm": 1.2003953456878662, + "learning_rate": 1.8646181716164831e-07 + }, + { + "step": 139, + "epoch": 3.915492957746479, + "cpu_mem": 1.723465728, + "gpu_mem": 4.623172608, + "loss": 1.2374, + "grad_norm": 1.741915225982666, + "learning_rate": 4.662269987756317e-08 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.723465728, + "gpu_mem": 4.6232064, + "loss": 1.1697, + "grad_norm": 1.9262864589691162, + "learning_rate": 0.0 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.723465728, + "gpu_mem": 4.6232064, + "train_runtime": 672.8629, + "train_samples_per_second": 13.382, + "train_steps_per_second": 0.208, + "total_flos": 0.0, + "train_loss": 1.3816127955913544 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r8-a2/adapter_config.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c9ded8039b496858a8aa3d756f427279337f8964 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r8-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 16, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 8, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "D" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r8-a2/eval_results.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..dfdd43982c0e5e16e96b4f577ec1105971a90b40 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_e", + "results": 0.2895622895622896 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r8-a2/training_configuration.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..9c0629b24d9e6b33ef0392c59d0833f4ad1df927 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_E", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "abl_D", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 3163776 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_D-arc_e-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r8-a2", + "seed": 42, + "timestamp": "2025-09-02T15:05:27.652816" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r8-a2/training_logs.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..b9e22770eec0427456120b2e1b7d0adc4b3dc3a0 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-arc_e-r8-a2/training_logs.json @@ -0,0 +1,1273 @@ +[ + { + "step": 1, + "epoch": 0.028169014084507043, + "cpu_mem": 1.72177408, + "gpu_mem": 4.430716416, + "loss": 4.6319, + "grad_norm": 19.96603012084961, + "learning_rate": 2.1428571428571425e-05 + }, + { + "step": 2, + "epoch": 0.056338028169014086, + "cpu_mem": 1.72177408, + "gpu_mem": 4.4561664, + "loss": 4.4578, + "grad_norm": 19.951278686523438, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 3, + "epoch": 0.08450704225352113, + "cpu_mem": 1.721970688, + "gpu_mem": 4.456144896, + "loss": 4.4422, + "grad_norm": 20.226734161376953, + "learning_rate": 6.428571428571427e-05 + }, + { + "step": 4, + "epoch": 0.11267605633802817, + "cpu_mem": 1.721970688, + "gpu_mem": 4.456123392, + "loss": 4.2415, + "grad_norm": 18.856990814208984, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 5, + "epoch": 0.14084507042253522, + "cpu_mem": 1.722167296, + "gpu_mem": 4.456164864, + "loss": 3.5153, + "grad_norm": 15.925736427307129, + "learning_rate": 0.00010714285714285714 + }, + { + "step": 6, + "epoch": 0.16901408450704225, + "cpu_mem": 1.722167296, + "gpu_mem": 4.456140288, + "loss": 2.7608, + "grad_norm": 12.457343101501465, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 7, + "epoch": 0.19718309859154928, + "cpu_mem": 1.722363904, + "gpu_mem": 4.456163328, + "loss": 2.4775, + "grad_norm": 14.461479187011719, + "learning_rate": 0.00015 + }, + { + "step": 8, + "epoch": 0.22535211267605634, + "cpu_mem": 1.722363904, + "gpu_mem": 4.456121856, + "loss": 2.0511, + "grad_norm": 6.818901538848877, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 9, + "epoch": 0.2535211267605634, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456123392, + "loss": 2.0216, + "grad_norm": 4.6875319480896, + "learning_rate": 0.00019285714285714286 + }, + { + "step": 10, + "epoch": 0.28169014084507044, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456118784, + "loss": 1.7575, + "grad_norm": 2.3636090755462646, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 11, + "epoch": 0.30985915492957744, + "cpu_mem": 1.722560512, + "gpu_mem": 4.45619712, + "loss": 1.6068, + "grad_norm": 2.060075521469116, + "learning_rate": 0.00023571428571428569 + }, + { + "step": 12, + "epoch": 0.3380281690140845, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456171008, + "loss": 1.5982, + "grad_norm": 1.9440051317214966, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 13, + "epoch": 0.36619718309859156, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456121856, + "loss": 1.4191, + "grad_norm": 1.0107730627059937, + "learning_rate": 0.00027857142857142854 + }, + { + "step": 14, + "epoch": 0.39436619718309857, + "cpu_mem": 1.722560512, + "gpu_mem": 4.45614336, + "loss": 1.3963, + "grad_norm": 1.0020443201065063, + "learning_rate": 0.0003 + }, + { + "step": 15, + "epoch": 0.4225352112676056, + "cpu_mem": 1.722560512, + "gpu_mem": 4.45612032, + "loss": 1.3624, + "grad_norm": 0.9032099843025208, + "learning_rate": 0.0002999533773001224 + }, + { + "step": 16, + "epoch": 0.4507042253521127, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456124928, + "loss": 1.3693, + "grad_norm": 0.5151799917221069, + "learning_rate": 0.0002998135381828383 + }, + { + "step": 17, + "epoch": 0.4788732394366197, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456161792, + "loss": 1.343, + "grad_norm": 1.0978657007217407, + "learning_rate": 0.00029958056957717696 + }, + { + "step": 18, + "epoch": 0.5070422535211268, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456172544, + "loss": 1.3397, + "grad_norm": 1.1621737480163574, + "learning_rate": 0.0002992546163048102 + }, + { + "step": 19, + "epoch": 0.5352112676056338, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456115712, + "loss": 1.339, + "grad_norm": 2.7990646362304688, + "learning_rate": 0.0002988358809900258 + }, + { + "step": 20, + "epoch": 0.5633802816901409, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456186368, + "loss": 1.3995, + "grad_norm": 2.6295888423919678, + "learning_rate": 0.0002983246239337692 + }, + { + "step": 21, + "epoch": 0.5915492957746479, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456184832, + "loss": 1.3245, + "grad_norm": 0.9985354542732239, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 22, + "epoch": 0.6197183098591549, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456141824, + "loss": 1.3074, + "grad_norm": 1.3775261640548706, + "learning_rate": 0.00029702587317728153 + }, + { + "step": 23, + "epoch": 0.647887323943662, + "cpu_mem": 1.722560512, + "gpu_mem": 4.45615872, + "loss": 1.2883, + "grad_norm": 0.5953607559204102, + "learning_rate": 0.0002962391868272735 + }, + { + "step": 24, + "epoch": 0.676056338028169, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456115712, + "loss": 1.3544, + "grad_norm": 0.8177909851074219, + "learning_rate": 0.00029536159293436166 + }, + { + "step": 25, + "epoch": 0.704225352112676, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456144896, + "loss": 1.3615, + "grad_norm": 0.5633518099784851, + "learning_rate": 0.00029439363704250176 + }, + { + "step": 26, + "epoch": 0.7323943661971831, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456124928, + "loss": 1.4044, + "grad_norm": 0.5022664070129395, + "learning_rate": 0.00029333592086792107 + }, + { + "step": 27, + "epoch": 0.7605633802816901, + "cpu_mem": 1.722560512, + "gpu_mem": 4.45615104, + "loss": 1.355, + "grad_norm": 0.5943797826766968, + "learning_rate": 0.0002921891019250697 + }, + { + "step": 28, + "epoch": 0.7887323943661971, + "cpu_mem": 1.722560512, + "gpu_mem": 4.45615104, + "loss": 1.3386, + "grad_norm": 0.5705973505973816, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 29, + "epoch": 0.8169014084507042, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456129536, + "loss": 1.2833, + "grad_norm": 0.7330148816108704, + "learning_rate": 0.00028963106229663063 + }, + { + "step": 30, + "epoch": 0.8450704225352113, + "cpu_mem": 1.722560512, + "gpu_mem": 4.45612032, + "loss": 1.3772, + "grad_norm": 0.588225245475769, + "learning_rate": 0.00028822143178056114 + }, + { + "step": 31, + "epoch": 0.8732394366197183, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456138752, + "loss": 1.359, + "grad_norm": 0.5594052672386169, + "learning_rate": 0.00028672587784675096 + }, + { + "step": 32, + "epoch": 0.9014084507042254, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456161792, + "loss": 1.338, + "grad_norm": 0.5966840386390686, + "learning_rate": 0.0002851453301853628 + }, + { + "step": 33, + "epoch": 0.9295774647887324, + "cpu_mem": 1.722560512, + "gpu_mem": 4.45615872, + "loss": 1.3514, + "grad_norm": 0.774011492729187, + "learning_rate": 0.00028348077132172027 + }, + { + "step": 34, + "epoch": 0.9577464788732394, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456161792, + "loss": 1.4543, + "grad_norm": 1.3361443281173706, + "learning_rate": 0.0002817332360055343 + }, + { + "step": 35, + "epoch": 0.9859154929577465, + "cpu_mem": 1.722560512, + "gpu_mem": 4.45614336, + "loss": 1.3216, + "grad_norm": 0.7013792991638184, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 36, + "epoch": 1.0140845070422535, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468828672, + "loss": 1.9407, + "grad_norm": 0.9638655781745911, + "learning_rate": 0.0002779936322448233 + }, + { + "step": 37, + "epoch": 1.0422535211267605, + "cpu_mem": 1.722560512, + "gpu_mem": 4.46883328, + "loss": 1.342, + "grad_norm": 0.4204472601413727, + "learning_rate": 0.0002760038884726157 + }, + { + "step": 38, + "epoch": 1.0704225352112675, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468811776, + "loss": 1.2265, + "grad_norm": 0.38990548253059387, + "learning_rate": 0.00027393581614739923 + }, + { + "step": 39, + "epoch": 1.0985915492957747, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468801024, + "loss": 1.3701, + "grad_norm": 0.9990638494491577, + "learning_rate": 0.0002717907008573785 + }, + { + "step": 40, + "epoch": 1.1267605633802817, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468864, + "loss": 1.4108, + "grad_norm": 0.8932640552520752, + "learning_rate": 0.0002695698760834384 + }, + { + "step": 41, + "epoch": 1.1549295774647887, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468824064, + "loss": 1.3311, + "grad_norm": 0.3582281470298767, + "learning_rate": 0.00026727472237020447 + }, + { + "step": 42, + "epoch": 1.1830985915492958, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468867072, + "loss": 1.3464, + "grad_norm": 0.3767402768135071, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 43, + "epoch": 1.2112676056338028, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468816384, + "loss": 1.3904, + "grad_norm": 0.3280416429042816, + "learning_rate": 0.0002624671804451601 + }, + { + "step": 44, + "epoch": 1.2394366197183098, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468880896, + "loss": 1.3786, + "grad_norm": 0.5452196002006531, + "learning_rate": 0.0002599577807744739 + }, + { + "step": 45, + "epoch": 1.267605633802817, + "cpu_mem": 1.722560512, + "gpu_mem": 4.46884864, + "loss": 1.375, + "grad_norm": 0.4560747444629669, + "learning_rate": 0.0002573800273889577 + }, + { + "step": 46, + "epoch": 1.295774647887324, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468853248, + "loss": 1.3519, + "grad_norm": 0.30323293805122375, + "learning_rate": 0.0002547355227129109 + }, + { + "step": 47, + "epoch": 1.323943661971831, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468799488, + "loss": 1.2942, + "grad_norm": 0.4669688045978546, + "learning_rate": 0.00025202591066563786 + }, + { + "step": 48, + "epoch": 1.352112676056338, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468813312, + "loss": 1.3161, + "grad_norm": 0.27313098311424255, + "learning_rate": 0.0002492528756395289 + }, + { + "step": 49, + "epoch": 1.380281690140845, + "cpu_mem": 1.722560512, + "gpu_mem": 4.46880256, + "loss": 1.3249, + "grad_norm": 0.3969173729419708, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 50, + "epoch": 1.408450704225352, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468816384, + "loss": 1.3403, + "grad_norm": 0.429466187953949, + "learning_rate": 0.00024352347027881003 + }, + { + "step": 51, + "epoch": 1.436619718309859, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468868608, + "loss": 1.3553, + "grad_norm": 0.45321452617645264, + "learning_rate": 0.0002405706615488216 + }, + { + "step": 52, + "epoch": 1.4647887323943662, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468816384, + "loss": 1.3593, + "grad_norm": 0.43704646825790405, + "learning_rate": 0.00023756155083521846 + }, + { + "step": 53, + "epoch": 1.4929577464788732, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468885504, + "loss": 1.3012, + "grad_norm": 0.39156848192214966, + "learning_rate": 0.00023449800870954326 + }, + { + "step": 54, + "epoch": 1.5211267605633803, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468853248, + "loss": 1.296, + "grad_norm": 0.36113888025283813, + "learning_rate": 0.0002313819395798639 + }, + { + "step": 55, + "epoch": 1.5492957746478875, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468862464, + "loss": 1.3626, + "grad_norm": 0.3690665066242218, + "learning_rate": 0.0002282152805069247 + }, + { + "step": 56, + "epoch": 1.5774647887323945, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468837888, + "loss": 1.3101, + "grad_norm": 0.2917788624763489, + "learning_rate": 0.000225 + }, + { + "step": 57, + "epoch": 1.6056338028169015, + "cpu_mem": 1.722560512, + "gpu_mem": 4.46887168, + "loss": 1.3122, + "grad_norm": 0.18467584252357483, + "learning_rate": 0.00022173809679319772 + }, + { + "step": 58, + "epoch": 1.6338028169014085, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468853248, + "loss": 1.3149, + "grad_norm": 0.2777339518070221, + "learning_rate": 0.00021843159860297442 + }, + { + "step": 59, + "epoch": 1.6619718309859155, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468839424, + "loss": 1.3159, + "grad_norm": 0.2292451113462448, + "learning_rate": 0.00021508256086763368 + }, + { + "step": 60, + "epoch": 1.6901408450704225, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468877824, + "loss": 1.2997, + "grad_norm": 0.4204176962375641, + "learning_rate": 0.00021169306546959174 + }, + { + "step": 61, + "epoch": 1.7183098591549295, + "cpu_mem": 1.722560512, + "gpu_mem": 4.46881024, + "loss": 1.3002, + "grad_norm": 0.39707139134407043, + "learning_rate": 0.0002082652194412042 + }, + { + "step": 62, + "epoch": 1.7464788732394365, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468857856, + "loss": 1.3611, + "grad_norm": 0.2869921326637268, + "learning_rate": 0.00020480115365495926 + }, + { + "step": 63, + "epoch": 1.7746478873239435, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468807168, + "loss": 1.3246, + "grad_norm": 0.37791329622268677, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 64, + "epoch": 1.8028169014084507, + "cpu_mem": 1.722560512, + "gpu_mem": 4.46885632, + "loss": 1.3409, + "grad_norm": 0.4477396011352539, + "learning_rate": 0.00019777299753775265 + }, + { + "step": 65, + "epoch": 1.8309859154929577, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468854784, + "loss": 1.34, + "grad_norm": 0.34427109360694885, + "learning_rate": 0.00019421327616163563 + }, + { + "step": 66, + "epoch": 1.8591549295774648, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468873216, + "loss": 1.3138, + "grad_norm": 0.36941012740135193, + "learning_rate": 0.00019062607022145078 + }, + { + "step": 67, + "epoch": 1.887323943661972, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468814848, + "loss": 1.3096, + "grad_norm": 0.2957288324832916, + "learning_rate": 0.00018701360965354402 + }, + { + "step": 68, + "epoch": 1.915492957746479, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468827136, + "loss": 1.3422, + "grad_norm": 0.2089984267950058, + "learning_rate": 0.00018337814009344714 + }, + { + "step": 69, + "epoch": 1.943661971830986, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468851712, + "loss": 1.2997, + "grad_norm": 0.4101139307022095, + "learning_rate": 0.0001797219214799096 + }, + { + "step": 70, + "epoch": 1.971830985915493, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468828672, + "loss": 1.275, + "grad_norm": 0.27523690462112427, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 71, + "epoch": 2.0, + "cpu_mem": 1.722560512, + "gpu_mem": 4.46866432, + "loss": 2.0167, + "grad_norm": 1.3589415550231934, + "learning_rate": 0.00017235633992642615 + }, + { + "step": 72, + "epoch": 2.028169014084507, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456154112, + "loss": 1.3384, + "grad_norm": 0.2910694479942322, + "learning_rate": 0.00016865155569712278 + }, + { + "step": 73, + "epoch": 2.056338028169014, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456117248, + "loss": 1.3054, + "grad_norm": 0.2803052067756653, + "learning_rate": 0.0001649351769893725 + }, + { + "step": 74, + "epoch": 2.084507042253521, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456177152, + "loss": 1.3097, + "grad_norm": 0.28145790100097656, + "learning_rate": 0.00016120951403796364 + }, + { + "step": 75, + "epoch": 2.112676056338028, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456144896, + "loss": 1.3439, + "grad_norm": 0.46064862608909607, + "learning_rate": 0.00015747688284910457 + }, + { + "step": 76, + "epoch": 2.140845070422535, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456155648, + "loss": 1.2952, + "grad_norm": 0.2799583971500397, + "learning_rate": 0.00015373960376071093 + }, + { + "step": 77, + "epoch": 2.169014084507042, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456192512, + "loss": 1.3466, + "grad_norm": 0.224347323179245, + "learning_rate": 0.00015 + }, + { + "step": 78, + "epoch": 2.1971830985915495, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456177152, + "loss": 1.3565, + "grad_norm": 0.42305493354797363, + "learning_rate": 0.00014626039623928907 + }, + { + "step": 79, + "epoch": 2.2253521126760565, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456128, + "loss": 1.27, + "grad_norm": 0.3464326560497284, + "learning_rate": 0.0001425231171508954 + }, + { + "step": 80, + "epoch": 2.2535211267605635, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456172544, + "loss": 1.3518, + "grad_norm": 0.4349273145198822, + "learning_rate": 0.00013879048596203636 + }, + { + "step": 81, + "epoch": 2.2816901408450705, + "cpu_mem": 1.722560512, + "gpu_mem": 4.45615872, + "loss": 1.3339, + "grad_norm": 0.41595470905303955, + "learning_rate": 0.0001350648230106275 + }, + { + "step": 82, + "epoch": 2.3098591549295775, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456126464, + "loss": 1.3439, + "grad_norm": 0.43620944023132324, + "learning_rate": 0.00013134844430287725 + }, + { + "step": 83, + "epoch": 2.3380281690140845, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456177152, + "loss": 1.3321, + "grad_norm": 0.8163869380950928, + "learning_rate": 0.0001276436600735738 + }, + { + "step": 84, + "epoch": 2.3661971830985915, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456115712, + "loss": 1.3542, + "grad_norm": 0.3859580159187317, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 85, + "epoch": 2.3943661971830985, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456161792, + "loss": 1.3613, + "grad_norm": 0.6109170913696289, + "learning_rate": 0.00012027807852009038 + }, + { + "step": 86, + "epoch": 2.4225352112676055, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456115712, + "loss": 1.3307, + "grad_norm": 0.393836110830307, + "learning_rate": 0.00011662185990655284 + }, + { + "step": 87, + "epoch": 2.4507042253521125, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456146432, + "loss": 1.3747, + "grad_norm": 0.34954118728637695, + "learning_rate": 0.00011298639034645593 + }, + { + "step": 88, + "epoch": 2.4788732394366195, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456121856, + "loss": 1.3396, + "grad_norm": 0.6215751767158508, + "learning_rate": 0.00010937392977854923 + }, + { + "step": 89, + "epoch": 2.507042253521127, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456175616, + "loss": 1.2929, + "grad_norm": 0.3224996030330658, + "learning_rate": 0.00010578672383836435 + }, + { + "step": 90, + "epoch": 2.535211267605634, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456157184, + "loss": 1.3236, + "grad_norm": 0.39764606952667236, + "learning_rate": 0.00010222700246224735 + }, + { + "step": 91, + "epoch": 2.563380281690141, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456106496, + "loss": 1.3915, + "grad_norm": 0.8185414671897888, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 92, + "epoch": 2.591549295774648, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456131072, + "loss": 1.3081, + "grad_norm": 0.3042651414871216, + "learning_rate": 9.519884634504074e-05 + }, + { + "step": 93, + "epoch": 2.619718309859155, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456134144, + "loss": 1.2578, + "grad_norm": 0.5396004319190979, + "learning_rate": 9.17347805587958e-05 + }, + { + "step": 94, + "epoch": 2.647887323943662, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456126464, + "loss": 1.2939, + "grad_norm": 0.3204064965248108, + "learning_rate": 8.830693453040829e-05 + }, + { + "step": 95, + "epoch": 2.676056338028169, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456164864, + "loss": 1.3191, + "grad_norm": 0.27339568734169006, + "learning_rate": 8.491743913236628e-05 + }, + { + "step": 96, + "epoch": 2.704225352112676, + "cpu_mem": 1.722560512, + "gpu_mem": 4.45617408, + "loss": 1.262, + "grad_norm": 0.6593722701072693, + "learning_rate": 8.156840139702554e-05 + }, + { + "step": 97, + "epoch": 2.732394366197183, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456117248, + "loss": 1.339, + "grad_norm": 0.4599621891975403, + "learning_rate": 7.82619032068023e-05 + }, + { + "step": 98, + "epoch": 2.76056338028169, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456117248, + "loss": 1.3332, + "grad_norm": 0.23627932369709015, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 99, + "epoch": 2.788732394366197, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456114176, + "loss": 1.2902, + "grad_norm": 0.38717377185821533, + "learning_rate": 7.17847194930753e-05 + }, + { + "step": 100, + "epoch": 2.816901408450704, + "cpu_mem": 1.722560512, + "gpu_mem": 4.45611264, + "loss": 1.2648, + "grad_norm": 0.33426687121391296, + "learning_rate": 6.86180604201361e-05 + }, + { + "step": 101, + "epoch": 2.845070422535211, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456155648, + "loss": 1.2471, + "grad_norm": 0.36515259742736816, + "learning_rate": 6.550199129045668e-05 + }, + { + "step": 102, + "epoch": 2.873239436619718, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456094208, + "loss": 1.2851, + "grad_norm": 0.24860692024230957, + "learning_rate": 6.243844916478155e-05 + }, + { + "step": 103, + "epoch": 2.9014084507042255, + "cpu_mem": 1.722560512, + "gpu_mem": 4.45614336, + "loss": 1.2964, + "grad_norm": 0.27683886885643005, + "learning_rate": 5.9429338451178355e-05 + }, + { + "step": 104, + "epoch": 2.9295774647887325, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456206336, + "loss": 1.3179, + "grad_norm": 0.468164324760437, + "learning_rate": 5.6476529721189974e-05 + }, + { + "step": 105, + "epoch": 2.9577464788732395, + "cpu_mem": 1.722560512, + "gpu_mem": 4.45615872, + "loss": 1.2607, + "grad_norm": 0.30810731649398804, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 106, + "epoch": 2.9859154929577465, + "cpu_mem": 1.722560512, + "gpu_mem": 4.456140288, + "loss": 1.3101, + "grad_norm": 0.24091149866580963, + "learning_rate": 5.074712436047112e-05 + }, + { + "step": 107, + "epoch": 3.0140845070422535, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468851712, + "loss": 1.8759, + "grad_norm": 0.8109790682792664, + "learning_rate": 4.7974089334362057e-05 + }, + { + "step": 108, + "epoch": 3.0422535211267605, + "cpu_mem": 1.722560512, + "gpu_mem": 4.46883328, + "loss": 1.3087, + "grad_norm": 0.40748584270477295, + "learning_rate": 4.526447728708908e-05 + }, + { + "step": 109, + "epoch": 3.0704225352112675, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468822528, + "loss": 1.32, + "grad_norm": 0.33645784854888916, + "learning_rate": 4.261997261104223e-05 + }, + { + "step": 110, + "epoch": 3.0985915492957745, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468876288, + "loss": 1.322, + "grad_norm": 0.7711585164070129, + "learning_rate": 4.004221922552608e-05 + }, + { + "step": 111, + "epoch": 3.1267605633802815, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468836352, + "loss": 1.316, + "grad_norm": 0.46212878823280334, + "learning_rate": 3.753281955483985e-05 + }, + { + "step": 112, + "epoch": 3.1549295774647885, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468854784, + "loss": 1.2983, + "grad_norm": 0.2690226435661316, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 113, + "epoch": 3.183098591549296, + "cpu_mem": 1.722560512, + "gpu_mem": 4.46891776, + "loss": 1.2961, + "grad_norm": 0.2687317430973053, + "learning_rate": 3.2725277629795526e-05 + }, + { + "step": 114, + "epoch": 3.211267605633803, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468845568, + "loss": 1.3235, + "grad_norm": 0.3948516249656677, + "learning_rate": 3.0430123916561672e-05 + }, + { + "step": 115, + "epoch": 3.23943661971831, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468839424, + "loss": 1.3319, + "grad_norm": 0.4561668336391449, + "learning_rate": 2.8209299142621522e-05 + }, + { + "step": 116, + "epoch": 3.267605633802817, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468854784, + "loss": 1.3061, + "grad_norm": 0.31403616070747375, + "learning_rate": 2.6064183852600797e-05 + }, + { + "step": 117, + "epoch": 3.295774647887324, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468870144, + "loss": 1.2695, + "grad_norm": 0.3783471882343292, + "learning_rate": 2.3996111527384288e-05 + }, + { + "step": 118, + "epoch": 3.323943661971831, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468860928, + "loss": 1.2991, + "grad_norm": 0.39244377613067627, + "learning_rate": 2.2006367755176655e-05 + }, + { + "step": 119, + "epoch": 3.352112676056338, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468851712, + "loss": 1.3298, + "grad_norm": 0.3364337384700775, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 120, + "epoch": 3.380281690140845, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468870144, + "loss": 1.3064, + "grad_norm": 0.3554164171218872, + "learning_rate": 1.82667639944657e-05 + }, + { + "step": 121, + "epoch": 3.408450704225352, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468868608, + "loss": 1.2865, + "grad_norm": 0.39552226662635803, + "learning_rate": 1.6519228678279718e-05 + }, + { + "step": 122, + "epoch": 3.436619718309859, + "cpu_mem": 1.722560512, + "gpu_mem": 4.4688256, + "loss": 1.292, + "grad_norm": 0.4067149758338928, + "learning_rate": 1.4854669814637143e-05 + }, + { + "step": 123, + "epoch": 3.464788732394366, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468857856, + "loss": 1.2877, + "grad_norm": 0.4058687090873718, + "learning_rate": 1.3274122153249028e-05 + }, + { + "step": 124, + "epoch": 3.492957746478873, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468811776, + "loss": 1.3094, + "grad_norm": 0.29216790199279785, + "learning_rate": 1.1778568219438839e-05 + }, + { + "step": 125, + "epoch": 3.52112676056338, + "cpu_mem": 1.722560512, + "gpu_mem": 4.46885632, + "loss": 1.2951, + "grad_norm": 0.605035662651062, + "learning_rate": 1.036893770336938e-05 + }, + { + "step": 126, + "epoch": 3.5492957746478875, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468807168, + "loss": 1.2821, + "grad_norm": 0.4988017678260803, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 127, + "epoch": 3.5774647887323945, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468819456, + "loss": 1.3059, + "grad_norm": 0.2613072693347931, + "learning_rate": 7.810898074930243e-06 + }, + { + "step": 128, + "epoch": 3.6056338028169015, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468844032, + "loss": 1.2899, + "grad_norm": 0.38447514176368713, + "learning_rate": 6.664079132078881e-06 + }, + { + "step": 129, + "epoch": 3.6338028169014085, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468805632, + "loss": 1.2958, + "grad_norm": 0.24264349043369293, + "learning_rate": 5.606362957498195e-06 + }, + { + "step": 130, + "epoch": 3.6619718309859155, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468808704, + "loss": 1.3099, + "grad_norm": 0.26997119188308716, + "learning_rate": 4.638407065638322e-06 + }, + { + "step": 131, + "epoch": 3.6901408450704225, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468820992, + "loss": 1.2863, + "grad_norm": 0.32465463876724243, + "learning_rate": 3.760813172726457e-06 + }, + { + "step": 132, + "epoch": 3.7183098591549295, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468785664, + "loss": 1.2842, + "grad_norm": 0.29910916090011597, + "learning_rate": 2.9741268227184255e-06 + }, + { + "step": 133, + "epoch": 3.7464788732394365, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468827136, + "loss": 1.2494, + "grad_norm": 0.36570098996162415, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 134, + "epoch": 3.7746478873239435, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468842496, + "loss": 1.2564, + "grad_norm": 0.4065055549144745, + "learning_rate": 1.6753760662307215e-06 + }, + { + "step": 135, + "epoch": 3.802816901408451, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468807168, + "loss": 1.3175, + "grad_norm": 0.5404503345489502, + "learning_rate": 1.1641190099741904e-06 + }, + { + "step": 136, + "epoch": 3.830985915492958, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468814848, + "loss": 1.337, + "grad_norm": 0.35060980916023254, + "learning_rate": 7.453836951897885e-07 + }, + { + "step": 137, + "epoch": 3.859154929577465, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468836352, + "loss": 1.2939, + "grad_norm": 0.462889701128006, + "learning_rate": 4.194304228229806e-07 + }, + { + "step": 138, + "epoch": 3.887323943661972, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468847104, + "loss": 1.3436, + "grad_norm": 0.3507240414619446, + "learning_rate": 1.8646181716164831e-07 + }, + { + "step": 139, + "epoch": 3.915492957746479, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468839424, + "loss": 1.3487, + "grad_norm": 0.32301780581474304, + "learning_rate": 4.662269987756317e-08 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468873216, + "loss": 1.3291, + "grad_norm": 0.3487809896469116, + "learning_rate": 0.0 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.722560512, + "gpu_mem": 4.468873216, + "train_runtime": 667.7276, + "train_samples_per_second": 13.485, + "train_steps_per_second": 0.21, + "total_flos": 0.0, + "train_loss": 1.4780358808381218 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r32-a2/adapter_config.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4d69c6bc9ef572e681044e096143c4cad32a3229 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r32-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 64, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 32, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "D" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r32-a2/eval_results.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..4f1437dabcb778197249a4a85cb12b61b888956d --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "boolq", + "results": 0.7660550458715596 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r32-a2/training_configuration.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..2444e2484ef1248772742f34c3a3bc73beb95b92 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "BOOLQ", + "dataset_id": "google/boolq", + "preprocess_id": "boolq_train_deepeval" + }, + "peft_config": { + "method": "abl_D", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 12773376 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 2, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_D-boolq-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r32-a2", + "seed": 42, + "timestamp": "2025-09-02T18:16:09.944778" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r32-a2/training_logs.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..f58f8f22395f6fb702211530b36c9a90238f5c6b --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r32-a2/training_logs.json @@ -0,0 +1,2659 @@ +[ + { + "step": 1, + "epoch": 0.006779661016949152, + "cpu_mem": 1.700073472, + "gpu_mem": 4.470216704, + "loss": 8.869, + "grad_norm": 28.750593185424805, + "learning_rate": 9.999999999999999e-06 + }, + { + "step": 2, + "epoch": 0.013559322033898305, + "cpu_mem": 1.700859904, + "gpu_mem": 4.57254144, + "loss": 8.9376, + "grad_norm": 29.424304962158203, + "learning_rate": 1.9999999999999998e-05 + }, + { + "step": 3, + "epoch": 0.020338983050847456, + "cpu_mem": 1.701646336, + "gpu_mem": 4.572460032, + "loss": 8.6421, + "grad_norm": 29.651206970214844, + "learning_rate": 2.9999999999999997e-05 + }, + { + "step": 4, + "epoch": 0.02711864406779661, + "cpu_mem": 1.70223616, + "gpu_mem": 4.572460032, + "loss": 7.9922, + "grad_norm": 30.161924362182617, + "learning_rate": 3.9999999999999996e-05 + }, + { + "step": 5, + "epoch": 0.03389830508474576, + "cpu_mem": 1.702629376, + "gpu_mem": 4.57239552, + "loss": 7.0073, + "grad_norm": 28.70612907409668, + "learning_rate": 4.9999999999999996e-05 + }, + { + "step": 6, + "epoch": 0.04067796610169491, + "cpu_mem": 1.7032192, + "gpu_mem": 4.572415488, + "loss": 6.1436, + "grad_norm": 27.442537307739258, + "learning_rate": 5.9999999999999995e-05 + }, + { + "step": 7, + "epoch": 0.04745762711864407, + "cpu_mem": 1.703809024, + "gpu_mem": 4.572467712, + "loss": 4.5886, + "grad_norm": 26.88599395751953, + "learning_rate": 7e-05 + }, + { + "step": 8, + "epoch": 0.05423728813559322, + "cpu_mem": 1.70420224, + "gpu_mem": 4.572553728, + "loss": 3.2917, + "grad_norm": 22.020511627197266, + "learning_rate": 7.999999999999999e-05 + }, + { + "step": 9, + "epoch": 0.061016949152542375, + "cpu_mem": 1.704595456, + "gpu_mem": 4.572461568, + "loss": 2.3202, + "grad_norm": 14.599939346313477, + "learning_rate": 8.999999999999999e-05 + }, + { + "step": 10, + "epoch": 0.06779661016949153, + "cpu_mem": 1.70518528, + "gpu_mem": 4.572361728, + "loss": 1.5993, + "grad_norm": 9.308046340942383, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 11, + "epoch": 0.07457627118644068, + "cpu_mem": 1.705578496, + "gpu_mem": 4.572466176, + "loss": 1.0754, + "grad_norm": 6.452853679656982, + "learning_rate": 0.00010999999999999998 + }, + { + "step": 12, + "epoch": 0.08135593220338982, + "cpu_mem": 1.705971712, + "gpu_mem": 4.572837888, + "loss": 0.849, + "grad_norm": 3.105442762374878, + "learning_rate": 0.00011999999999999999 + }, + { + "step": 13, + "epoch": 0.08813559322033898, + "cpu_mem": 1.706364928, + "gpu_mem": 4.5724416, + "loss": 0.8386, + "grad_norm": 4.129223346710205, + "learning_rate": 0.00013 + }, + { + "step": 14, + "epoch": 0.09491525423728814, + "cpu_mem": 1.706758144, + "gpu_mem": 4.57241856, + "loss": 0.6803, + "grad_norm": 1.3575210571289062, + "learning_rate": 0.00014 + }, + { + "step": 15, + "epoch": 0.1016949152542373, + "cpu_mem": 1.70715136, + "gpu_mem": 4.57235712, + "loss": 0.7768, + "grad_norm": 3.818765640258789, + "learning_rate": 0.00015 + }, + { + "step": 16, + "epoch": 0.10847457627118644, + "cpu_mem": 1.707544576, + "gpu_mem": 4.5724416, + "loss": 0.7539, + "grad_norm": 5.82582950592041, + "learning_rate": 0.00015999999999999999 + }, + { + "step": 17, + "epoch": 0.1152542372881356, + "cpu_mem": 1.707937792, + "gpu_mem": 4.572481536, + "loss": 0.7608, + "grad_norm": 4.843811988830566, + "learning_rate": 0.00016999999999999999 + }, + { + "step": 18, + "epoch": 0.12203389830508475, + "cpu_mem": 1.7081344, + "gpu_mem": 4.572544512, + "loss": 0.6773, + "grad_norm": 1.6795258522033691, + "learning_rate": 0.00017999999999999998 + }, + { + "step": 19, + "epoch": 0.1288135593220339, + "cpu_mem": 1.708527616, + "gpu_mem": 4.572381696, + "loss": 0.7218, + "grad_norm": 2.0494775772094727, + "learning_rate": 0.00018999999999999998 + }, + { + "step": 20, + "epoch": 0.13559322033898305, + "cpu_mem": 1.708724224, + "gpu_mem": 4.572493824, + "loss": 0.6964, + "grad_norm": 5.073785305023193, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 21, + "epoch": 0.1423728813559322, + "cpu_mem": 1.70911744, + "gpu_mem": 4.572652032, + "loss": 0.6463, + "grad_norm": 2.6963160037994385, + "learning_rate": 0.00020999999999999998 + }, + { + "step": 22, + "epoch": 0.14915254237288136, + "cpu_mem": 1.709510656, + "gpu_mem": 4.572544512, + "loss": 0.7597, + "grad_norm": 4.647146701812744, + "learning_rate": 0.00021999999999999995 + }, + { + "step": 23, + "epoch": 0.15593220338983052, + "cpu_mem": 1.709903872, + "gpu_mem": 4.572516864, + "loss": 0.6843, + "grad_norm": 1.021748661994934, + "learning_rate": 0.00023 + }, + { + "step": 24, + "epoch": 0.16271186440677965, + "cpu_mem": 1.71010048, + "gpu_mem": 4.572573696, + "loss": 0.6154, + "grad_norm": 2.3551743030548096, + "learning_rate": 0.00023999999999999998 + }, + { + "step": 25, + "epoch": 0.1694915254237288, + "cpu_mem": 1.710297088, + "gpu_mem": 4.572358656, + "loss": 0.6941, + "grad_norm": 2.721658945083618, + "learning_rate": 0.00025 + }, + { + "step": 26, + "epoch": 0.17627118644067796, + "cpu_mem": 1.710690304, + "gpu_mem": 4.572413952, + "loss": 0.7216, + "grad_norm": 2.7436864376068115, + "learning_rate": 0.00026 + }, + { + "step": 27, + "epoch": 0.18305084745762712, + "cpu_mem": 1.710886912, + "gpu_mem": 4.572705792, + "loss": 0.6927, + "grad_norm": 3.214296340942383, + "learning_rate": 0.00027 + }, + { + "step": 28, + "epoch": 0.18983050847457628, + "cpu_mem": 1.71108352, + "gpu_mem": 4.572384768, + "loss": 0.6844, + "grad_norm": 0.6148536801338196, + "learning_rate": 0.00028 + }, + { + "step": 29, + "epoch": 0.19661016949152543, + "cpu_mem": 1.711280128, + "gpu_mem": 4.57244928, + "loss": 0.6581, + "grad_norm": 1.5406228303909302, + "learning_rate": 0.00029 + }, + { + "step": 30, + "epoch": 0.2033898305084746, + "cpu_mem": 1.711673344, + "gpu_mem": 4.572527616, + "loss": 0.746, + "grad_norm": 2.7768375873565674, + "learning_rate": 0.0003 + }, + { + "step": 31, + "epoch": 0.21016949152542372, + "cpu_mem": 1.711869952, + "gpu_mem": 4.572331008, + "loss": 0.6004, + "grad_norm": 1.4867066144943237, + "learning_rate": 0.0002999893794250036 + }, + { + "step": 32, + "epoch": 0.21694915254237288, + "cpu_mem": 1.712263168, + "gpu_mem": 4.572444672, + "loss": 0.6804, + "grad_norm": 0.6709100604057312, + "learning_rate": 0.00029995751920396937 + }, + { + "step": 33, + "epoch": 0.22372881355932203, + "cpu_mem": 1.712459776, + "gpu_mem": 4.572682752, + "loss": 0.7777, + "grad_norm": 3.5915029048919678, + "learning_rate": 0.00029990442384854874 + }, + { + "step": 34, + "epoch": 0.2305084745762712, + "cpu_mem": 1.712656384, + "gpu_mem": 4.572384768, + "loss": 0.7569, + "grad_norm": 4.242762088775635, + "learning_rate": 0.0002998301008774512 + }, + { + "step": 35, + "epoch": 0.23728813559322035, + "cpu_mem": 1.712852992, + "gpu_mem": 4.5725952, + "loss": 0.6485, + "grad_norm": 0.7949064373970032, + "learning_rate": 0.0002997345608153792 + }, + { + "step": 36, + "epoch": 0.2440677966101695, + "cpu_mem": 1.71403264, + "gpu_mem": 4.572546048, + "loss": 0.6567, + "grad_norm": 0.9245438575744629, + "learning_rate": 0.000299617817191538 + }, + { + "step": 37, + "epoch": 0.25084745762711863, + "cpu_mem": 1.714229248, + "gpu_mem": 4.57235712, + "loss": 0.6297, + "grad_norm": 1.2665709257125854, + "learning_rate": 0.0002994798865377198 + }, + { + "step": 38, + "epoch": 0.2576271186440678, + "cpu_mem": 1.714425856, + "gpu_mem": 4.572604416, + "loss": 0.724, + "grad_norm": 1.690063238143921, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 39, + "epoch": 0.26440677966101694, + "cpu_mem": 1.714622464, + "gpu_mem": 4.572983808, + "loss": 0.6796, + "grad_norm": 0.7226794958114624, + "learning_rate": 0.0002991405452657846 + }, + { + "step": 40, + "epoch": 0.2711864406779661, + "cpu_mem": 1.714819072, + "gpu_mem": 4.572553728, + "loss": 0.6057, + "grad_norm": 1.2654309272766113, + "learning_rate": 0.00029893918270099324 + }, + { + "step": 41, + "epoch": 0.27796610169491526, + "cpu_mem": 1.71501568, + "gpu_mem": 4.572781056, + "loss": 0.6746, + "grad_norm": 0.7748790383338928, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 42, + "epoch": 0.2847457627118644, + "cpu_mem": 1.715212288, + "gpu_mem": 4.572678144, + "loss": 0.6027, + "grad_norm": 0.7013700604438782, + "learning_rate": 0.0002984732162821399 + }, + { + "step": 43, + "epoch": 0.29152542372881357, + "cpu_mem": 1.715212288, + "gpu_mem": 4.572499968, + "loss": 0.6246, + "grad_norm": 1.265302300453186, + "learning_rate": 0.0002982086784124952 + }, + { + "step": 44, + "epoch": 0.2983050847457627, + "cpu_mem": 1.715212288, + "gpu_mem": 4.572642816, + "loss": 0.5883, + "grad_norm": 0.759946882724762, + "learning_rate": 0.00029792315305772796 + }, + { + "step": 45, + "epoch": 0.3050847457627119, + "cpu_mem": 1.715408896, + "gpu_mem": 4.572423168, + "loss": 0.9142, + "grad_norm": 4.669005393981934, + "learning_rate": 0.0002976166806504174 + }, + { + "step": 46, + "epoch": 0.31186440677966104, + "cpu_mem": 1.715605504, + "gpu_mem": 4.572665856, + "loss": 0.6482, + "grad_norm": 0.665550947189331, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 47, + "epoch": 0.31864406779661014, + "cpu_mem": 1.715802112, + "gpu_mem": 4.572389376, + "loss": 0.6341, + "grad_norm": 2.4424753189086914, + "learning_rate": 0.00029694107123365385 + }, + { + "step": 48, + "epoch": 0.3254237288135593, + "cpu_mem": 1.71599872, + "gpu_mem": 4.572466176, + "loss": 0.5722, + "grad_norm": 1.550267219543457, + "learning_rate": 0.00029657202989567393 + }, + { + "step": 49, + "epoch": 0.33220338983050846, + "cpu_mem": 1.71599872, + "gpu_mem": 4.572483072, + "loss": 0.9536, + "grad_norm": 4.4340105056762695, + "learning_rate": 0.00029618223283454893 + }, + { + "step": 50, + "epoch": 0.3389830508474576, + "cpu_mem": 1.716195328, + "gpu_mem": 4.572421632, + "loss": 0.714, + "grad_norm": 2.585852861404419, + "learning_rate": 0.00029577173524853123 + }, + { + "step": 51, + "epoch": 0.34576271186440677, + "cpu_mem": 1.716195328, + "gpu_mem": 4.57242624, + "loss": 0.5802, + "grad_norm": 0.6010729074478149, + "learning_rate": 0.0002953405952672261 + }, + { + "step": 52, + "epoch": 0.3525423728813559, + "cpu_mem": 1.716195328, + "gpu_mem": 4.572506112, + "loss": 0.6448, + "grad_norm": 0.6412949562072754, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 53, + "epoch": 0.3593220338983051, + "cpu_mem": 1.716391936, + "gpu_mem": 4.572529152, + "loss": 0.7186, + "grad_norm": 3.4634180068969727, + "learning_rate": 0.0002944166352441363 + }, + { + "step": 54, + "epoch": 0.36610169491525424, + "cpu_mem": 1.716588544, + "gpu_mem": 4.57245696, + "loss": 0.707, + "grad_norm": 1.6346309185028076, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 55, + "epoch": 0.3728813559322034, + "cpu_mem": 1.716785152, + "gpu_mem": 4.572727296, + "loss": 0.6686, + "grad_norm": 1.974165439605713, + "learning_rate": 0.00029341087610604337 + }, + { + "step": 56, + "epoch": 0.37966101694915255, + "cpu_mem": 1.716785152, + "gpu_mem": 4.572513792, + "loss": 0.7057, + "grad_norm": 1.2388203144073486, + "learning_rate": 0.00029287749809037904 + }, + { + "step": 57, + "epoch": 0.3864406779661017, + "cpu_mem": 1.71698176, + "gpu_mem": 4.572507648, + "loss": 0.6182, + "grad_norm": 0.7054196000099182, + "learning_rate": 0.0002923238875255979 + }, + { + "step": 58, + "epoch": 0.39322033898305087, + "cpu_mem": 1.71698176, + "gpu_mem": 4.5724032, + "loss": 0.5996, + "grad_norm": 0.7088422775268555, + "learning_rate": 0.00029175012280720024 + }, + { + "step": 59, + "epoch": 0.4, + "cpu_mem": 1.717178368, + "gpu_mem": 4.572420096, + "loss": 0.6761, + "grad_norm": 0.9367287158966064, + "learning_rate": 0.000291156285184669 + }, + { + "step": 60, + "epoch": 0.4067796610169492, + "cpu_mem": 1.717178368, + "gpu_mem": 4.572513792, + "loss": 0.6288, + "grad_norm": 1.6509524583816528, + "learning_rate": 0.00029054245874996426 + }, + { + "step": 61, + "epoch": 0.4135593220338983, + "cpu_mem": 1.717178368, + "gpu_mem": 4.572524544, + "loss": 0.6335, + "grad_norm": 1.4897509813308716, + "learning_rate": 0.0002899087304256151 + }, + { + "step": 62, + "epoch": 0.42033898305084744, + "cpu_mem": 1.717178368, + "gpu_mem": 4.572512256, + "loss": 0.7055, + "grad_norm": 0.7237626910209656, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 63, + "epoch": 0.4271186440677966, + "cpu_mem": 1.717374976, + "gpu_mem": 4.572504576, + "loss": 0.5146, + "grad_norm": 0.902273952960968, + "learning_rate": 0.000288581929876693 + }, + { + "step": 64, + "epoch": 0.43389830508474575, + "cpu_mem": 1.717374976, + "gpu_mem": 4.57243392, + "loss": 0.5908, + "grad_norm": 0.5954427719116211, + "learning_rate": 0.0002878890455372498 + }, + { + "step": 65, + "epoch": 0.4406779661016949, + "cpu_mem": 1.717374976, + "gpu_mem": 4.572478464, + "loss": 0.6396, + "grad_norm": 1.3040223121643066, + "learning_rate": 0.0002871766350518159 + }, + { + "step": 66, + "epoch": 0.44745762711864406, + "cpu_mem": 1.717571584, + "gpu_mem": 4.572672, + "loss": 0.5905, + "grad_norm": 0.42842549085617065, + "learning_rate": 0.00028644479930317775 + }, + { + "step": 67, + "epoch": 0.4542372881355932, + "cpu_mem": 1.717768192, + "gpu_mem": 4.572381696, + "loss": 0.6612, + "grad_norm": 0.8538391590118408, + "learning_rate": 0.00028569364192488803 + }, + { + "step": 68, + "epoch": 0.4610169491525424, + "cpu_mem": 1.717768192, + "gpu_mem": 4.57234944, + "loss": 0.7059, + "grad_norm": 1.0457584857940674, + "learning_rate": 0.00028492326928659045 + }, + { + "step": 69, + "epoch": 0.46779661016949153, + "cpu_mem": 1.717768192, + "gpu_mem": 4.572415488, + "loss": 0.6098, + "grad_norm": 0.9364701509475708, + "learning_rate": 0.00028413379047895665 + }, + { + "step": 70, + "epoch": 0.4745762711864407, + "cpu_mem": 1.7179648, + "gpu_mem": 4.572409344, + "loss": 0.5587, + "grad_norm": 0.553261399269104, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 71, + "epoch": 0.48135593220338985, + "cpu_mem": 1.7179648, + "gpu_mem": 4.572638208, + "loss": 0.556, + "grad_norm": 0.8999815583229065, + "learning_rate": 0.0002824979642304366 + }, + { + "step": 72, + "epoch": 0.488135593220339, + "cpu_mem": 1.7179648, + "gpu_mem": 4.572630528, + "loss": 0.5604, + "grad_norm": 1.0098932981491089, + "learning_rate": 0.0002816518484350883 + }, + { + "step": 73, + "epoch": 0.49491525423728816, + "cpu_mem": 1.7179648, + "gpu_mem": 4.572596736, + "loss": 0.7627, + "grad_norm": 2.4771575927734375, + "learning_rate": 0.0002807870897286772 + }, + { + "step": 74, + "epoch": 0.5016949152542373, + "cpu_mem": 1.718161408, + "gpu_mem": 4.57245696, + "loss": 0.5323, + "grad_norm": 0.6309259533882141, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 75, + "epoch": 0.5084745762711864, + "cpu_mem": 1.718161408, + "gpu_mem": 4.572381696, + "loss": 0.5456, + "grad_norm": 1.8133351802825928, + "learning_rate": 0.000279002136031155 + }, + { + "step": 76, + "epoch": 0.5152542372881356, + "cpu_mem": 1.718161408, + "gpu_mem": 4.572321792, + "loss": 0.6501, + "grad_norm": 1.9666740894317627, + "learning_rate": 0.00027808219380317216 + }, + { + "step": 77, + "epoch": 0.5220338983050847, + "cpu_mem": 1.718161408, + "gpu_mem": 4.57239552, + "loss": 0.5479, + "grad_norm": 1.4216151237487793, + "learning_rate": 0.0002771441141545895 + }, + { + "step": 78, + "epoch": 0.5288135593220339, + "cpu_mem": 1.718161408, + "gpu_mem": 4.572447744, + "loss": 0.8347, + "grad_norm": 3.658280611038208, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 79, + "epoch": 0.535593220338983, + "cpu_mem": 1.718358016, + "gpu_mem": 4.57257984, + "loss": 0.8043, + "grad_norm": 2.852076292037964, + "learning_rate": 0.000275214076502292 + }, + { + "step": 80, + "epoch": 0.5423728813559322, + "cpu_mem": 1.718358016, + "gpu_mem": 4.572470784, + "loss": 0.6989, + "grad_norm": 2.2610459327697754, + "learning_rate": 0.0002742223918067056 + }, + { + "step": 81, + "epoch": 0.5491525423728814, + "cpu_mem": 1.718358016, + "gpu_mem": 4.572350976, + "loss": 0.6621, + "grad_norm": 1.6608003377914429, + "learning_rate": 0.00027321311626807374 + }, + { + "step": 82, + "epoch": 0.5559322033898305, + "cpu_mem": 1.718358016, + "gpu_mem": 4.572420096, + "loss": 0.6081, + "grad_norm": 0.6904788017272949, + "learning_rate": 0.0002721863928075503 + }, + { + "step": 83, + "epoch": 0.5627118644067797, + "cpu_mem": 1.718358016, + "gpu_mem": 4.572519936, + "loss": 0.6395, + "grad_norm": 1.14455246925354, + "learning_rate": 0.000271142366817049 + }, + { + "step": 84, + "epoch": 0.5694915254237288, + "cpu_mem": 1.718358016, + "gpu_mem": 4.572483072, + "loss": 0.7262, + "grad_norm": 2.554008960723877, + "learning_rate": 0.00027008118613865406 + }, + { + "step": 85, + "epoch": 0.576271186440678, + "cpu_mem": 1.718358016, + "gpu_mem": 4.572515328, + "loss": 0.6261, + "grad_norm": 1.7196681499481201, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 86, + "epoch": 0.5830508474576271, + "cpu_mem": 1.718358016, + "gpu_mem": 4.572466176, + "loss": 0.6168, + "grad_norm": 0.6027466654777527, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 87, + "epoch": 0.5898305084745763, + "cpu_mem": 1.718554624, + "gpu_mem": 4.572473856, + "loss": 0.5868, + "grad_norm": 1.1970185041427612, + "learning_rate": 0.00026679623070746325 + }, + { + "step": 88, + "epoch": 0.5966101694915255, + "cpu_mem": 1.718554624, + "gpu_mem": 4.57261824, + "loss": 0.5899, + "grad_norm": 0.9783823490142822, + "learning_rate": 0.0002656679579618081 + }, + { + "step": 89, + "epoch": 0.6033898305084746, + "cpu_mem": 1.718554624, + "gpu_mem": 4.572400128, + "loss": 0.7, + "grad_norm": 1.5105644464492798, + "learning_rate": 0.0002645233057465235 + }, + { + "step": 90, + "epoch": 0.6101694915254238, + "cpu_mem": 1.718554624, + "gpu_mem": 4.572453888, + "loss": 0.5708, + "grad_norm": 0.6399961709976196, + "learning_rate": 0.00026336243615313873 + }, + { + "step": 91, + "epoch": 0.6169491525423729, + "cpu_mem": 1.718554624, + "gpu_mem": 4.572421632, + "loss": 0.567, + "grad_norm": 0.8920333981513977, + "learning_rate": 0.00026218551356968814 + }, + { + "step": 92, + "epoch": 0.6237288135593221, + "cpu_mem": 1.718554624, + "gpu_mem": 4.57250304, + "loss": 0.6471, + "grad_norm": 0.5527065396308899, + "learning_rate": 0.00026099270465743254 + }, + { + "step": 93, + "epoch": 0.6305084745762712, + "cpu_mem": 1.718554624, + "gpu_mem": 4.572306432, + "loss": 0.7101, + "grad_norm": 0.6522276997566223, + "learning_rate": 0.0002597841783272588 + }, + { + "step": 94, + "epoch": 0.6372881355932203, + "cpu_mem": 1.718554624, + "gpu_mem": 4.572420096, + "loss": 0.6556, + "grad_norm": 1.8380529880523682, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 95, + "epoch": 0.6440677966101694, + "cpu_mem": 1.718554624, + "gpu_mem": 4.572440064, + "loss": 0.6369, + "grad_norm": 0.8491230010986328, + "learning_rate": 0.00025732066016100394 + }, + { + "step": 96, + "epoch": 0.6508474576271186, + "cpu_mem": 1.718554624, + "gpu_mem": 4.572478464, + "loss": 0.5301, + "grad_norm": 0.5455194115638733, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 97, + "epoch": 0.6576271186440678, + "cpu_mem": 1.718751232, + "gpu_mem": 4.572463104, + "loss": 0.6604, + "grad_norm": 0.8768364191055298, + "learning_rate": 0.0002547963544337602 + }, + { + "step": 98, + "epoch": 0.6644067796610169, + "cpu_mem": 1.718751232, + "gpu_mem": 4.572375552, + "loss": 0.6361, + "grad_norm": 1.0682636499404907, + "learning_rate": 0.0002535118517223168 + }, + { + "step": 99, + "epoch": 0.6711864406779661, + "cpu_mem": 1.718751232, + "gpu_mem": 4.572324864, + "loss": 0.5656, + "grad_norm": 1.0466759204864502, + "learning_rate": 0.00025221269093908365 + }, + { + "step": 100, + "epoch": 0.6779661016949152, + "cpu_mem": 1.718751232, + "gpu_mem": 4.5724416, + "loss": 0.5935, + "grad_norm": 0.5262985229492188, + "learning_rate": 0.0002508990560551879 + }, + { + "step": 101, + "epoch": 0.6847457627118644, + "cpu_mem": 1.718751232, + "gpu_mem": 4.572473856, + "loss": 0.5788, + "grad_norm": 0.7149959802627563, + "learning_rate": 0.0002495711330914001 + }, + { + "step": 102, + "epoch": 0.6915254237288135, + "cpu_mem": 1.718751232, + "gpu_mem": 4.572507648, + "loss": 0.6063, + "grad_norm": 0.7704340219497681, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 103, + "epoch": 0.6983050847457627, + "cpu_mem": 1.718751232, + "gpu_mem": 4.572558336, + "loss": 0.6116, + "grad_norm": 0.7952002286911011, + "learning_rate": 0.0002468731770971113 + }, + { + "step": 104, + "epoch": 0.7050847457627119, + "cpu_mem": 1.718751232, + "gpu_mem": 4.572463104, + "loss": 0.5912, + "grad_norm": 0.674315869808197, + "learning_rate": 0.0002455035261178632 + }, + { + "step": 105, + "epoch": 0.711864406779661, + "cpu_mem": 1.718751232, + "gpu_mem": 4.57256448, + "loss": 0.6019, + "grad_norm": 0.7072395086288452, + "learning_rate": 0.0002441203511071278 + }, + { + "step": 106, + "epoch": 0.7186440677966102, + "cpu_mem": 1.718751232, + "gpu_mem": 4.572515328, + "loss": 0.5798, + "grad_norm": 0.7142025232315063, + "learning_rate": 0.00024272384793309077 + }, + { + "step": 107, + "epoch": 0.7254237288135593, + "cpu_mem": 1.718751232, + "gpu_mem": 4.5724032, + "loss": 0.4805, + "grad_norm": 0.5528094172477722, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 108, + "epoch": 0.7322033898305085, + "cpu_mem": 1.718751232, + "gpu_mem": 4.57258752, + "loss": 0.5816, + "grad_norm": 0.81120765209198, + "learning_rate": 0.00023989164997670202 + }, + { + "step": 109, + "epoch": 0.7389830508474576, + "cpu_mem": 1.718751232, + "gpu_mem": 4.5724416, + "loss": 0.6088, + "grad_norm": 0.9077268242835999, + "learning_rate": 0.0002384563562552943 + }, + { + "step": 110, + "epoch": 0.7457627118644068, + "cpu_mem": 1.718751232, + "gpu_mem": 4.572444672, + "loss": 0.5937, + "grad_norm": 0.8482281565666199, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 111, + "epoch": 0.752542372881356, + "cpu_mem": 1.718751232, + "gpu_mem": 4.572413952, + "loss": 0.5373, + "grad_norm": 0.5784004330635071, + "learning_rate": 0.0002355483955402446 + }, + { + "step": 112, + "epoch": 0.7593220338983051, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572460032, + "loss": 0.5455, + "grad_norm": 1.168546199798584, + "learning_rate": 0.00023407614033613407 + }, + { + "step": 113, + "epoch": 0.7661016949152543, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572450816, + "loss": 0.592, + "grad_norm": 1.0730594396591187, + "learning_rate": 0.0002325919793059723 + }, + { + "step": 114, + "epoch": 0.7728813559322034, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572432384, + "loss": 0.5258, + "grad_norm": 0.8413960933685303, + "learning_rate": 0.00023109612261833963 + }, + { + "step": 115, + "epoch": 0.7796610169491526, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572507648, + "loss": 0.5347, + "grad_norm": 0.6944677829742432, + "learning_rate": 0.0002295887820980112 + }, + { + "step": 116, + "epoch": 0.7864406779661017, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572427776, + "loss": 0.5365, + "grad_norm": 0.7183632850646973, + "learning_rate": 0.0002280701711959608 + }, + { + "step": 117, + "epoch": 0.7932203389830509, + "cpu_mem": 1.71894784, + "gpu_mem": 4.57231872, + "loss": 0.5478, + "grad_norm": 1.401812195777893, + "learning_rate": 0.00022654050495913495 + }, + { + "step": 118, + "epoch": 0.8, + "cpu_mem": 1.71894784, + "gpu_mem": 4.5725568, + "loss": 0.6311, + "grad_norm": 1.5453472137451172, + "learning_rate": 0.000225 + }, + { + "step": 119, + "epoch": 0.8067796610169492, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572727296, + "loss": 0.4633, + "grad_norm": 0.7705002427101135, + "learning_rate": 0.00022344887446586865 + }, + { + "step": 120, + "epoch": 0.8135593220338984, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572460032, + "loss": 0.5521, + "grad_norm": 1.450700044631958, + "learning_rate": 0.00022188734800800852 + }, + { + "step": 121, + "epoch": 0.8203389830508474, + "cpu_mem": 1.71894784, + "gpu_mem": 4.57248768, + "loss": 0.5243, + "grad_norm": 0.9056739211082458, + "learning_rate": 0.00022031564175053754 + }, + { + "step": 122, + "epoch": 0.8271186440677966, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572538368, + "loss": 0.4851, + "grad_norm": 0.8386183381080627, + "learning_rate": 0.00021873397825911153 + }, + { + "step": 123, + "epoch": 0.8338983050847457, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572347904, + "loss": 0.5434, + "grad_norm": 1.5590592622756958, + "learning_rate": 0.00021714258150940685 + }, + { + "step": 124, + "epoch": 0.8406779661016949, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572790272, + "loss": 0.4759, + "grad_norm": 0.9887855052947998, + "learning_rate": 0.0002155416768554039 + }, + { + "step": 125, + "epoch": 0.847457627118644, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572516864, + "loss": 0.5064, + "grad_norm": 0.8324416875839233, + "learning_rate": 0.00021393149099747523 + }, + { + "step": 126, + "epoch": 0.8542372881355932, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572400128, + "loss": 0.5669, + "grad_norm": 1.033551812171936, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 127, + "epoch": 0.8610169491525423, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572839424, + "loss": 0.5166, + "grad_norm": 1.09823477268219, + "learning_rate": 0.00021068418901049025 + }, + { + "step": 128, + "epoch": 0.8677966101694915, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572615168, + "loss": 0.4587, + "grad_norm": 1.2634567022323608, + "learning_rate": 0.0002090475327242912 + }, + { + "step": 129, + "epoch": 0.8745762711864407, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572655104, + "loss": 0.5442, + "grad_norm": 1.2347668409347534, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 130, + "epoch": 0.8813559322033898, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572436992, + "loss": 0.6395, + "grad_norm": 0.9695599675178528, + "learning_rate": 0.0002057493683490491 + }, + { + "step": 131, + "epoch": 0.888135593220339, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572566016, + "loss": 0.5222, + "grad_norm": 0.9749922752380371, + "learning_rate": 0.00020408832730536746 + }, + { + "step": 132, + "epoch": 0.8949152542372881, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572647424, + "loss": 0.4685, + "grad_norm": 1.3722708225250244, + "learning_rate": 0.00020241962693986476 + }, + { + "step": 133, + "epoch": 0.9016949152542373, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572430848, + "loss": 0.5021, + "grad_norm": 1.3059203624725342, + "learning_rate": 0.0002007435035533061 + }, + { + "step": 134, + "epoch": 0.9084745762711864, + "cpu_mem": 1.71894784, + "gpu_mem": 4.57256448, + "loss": 0.4843, + "grad_norm": 1.2460843324661255, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 135, + "epoch": 0.9152542372881356, + "cpu_mem": 1.71894784, + "gpu_mem": 4.57258752, + "loss": 0.5122, + "grad_norm": 1.2564277648925781, + "learning_rate": 0.00019736993814225374 + }, + { + "step": 136, + "epoch": 0.9220338983050848, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572424704, + "loss": 0.4726, + "grad_norm": 1.3146121501922607, + "learning_rate": 0.00019567297384048604 + }, + { + "step": 137, + "epoch": 0.9288135593220339, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572304896, + "loss": 0.5684, + "grad_norm": 1.791987419128418, + "learning_rate": 0.0001939695418954653 + }, + { + "step": 138, + "epoch": 0.9355932203389831, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572486144, + "loss": 0.4931, + "grad_norm": 1.3301362991333008, + "learning_rate": 0.00019225988352621445 + }, + { + "step": 139, + "epoch": 0.9423728813559322, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572384768, + "loss": 0.4281, + "grad_norm": 1.0045467615127563, + "learning_rate": 0.00019054424083346592 + }, + { + "step": 140, + "epoch": 0.9491525423728814, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572436992, + "loss": 0.54, + "grad_norm": 1.1835798025131226, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 141, + "epoch": 0.9559322033898305, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572469248, + "loss": 0.6125, + "grad_norm": 2.2839300632476807, + "learning_rate": 0.0001870959750831323 + }, + { + "step": 142, + "epoch": 0.9627118644067797, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572609024, + "loss": 0.5064, + "grad_norm": 2.0525028705596924, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 143, + "epoch": 0.9694915254237289, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572592128, + "loss": 0.6009, + "grad_norm": 1.487833857536316, + "learning_rate": 0.00018362669777878453 + }, + { + "step": 144, + "epoch": 0.976271186440678, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572784128, + "loss": 0.5106, + "grad_norm": 0.8827306032180786, + "learning_rate": 0.00018188479343294648 + }, + { + "step": 145, + "epoch": 0.9830508474576272, + "cpu_mem": 1.71894784, + "gpu_mem": 4.57249536, + "loss": 0.4681, + "grad_norm": 0.9175130128860474, + "learning_rate": 0.0001801383739559098 + }, + { + "step": 146, + "epoch": 0.9898305084745763, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572530688, + "loss": 0.5451, + "grad_norm": 1.7076985836029053, + "learning_rate": 0.0001783876866540615 + }, + { + "step": 147, + "epoch": 0.9966101694915255, + "cpu_mem": 1.71894784, + "gpu_mem": 4.572429312, + "loss": 0.599, + "grad_norm": 1.501021146774292, + "learning_rate": 0.00017663297943814552 + }, + { + "step": 148, + "epoch": 1.0033898305084745, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623687168, + "loss": 0.745, + "grad_norm": 2.972140073776245, + "learning_rate": 0.0001748745007881561 + }, + { + "step": 149, + "epoch": 1.0101694915254238, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623622656, + "loss": 0.409, + "grad_norm": 1.5939456224441528, + "learning_rate": 0.00017311249971815185 + }, + { + "step": 150, + "epoch": 1.0169491525423728, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62345984, + "loss": 0.4245, + "grad_norm": 1.1110289096832275, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 151, + "epoch": 1.023728813559322, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623532032, + "loss": 0.4476, + "grad_norm": 0.7456145286560059, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 152, + "epoch": 1.0305084745762711, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62356736, + "loss": 0.3984, + "grad_norm": 1.1336967945098877, + "learning_rate": 0.00016780785939859576 + }, + { + "step": 153, + "epoch": 1.0372881355932204, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623591936, + "loss": 0.6373, + "grad_norm": 2.3109586238861084, + "learning_rate": 0.00016603426823476693 + }, + { + "step": 154, + "epoch": 1.0440677966101695, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623553536, + "loss": 0.5151, + "grad_norm": 1.8028135299682617, + "learning_rate": 0.00016425840649562736 + }, + { + "step": 155, + "epoch": 1.0508474576271187, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62377472, + "loss": 0.4513, + "grad_norm": 1.581742525100708, + "learning_rate": 0.00016248052565681436 + }, + { + "step": 156, + "epoch": 1.0576271186440678, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62368256, + "loss": 0.439, + "grad_norm": 1.510156273841858, + "learning_rate": 0.00016070087747988482 + }, + { + "step": 157, + "epoch": 1.064406779661017, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623588864, + "loss": 0.4373, + "grad_norm": 1.2448142766952515, + "learning_rate": 0.00015891971397666464 + }, + { + "step": 158, + "epoch": 1.071186440677966, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623515136, + "loss": 0.4529, + "grad_norm": 1.3465746641159058, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 159, + "epoch": 1.0779661016949154, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623863808, + "loss": 0.4393, + "grad_norm": 1.2849854230880737, + "learning_rate": 0.00015535385007584706 + }, + { + "step": 160, + "epoch": 1.0847457627118644, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623458304, + "loss": 0.4418, + "grad_norm": 2.1204171180725098, + "learning_rate": 0.0001535696546319161 + }, + { + "step": 161, + "epoch": 1.0915254237288137, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623404544, + "loss": 0.3064, + "grad_norm": 1.0373823642730713, + "learning_rate": 0.00015178495369752213 + }, + { + "step": 162, + "epoch": 1.0983050847457627, + "cpu_mem": 1.71894784, + "gpu_mem": 4.624180224, + "loss": 0.3804, + "grad_norm": 1.238021969795227, + "learning_rate": 0.00015 + }, + { + "step": 163, + "epoch": 1.1050847457627118, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623656448, + "loss": 0.4082, + "grad_norm": 1.2749755382537842, + "learning_rate": 0.00014821504630247785 + }, + { + "step": 164, + "epoch": 1.111864406779661, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623568896, + "loss": 0.4603, + "grad_norm": 1.539047360420227, + "learning_rate": 0.00014643034536808387 + }, + { + "step": 165, + "epoch": 1.11864406779661, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623518208, + "loss": 0.4308, + "grad_norm": 1.6282577514648438, + "learning_rate": 0.00014464614992415294 + }, + { + "step": 166, + "epoch": 1.1254237288135593, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62361344, + "loss": 0.3791, + "grad_norm": 1.2951699495315552, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 167, + "epoch": 1.1322033898305084, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623530496, + "loss": 0.5118, + "grad_norm": 2.0720505714416504, + "learning_rate": 0.00014108028602333536 + }, + { + "step": 168, + "epoch": 1.1389830508474577, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623548928, + "loss": 0.4853, + "grad_norm": 1.637773036956787, + "learning_rate": 0.00013929912252011516 + }, + { + "step": 169, + "epoch": 1.1457627118644067, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62363648, + "loss": 0.4432, + "grad_norm": 1.6053849458694458, + "learning_rate": 0.00013751947434318564 + }, + { + "step": 170, + "epoch": 1.152542372881356, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62352128, + "loss": 0.4236, + "grad_norm": 1.6041933298110962, + "learning_rate": 0.00013574159350437261 + }, + { + "step": 171, + "epoch": 1.159322033898305, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623584256, + "loss": 0.5736, + "grad_norm": 2.500246524810791, + "learning_rate": 0.0001339657317652331 + }, + { + "step": 172, + "epoch": 1.1661016949152543, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623492096, + "loss": 0.4105, + "grad_norm": 1.6655519008636475, + "learning_rate": 0.00013219214060140424 + }, + { + "step": 173, + "epoch": 1.1728813559322033, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623791616, + "loss": 0.5223, + "grad_norm": 1.4275615215301514, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 174, + "epoch": 1.1796610169491526, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623515136, + "loss": 0.4822, + "grad_norm": 1.8189388513565063, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 175, + "epoch": 1.1864406779661016, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623481344, + "loss": 0.4464, + "grad_norm": 1.3254585266113281, + "learning_rate": 0.00012688750028184818 + }, + { + "step": 176, + "epoch": 1.193220338983051, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623619584, + "loss": 0.418, + "grad_norm": 1.4017231464385986, + "learning_rate": 0.0001251254992118439 + }, + { + "step": 177, + "epoch": 1.2, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623717888, + "loss": 0.4882, + "grad_norm": 1.5580755472183228, + "learning_rate": 0.00012336702056185453 + }, + { + "step": 178, + "epoch": 1.2067796610169492, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623464448, + "loss": 0.4563, + "grad_norm": 1.211509108543396, + "learning_rate": 0.00012161231334593851 + }, + { + "step": 179, + "epoch": 1.2135593220338983, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623564288, + "loss": 0.5855, + "grad_norm": 2.1758177280426025, + "learning_rate": 0.00011986162604409015 + }, + { + "step": 180, + "epoch": 1.2203389830508475, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62353664, + "loss": 0.4123, + "grad_norm": 2.215142250061035, + "learning_rate": 0.00011811520656705348 + }, + { + "step": 181, + "epoch": 1.2271186440677966, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623473664, + "loss": 0.3528, + "grad_norm": 1.2972583770751953, + "learning_rate": 0.00011637330222121543 + }, + { + "step": 182, + "epoch": 1.2338983050847459, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623691776, + "loss": 0.4597, + "grad_norm": 1.5611552000045776, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 183, + "epoch": 1.240677966101695, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623588864, + "loss": 0.4159, + "grad_norm": 1.1363767385482788, + "learning_rate": 0.00011290402491686766 + }, + { + "step": 184, + "epoch": 1.2474576271186442, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62353664, + "loss": 0.3781, + "grad_norm": 1.4128774404525757, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 185, + "epoch": 1.2542372881355932, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623515136, + "loss": 0.4507, + "grad_norm": 1.3687834739685059, + "learning_rate": 0.00010945575916653407 + }, + { + "step": 186, + "epoch": 1.2610169491525425, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623524352, + "loss": 0.3864, + "grad_norm": 1.4887542724609375, + "learning_rate": 0.00010774011647378553 + }, + { + "step": 187, + "epoch": 1.2677966101694915, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623456768, + "loss": 0.5193, + "grad_norm": 1.3200255632400513, + "learning_rate": 0.00010603045810453468 + }, + { + "step": 188, + "epoch": 1.2745762711864406, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623619584, + "loss": 0.3466, + "grad_norm": 1.1429715156555176, + "learning_rate": 0.00010432702615951396 + }, + { + "step": 189, + "epoch": 1.2813559322033898, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623489024, + "loss": 0.4356, + "grad_norm": 1.324742078781128, + "learning_rate": 0.00010263006185774627 + }, + { + "step": 190, + "epoch": 1.288135593220339, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623608832, + "loss": 0.4665, + "grad_norm": 1.4559569358825684, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 191, + "epoch": 1.2949152542372881, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623427584, + "loss": 0.3313, + "grad_norm": 1.6327720880508423, + "learning_rate": 9.925649644669391e-05 + }, + { + "step": 192, + "epoch": 1.3016949152542372, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62355968, + "loss": 0.3804, + "grad_norm": 1.8900833129882812, + "learning_rate": 9.758037306013526e-05 + }, + { + "step": 193, + "epoch": 1.3084745762711865, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623533568, + "loss": 0.4202, + "grad_norm": 1.2170467376708984, + "learning_rate": 9.591167269463255e-05 + }, + { + "step": 194, + "epoch": 1.3152542372881357, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623499776, + "loss": 0.4151, + "grad_norm": 1.1876418590545654, + "learning_rate": 9.425063165095088e-05 + }, + { + "step": 195, + "epoch": 1.3220338983050848, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623604224, + "loss": 0.2875, + "grad_norm": 0.9755733609199524, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 196, + "epoch": 1.3288135593220338, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623599616, + "loss": 0.4251, + "grad_norm": 1.2344481945037842, + "learning_rate": 9.095246727570879e-05 + }, + { + "step": 197, + "epoch": 1.335593220338983, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623458304, + "loss": 0.3392, + "grad_norm": 1.6784648895263672, + "learning_rate": 8.931581098950973e-05 + }, + { + "step": 198, + "epoch": 1.3423728813559321, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623650304, + "loss": 0.4044, + "grad_norm": 1.181406855583191, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 199, + "epoch": 1.3491525423728814, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623501312, + "loss": 0.4051, + "grad_norm": 1.790902853012085, + "learning_rate": 8.606850900252478e-05 + }, + { + "step": 200, + "epoch": 1.3559322033898304, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623604224, + "loss": 0.3635, + "grad_norm": 1.479731798171997, + "learning_rate": 8.445832314459608e-05 + }, + { + "step": 201, + "epoch": 1.3627118644067797, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623806976, + "loss": 0.351, + "grad_norm": 1.8329912424087524, + "learning_rate": 8.285741849059311e-05 + }, + { + "step": 202, + "epoch": 1.3694915254237288, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623608832, + "loss": 0.3992, + "grad_norm": 1.5027717351913452, + "learning_rate": 8.126602174088843e-05 + }, + { + "step": 203, + "epoch": 1.376271186440678, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623495168, + "loss": 0.3622, + "grad_norm": 1.2444499731063843, + "learning_rate": 7.968435824946242e-05 + }, + { + "step": 204, + "epoch": 1.383050847457627, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623508992, + "loss": 0.4287, + "grad_norm": 1.5587791204452515, + "learning_rate": 7.811265199199152e-05 + }, + { + "step": 205, + "epoch": 1.3898305084745763, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623553536, + "loss": 0.3981, + "grad_norm": 1.4963443279266357, + "learning_rate": 7.655112553413135e-05 + }, + { + "step": 206, + "epoch": 1.3966101694915254, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623495168, + "loss": 0.3163, + "grad_norm": 1.3461599349975586, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 207, + "epoch": 1.4033898305084747, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62372864, + "loss": 0.5083, + "grad_norm": 1.9481723308563232, + "learning_rate": 7.345949504086507e-05 + }, + { + "step": 208, + "epoch": 1.4101694915254237, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62375936, + "loss": 0.4477, + "grad_norm": 2.8035244941711426, + "learning_rate": 7.192982880403917e-05 + }, + { + "step": 209, + "epoch": 1.4169491525423727, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623685632, + "loss": 0.4836, + "grad_norm": 1.6210641860961914, + "learning_rate": 7.041121790198881e-05 + }, + { + "step": 210, + "epoch": 1.423728813559322, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623573504, + "loss": 0.4974, + "grad_norm": 1.8263094425201416, + "learning_rate": 6.890387738166041e-05 + }, + { + "step": 211, + "epoch": 1.4305084745762713, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623522816, + "loss": 0.363, + "grad_norm": 1.8436623811721802, + "learning_rate": 6.740802069402771e-05 + }, + { + "step": 212, + "epoch": 1.4372881355932203, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623492096, + "loss": 0.4691, + "grad_norm": 1.4035260677337646, + "learning_rate": 6.592385966386588e-05 + }, + { + "step": 213, + "epoch": 1.4440677966101694, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623515136, + "loss": 0.3799, + "grad_norm": 1.6010749340057373, + "learning_rate": 6.445160445975536e-05 + }, + { + "step": 214, + "epoch": 1.4508474576271186, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62359808, + "loss": 0.3986, + "grad_norm": 1.4254558086395264, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 215, + "epoch": 1.457627118644068, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623525888, + "loss": 0.514, + "grad_norm": 2.6828815937042236, + "learning_rate": 6.154364374470568e-05 + }, + { + "step": 216, + "epoch": 1.464406779661017, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623691776, + "loss": 0.345, + "grad_norm": 1.1209403276443481, + "learning_rate": 6.010835002329795e-05 + }, + { + "step": 217, + "epoch": 1.471186440677966, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623533568, + "loss": 0.4194, + "grad_norm": 1.9598803520202637, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 218, + "epoch": 1.4779661016949153, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623510528, + "loss": 0.3866, + "grad_norm": 1.5407321453094482, + "learning_rate": 5.72761520669092e-05 + }, + { + "step": 219, + "epoch": 1.4847457627118645, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62363648, + "loss": 0.4214, + "grad_norm": 1.4244434833526611, + "learning_rate": 5.587964889287218e-05 + }, + { + "step": 220, + "epoch": 1.4915254237288136, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623670272, + "loss": 0.4378, + "grad_norm": 1.533613920211792, + "learning_rate": 5.449647388213678e-05 + }, + { + "step": 221, + "epoch": 1.4983050847457626, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623538176, + "loss": 0.38, + "grad_norm": 1.4230495691299438, + "learning_rate": 5.312682290288869e-05 + }, + { + "step": 222, + "epoch": 1.505084745762712, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62367488, + "loss": 0.3922, + "grad_norm": 1.5885331630706787, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 223, + "epoch": 1.5118644067796612, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623588864, + "loss": 0.3761, + "grad_norm": 1.7837285995483398, + "learning_rate": 5.0428866908599864e-05 + }, + { + "step": 224, + "epoch": 1.5186440677966102, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623553536, + "loss": 0.2813, + "grad_norm": 1.1231423616409302, + "learning_rate": 4.9100943944812114e-05 + }, + { + "step": 225, + "epoch": 1.5254237288135593, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623518208, + "loss": 0.2933, + "grad_norm": 1.1998182535171509, + "learning_rate": 4.778730906091632e-05 + }, + { + "step": 226, + "epoch": 1.5322033898305085, + "cpu_mem": 1.71894784, + "gpu_mem": 4.6236672, + "loss": 0.339, + "grad_norm": 1.4583945274353027, + "learning_rate": 4.648814827768322e-05 + }, + { + "step": 227, + "epoch": 1.5389830508474578, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623556608, + "loss": 0.3676, + "grad_norm": 1.565933108329773, + "learning_rate": 4.5203645566239816e-05 + }, + { + "step": 228, + "epoch": 1.5457627118644068, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623501312, + "loss": 0.4049, + "grad_norm": 1.3880119323730469, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 229, + "epoch": 1.5525423728813559, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623442944, + "loss": 0.33, + "grad_norm": 1.1997932195663452, + "learning_rate": 4.267933983899601e-05 + }, + { + "step": 230, + "epoch": 1.559322033898305, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623499776, + "loss": 0.3258, + "grad_norm": 1.2552076578140259, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 231, + "epoch": 1.5661016949152542, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623777792, + "loss": 0.3763, + "grad_norm": 1.4037959575653076, + "learning_rate": 4.0215821672741213e-05 + }, + { + "step": 232, + "epoch": 1.5728813559322035, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623501312, + "loss": 0.4085, + "grad_norm": 1.4837490320205688, + "learning_rate": 3.900729534256745e-05 + }, + { + "step": 233, + "epoch": 1.5796610169491525, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623814656, + "loss": 0.3702, + "grad_norm": 1.548228144645691, + "learning_rate": 3.781448643031187e-05 + }, + { + "step": 234, + "epoch": 1.5864406779661016, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62369024, + "loss": 0.3564, + "grad_norm": 1.5536974668502808, + "learning_rate": 3.663756384686127e-05 + }, + { + "step": 235, + "epoch": 1.5932203389830508, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623446016, + "loss": 0.3021, + "grad_norm": 1.1162501573562622, + "learning_rate": 3.547669425347647e-05 + }, + { + "step": 236, + "epoch": 1.6, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62350592, + "loss": 0.3577, + "grad_norm": 1.971134066581726, + "learning_rate": 3.433204203819185e-05 + }, + { + "step": 237, + "epoch": 1.6067796610169491, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62356736, + "loss": 0.4558, + "grad_norm": 1.6547377109527588, + "learning_rate": 3.3203769292536764e-05 + }, + { + "step": 238, + "epoch": 1.6135593220338982, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623568896, + "loss": 0.4636, + "grad_norm": 1.6322640180587769, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 239, + "epoch": 1.6203389830508474, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623822336, + "loss": 0.438, + "grad_norm": 1.7353867292404175, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 240, + "epoch": 1.6271186440677967, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623472128, + "loss": 0.4591, + "grad_norm": 2.0931665897369385, + "learning_rate": 2.9918813861345952e-05 + }, + { + "step": 241, + "epoch": 1.6338983050847458, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623768576, + "loss": 0.37, + "grad_norm": 1.5519907474517822, + "learning_rate": 2.885763318295102e-05 + }, + { + "step": 242, + "epoch": 1.6406779661016948, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623630336, + "loss": 0.3747, + "grad_norm": 1.5780291557312012, + "learning_rate": 2.781360719244964e-05 + }, + { + "step": 243, + "epoch": 1.647457627118644, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62348288, + "loss": 0.4226, + "grad_norm": 1.8254464864730835, + "learning_rate": 2.6786883731926306e-05 + }, + { + "step": 244, + "epoch": 1.6542372881355933, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623622656, + "loss": 0.3204, + "grad_norm": 1.3813347816467285, + "learning_rate": 2.5777608193294396e-05 + }, + { + "step": 245, + "epoch": 1.6610169491525424, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623501312, + "loss": 0.375, + "grad_norm": 1.1614993810653687, + "learning_rate": 2.4785923497707956e-05 + }, + { + "step": 246, + "epoch": 1.6677966101694914, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623595008, + "loss": 0.4449, + "grad_norm": 1.3523486852645874, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 247, + "epoch": 1.6745762711864407, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62361344, + "loss": 0.2386, + "grad_norm": 1.20607590675354, + "learning_rate": 2.285588584541047e-05 + }, + { + "step": 248, + "epoch": 1.68135593220339, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623565824, + "loss": 0.4162, + "grad_norm": 1.4451812505722046, + "learning_rate": 2.1917806196827792e-05 + }, + { + "step": 249, + "epoch": 1.688135593220339, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623472128, + "loss": 0.2798, + "grad_norm": 1.4528095722198486, + "learning_rate": 2.0997863968844914e-05 + }, + { + "step": 250, + "epoch": 1.694915254237288, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623564288, + "loss": 0.3847, + "grad_norm": 2.0149261951446533, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 251, + "epoch": 1.7016949152542373, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623476736, + "loss": 0.2572, + "grad_norm": 1.2979618310928345, + "learning_rate": 1.921291027132278e-05 + }, + { + "step": 252, + "epoch": 1.7084745762711866, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623519744, + "loss": 0.4533, + "grad_norm": 2.186067581176758, + "learning_rate": 1.834815156491165e-05 + }, + { + "step": 253, + "epoch": 1.7152542372881356, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62371328, + "loss": 0.4749, + "grad_norm": 2.4554576873779297, + "learning_rate": 1.750203576956341e-05 + }, + { + "step": 254, + "epoch": 1.7220338983050847, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623508992, + "loss": 0.3425, + "grad_norm": 1.4987815618515015, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 255, + "epoch": 1.7288135593220337, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623665664, + "loss": 0.4362, + "grad_norm": 1.7981704473495483, + "learning_rate": 1.5866209521043304e-05 + }, + { + "step": 256, + "epoch": 1.735593220338983, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623492096, + "loss": 0.3805, + "grad_norm": 1.6738426685333252, + "learning_rate": 1.5076730713409523e-05 + }, + { + "step": 257, + "epoch": 1.7423728813559323, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62390528, + "loss": 0.4587, + "grad_norm": 1.3810175657272339, + "learning_rate": 1.4306358075111923e-05 + }, + { + "step": 258, + "epoch": 1.7491525423728813, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623564288, + "loss": 0.4616, + "grad_norm": 2.2409305572509766, + "learning_rate": 1.3555200696822232e-05 + }, + { + "step": 259, + "epoch": 1.7559322033898304, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623481344, + "loss": 0.4302, + "grad_norm": 2.257272958755493, + "learning_rate": 1.2823364948184095e-05 + }, + { + "step": 260, + "epoch": 1.7627118644067796, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62359808, + "loss": 0.2501, + "grad_norm": 1.161773443222046, + "learning_rate": 1.2110954462750166e-05 + }, + { + "step": 261, + "epoch": 1.769491525423729, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623553536, + "loss": 0.2696, + "grad_norm": 1.5331093072891235, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 262, + "epoch": 1.776271186440678, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623510528, + "loss": 0.3098, + "grad_norm": 1.3339146375656128, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 263, + "epoch": 1.783050847457627, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623547392, + "loss": 0.333, + "grad_norm": 1.137773036956787, + "learning_rate": 1.0091269574384874e-05 + }, + { + "step": 264, + "epoch": 1.7898305084745763, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623634944, + "loss": 0.3358, + "grad_norm": 1.408526062965393, + "learning_rate": 9.45754125003576e-06 + }, + { + "step": 265, + "epoch": 1.7966101694915255, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623553536, + "loss": 0.437, + "grad_norm": 1.417105793952942, + "learning_rate": 8.843714815330987e-06 + }, + { + "step": 266, + "epoch": 1.8033898305084746, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623768576, + "loss": 0.4046, + "grad_norm": 1.6505175828933716, + "learning_rate": 8.249877192799731e-06 + }, + { + "step": 267, + "epoch": 1.8101694915254236, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623561216, + "loss": 0.4626, + "grad_norm": 1.8442293405532837, + "learning_rate": 7.676112474402068e-06 + }, + { + "step": 268, + "epoch": 1.8169491525423729, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623565824, + "loss": 0.2807, + "grad_norm": 1.2415339946746826, + "learning_rate": 7.122501909620926e-06 + }, + { + "step": 269, + "epoch": 1.8237288135593221, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623576576, + "loss": 0.3846, + "grad_norm": 2.06632924079895, + "learning_rate": 6.5891238939566275e-06 + }, + { + "step": 270, + "epoch": 1.8305084745762712, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623614976, + "loss": 0.3243, + "grad_norm": 2.0014865398406982, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 271, + "epoch": 1.8372881355932202, + "cpu_mem": 1.71894784, + "gpu_mem": 4.6236672, + "loss": 0.3303, + "grad_norm": 1.3684595823287964, + "learning_rate": 5.583364755863701e-06 + }, + { + "step": 272, + "epoch": 1.8440677966101695, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623525888, + "loss": 0.4302, + "grad_norm": 1.1140187978744507, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 273, + "epoch": 1.8508474576271188, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62340608, + "loss": 0.3408, + "grad_norm": 1.4989228248596191, + "learning_rate": 4.659404732773908e-06 + }, + { + "step": 274, + "epoch": 1.8576271186440678, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623633408, + "loss": 0.3916, + "grad_norm": 1.5992313623428345, + "learning_rate": 4.228264751468752e-06 + }, + { + "step": 275, + "epoch": 1.8644067796610169, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623877632, + "loss": 0.3776, + "grad_norm": 1.6213064193725586, + "learning_rate": 3.817767165451041e-06 + }, + { + "step": 276, + "epoch": 1.8711864406779661, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623538176, + "loss": 0.364, + "grad_norm": 1.8253496885299683, + "learning_rate": 3.4279701043260886e-06 + }, + { + "step": 277, + "epoch": 1.8779661016949154, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623484416, + "loss": 0.5017, + "grad_norm": 1.8761091232299805, + "learning_rate": 3.0589287663461472e-06 + }, + { + "step": 278, + "epoch": 1.8847457627118644, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623647232, + "loss": 0.505, + "grad_norm": 1.9046614170074463, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 279, + "epoch": 1.8915254237288135, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623587328, + "loss": 0.4148, + "grad_norm": 1.8106591701507568, + "learning_rate": 2.3833193495825853e-06 + }, + { + "step": 280, + "epoch": 1.8983050847457628, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62356736, + "loss": 0.3758, + "grad_norm": 1.356575608253479, + "learning_rate": 2.076846942272026e-06 + }, + { + "step": 281, + "epoch": 1.905084745762712, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623502848, + "loss": 0.4082, + "grad_norm": 1.3462706804275513, + "learning_rate": 1.791321587504768e-06 + }, + { + "step": 282, + "epoch": 1.911864406779661, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623931392, + "loss": 0.299, + "grad_norm": 1.9833860397338867, + "learning_rate": 1.5267837178600972e-06 + }, + { + "step": 283, + "epoch": 1.9186440677966101, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623638016, + "loss": 0.3079, + "grad_norm": 1.5204863548278809, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 284, + "epoch": 1.9254237288135592, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623493632, + "loss": 0.4321, + "grad_norm": 1.772533655166626, + "learning_rate": 1.0608172990067553e-06 + }, + { + "step": 285, + "epoch": 1.9322033898305084, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623547392, + "loss": 0.3172, + "grad_norm": 1.3415104150772095, + "learning_rate": 8.594547342153979e-07 + }, + { + "step": 286, + "epoch": 1.9389830508474577, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623965184, + "loss": 0.33, + "grad_norm": 1.3366953134536743, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 287, + "epoch": 1.9457627118644067, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623734784, + "loss": 0.2836, + "grad_norm": 1.4056572914123535, + "learning_rate": 5.201134622801473e-07 + }, + { + "step": 288, + "epoch": 1.9525423728813558, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623519744, + "loss": 0.5023, + "grad_norm": 1.645820140838623, + "learning_rate": 3.821828084619727e-07 + }, + { + "step": 289, + "epoch": 1.959322033898305, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623604224, + "loss": 0.315, + "grad_norm": 1.207215428352356, + "learning_rate": 2.654391846207915e-07 + }, + { + "step": 290, + "epoch": 1.9661016949152543, + "cpu_mem": 1.71894784, + "gpu_mem": 4.62352896, + "loss": 0.4202, + "grad_norm": 1.5676509141921997, + "learning_rate": 1.6989912254880556e-07 + }, + { + "step": 291, + "epoch": 1.9728813559322034, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623564288, + "loss": 0.4466, + "grad_norm": 1.578613042831421, + "learning_rate": 9.557615145123765e-08 + }, + { + "step": 292, + "epoch": 1.9796610169491524, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623647232, + "loss": 0.3404, + "grad_norm": 1.2707529067993164, + "learning_rate": 4.248079603064724e-08 + }, + { + "step": 293, + "epoch": 1.9864406779661017, + "cpu_mem": 1.71894784, + "gpu_mem": 4.623564288, + "loss": 0.5055, + "grad_norm": 2.6826367378234863, + "learning_rate": 1.0620574996372811e-08 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.71894784, + "gpu_mem": 4.6235904, + "loss": 0.4529, + "grad_norm": 1.5452101230621338, + "learning_rate": 0.0 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.71894784, + "gpu_mem": 4.6235904, + "train_runtime": 4464.7893, + "train_samples_per_second": 4.223, + "train_steps_per_second": 0.066, + "total_flos": 0.0, + "train_loss": 0.6957279571465084 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r8-a2/adapter_config.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c9ded8039b496858a8aa3d756f427279337f8964 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r8-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 16, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 8, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "D" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r8-a2/eval_results.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..008e6fd7264730cb656b03ddca961d9020127573 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "boolq", + "results": 0.6914373088685015 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r8-a2/training_configuration.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..840b9e18bf7c5e16f4633b011896a867628b0f0b --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "BOOLQ", + "dataset_id": "google/boolq", + "preprocess_id": "boolq_train_deepeval" + }, + "peft_config": { + "method": "abl_D", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 3163776 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 2, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_D-boolq-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r8-a2", + "seed": 42, + "timestamp": "2025-09-02T11:22:28.996126" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r8-a2/training_logs.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..c8b560ad2c9ef9050e6ca292e16d1efeb80fcba9 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-boolq-r8-a2/training_logs.json @@ -0,0 +1,2659 @@ +[ + { + "step": 1, + "epoch": 0.006779661016949152, + "cpu_mem": 1.697869824, + "gpu_mem": 4.431080448, + "loss": 8.869, + "grad_norm": 16.31131935119629, + "learning_rate": 9.999999999999999e-06 + }, + { + "step": 2, + "epoch": 0.013559322033898305, + "cpu_mem": 1.698656256, + "gpu_mem": 4.456607232, + "loss": 8.9376, + "grad_norm": 16.644018173217773, + "learning_rate": 1.9999999999999998e-05 + }, + { + "step": 3, + "epoch": 0.020338983050847456, + "cpu_mem": 1.699442688, + "gpu_mem": 4.456525824, + "loss": 8.8818, + "grad_norm": 16.7022762298584, + "learning_rate": 2.9999999999999997e-05 + }, + { + "step": 4, + "epoch": 0.02711864406779661, + "cpu_mem": 1.70022912, + "gpu_mem": 4.456525824, + "loss": 8.7159, + "grad_norm": 17.160343170166016, + "learning_rate": 3.9999999999999996e-05 + }, + { + "step": 5, + "epoch": 0.03389830508474576, + "cpu_mem": 1.700622336, + "gpu_mem": 4.456461312, + "loss": 8.3783, + "grad_norm": 16.896221160888672, + "learning_rate": 4.9999999999999996e-05 + }, + { + "step": 6, + "epoch": 0.04067796610169491, + "cpu_mem": 1.70121216, + "gpu_mem": 4.45648128, + "loss": 8.2917, + "grad_norm": 15.832453727722168, + "learning_rate": 5.9999999999999995e-05 + }, + { + "step": 7, + "epoch": 0.04745762711864407, + "cpu_mem": 1.701801984, + "gpu_mem": 4.456533504, + "loss": 7.8427, + "grad_norm": 16.776145935058594, + "learning_rate": 7e-05 + }, + { + "step": 8, + "epoch": 0.05423728813559322, + "cpu_mem": 1.702391808, + "gpu_mem": 4.45661952, + "loss": 7.3845, + "grad_norm": 17.093584060668945, + "learning_rate": 7.999999999999999e-05 + }, + { + "step": 9, + "epoch": 0.061016949152542375, + "cpu_mem": 1.702785024, + "gpu_mem": 4.45652736, + "loss": 6.7413, + "grad_norm": 16.384645462036133, + "learning_rate": 8.999999999999999e-05 + }, + { + "step": 10, + "epoch": 0.06779661016949153, + "cpu_mem": 1.703374848, + "gpu_mem": 4.45642752, + "loss": 6.2559, + "grad_norm": 15.777255058288574, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 11, + "epoch": 0.07457627118644068, + "cpu_mem": 1.703768064, + "gpu_mem": 4.456531968, + "loss": 5.5563, + "grad_norm": 16.534765243530273, + "learning_rate": 0.00010999999999999998 + }, + { + "step": 12, + "epoch": 0.08135593220338982, + "cpu_mem": 1.70416128, + "gpu_mem": 4.45690368, + "loss": 4.8786, + "grad_norm": 16.408397674560547, + "learning_rate": 0.00011999999999999999 + }, + { + "step": 13, + "epoch": 0.08813559322033898, + "cpu_mem": 1.704554496, + "gpu_mem": 4.456507392, + "loss": 4.2795, + "grad_norm": 15.05648422241211, + "learning_rate": 0.00013 + }, + { + "step": 14, + "epoch": 0.09491525423728814, + "cpu_mem": 1.704947712, + "gpu_mem": 4.456484352, + "loss": 3.3109, + "grad_norm": 14.064278602600098, + "learning_rate": 0.00014 + }, + { + "step": 15, + "epoch": 0.1016949152542373, + "cpu_mem": 1.70514432, + "gpu_mem": 4.456422912, + "loss": 2.797, + "grad_norm": 10.931859970092773, + "learning_rate": 0.00015 + }, + { + "step": 16, + "epoch": 0.10847457627118644, + "cpu_mem": 1.705537536, + "gpu_mem": 4.456507392, + "loss": 2.2972, + "grad_norm": 8.908764839172363, + "learning_rate": 0.00015999999999999999 + }, + { + "step": 17, + "epoch": 0.1152542372881356, + "cpu_mem": 1.705930752, + "gpu_mem": 4.456547328, + "loss": 1.7691, + "grad_norm": 6.247945308685303, + "learning_rate": 0.00016999999999999999 + }, + { + "step": 18, + "epoch": 0.12203389830508475, + "cpu_mem": 1.706323968, + "gpu_mem": 4.456610304, + "loss": 1.6227, + "grad_norm": 4.6546430587768555, + "learning_rate": 0.00017999999999999998 + }, + { + "step": 19, + "epoch": 0.1288135593220339, + "cpu_mem": 1.706717184, + "gpu_mem": 4.456447488, + "loss": 1.1909, + "grad_norm": 3.135087251663208, + "learning_rate": 0.00018999999999999998 + }, + { + "step": 20, + "epoch": 0.13559322033898305, + "cpu_mem": 1.706913792, + "gpu_mem": 4.456559616, + "loss": 0.9808, + "grad_norm": 3.1353225708007812, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 21, + "epoch": 0.1423728813559322, + "cpu_mem": 1.7071104, + "gpu_mem": 4.456717824, + "loss": 0.8854, + "grad_norm": 2.3749969005584717, + "learning_rate": 0.00020999999999999998 + }, + { + "step": 22, + "epoch": 0.14915254237288136, + "cpu_mem": 1.707503616, + "gpu_mem": 4.456610304, + "loss": 0.8633, + "grad_norm": 1.931467056274414, + "learning_rate": 0.00021999999999999995 + }, + { + "step": 23, + "epoch": 0.15593220338983052, + "cpu_mem": 1.707896832, + "gpu_mem": 4.456582656, + "loss": 0.7937, + "grad_norm": 1.158082365989685, + "learning_rate": 0.00023 + }, + { + "step": 24, + "epoch": 0.16271186440677965, + "cpu_mem": 1.70809344, + "gpu_mem": 4.456639488, + "loss": 0.6993, + "grad_norm": 1.0842634439468384, + "learning_rate": 0.00023999999999999998 + }, + { + "step": 25, + "epoch": 0.1694915254237288, + "cpu_mem": 1.708486656, + "gpu_mem": 4.456424448, + "loss": 0.6946, + "grad_norm": 0.9029825925827026, + "learning_rate": 0.00025 + }, + { + "step": 26, + "epoch": 0.17627118644067796, + "cpu_mem": 1.708683264, + "gpu_mem": 4.456479744, + "loss": 0.675, + "grad_norm": 1.2476006746292114, + "learning_rate": 0.00026 + }, + { + "step": 27, + "epoch": 0.18305084745762712, + "cpu_mem": 1.708879872, + "gpu_mem": 4.456771584, + "loss": 0.7661, + "grad_norm": 4.477359771728516, + "learning_rate": 0.00027 + }, + { + "step": 28, + "epoch": 0.18983050847457628, + "cpu_mem": 1.709273088, + "gpu_mem": 4.45645056, + "loss": 0.6685, + "grad_norm": 1.4081227779388428, + "learning_rate": 0.00028 + }, + { + "step": 29, + "epoch": 0.19661016949152543, + "cpu_mem": 1.709469696, + "gpu_mem": 4.456515072, + "loss": 0.6329, + "grad_norm": 1.8570095300674438, + "learning_rate": 0.00029 + }, + { + "step": 30, + "epoch": 0.2033898305084746, + "cpu_mem": 1.709666304, + "gpu_mem": 4.456593408, + "loss": 0.831, + "grad_norm": 4.979935169219971, + "learning_rate": 0.0003 + }, + { + "step": 31, + "epoch": 0.21016949152542372, + "cpu_mem": 1.709862912, + "gpu_mem": 4.4563968, + "loss": 0.746, + "grad_norm": 4.405153751373291, + "learning_rate": 0.0002999893794250036 + }, + { + "step": 32, + "epoch": 0.21694915254237288, + "cpu_mem": 1.710256128, + "gpu_mem": 4.456510464, + "loss": 0.8674, + "grad_norm": 4.953607082366943, + "learning_rate": 0.00029995751920396937 + }, + { + "step": 33, + "epoch": 0.22372881355932203, + "cpu_mem": 1.710452736, + "gpu_mem": 4.456748544, + "loss": 0.6983, + "grad_norm": 1.5675383806228638, + "learning_rate": 0.00029990442384854874 + }, + { + "step": 34, + "epoch": 0.2305084745762712, + "cpu_mem": 1.710649344, + "gpu_mem": 4.45645056, + "loss": 0.7476, + "grad_norm": 5.215970039367676, + "learning_rate": 0.0002998301008774512 + }, + { + "step": 35, + "epoch": 0.23728813559322035, + "cpu_mem": 1.710845952, + "gpu_mem": 4.456660992, + "loss": 0.7061, + "grad_norm": 2.9353201389312744, + "learning_rate": 0.0002997345608153792 + }, + { + "step": 36, + "epoch": 0.2440677966101695, + "cpu_mem": 1.71104256, + "gpu_mem": 4.45661184, + "loss": 0.7258, + "grad_norm": 2.450796604156494, + "learning_rate": 0.000299617817191538 + }, + { + "step": 37, + "epoch": 0.25084745762711863, + "cpu_mem": 1.711239168, + "gpu_mem": 4.456422912, + "loss": 0.6573, + "grad_norm": 0.8968814015388489, + "learning_rate": 0.0002994798865377198 + }, + { + "step": 38, + "epoch": 0.2576271186440678, + "cpu_mem": 1.711435776, + "gpu_mem": 4.456670208, + "loss": 0.8757, + "grad_norm": 3.613511085510254, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 39, + "epoch": 0.26440677966101694, + "cpu_mem": 1.711828992, + "gpu_mem": 4.4570496, + "loss": 0.7637, + "grad_norm": 2.7372958660125732, + "learning_rate": 0.0002991405452657846 + }, + { + "step": 40, + "epoch": 0.2711864406779661, + "cpu_mem": 1.7120256, + "gpu_mem": 4.45661952, + "loss": 0.7206, + "grad_norm": 2.3666133880615234, + "learning_rate": 0.00029893918270099324 + }, + { + "step": 41, + "epoch": 0.27796610169491526, + "cpu_mem": 1.712222208, + "gpu_mem": 4.456846848, + "loss": 0.7182, + "grad_norm": 1.2755488157272339, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 42, + "epoch": 0.2847457627118644, + "cpu_mem": 1.712418816, + "gpu_mem": 4.456743936, + "loss": 0.6892, + "grad_norm": 0.8972368836402893, + "learning_rate": 0.0002984732162821399 + }, + { + "step": 43, + "epoch": 0.29152542372881357, + "cpu_mem": 1.712418816, + "gpu_mem": 4.45656576, + "loss": 0.7419, + "grad_norm": 2.532297134399414, + "learning_rate": 0.0002982086784124952 + }, + { + "step": 44, + "epoch": 0.2983050847457627, + "cpu_mem": 1.712615424, + "gpu_mem": 4.456708608, + "loss": 0.7337, + "grad_norm": 2.325652599334717, + "learning_rate": 0.00029792315305772796 + }, + { + "step": 45, + "epoch": 0.3050847457627119, + "cpu_mem": 1.712812032, + "gpu_mem": 4.45648896, + "loss": 0.702, + "grad_norm": 0.534309983253479, + "learning_rate": 0.0002976166806504174 + }, + { + "step": 46, + "epoch": 0.31186440677966104, + "cpu_mem": 1.71300864, + "gpu_mem": 4.456731648, + "loss": 0.6768, + "grad_norm": 0.606399416923523, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 47, + "epoch": 0.31864406779661014, + "cpu_mem": 1.71300864, + "gpu_mem": 4.456455168, + "loss": 0.6367, + "grad_norm": 0.3364216983318329, + "learning_rate": 0.00029694107123365385 + }, + { + "step": 48, + "epoch": 0.3254237288135593, + "cpu_mem": 1.713205248, + "gpu_mem": 4.456531968, + "loss": 0.6205, + "grad_norm": 0.355301171541214, + "learning_rate": 0.00029657202989567393 + }, + { + "step": 49, + "epoch": 0.33220338983050846, + "cpu_mem": 1.713401856, + "gpu_mem": 4.456548864, + "loss": 0.8245, + "grad_norm": 2.323561191558838, + "learning_rate": 0.00029618223283454893 + }, + { + "step": 50, + "epoch": 0.3389830508474576, + "cpu_mem": 1.713401856, + "gpu_mem": 4.456487424, + "loss": 0.6515, + "grad_norm": 0.9249618649482727, + "learning_rate": 0.00029577173524853123 + }, + { + "step": 51, + "epoch": 0.34576271186440677, + "cpu_mem": 1.713598464, + "gpu_mem": 4.456492032, + "loss": 0.6175, + "grad_norm": 0.3961940109729767, + "learning_rate": 0.0002953405952672261 + }, + { + "step": 52, + "epoch": 0.3525423728813559, + "cpu_mem": 1.713795072, + "gpu_mem": 4.456571904, + "loss": 0.6813, + "grad_norm": 0.5494756698608398, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 53, + "epoch": 0.3593220338983051, + "cpu_mem": 1.713795072, + "gpu_mem": 4.456594944, + "loss": 0.6306, + "grad_norm": 1.2922532558441162, + "learning_rate": 0.0002944166352441363 + }, + { + "step": 54, + "epoch": 0.36610169491525424, + "cpu_mem": 1.713795072, + "gpu_mem": 4.456522752, + "loss": 0.7282, + "grad_norm": 0.5947088003158569, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 55, + "epoch": 0.3728813559322034, + "cpu_mem": 1.71399168, + "gpu_mem": 4.456793088, + "loss": 0.6609, + "grad_norm": 1.0830832719802856, + "learning_rate": 0.00029341087610604337 + }, + { + "step": 56, + "epoch": 0.37966101694915255, + "cpu_mem": 1.714188288, + "gpu_mem": 4.456579584, + "loss": 0.6575, + "grad_norm": 0.29077136516571045, + "learning_rate": 0.00029287749809037904 + }, + { + "step": 57, + "epoch": 0.3864406779661017, + "cpu_mem": 1.714188288, + "gpu_mem": 4.45657344, + "loss": 0.6281, + "grad_norm": 0.7768498063087463, + "learning_rate": 0.0002923238875255979 + }, + { + "step": 58, + "epoch": 0.39322033898305087, + "cpu_mem": 1.714384896, + "gpu_mem": 4.456468992, + "loss": 0.6066, + "grad_norm": 0.27297621965408325, + "learning_rate": 0.00029175012280720024 + }, + { + "step": 59, + "epoch": 0.4, + "cpu_mem": 1.714384896, + "gpu_mem": 4.456485888, + "loss": 0.7014, + "grad_norm": 1.200810432434082, + "learning_rate": 0.000291156285184669 + }, + { + "step": 60, + "epoch": 0.4067796610169492, + "cpu_mem": 1.714384896, + "gpu_mem": 4.456579584, + "loss": 0.5967, + "grad_norm": 0.2797515392303467, + "learning_rate": 0.00029054245874996426 + }, + { + "step": 61, + "epoch": 0.4135593220338983, + "cpu_mem": 1.714778112, + "gpu_mem": 4.456590336, + "loss": 0.6199, + "grad_norm": 0.38585567474365234, + "learning_rate": 0.0002899087304256151 + }, + { + "step": 62, + "epoch": 0.42033898305084744, + "cpu_mem": 1.714778112, + "gpu_mem": 4.456578048, + "loss": 0.7541, + "grad_norm": 1.7268927097320557, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 63, + "epoch": 0.4271186440677966, + "cpu_mem": 1.714778112, + "gpu_mem": 4.456570368, + "loss": 0.562, + "grad_norm": 0.5583781003952026, + "learning_rate": 0.000288581929876693 + }, + { + "step": 64, + "epoch": 0.43389830508474575, + "cpu_mem": 1.71497472, + "gpu_mem": 4.456499712, + "loss": 0.63, + "grad_norm": 0.39976856112480164, + "learning_rate": 0.0002878890455372498 + }, + { + "step": 65, + "epoch": 0.4406779661016949, + "cpu_mem": 1.71497472, + "gpu_mem": 4.456544256, + "loss": 0.6612, + "grad_norm": 0.33548709750175476, + "learning_rate": 0.0002871766350518159 + }, + { + "step": 66, + "epoch": 0.44745762711864406, + "cpu_mem": 1.715171328, + "gpu_mem": 4.456737792, + "loss": 0.6225, + "grad_norm": 0.9883282780647278, + "learning_rate": 0.00028644479930317775 + }, + { + "step": 67, + "epoch": 0.4542372881355932, + "cpu_mem": 1.715171328, + "gpu_mem": 4.456447488, + "loss": 0.6415, + "grad_norm": 0.5853760838508606, + "learning_rate": 0.00028569364192488803 + }, + { + "step": 68, + "epoch": 0.4610169491525424, + "cpu_mem": 1.715171328, + "gpu_mem": 4.456415232, + "loss": 0.7375, + "grad_norm": 1.478963851928711, + "learning_rate": 0.00028492326928659045 + }, + { + "step": 69, + "epoch": 0.46779661016949153, + "cpu_mem": 1.715171328, + "gpu_mem": 4.45648128, + "loss": 0.6351, + "grad_norm": 0.6073285937309265, + "learning_rate": 0.00028413379047895665 + }, + { + "step": 70, + "epoch": 0.4745762711864407, + "cpu_mem": 1.715367936, + "gpu_mem": 4.456475136, + "loss": 0.6643, + "grad_norm": 0.6471723318099976, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 71, + "epoch": 0.48135593220338985, + "cpu_mem": 1.715367936, + "gpu_mem": 4.456704, + "loss": 0.5997, + "grad_norm": 0.6086612939834595, + "learning_rate": 0.0002824979642304366 + }, + { + "step": 72, + "epoch": 0.488135593220339, + "cpu_mem": 1.715564544, + "gpu_mem": 4.45669632, + "loss": 0.6552, + "grad_norm": 0.50797039270401, + "learning_rate": 0.0002816518484350883 + }, + { + "step": 73, + "epoch": 0.49491525423728816, + "cpu_mem": 1.715564544, + "gpu_mem": 4.456662528, + "loss": 0.6839, + "grad_norm": 0.9377349019050598, + "learning_rate": 0.0002807870897286772 + }, + { + "step": 74, + "epoch": 0.5016949152542373, + "cpu_mem": 1.715564544, + "gpu_mem": 4.456522752, + "loss": 0.6125, + "grad_norm": 1.0703237056732178, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 75, + "epoch": 0.5084745762711864, + "cpu_mem": 1.715564544, + "gpu_mem": 4.456447488, + "loss": 0.5957, + "grad_norm": 1.6800501346588135, + "learning_rate": 0.000279002136031155 + }, + { + "step": 76, + "epoch": 0.5152542372881356, + "cpu_mem": 1.715761152, + "gpu_mem": 4.456387584, + "loss": 0.6732, + "grad_norm": 0.803346574306488, + "learning_rate": 0.00027808219380317216 + }, + { + "step": 77, + "epoch": 0.5220338983050847, + "cpu_mem": 1.715761152, + "gpu_mem": 4.456461312, + "loss": 0.5722, + "grad_norm": 0.6176803708076477, + "learning_rate": 0.0002771441141545895 + }, + { + "step": 78, + "epoch": 0.5288135593220339, + "cpu_mem": 1.715761152, + "gpu_mem": 4.456513536, + "loss": 0.8614, + "grad_norm": 3.0320825576782227, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 79, + "epoch": 0.535593220338983, + "cpu_mem": 1.71595776, + "gpu_mem": 4.456645632, + "loss": 0.7359, + "grad_norm": 1.9087793827056885, + "learning_rate": 0.000275214076502292 + }, + { + "step": 80, + "epoch": 0.5423728813559322, + "cpu_mem": 1.71595776, + "gpu_mem": 4.456536576, + "loss": 0.6806, + "grad_norm": 1.5242702960968018, + "learning_rate": 0.0002742223918067056 + }, + { + "step": 81, + "epoch": 0.5491525423728814, + "cpu_mem": 1.71595776, + "gpu_mem": 4.456416768, + "loss": 0.6676, + "grad_norm": 1.4538636207580566, + "learning_rate": 0.00027321311626807374 + }, + { + "step": 82, + "epoch": 0.5559322033898305, + "cpu_mem": 1.71595776, + "gpu_mem": 4.456485888, + "loss": 0.6589, + "grad_norm": 1.010715365409851, + "learning_rate": 0.0002721863928075503 + }, + { + "step": 83, + "epoch": 0.5627118644067797, + "cpu_mem": 1.71595776, + "gpu_mem": 4.456585728, + "loss": 0.6419, + "grad_norm": 0.46907028555870056, + "learning_rate": 0.000271142366817049 + }, + { + "step": 84, + "epoch": 0.5694915254237288, + "cpu_mem": 1.71595776, + "gpu_mem": 4.456548864, + "loss": 0.7304, + "grad_norm": 2.3578789234161377, + "learning_rate": 0.00027008118613865406 + }, + { + "step": 85, + "epoch": 0.576271186440678, + "cpu_mem": 1.71595776, + "gpu_mem": 4.45658112, + "loss": 0.7484, + "grad_norm": 2.7270755767822266, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 86, + "epoch": 0.5830508474576271, + "cpu_mem": 1.71595776, + "gpu_mem": 4.456531968, + "loss": 0.7655, + "grad_norm": 2.6161556243896484, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 87, + "epoch": 0.5898305084745763, + "cpu_mem": 1.71595776, + "gpu_mem": 4.456539648, + "loss": 0.6873, + "grad_norm": 1.6845486164093018, + "learning_rate": 0.00026679623070746325 + }, + { + "step": 88, + "epoch": 0.5966101694915255, + "cpu_mem": 1.71595776, + "gpu_mem": 4.456684032, + "loss": 0.6666, + "grad_norm": 1.6000875234603882, + "learning_rate": 0.0002656679579618081 + }, + { + "step": 89, + "epoch": 0.6033898305084746, + "cpu_mem": 1.71595776, + "gpu_mem": 4.45646592, + "loss": 0.6313, + "grad_norm": 0.3688609302043915, + "learning_rate": 0.0002645233057465235 + }, + { + "step": 90, + "epoch": 0.6101694915254238, + "cpu_mem": 1.716154368, + "gpu_mem": 4.45651968, + "loss": 0.6559, + "grad_norm": 0.5462937355041504, + "learning_rate": 0.00026336243615313873 + }, + { + "step": 91, + "epoch": 0.6169491525423729, + "cpu_mem": 1.716350976, + "gpu_mem": 4.456487424, + "loss": 0.5752, + "grad_norm": 0.3375707268714905, + "learning_rate": 0.00026218551356968814 + }, + { + "step": 92, + "epoch": 0.6237288135593221, + "cpu_mem": 1.716350976, + "gpu_mem": 4.456568832, + "loss": 0.7786, + "grad_norm": 1.9867311716079712, + "learning_rate": 0.00026099270465743254 + }, + { + "step": 93, + "epoch": 0.6305084745762712, + "cpu_mem": 1.716350976, + "gpu_mem": 4.456372224, + "loss": 0.8098, + "grad_norm": 1.5615675449371338, + "learning_rate": 0.0002597841783272588 + }, + { + "step": 94, + "epoch": 0.6372881355932203, + "cpu_mem": 1.716350976, + "gpu_mem": 4.456485888, + "loss": 0.5768, + "grad_norm": 0.26242563128471375, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 95, + "epoch": 0.6440677966101694, + "cpu_mem": 1.716350976, + "gpu_mem": 4.456505856, + "loss": 0.6772, + "grad_norm": 0.5399349927902222, + "learning_rate": 0.00025732066016100394 + }, + { + "step": 96, + "epoch": 0.6508474576271186, + "cpu_mem": 1.716350976, + "gpu_mem": 4.456544256, + "loss": 0.6234, + "grad_norm": 0.37212157249450684, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 97, + "epoch": 0.6576271186440678, + "cpu_mem": 1.716350976, + "gpu_mem": 4.456528896, + "loss": 0.6843, + "grad_norm": 0.3934764564037323, + "learning_rate": 0.0002547963544337602 + }, + { + "step": 98, + "epoch": 0.6644067796610169, + "cpu_mem": 1.716350976, + "gpu_mem": 4.456441344, + "loss": 0.6301, + "grad_norm": 0.3429834842681885, + "learning_rate": 0.0002535118517223168 + }, + { + "step": 99, + "epoch": 0.6711864406779661, + "cpu_mem": 1.716350976, + "gpu_mem": 4.456390656, + "loss": 0.6193, + "grad_norm": 0.4775605797767639, + "learning_rate": 0.00025221269093908365 + }, + { + "step": 100, + "epoch": 0.6779661016949152, + "cpu_mem": 1.716350976, + "gpu_mem": 4.456507392, + "loss": 0.6625, + "grad_norm": 1.0324757099151611, + "learning_rate": 0.0002508990560551879 + }, + { + "step": 101, + "epoch": 0.6847457627118644, + "cpu_mem": 1.716350976, + "gpu_mem": 4.456539648, + "loss": 0.6499, + "grad_norm": 0.4704483151435852, + "learning_rate": 0.0002495711330914001 + }, + { + "step": 102, + "epoch": 0.6915254237288135, + "cpu_mem": 1.716350976, + "gpu_mem": 4.45657344, + "loss": 0.6365, + "grad_norm": 0.34163546562194824, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 103, + "epoch": 0.6983050847457627, + "cpu_mem": 1.716350976, + "gpu_mem": 4.456624128, + "loss": 0.6552, + "grad_norm": 0.3359861969947815, + "learning_rate": 0.0002468731770971113 + }, + { + "step": 104, + "epoch": 0.7050847457627119, + "cpu_mem": 1.716350976, + "gpu_mem": 4.456528896, + "loss": 0.637, + "grad_norm": 0.7587937116622925, + "learning_rate": 0.0002455035261178632 + }, + { + "step": 105, + "epoch": 0.711864406779661, + "cpu_mem": 1.716350976, + "gpu_mem": 4.456630272, + "loss": 0.6384, + "grad_norm": 0.5187345147132874, + "learning_rate": 0.0002441203511071278 + }, + { + "step": 106, + "epoch": 0.7186440677966102, + "cpu_mem": 1.716350976, + "gpu_mem": 4.45658112, + "loss": 0.6034, + "grad_norm": 0.45107683539390564, + "learning_rate": 0.00024272384793309077 + }, + { + "step": 107, + "epoch": 0.7254237288135593, + "cpu_mem": 1.716350976, + "gpu_mem": 4.456468992, + "loss": 0.5533, + "grad_norm": 0.3475896120071411, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 108, + "epoch": 0.7322033898305085, + "cpu_mem": 1.716547584, + "gpu_mem": 4.456653312, + "loss": 0.6198, + "grad_norm": 0.4027327299118042, + "learning_rate": 0.00023989164997670202 + }, + { + "step": 109, + "epoch": 0.7389830508474576, + "cpu_mem": 1.716547584, + "gpu_mem": 4.456507392, + "loss": 0.6759, + "grad_norm": 0.593356728553772, + "learning_rate": 0.0002384563562552943 + }, + { + "step": 110, + "epoch": 0.7457627118644068, + "cpu_mem": 1.716547584, + "gpu_mem": 4.456510464, + "loss": 0.6423, + "grad_norm": 0.6139599680900574, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 111, + "epoch": 0.752542372881356, + "cpu_mem": 1.716547584, + "gpu_mem": 4.456479744, + "loss": 0.6022, + "grad_norm": 0.659511387348175, + "learning_rate": 0.0002355483955402446 + }, + { + "step": 112, + "epoch": 0.7593220338983051, + "cpu_mem": 1.716547584, + "gpu_mem": 4.456525824, + "loss": 0.6021, + "grad_norm": 0.6171076893806458, + "learning_rate": 0.00023407614033613407 + }, + { + "step": 113, + "epoch": 0.7661016949152543, + "cpu_mem": 1.716547584, + "gpu_mem": 4.456516608, + "loss": 0.6106, + "grad_norm": 0.4746922254562378, + "learning_rate": 0.0002325919793059723 + }, + { + "step": 114, + "epoch": 0.7728813559322034, + "cpu_mem": 1.716547584, + "gpu_mem": 4.456498176, + "loss": 0.6051, + "grad_norm": 1.030656337738037, + "learning_rate": 0.00023109612261833963 + }, + { + "step": 115, + "epoch": 0.7796610169491526, + "cpu_mem": 1.716547584, + "gpu_mem": 4.45657344, + "loss": 0.5775, + "grad_norm": 0.7366604208946228, + "learning_rate": 0.0002295887820980112 + }, + { + "step": 116, + "epoch": 0.7864406779661017, + "cpu_mem": 1.716547584, + "gpu_mem": 4.456493568, + "loss": 0.6327, + "grad_norm": 0.5481318831443787, + "learning_rate": 0.0002280701711959608 + }, + { + "step": 117, + "epoch": 0.7932203389830509, + "cpu_mem": 1.716547584, + "gpu_mem": 4.456384512, + "loss": 0.6409, + "grad_norm": 0.8867917656898499, + "learning_rate": 0.00022654050495913495 + }, + { + "step": 118, + "epoch": 0.8, + "cpu_mem": 1.716547584, + "gpu_mem": 4.456622592, + "loss": 0.56, + "grad_norm": 0.6395578980445862, + "learning_rate": 0.000225 + }, + { + "step": 119, + "epoch": 0.8067796610169492, + "cpu_mem": 1.716547584, + "gpu_mem": 4.456793088, + "loss": 0.5706, + "grad_norm": 0.46005842089653015, + "learning_rate": 0.00022344887446586865 + }, + { + "step": 120, + "epoch": 0.8135593220338984, + "cpu_mem": 1.716547584, + "gpu_mem": 4.456525824, + "loss": 0.5853, + "grad_norm": 0.47217872738838196, + "learning_rate": 0.00022188734800800852 + }, + { + "step": 121, + "epoch": 0.8203389830508474, + "cpu_mem": 1.716547584, + "gpu_mem": 4.456553472, + "loss": 0.5671, + "grad_norm": 0.49688583612442017, + "learning_rate": 0.00022031564175053754 + }, + { + "step": 122, + "epoch": 0.8271186440677966, + "cpu_mem": 1.716547584, + "gpu_mem": 4.45660416, + "loss": 0.5528, + "grad_norm": 0.47495418787002563, + "learning_rate": 0.00021873397825911153 + }, + { + "step": 123, + "epoch": 0.8338983050847457, + "cpu_mem": 1.716547584, + "gpu_mem": 4.456413696, + "loss": 0.5849, + "grad_norm": 1.1594964265823364, + "learning_rate": 0.00021714258150940685 + }, + { + "step": 124, + "epoch": 0.8406779661016949, + "cpu_mem": 1.716547584, + "gpu_mem": 4.456856064, + "loss": 0.5625, + "grad_norm": 0.6249487400054932, + "learning_rate": 0.0002155416768554039 + }, + { + "step": 125, + "epoch": 0.847457627118644, + "cpu_mem": 1.716547584, + "gpu_mem": 4.456582656, + "loss": 0.5391, + "grad_norm": 0.8827047348022461, + "learning_rate": 0.00021393149099747523 + }, + { + "step": 126, + "epoch": 0.8542372881355932, + "cpu_mem": 1.716547584, + "gpu_mem": 4.45646592, + "loss": 0.5567, + "grad_norm": 0.6990951895713806, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 127, + "epoch": 0.8610169491525423, + "cpu_mem": 1.716547584, + "gpu_mem": 4.456905216, + "loss": 0.6052, + "grad_norm": 1.0248686075210571, + "learning_rate": 0.00021068418901049025 + }, + { + "step": 128, + "epoch": 0.8677966101694915, + "cpu_mem": 1.716547584, + "gpu_mem": 4.45668096, + "loss": 0.5693, + "grad_norm": 0.7876132726669312, + "learning_rate": 0.0002090475327242912 + }, + { + "step": 129, + "epoch": 0.8745762711864407, + "cpu_mem": 1.716547584, + "gpu_mem": 4.456720896, + "loss": 0.6434, + "grad_norm": 1.0540376901626587, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 130, + "epoch": 0.8813559322033898, + "cpu_mem": 1.717530624, + "gpu_mem": 4.456502784, + "loss": 0.6919, + "grad_norm": 0.5420183539390564, + "learning_rate": 0.0002057493683490491 + }, + { + "step": 131, + "epoch": 0.888135593220339, + "cpu_mem": 1.717530624, + "gpu_mem": 4.456631808, + "loss": 0.624, + "grad_norm": 0.5553213953971863, + "learning_rate": 0.00020408832730536746 + }, + { + "step": 132, + "epoch": 0.8949152542372881, + "cpu_mem": 1.717530624, + "gpu_mem": 4.456713216, + "loss": 0.5766, + "grad_norm": 0.9684531688690186, + "learning_rate": 0.00020241962693986476 + }, + { + "step": 133, + "epoch": 0.9016949152542373, + "cpu_mem": 1.717530624, + "gpu_mem": 4.45649664, + "loss": 0.6031, + "grad_norm": 1.0884336233139038, + "learning_rate": 0.0002007435035533061 + }, + { + "step": 134, + "epoch": 0.9084745762711864, + "cpu_mem": 1.717530624, + "gpu_mem": 4.456630272, + "loss": 0.5769, + "grad_norm": 1.7601666450500488, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 135, + "epoch": 0.9152542372881356, + "cpu_mem": 1.717530624, + "gpu_mem": 4.456653312, + "loss": 0.5579, + "grad_norm": 0.4084602892398834, + "learning_rate": 0.00019736993814225374 + }, + { + "step": 136, + "epoch": 0.9220338983050848, + "cpu_mem": 1.717530624, + "gpu_mem": 4.456490496, + "loss": 0.501, + "grad_norm": 0.4869396984577179, + "learning_rate": 0.00019567297384048604 + }, + { + "step": 137, + "epoch": 0.9288135593220339, + "cpu_mem": 1.717530624, + "gpu_mem": 4.456370688, + "loss": 0.6389, + "grad_norm": 1.3335785865783691, + "learning_rate": 0.0001939695418954653 + }, + { + "step": 138, + "epoch": 0.9355932203389831, + "cpu_mem": 1.717530624, + "gpu_mem": 4.456551936, + "loss": 0.6286, + "grad_norm": 1.5002145767211914, + "learning_rate": 0.00019225988352621445 + }, + { + "step": 139, + "epoch": 0.9423728813559322, + "cpu_mem": 1.717530624, + "gpu_mem": 4.45645056, + "loss": 0.6241, + "grad_norm": 1.3122655153274536, + "learning_rate": 0.00019054424083346592 + }, + { + "step": 140, + "epoch": 0.9491525423728814, + "cpu_mem": 1.717530624, + "gpu_mem": 4.456502784, + "loss": 0.5564, + "grad_norm": 0.5609496831893921, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 141, + "epoch": 0.9559322033898305, + "cpu_mem": 1.717530624, + "gpu_mem": 4.45653504, + "loss": 0.5796, + "grad_norm": 0.667332112789154, + "learning_rate": 0.0001870959750831323 + }, + { + "step": 142, + "epoch": 0.9627118644067797, + "cpu_mem": 1.717530624, + "gpu_mem": 4.456674816, + "loss": 0.5634, + "grad_norm": 1.4193509817123413, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 143, + "epoch": 0.9694915254237289, + "cpu_mem": 1.717530624, + "gpu_mem": 4.45665792, + "loss": 0.6842, + "grad_norm": 0.648862898349762, + "learning_rate": 0.00018362669777878453 + }, + { + "step": 144, + "epoch": 0.976271186440678, + "cpu_mem": 1.717530624, + "gpu_mem": 4.45684992, + "loss": 0.5783, + "grad_norm": 0.5007690787315369, + "learning_rate": 0.00018188479343294648 + }, + { + "step": 145, + "epoch": 0.9830508474576272, + "cpu_mem": 1.717530624, + "gpu_mem": 4.456561152, + "loss": 0.5853, + "grad_norm": 1.1658531427383423, + "learning_rate": 0.0001801383739559098 + }, + { + "step": 146, + "epoch": 0.9898305084745763, + "cpu_mem": 1.717530624, + "gpu_mem": 4.45659648, + "loss": 0.554, + "grad_norm": 0.46922358870506287, + "learning_rate": 0.0001783876866540615 + }, + { + "step": 147, + "epoch": 0.9966101694915255, + "cpu_mem": 1.717530624, + "gpu_mem": 4.456495104, + "loss": 0.5894, + "grad_norm": 0.47734344005584717, + "learning_rate": 0.00017663297943814552 + }, + { + "step": 148, + "epoch": 1.0033898305084745, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469353984, + "loss": 0.9218, + "grad_norm": 2.0238542556762695, + "learning_rate": 0.0001748745007881561 + }, + { + "step": 149, + "epoch": 1.0101694915254238, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469289472, + "loss": 0.5777, + "grad_norm": 0.7447729706764221, + "learning_rate": 0.00017311249971815185 + }, + { + "step": 150, + "epoch": 1.0169491525423728, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469126656, + "loss": 0.6005, + "grad_norm": 0.912347674369812, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 151, + "epoch": 1.023728813559322, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469198848, + "loss": 0.6927, + "grad_norm": 1.058058261871338, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 152, + "epoch": 1.0305084745762711, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469234176, + "loss": 0.5205, + "grad_norm": 0.5982127785682678, + "learning_rate": 0.00016780785939859576 + }, + { + "step": 153, + "epoch": 1.0372881355932204, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469258752, + "loss": 0.633, + "grad_norm": 1.1450179815292358, + "learning_rate": 0.00016603426823476693 + }, + { + "step": 154, + "epoch": 1.0440677966101695, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469220352, + "loss": 0.5867, + "grad_norm": 0.9508165717124939, + "learning_rate": 0.00016425840649562736 + }, + { + "step": 155, + "epoch": 1.0508474576271187, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469441536, + "loss": 0.5985, + "grad_norm": 1.0046089887619019, + "learning_rate": 0.00016248052565681436 + }, + { + "step": 156, + "epoch": 1.0576271186440678, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469349376, + "loss": 0.6469, + "grad_norm": 1.5423786640167236, + "learning_rate": 0.00016070087747988482 + }, + { + "step": 157, + "epoch": 1.064406779661017, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46925568, + "loss": 0.5739, + "grad_norm": 0.42227521538734436, + "learning_rate": 0.00015891971397666464 + }, + { + "step": 158, + "epoch": 1.071186440677966, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469181952, + "loss": 0.5698, + "grad_norm": 0.4269810914993286, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 159, + "epoch": 1.0779661016949154, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469530624, + "loss": 0.5034, + "grad_norm": 0.4543478786945343, + "learning_rate": 0.00015535385007584706 + }, + { + "step": 160, + "epoch": 1.0847457627118644, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46912512, + "loss": 0.6431, + "grad_norm": 0.7189933061599731, + "learning_rate": 0.0001535696546319161 + }, + { + "step": 161, + "epoch": 1.0915254237288137, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46907136, + "loss": 0.52, + "grad_norm": 0.413745641708374, + "learning_rate": 0.00015178495369752213 + }, + { + "step": 162, + "epoch": 1.0983050847457627, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46984704, + "loss": 0.5457, + "grad_norm": 0.4668547809123993, + "learning_rate": 0.00015 + }, + { + "step": 163, + "epoch": 1.1050847457627118, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469323264, + "loss": 0.6425, + "grad_norm": 0.9189396500587463, + "learning_rate": 0.00014821504630247785 + }, + { + "step": 164, + "epoch": 1.111864406779661, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469235712, + "loss": 0.6388, + "grad_norm": 0.7249500751495361, + "learning_rate": 0.00014643034536808387 + }, + { + "step": 165, + "epoch": 1.11864406779661, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469185024, + "loss": 0.588, + "grad_norm": 0.9980934262275696, + "learning_rate": 0.00014464614992415294 + }, + { + "step": 166, + "epoch": 1.1254237288135593, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469280256, + "loss": 0.5594, + "grad_norm": 0.4786507189273834, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 167, + "epoch": 1.1322033898305084, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469197312, + "loss": 0.6214, + "grad_norm": 0.5040615200996399, + "learning_rate": 0.00014108028602333536 + }, + { + "step": 168, + "epoch": 1.1389830508474577, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469215744, + "loss": 0.5981, + "grad_norm": 1.250231385231018, + "learning_rate": 0.00013929912252011516 + }, + { + "step": 169, + "epoch": 1.1457627118644067, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469303296, + "loss": 0.6258, + "grad_norm": 1.619568109512329, + "learning_rate": 0.00013751947434318564 + }, + { + "step": 170, + "epoch": 1.152542372881356, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469188096, + "loss": 0.5679, + "grad_norm": 0.8477152585983276, + "learning_rate": 0.00013574159350437261 + }, + { + "step": 171, + "epoch": 1.159322033898305, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469251072, + "loss": 0.6117, + "grad_norm": 0.5000309944152832, + "learning_rate": 0.0001339657317652331 + }, + { + "step": 172, + "epoch": 1.1661016949152543, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469158912, + "loss": 0.5838, + "grad_norm": 0.49062803387641907, + "learning_rate": 0.00013219214060140424 + }, + { + "step": 173, + "epoch": 1.1728813559322033, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469458432, + "loss": 0.5706, + "grad_norm": 0.4446929097175598, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 174, + "epoch": 1.1796610169491526, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469181952, + "loss": 0.7163, + "grad_norm": 1.225297451019287, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 175, + "epoch": 1.1864406779661016, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46914816, + "loss": 0.5768, + "grad_norm": 0.7312919497489929, + "learning_rate": 0.00012688750028184818 + }, + { + "step": 176, + "epoch": 1.193220338983051, + "cpu_mem": 1.717530624, + "gpu_mem": 4.4692864, + "loss": 0.5084, + "grad_norm": 0.6148270964622498, + "learning_rate": 0.0001251254992118439 + }, + { + "step": 177, + "epoch": 1.2, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469384704, + "loss": 0.5796, + "grad_norm": 0.5432323217391968, + "learning_rate": 0.00012336702056185453 + }, + { + "step": 178, + "epoch": 1.2067796610169492, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469131264, + "loss": 0.5521, + "grad_norm": 0.6355196833610535, + "learning_rate": 0.00012161231334593851 + }, + { + "step": 179, + "epoch": 1.2135593220338983, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469231104, + "loss": 0.6336, + "grad_norm": 1.1527329683303833, + "learning_rate": 0.00011986162604409015 + }, + { + "step": 180, + "epoch": 1.2203389830508475, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469203456, + "loss": 0.605, + "grad_norm": 0.5085532665252686, + "learning_rate": 0.00011811520656705348 + }, + { + "step": 181, + "epoch": 1.2271186440677966, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46914048, + "loss": 0.5601, + "grad_norm": 0.5035235285758972, + "learning_rate": 0.00011637330222121543 + }, + { + "step": 182, + "epoch": 1.2338983050847459, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469358592, + "loss": 0.6475, + "grad_norm": 1.6928083896636963, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 183, + "epoch": 1.240677966101695, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46925568, + "loss": 0.6012, + "grad_norm": 0.5176418423652649, + "learning_rate": 0.00011290402491686766 + }, + { + "step": 184, + "epoch": 1.2474576271186442, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469203456, + "loss": 0.518, + "grad_norm": 0.49645403027534485, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 185, + "epoch": 1.2542372881355932, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469181952, + "loss": 0.5492, + "grad_norm": 0.4606945216655731, + "learning_rate": 0.00010945575916653407 + }, + { + "step": 186, + "epoch": 1.2610169491525425, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469191168, + "loss": 0.4818, + "grad_norm": 0.49132272601127625, + "learning_rate": 0.00010774011647378553 + }, + { + "step": 187, + "epoch": 1.2677966101694915, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469123584, + "loss": 0.6454, + "grad_norm": 0.5960263609886169, + "learning_rate": 0.00010603045810453468 + }, + { + "step": 188, + "epoch": 1.2745762711864406, + "cpu_mem": 1.717530624, + "gpu_mem": 4.4692864, + "loss": 0.5689, + "grad_norm": 0.9244797825813293, + "learning_rate": 0.00010432702615951396 + }, + { + "step": 189, + "epoch": 1.2813559322033898, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46915584, + "loss": 0.6324, + "grad_norm": 0.5442416667938232, + "learning_rate": 0.00010263006185774627 + }, + { + "step": 190, + "epoch": 1.288135593220339, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469275648, + "loss": 0.5576, + "grad_norm": 0.8285912871360779, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 191, + "epoch": 1.2949152542372881, + "cpu_mem": 1.717530624, + "gpu_mem": 4.4690944, + "loss": 0.5214, + "grad_norm": 0.5802145004272461, + "learning_rate": 9.925649644669391e-05 + }, + { + "step": 192, + "epoch": 1.3016949152542372, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469226496, + "loss": 0.521, + "grad_norm": 1.586517333984375, + "learning_rate": 9.758037306013526e-05 + }, + { + "step": 193, + "epoch": 1.3084745762711865, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469200384, + "loss": 0.5584, + "grad_norm": 0.49083393812179565, + "learning_rate": 9.591167269463255e-05 + }, + { + "step": 194, + "epoch": 1.3152542372881357, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469166592, + "loss": 0.6541, + "grad_norm": 0.5622571706771851, + "learning_rate": 9.425063165095088e-05 + }, + { + "step": 195, + "epoch": 1.3220338983050848, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46927104, + "loss": 0.5181, + "grad_norm": 0.6026725769042969, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 196, + "epoch": 1.3288135593220338, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469266432, + "loss": 0.495, + "grad_norm": 0.4962829053401947, + "learning_rate": 9.095246727570879e-05 + }, + { + "step": 197, + "epoch": 1.335593220338983, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46912512, + "loss": 0.5268, + "grad_norm": 1.0477226972579956, + "learning_rate": 8.931581098950973e-05 + }, + { + "step": 198, + "epoch": 1.3423728813559321, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46931712, + "loss": 0.5355, + "grad_norm": 0.6409386396408081, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 199, + "epoch": 1.3491525423728814, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469168128, + "loss": 0.5921, + "grad_norm": 1.1158617734909058, + "learning_rate": 8.606850900252478e-05 + }, + { + "step": 200, + "epoch": 1.3559322033898304, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46927104, + "loss": 0.5637, + "grad_norm": 0.7426369190216064, + "learning_rate": 8.445832314459608e-05 + }, + { + "step": 201, + "epoch": 1.3627118644067797, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469473792, + "loss": 0.5122, + "grad_norm": 0.5883936882019043, + "learning_rate": 8.285741849059311e-05 + }, + { + "step": 202, + "epoch": 1.3694915254237288, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469275648, + "loss": 0.5466, + "grad_norm": 0.5786541104316711, + "learning_rate": 8.126602174088843e-05 + }, + { + "step": 203, + "epoch": 1.376271186440678, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469161984, + "loss": 0.4724, + "grad_norm": 0.7795109748840332, + "learning_rate": 7.968435824946242e-05 + }, + { + "step": 204, + "epoch": 1.383050847457627, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469175808, + "loss": 0.5601, + "grad_norm": 0.651544988155365, + "learning_rate": 7.811265199199152e-05 + }, + { + "step": 205, + "epoch": 1.3898305084745763, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469220352, + "loss": 0.604, + "grad_norm": 1.1575654745101929, + "learning_rate": 7.655112553413135e-05 + }, + { + "step": 206, + "epoch": 1.3966101694915254, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469161984, + "loss": 0.5268, + "grad_norm": 0.6956334114074707, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 207, + "epoch": 1.4033898305084747, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469395456, + "loss": 0.591, + "grad_norm": 1.1958980560302734, + "learning_rate": 7.345949504086507e-05 + }, + { + "step": 208, + "epoch": 1.4101694915254237, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469426176, + "loss": 0.5389, + "grad_norm": 1.7963932752609253, + "learning_rate": 7.192982880403917e-05 + }, + { + "step": 209, + "epoch": 1.4169491525423727, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469352448, + "loss": 0.6071, + "grad_norm": 0.549054741859436, + "learning_rate": 7.041121790198881e-05 + }, + { + "step": 210, + "epoch": 1.423728813559322, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46924032, + "loss": 0.5965, + "grad_norm": 0.8228573799133301, + "learning_rate": 6.890387738166041e-05 + }, + { + "step": 211, + "epoch": 1.4305084745762713, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469189632, + "loss": 0.4806, + "grad_norm": 1.143716812133789, + "learning_rate": 6.740802069402771e-05 + }, + { + "step": 212, + "epoch": 1.4372881355932203, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469158912, + "loss": 0.6204, + "grad_norm": 0.6550019979476929, + "learning_rate": 6.592385966386588e-05 + }, + { + "step": 213, + "epoch": 1.4440677966101694, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469181952, + "loss": 0.6674, + "grad_norm": 1.096329689025879, + "learning_rate": 6.445160445975536e-05 + }, + { + "step": 214, + "epoch": 1.4508474576271186, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469264896, + "loss": 0.523, + "grad_norm": 0.7323886156082153, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 215, + "epoch": 1.457627118644068, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469192704, + "loss": 0.631, + "grad_norm": 1.788822889328003, + "learning_rate": 6.154364374470568e-05 + }, + { + "step": 216, + "epoch": 1.464406779661017, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469358592, + "loss": 0.5515, + "grad_norm": 0.6211614012718201, + "learning_rate": 6.010835002329795e-05 + }, + { + "step": 217, + "epoch": 1.471186440677966, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469200384, + "loss": 0.6035, + "grad_norm": 1.5593318939208984, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 218, + "epoch": 1.4779661016949153, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469177344, + "loss": 0.5387, + "grad_norm": 0.4714277684688568, + "learning_rate": 5.72761520669092e-05 + }, + { + "step": 219, + "epoch": 1.4847457627118645, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469303296, + "loss": 0.5465, + "grad_norm": 0.42789241671562195, + "learning_rate": 5.587964889287218e-05 + }, + { + "step": 220, + "epoch": 1.4915254237288136, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469337088, + "loss": 0.5841, + "grad_norm": 0.7923482060432434, + "learning_rate": 5.449647388213678e-05 + }, + { + "step": 221, + "epoch": 1.4983050847457626, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469204992, + "loss": 0.6254, + "grad_norm": 0.8893699645996094, + "learning_rate": 5.312682290288869e-05 + }, + { + "step": 222, + "epoch": 1.505084745762712, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469341696, + "loss": 0.5427, + "grad_norm": 0.8434540033340454, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 223, + "epoch": 1.5118644067796612, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46925568, + "loss": 0.5344, + "grad_norm": 1.0874117612838745, + "learning_rate": 5.0428866908599864e-05 + }, + { + "step": 224, + "epoch": 1.5186440677966102, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469220352, + "loss": 0.5543, + "grad_norm": 0.9588460326194763, + "learning_rate": 4.9100943944812114e-05 + }, + { + "step": 225, + "epoch": 1.5254237288135593, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469185024, + "loss": 0.563, + "grad_norm": 0.508337140083313, + "learning_rate": 4.778730906091632e-05 + }, + { + "step": 226, + "epoch": 1.5322033898305085, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469334016, + "loss": 0.5055, + "grad_norm": 1.094592809677124, + "learning_rate": 4.648814827768322e-05 + }, + { + "step": 227, + "epoch": 1.5389830508474578, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469223424, + "loss": 0.5079, + "grad_norm": 0.5023008584976196, + "learning_rate": 4.5203645566239816e-05 + }, + { + "step": 228, + "epoch": 1.5457627118644068, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469168128, + "loss": 0.6684, + "grad_norm": 0.6466962099075317, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 229, + "epoch": 1.5525423728813559, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46910976, + "loss": 0.5502, + "grad_norm": 0.5090475082397461, + "learning_rate": 4.267933983899601e-05 + }, + { + "step": 230, + "epoch": 1.559322033898305, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469166592, + "loss": 0.5547, + "grad_norm": 0.7726592421531677, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 231, + "epoch": 1.5661016949152542, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469444608, + "loss": 0.5945, + "grad_norm": 0.711047887802124, + "learning_rate": 4.0215821672741213e-05 + }, + { + "step": 232, + "epoch": 1.5728813559322035, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469168128, + "loss": 0.6187, + "grad_norm": 0.6217044591903687, + "learning_rate": 3.900729534256745e-05 + }, + { + "step": 233, + "epoch": 1.5796610169491525, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469481472, + "loss": 0.5314, + "grad_norm": 0.8925166726112366, + "learning_rate": 3.781448643031187e-05 + }, + { + "step": 234, + "epoch": 1.5864406779661016, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469357056, + "loss": 0.542, + "grad_norm": 1.0799064636230469, + "learning_rate": 3.663756384686127e-05 + }, + { + "step": 235, + "epoch": 1.5932203389830508, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469112832, + "loss": 0.4834, + "grad_norm": 1.0067927837371826, + "learning_rate": 3.547669425347647e-05 + }, + { + "step": 236, + "epoch": 1.6, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469172736, + "loss": 0.5416, + "grad_norm": 0.6978006958961487, + "learning_rate": 3.433204203819185e-05 + }, + { + "step": 237, + "epoch": 1.6067796610169491, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469234176, + "loss": 0.5148, + "grad_norm": 1.0577863454818726, + "learning_rate": 3.3203769292536764e-05 + }, + { + "step": 238, + "epoch": 1.6135593220338982, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469235712, + "loss": 0.5649, + "grad_norm": 0.8333036303520203, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 239, + "epoch": 1.6203389830508474, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469489152, + "loss": 0.633, + "grad_norm": 0.709492564201355, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 240, + "epoch": 1.6271186440677967, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469138944, + "loss": 0.6644, + "grad_norm": 0.5904974937438965, + "learning_rate": 2.9918813861345952e-05 + }, + { + "step": 241, + "epoch": 1.6338983050847458, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469435392, + "loss": 0.537, + "grad_norm": 1.0093928575515747, + "learning_rate": 2.885763318295102e-05 + }, + { + "step": 242, + "epoch": 1.6406779661016948, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469297152, + "loss": 0.5652, + "grad_norm": 0.6189796924591064, + "learning_rate": 2.781360719244964e-05 + }, + { + "step": 243, + "epoch": 1.647457627118644, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469149696, + "loss": 0.6832, + "grad_norm": 1.456748604774475, + "learning_rate": 2.6786883731926306e-05 + }, + { + "step": 244, + "epoch": 1.6542372881355933, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469289472, + "loss": 0.5447, + "grad_norm": 0.6964653134346008, + "learning_rate": 2.5777608193294396e-05 + }, + { + "step": 245, + "epoch": 1.6610169491525424, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469168128, + "loss": 0.5805, + "grad_norm": 0.524156928062439, + "learning_rate": 2.4785923497707956e-05 + }, + { + "step": 246, + "epoch": 1.6677966101694914, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469261824, + "loss": 0.5623, + "grad_norm": 0.6811053156852722, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 247, + "epoch": 1.6745762711864407, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469280256, + "loss": 0.4912, + "grad_norm": 0.5917328596115112, + "learning_rate": 2.285588584541047e-05 + }, + { + "step": 248, + "epoch": 1.68135593220339, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46923264, + "loss": 0.5409, + "grad_norm": 0.5480870008468628, + "learning_rate": 2.1917806196827792e-05 + }, + { + "step": 249, + "epoch": 1.688135593220339, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469138944, + "loss": 0.4981, + "grad_norm": 0.7619912624359131, + "learning_rate": 2.0997863968844914e-05 + }, + { + "step": 250, + "epoch": 1.694915254237288, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469231104, + "loss": 0.5462, + "grad_norm": 0.9875842928886414, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 251, + "epoch": 1.7016949152542373, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469143552, + "loss": 0.5437, + "grad_norm": 0.6850496530532837, + "learning_rate": 1.921291027132278e-05 + }, + { + "step": 252, + "epoch": 1.7084745762711866, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46918656, + "loss": 0.6245, + "grad_norm": 0.8352252244949341, + "learning_rate": 1.834815156491165e-05 + }, + { + "step": 253, + "epoch": 1.7152542372881356, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469380096, + "loss": 0.5991, + "grad_norm": 0.6352962851524353, + "learning_rate": 1.750203576956341e-05 + }, + { + "step": 254, + "epoch": 1.7220338983050847, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469175808, + "loss": 0.5582, + "grad_norm": 0.7635812163352966, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 255, + "epoch": 1.7288135593220337, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46933248, + "loss": 0.5995, + "grad_norm": 1.3215811252593994, + "learning_rate": 1.5866209521043304e-05 + }, + { + "step": 256, + "epoch": 1.735593220338983, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469158912, + "loss": 0.5729, + "grad_norm": 0.6398079991340637, + "learning_rate": 1.5076730713409523e-05 + }, + { + "step": 257, + "epoch": 1.7423728813559323, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469572096, + "loss": 0.5549, + "grad_norm": 0.7520560026168823, + "learning_rate": 1.4306358075111923e-05 + }, + { + "step": 258, + "epoch": 1.7491525423728813, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469231104, + "loss": 0.5985, + "grad_norm": 1.3216657638549805, + "learning_rate": 1.3555200696822232e-05 + }, + { + "step": 259, + "epoch": 1.7559322033898304, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46914816, + "loss": 0.5233, + "grad_norm": 0.5885584950447083, + "learning_rate": 1.2823364948184095e-05 + }, + { + "step": 260, + "epoch": 1.7627118644067796, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469264896, + "loss": 0.4612, + "grad_norm": 0.6336489319801331, + "learning_rate": 1.2110954462750166e-05 + }, + { + "step": 261, + "epoch": 1.769491525423729, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469220352, + "loss": 0.4273, + "grad_norm": 0.805219829082489, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 262, + "epoch": 1.776271186440678, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469177344, + "loss": 0.5067, + "grad_norm": 0.622574508190155, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 263, + "epoch": 1.783050847457627, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469214208, + "loss": 0.5284, + "grad_norm": 0.572135329246521, + "learning_rate": 1.0091269574384874e-05 + }, + { + "step": 264, + "epoch": 1.7898305084745763, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46930176, + "loss": 0.4853, + "grad_norm": 0.6615546345710754, + "learning_rate": 9.45754125003576e-06 + }, + { + "step": 265, + "epoch": 1.7966101694915255, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469220352, + "loss": 0.5876, + "grad_norm": 0.8003931641578674, + "learning_rate": 8.843714815330987e-06 + }, + { + "step": 266, + "epoch": 1.8033898305084746, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469435392, + "loss": 0.5349, + "grad_norm": 0.6436038017272949, + "learning_rate": 8.249877192799731e-06 + }, + { + "step": 267, + "epoch": 1.8101694915254236, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469228032, + "loss": 0.5349, + "grad_norm": 1.228948712348938, + "learning_rate": 7.676112474402068e-06 + }, + { + "step": 268, + "epoch": 1.8169491525423729, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46923264, + "loss": 0.4895, + "grad_norm": 0.8741024732589722, + "learning_rate": 7.122501909620926e-06 + }, + { + "step": 269, + "epoch": 1.8237288135593221, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469243392, + "loss": 0.523, + "grad_norm": 1.1317956447601318, + "learning_rate": 6.5891238939566275e-06 + }, + { + "step": 270, + "epoch": 1.8305084745762712, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469281792, + "loss": 0.5982, + "grad_norm": 0.8004896640777588, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 271, + "epoch": 1.8372881355932202, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469334016, + "loss": 0.5299, + "grad_norm": 0.750188410282135, + "learning_rate": 5.583364755863701e-06 + }, + { + "step": 272, + "epoch": 1.8440677966101695, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469192704, + "loss": 0.5667, + "grad_norm": 1.0007354021072388, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 273, + "epoch": 1.8508474576271188, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469072896, + "loss": 0.5951, + "grad_norm": 0.8461171388626099, + "learning_rate": 4.659404732773908e-06 + }, + { + "step": 274, + "epoch": 1.8576271186440678, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469300224, + "loss": 0.6029, + "grad_norm": 1.0854365825653076, + "learning_rate": 4.228264751468752e-06 + }, + { + "step": 275, + "epoch": 1.8644067796610169, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469544448, + "loss": 0.4763, + "grad_norm": 0.49423810839653015, + "learning_rate": 3.817767165451041e-06 + }, + { + "step": 276, + "epoch": 1.8711864406779661, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469204992, + "loss": 0.5754, + "grad_norm": 0.6848010420799255, + "learning_rate": 3.4279701043260886e-06 + }, + { + "step": 277, + "epoch": 1.8779661016949154, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469151232, + "loss": 0.6111, + "grad_norm": 0.8534131050109863, + "learning_rate": 3.0589287663461472e-06 + }, + { + "step": 278, + "epoch": 1.8847457627118644, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469314048, + "loss": 0.6056, + "grad_norm": 0.5377519130706787, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 279, + "epoch": 1.8915254237288135, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469254144, + "loss": 0.6437, + "grad_norm": 0.6584522724151611, + "learning_rate": 2.3833193495825853e-06 + }, + { + "step": 280, + "epoch": 1.8983050847457628, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469234176, + "loss": 0.5605, + "grad_norm": 0.7455096244812012, + "learning_rate": 2.076846942272026e-06 + }, + { + "step": 281, + "epoch": 1.905084745762712, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469169664, + "loss": 0.6059, + "grad_norm": 0.6636515855789185, + "learning_rate": 1.791321587504768e-06 + }, + { + "step": 282, + "epoch": 1.911864406779661, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469598208, + "loss": 0.557, + "grad_norm": 1.2527459859848022, + "learning_rate": 1.5267837178600972e-06 + }, + { + "step": 283, + "epoch": 1.9186440677966101, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469304832, + "loss": 0.5424, + "grad_norm": 0.595691978931427, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 284, + "epoch": 1.9254237288135592, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469160448, + "loss": 0.5162, + "grad_norm": 0.5123315453529358, + "learning_rate": 1.0608172990067553e-06 + }, + { + "step": 285, + "epoch": 1.9322033898305084, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469214208, + "loss": 0.5689, + "grad_norm": 0.6927798986434937, + "learning_rate": 8.594547342153979e-07 + }, + { + "step": 286, + "epoch": 1.9389830508474577, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469632, + "loss": 0.5781, + "grad_norm": 0.8162227272987366, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 287, + "epoch": 1.9457627118644067, + "cpu_mem": 1.717530624, + "gpu_mem": 4.4694016, + "loss": 0.5809, + "grad_norm": 0.935172975063324, + "learning_rate": 5.201134622801473e-07 + }, + { + "step": 288, + "epoch": 1.9525423728813558, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46918656, + "loss": 0.5931, + "grad_norm": 0.7867767810821533, + "learning_rate": 3.821828084619727e-07 + }, + { + "step": 289, + "epoch": 1.959322033898305, + "cpu_mem": 1.717530624, + "gpu_mem": 4.46927104, + "loss": 0.5657, + "grad_norm": 0.7583330869674683, + "learning_rate": 2.654391846207915e-07 + }, + { + "step": 290, + "epoch": 1.9661016949152543, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469195776, + "loss": 0.637, + "grad_norm": 1.2002619504928589, + "learning_rate": 1.6989912254880556e-07 + }, + { + "step": 291, + "epoch": 1.9728813559322034, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469231104, + "loss": 0.658, + "grad_norm": 0.8522661924362183, + "learning_rate": 9.557615145123765e-08 + }, + { + "step": 292, + "epoch": 1.9796610169491524, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469314048, + "loss": 0.5665, + "grad_norm": 0.6073691844940186, + "learning_rate": 4.248079603064724e-08 + }, + { + "step": 293, + "epoch": 1.9864406779661017, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469231104, + "loss": 0.6521, + "grad_norm": 1.107248067855835, + "learning_rate": 1.0620574996372811e-08 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469257216, + "loss": 0.6243, + "grad_norm": 1.0533913373947144, + "learning_rate": 0.0 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.717530624, + "gpu_mem": 4.469257216, + "train_runtime": 4441.4323, + "train_samples_per_second": 4.245, + "train_steps_per_second": 0.066, + "total_flos": 0.0, + "train_loss": 0.9406823077980353 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r2-a2/adapter_config.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a6595677543f23b42f06770761e8d2aa18b1163d --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r2-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 4, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 2, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "D" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r2-a2/eval_results.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ac3e3c98f4312c0b21bf2bb2f12c7c567809079a --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "hellaswag", + "results": 0.2660824536944832 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r2-a2/training_configuration.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..61d9a7829f93d34512b38dea4c2a736e01bde212 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "HELLASWAG", + "dataset_id": "Rowan/hellaswag", + "preprocess_id": "hellaswag_train_deepeval" + }, + "peft_config": { + "method": "abl_D", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 789096 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 1, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_D-hellaswag-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r2-a2", + "seed": 42, + "timestamp": "2025-09-02T08:56:47.119776" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r2-a2/training_logs.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..7451895095fea834d426e140d50a61da3b06ed59 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r2-a2/training_logs.json @@ -0,0 +1,5629 @@ +[ + { + "step": 1, + "epoch": 0.0016025641025641025, + "cpu_mem": 1.698422784, + "gpu_mem": 4.421420544, + "loss": 3.4877, + "grad_norm": 7.044434070587158, + "learning_rate": 4.7619047619047615e-06 + }, + { + "step": 2, + "epoch": 0.003205128205128205, + "cpu_mem": 1.699405824, + "gpu_mem": 4.427879424, + "loss": 3.6203, + "grad_norm": 7.003216743469238, + "learning_rate": 9.523809523809523e-06 + }, + { + "step": 3, + "epoch": 0.004807692307692308, + "cpu_mem": 1.700388864, + "gpu_mem": 4.427887104, + "loss": 3.4275, + "grad_norm": 6.939116477966309, + "learning_rate": 1.4285714285714284e-05 + }, + { + "step": 4, + "epoch": 0.00641025641025641, + "cpu_mem": 1.701568512, + "gpu_mem": 4.427920896, + "loss": 3.6121, + "grad_norm": 6.761279582977295, + "learning_rate": 1.9047619047619046e-05 + }, + { + "step": 5, + "epoch": 0.008012820512820512, + "cpu_mem": 1.702551552, + "gpu_mem": 4.427884032, + "loss": 3.5131, + "grad_norm": 6.848179817199707, + "learning_rate": 2.3809523809523807e-05 + }, + { + "step": 6, + "epoch": 0.009615384615384616, + "cpu_mem": 1.703534592, + "gpu_mem": 4.427930112, + "loss": 3.6119, + "grad_norm": 7.240479946136475, + "learning_rate": 2.8571428571428567e-05 + }, + { + "step": 7, + "epoch": 0.011217948717948718, + "cpu_mem": 1.704321024, + "gpu_mem": 4.427890176, + "loss": 3.5731, + "grad_norm": 6.581533908843994, + "learning_rate": 3.333333333333333e-05 + }, + { + "step": 8, + "epoch": 0.01282051282051282, + "cpu_mem": 1.705107456, + "gpu_mem": 4.427920896, + "loss": 3.3123, + "grad_norm": 6.913814067840576, + "learning_rate": 3.809523809523809e-05 + }, + { + "step": 9, + "epoch": 0.014423076923076924, + "cpu_mem": 1.705893888, + "gpu_mem": 4.427920896, + "loss": 3.2442, + "grad_norm": 6.568249225616455, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 10, + "epoch": 0.016025641025641024, + "cpu_mem": 1.70668032, + "gpu_mem": 4.427864064, + "loss": 3.0934, + "grad_norm": 6.3976311683654785, + "learning_rate": 4.7619047619047614e-05 + }, + { + "step": 11, + "epoch": 0.017628205128205128, + "cpu_mem": 1.707466752, + "gpu_mem": 4.427884032, + "loss": 3.0751, + "grad_norm": 6.982696056365967, + "learning_rate": 5.238095238095237e-05 + }, + { + "step": 12, + "epoch": 0.019230769230769232, + "cpu_mem": 1.708253184, + "gpu_mem": 4.42788096, + "loss": 3.3317, + "grad_norm": 6.906455039978027, + "learning_rate": 5.7142857142857135e-05 + }, + { + "step": 13, + "epoch": 0.020833333333333332, + "cpu_mem": 1.709039616, + "gpu_mem": 4.42787328, + "loss": 3.0821, + "grad_norm": 6.182567119598389, + "learning_rate": 6.190476190476189e-05 + }, + { + "step": 14, + "epoch": 0.022435897435897436, + "cpu_mem": 1.709826048, + "gpu_mem": 4.427899392, + "loss": 2.9653, + "grad_norm": 6.1485090255737305, + "learning_rate": 6.666666666666666e-05 + }, + { + "step": 15, + "epoch": 0.02403846153846154, + "cpu_mem": 1.71061248, + "gpu_mem": 4.427897856, + "loss": 2.7652, + "grad_norm": 5.7540998458862305, + "learning_rate": 7.142857142857142e-05 + }, + { + "step": 16, + "epoch": 0.02564102564102564, + "cpu_mem": 1.711202304, + "gpu_mem": 4.427890176, + "loss": 2.8683, + "grad_norm": 5.214643478393555, + "learning_rate": 7.619047619047618e-05 + }, + { + "step": 17, + "epoch": 0.027243589743589744, + "cpu_mem": 1.711988736, + "gpu_mem": 4.427890176, + "loss": 2.6283, + "grad_norm": 4.751415729522705, + "learning_rate": 8.095238095238093e-05 + }, + { + "step": 18, + "epoch": 0.028846153846153848, + "cpu_mem": 1.712775168, + "gpu_mem": 4.427890176, + "loss": 2.63, + "grad_norm": 4.277373313903809, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 19, + "epoch": 0.030448717948717948, + "cpu_mem": 1.713364992, + "gpu_mem": 4.427890176, + "loss": 2.6227, + "grad_norm": 4.2605438232421875, + "learning_rate": 9.047619047619046e-05 + }, + { + "step": 20, + "epoch": 0.03205128205128205, + "cpu_mem": 1.714151424, + "gpu_mem": 4.427864064, + "loss": 2.4993, + "grad_norm": 4.099986553192139, + "learning_rate": 9.523809523809523e-05 + }, + { + "step": 21, + "epoch": 0.03365384615384615, + "cpu_mem": 1.714741248, + "gpu_mem": 4.42788096, + "loss": 2.5213, + "grad_norm": 3.775770664215088, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 22, + "epoch": 0.035256410256410256, + "cpu_mem": 1.715331072, + "gpu_mem": 4.42788864, + "loss": 2.1875, + "grad_norm": 2.921668767929077, + "learning_rate": 0.00010476190476190474 + }, + { + "step": 23, + "epoch": 0.03685897435897436, + "cpu_mem": 1.715920896, + "gpu_mem": 4.427902464, + "loss": 2.26, + "grad_norm": 3.452160358428955, + "learning_rate": 0.0001095238095238095 + }, + { + "step": 24, + "epoch": 0.038461538461538464, + "cpu_mem": 1.71651072, + "gpu_mem": 4.427887104, + "loss": 2.3344, + "grad_norm": 3.5098063945770264, + "learning_rate": 0.00011428571428571427 + }, + { + "step": 25, + "epoch": 0.04006410256410257, + "cpu_mem": 1.717100544, + "gpu_mem": 4.427874816, + "loss": 1.9686, + "grad_norm": 2.3574893474578857, + "learning_rate": 0.00011904761904761903 + }, + { + "step": 26, + "epoch": 0.041666666666666664, + "cpu_mem": 1.717690368, + "gpu_mem": 4.42788096, + "loss": 1.8065, + "grad_norm": 2.0331528186798096, + "learning_rate": 0.00012380952380952378 + }, + { + "step": 27, + "epoch": 0.04326923076923077, + "cpu_mem": 1.7184768, + "gpu_mem": 4.42788864, + "loss": 1.8343, + "grad_norm": 2.0203492641448975, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 28, + "epoch": 0.04487179487179487, + "cpu_mem": 1.718870016, + "gpu_mem": 4.427884032, + "loss": 1.7628, + "grad_norm": 1.671234369277954, + "learning_rate": 0.0001333333333333333 + }, + { + "step": 29, + "epoch": 0.046474358974358976, + "cpu_mem": 1.71945984, + "gpu_mem": 4.427893248, + "loss": 1.747, + "grad_norm": 2.0101375579833984, + "learning_rate": 0.00013809523809523808 + }, + { + "step": 30, + "epoch": 0.04807692307692308, + "cpu_mem": 1.720049664, + "gpu_mem": 4.4278656, + "loss": 1.7503, + "grad_norm": 1.6616005897521973, + "learning_rate": 0.00014285714285714284 + }, + { + "step": 31, + "epoch": 0.049679487179487176, + "cpu_mem": 1.720639488, + "gpu_mem": 4.427920896, + "loss": 1.6196, + "grad_norm": 1.4280704259872437, + "learning_rate": 0.0001476190476190476 + }, + { + "step": 32, + "epoch": 0.05128205128205128, + "cpu_mem": 1.721229312, + "gpu_mem": 4.427913216, + "loss": 1.6614, + "grad_norm": 1.6360634565353394, + "learning_rate": 0.00015238095238095237 + }, + { + "step": 33, + "epoch": 0.052884615384615384, + "cpu_mem": 1.721819136, + "gpu_mem": 4.427867136, + "loss": 1.5401, + "grad_norm": 1.1228793859481812, + "learning_rate": 0.00015714285714285713 + }, + { + "step": 34, + "epoch": 0.05448717948717949, + "cpu_mem": 1.72240896, + "gpu_mem": 4.427885568, + "loss": 1.5155, + "grad_norm": 0.9862116575241089, + "learning_rate": 0.00016190476190476187 + }, + { + "step": 35, + "epoch": 0.05608974358974359, + "cpu_mem": 1.722998784, + "gpu_mem": 4.427907072, + "loss": 1.4436, + "grad_norm": 0.7831733226776123, + "learning_rate": 0.00016666666666666666 + }, + { + "step": 36, + "epoch": 0.057692307692307696, + "cpu_mem": 1.723588608, + "gpu_mem": 4.427905536, + "loss": 1.4417, + "grad_norm": 0.603388786315918, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 37, + "epoch": 0.05929487179487179, + "cpu_mem": 1.724178432, + "gpu_mem": 4.427937792, + "loss": 1.4537, + "grad_norm": 0.5592174530029297, + "learning_rate": 0.0001761904761904762 + }, + { + "step": 38, + "epoch": 0.060897435897435896, + "cpu_mem": 1.724768256, + "gpu_mem": 4.427890176, + "loss": 1.444, + "grad_norm": 0.5786752700805664, + "learning_rate": 0.00018095238095238093 + }, + { + "step": 39, + "epoch": 0.0625, + "cpu_mem": 1.72535808, + "gpu_mem": 4.427947008, + "loss": 1.4539, + "grad_norm": 1.1525065898895264, + "learning_rate": 0.00018571428571428572 + }, + { + "step": 40, + "epoch": 0.0641025641025641, + "cpu_mem": 1.725751296, + "gpu_mem": 4.427874816, + "loss": 1.4664, + "grad_norm": 0.6327864527702332, + "learning_rate": 0.00019047619047619045 + }, + { + "step": 41, + "epoch": 0.06570512820512821, + "cpu_mem": 1.726144512, + "gpu_mem": 4.427902464, + "loss": 1.4775, + "grad_norm": 0.7490549683570862, + "learning_rate": 0.00019523809523809522 + }, + { + "step": 42, + "epoch": 0.0673076923076923, + "cpu_mem": 1.726734336, + "gpu_mem": 4.427916288, + "loss": 1.4139, + "grad_norm": 0.5082897543907166, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 43, + "epoch": 0.06891025641025642, + "cpu_mem": 1.72732416, + "gpu_mem": 4.427922432, + "loss": 1.4212, + "grad_norm": 0.5636128783226013, + "learning_rate": 0.00020476190476190475 + }, + { + "step": 44, + "epoch": 0.07051282051282051, + "cpu_mem": 1.727913984, + "gpu_mem": 4.427900928, + "loss": 1.4198, + "grad_norm": 0.5438753366470337, + "learning_rate": 0.00020952380952380948 + }, + { + "step": 45, + "epoch": 0.07211538461538461, + "cpu_mem": 1.7283072, + "gpu_mem": 4.427900928, + "loss": 1.4327, + "grad_norm": 0.5149738788604736, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 46, + "epoch": 0.07371794871794872, + "cpu_mem": 1.728897024, + "gpu_mem": 4.427900928, + "loss": 1.4275, + "grad_norm": 0.7032666206359863, + "learning_rate": 0.000219047619047619 + }, + { + "step": 47, + "epoch": 0.07532051282051282, + "cpu_mem": 1.729486848, + "gpu_mem": 4.427887104, + "loss": 1.3907, + "grad_norm": 0.3524303436279297, + "learning_rate": 0.0002238095238095238 + }, + { + "step": 48, + "epoch": 0.07692307692307693, + "cpu_mem": 1.730076672, + "gpu_mem": 4.427905536, + "loss": 1.4196, + "grad_norm": 0.6558328866958618, + "learning_rate": 0.00022857142857142854 + }, + { + "step": 49, + "epoch": 0.07852564102564102, + "cpu_mem": 1.730469888, + "gpu_mem": 4.427917824, + "loss": 1.3921, + "grad_norm": 0.29690393805503845, + "learning_rate": 0.0002333333333333333 + }, + { + "step": 50, + "epoch": 0.08012820512820513, + "cpu_mem": 1.731059712, + "gpu_mem": 4.427894784, + "loss": 1.3822, + "grad_norm": 0.31508514285087585, + "learning_rate": 0.00023809523809523807 + }, + { + "step": 51, + "epoch": 0.08173076923076923, + "cpu_mem": 1.731649536, + "gpu_mem": 4.427879424, + "loss": 1.3963, + "grad_norm": 0.569665789604187, + "learning_rate": 0.00024285714285714283 + }, + { + "step": 52, + "epoch": 0.08333333333333333, + "cpu_mem": 1.732042752, + "gpu_mem": 4.427884032, + "loss": 1.3673, + "grad_norm": 0.34152036905288696, + "learning_rate": 0.00024761904761904757 + }, + { + "step": 53, + "epoch": 0.08493589743589744, + "cpu_mem": 1.732435968, + "gpu_mem": 4.42791168, + "loss": 1.4171, + "grad_norm": 0.5739138126373291, + "learning_rate": 0.0002523809523809524 + }, + { + "step": 54, + "epoch": 0.08653846153846154, + "cpu_mem": 1.732829184, + "gpu_mem": 4.427887104, + "loss": 1.4376, + "grad_norm": 0.7961390614509583, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 55, + "epoch": 0.08814102564102565, + "cpu_mem": 1.7332224, + "gpu_mem": 4.427905536, + "loss": 1.4182, + "grad_norm": 0.47439131140708923, + "learning_rate": 0.00026190476190476186 + }, + { + "step": 56, + "epoch": 0.08974358974358974, + "cpu_mem": 1.733812224, + "gpu_mem": 4.427899392, + "loss": 1.4264, + "grad_norm": 0.6289687156677246, + "learning_rate": 0.0002666666666666666 + }, + { + "step": 57, + "epoch": 0.09134615384615384, + "cpu_mem": 1.734402048, + "gpu_mem": 4.4278656, + "loss": 1.4097, + "grad_norm": 0.45058491826057434, + "learning_rate": 0.0002714285714285714 + }, + { + "step": 58, + "epoch": 0.09294871794871795, + "cpu_mem": 1.734795264, + "gpu_mem": 4.427894784, + "loss": 1.4054, + "grad_norm": 0.4353361427783966, + "learning_rate": 0.00027619047619047615 + }, + { + "step": 59, + "epoch": 0.09455128205128205, + "cpu_mem": 1.735385088, + "gpu_mem": 4.427877888, + "loss": 1.383, + "grad_norm": 0.5815163850784302, + "learning_rate": 0.0002809523809523809 + }, + { + "step": 60, + "epoch": 0.09615384615384616, + "cpu_mem": 1.735974912, + "gpu_mem": 4.42791936, + "loss": 1.4044, + "grad_norm": 0.7179692387580872, + "learning_rate": 0.0002857142857142857 + }, + { + "step": 61, + "epoch": 0.09775641025641026, + "cpu_mem": 1.736368128, + "gpu_mem": 4.427885568, + "loss": 1.3947, + "grad_norm": 0.28731754422187805, + "learning_rate": 0.00029047619047619045 + }, + { + "step": 62, + "epoch": 0.09935897435897435, + "cpu_mem": 1.736761344, + "gpu_mem": 4.427925504, + "loss": 1.3895, + "grad_norm": 0.7793278694152832, + "learning_rate": 0.0002952380952380952 + }, + { + "step": 63, + "epoch": 0.10096153846153846, + "cpu_mem": 1.73715456, + "gpu_mem": 4.427879424, + "loss": 1.4326, + "grad_norm": 0.3396264314651489, + "learning_rate": 0.0003 + }, + { + "step": 64, + "epoch": 0.10256410256410256, + "cpu_mem": 1.737547776, + "gpu_mem": 4.427884032, + "loss": 1.4258, + "grad_norm": 0.5113117098808289, + "learning_rate": 0.00029999764801714643 + }, + { + "step": 65, + "epoch": 0.10416666666666667, + "cpu_mem": 1.7381376, + "gpu_mem": 4.42788096, + "loss": 1.4275, + "grad_norm": 0.46661895513534546, + "learning_rate": 0.00029999059214234344 + }, + { + "step": 66, + "epoch": 0.10576923076923077, + "cpu_mem": 1.738530816, + "gpu_mem": 4.427899392, + "loss": 1.4496, + "grad_norm": 0.3025074899196625, + "learning_rate": 0.00029997883259686163 + }, + { + "step": 67, + "epoch": 0.10737179487179487, + "cpu_mem": 1.738924032, + "gpu_mem": 4.427891712, + "loss": 1.4013, + "grad_norm": 0.32287317514419556, + "learning_rate": 0.00029996236974947764 + }, + { + "step": 68, + "epoch": 0.10897435897435898, + "cpu_mem": 1.739513856, + "gpu_mem": 4.427876352, + "loss": 1.3744, + "grad_norm": 0.24611066281795502, + "learning_rate": 0.00029994120411646263 + }, + { + "step": 69, + "epoch": 0.11057692307692307, + "cpu_mem": 1.739907072, + "gpu_mem": 4.427947008, + "loss": 1.416, + "grad_norm": 0.28950801491737366, + "learning_rate": 0.00029991533636156603 + }, + { + "step": 70, + "epoch": 0.11217948717948718, + "cpu_mem": 1.740300288, + "gpu_mem": 4.427897856, + "loss": 1.435, + "grad_norm": 0.5663497447967529, + "learning_rate": 0.00029988476729599464 + }, + { + "step": 71, + "epoch": 0.11378205128205128, + "cpu_mem": 1.740693504, + "gpu_mem": 4.427922432, + "loss": 1.438, + "grad_norm": 0.563672661781311, + "learning_rate": 0.0002998494978783874 + }, + { + "step": 72, + "epoch": 0.11538461538461539, + "cpu_mem": 1.741283328, + "gpu_mem": 4.427893248, + "loss": 1.3647, + "grad_norm": 0.24021561443805695, + "learning_rate": 0.0002998095292147852 + }, + { + "step": 73, + "epoch": 0.11698717948717949, + "cpu_mem": 1.741676544, + "gpu_mem": 4.427885568, + "loss": 1.4227, + "grad_norm": 0.5606208443641663, + "learning_rate": 0.0002997648625585962 + }, + { + "step": 74, + "epoch": 0.11858974358974358, + "cpu_mem": 1.742266368, + "gpu_mem": 4.427879424, + "loss": 1.4015, + "grad_norm": 0.28034263849258423, + "learning_rate": 0.0002997154993105566 + }, + { + "step": 75, + "epoch": 0.1201923076923077, + "cpu_mem": 1.742659584, + "gpu_mem": 4.427908608, + "loss": 1.3942, + "grad_norm": 0.2765113413333893, + "learning_rate": 0.00029966144101868636 + }, + { + "step": 76, + "epoch": 0.12179487179487179, + "cpu_mem": 1.7430528, + "gpu_mem": 4.427899392, + "loss": 1.3917, + "grad_norm": 0.405184268951416, + "learning_rate": 0.0002996026893782414 + }, + { + "step": 77, + "epoch": 0.1233974358974359, + "cpu_mem": 1.743249408, + "gpu_mem": 4.427887104, + "loss": 1.4138, + "grad_norm": 0.301628053188324, + "learning_rate": 0.00029953924623165955 + }, + { + "step": 78, + "epoch": 0.125, + "cpu_mem": 1.743839232, + "gpu_mem": 4.427879424, + "loss": 1.3619, + "grad_norm": 0.3369763493537903, + "learning_rate": 0.0002994711135685035 + }, + { + "step": 79, + "epoch": 0.1266025641025641, + "cpu_mem": 1.744232448, + "gpu_mem": 4.427931648, + "loss": 1.3932, + "grad_norm": 0.3128480613231659, + "learning_rate": 0.00029939829352539787 + }, + { + "step": 80, + "epoch": 0.1282051282051282, + "cpu_mem": 1.744625664, + "gpu_mem": 4.427910144, + "loss": 1.4036, + "grad_norm": 0.2513543665409088, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 81, + "epoch": 0.12980769230769232, + "cpu_mem": 1.74501888, + "gpu_mem": 4.427904, + "loss": 1.3682, + "grad_norm": 0.3001977801322937, + "learning_rate": 0.0002992386005807413 + }, + { + "step": 82, + "epoch": 0.13141025641025642, + "cpu_mem": 1.745608704, + "gpu_mem": 4.42788096, + "loss": 1.3912, + "grad_norm": 0.3435753583908081, + "learning_rate": 0.00029915173268712456 + }, + { + "step": 83, + "epoch": 0.1330128205128205, + "cpu_mem": 1.74600192, + "gpu_mem": 4.427902464, + "loss": 1.4179, + "grad_norm": 0.6141883134841919, + "learning_rate": 0.0002990601874292698 + }, + { + "step": 84, + "epoch": 0.1346153846153846, + "cpu_mem": 1.746395136, + "gpu_mem": 4.427874816, + "loss": 1.4385, + "grad_norm": 0.432098925113678, + "learning_rate": 0.0002989639676780152 + }, + { + "step": 85, + "epoch": 0.1362179487179487, + "cpu_mem": 1.746591744, + "gpu_mem": 4.427882496, + "loss": 1.4153, + "grad_norm": 0.4050647020339966, + "learning_rate": 0.0002988630764507904 + }, + { + "step": 86, + "epoch": 0.13782051282051283, + "cpu_mem": 1.747181568, + "gpu_mem": 4.427900928, + "loss": 1.3807, + "grad_norm": 0.30926942825317383, + "learning_rate": 0.00029875751691152094 + }, + { + "step": 87, + "epoch": 0.13942307692307693, + "cpu_mem": 1.747574784, + "gpu_mem": 4.427890176, + "loss": 1.4191, + "grad_norm": 0.4734255075454712, + "learning_rate": 0.0002986472923705301 + }, + { + "step": 88, + "epoch": 0.14102564102564102, + "cpu_mem": 1.747771392, + "gpu_mem": 4.42788864, + "loss": 1.4213, + "grad_norm": 0.5042356252670288, + "learning_rate": 0.0002985324062844341 + }, + { + "step": 89, + "epoch": 0.14262820512820512, + "cpu_mem": 1.748164608, + "gpu_mem": 4.427884032, + "loss": 1.3673, + "grad_norm": 0.27532869577407837, + "learning_rate": 0.0002984128622560345 + }, + { + "step": 90, + "epoch": 0.14423076923076922, + "cpu_mem": 1.748557824, + "gpu_mem": 4.42788864, + "loss": 1.3834, + "grad_norm": 0.33396437764167786, + "learning_rate": 0.0002982886640342046 + }, + { + "step": 91, + "epoch": 0.14583333333333334, + "cpu_mem": 1.74895104, + "gpu_mem": 4.427899392, + "loss": 1.3815, + "grad_norm": 0.27740201354026794, + "learning_rate": 0.00029815981551377217 + }, + { + "step": 92, + "epoch": 0.14743589743589744, + "cpu_mem": 1.749344256, + "gpu_mem": 4.427902464, + "loss": 1.4037, + "grad_norm": 0.3106025755405426, + "learning_rate": 0.00029802632073539745 + }, + { + "step": 93, + "epoch": 0.14903846153846154, + "cpu_mem": 1.749737472, + "gpu_mem": 4.427902464, + "loss": 1.4244, + "grad_norm": 0.4056639075279236, + "learning_rate": 0.0002978881838854462 + }, + { + "step": 94, + "epoch": 0.15064102564102563, + "cpu_mem": 1.750130688, + "gpu_mem": 4.427897856, + "loss": 1.4106, + "grad_norm": 0.3912266790866852, + "learning_rate": 0.00029774540929585847 + }, + { + "step": 95, + "epoch": 0.15224358974358973, + "cpu_mem": 1.750523904, + "gpu_mem": 4.427916288, + "loss": 1.3763, + "grad_norm": 0.40159371495246887, + "learning_rate": 0.0002975980014440126 + }, + { + "step": 96, + "epoch": 0.15384615384615385, + "cpu_mem": 1.75091712, + "gpu_mem": 4.42791936, + "loss": 1.402, + "grad_norm": 0.2465643286705017, + "learning_rate": 0.00029744596495258525 + }, + { + "step": 97, + "epoch": 0.15544871794871795, + "cpu_mem": 1.751310336, + "gpu_mem": 4.42789632, + "loss": 1.398, + "grad_norm": 0.25606217980384827, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 98, + "epoch": 0.15705128205128205, + "cpu_mem": 1.751703552, + "gpu_mem": 4.427907072, + "loss": 1.4014, + "grad_norm": 0.20486007630825043, + "learning_rate": 0.000297128025267308 + }, + { + "step": 99, + "epoch": 0.15865384615384615, + "cpu_mem": 1.752096768, + "gpu_mem": 4.427907072, + "loss": 1.3953, + "grad_norm": 0.49437621235847473, + "learning_rate": 0.00029696213204397396 + }, + { + "step": 100, + "epoch": 0.16025641025641027, + "cpu_mem": 1.752489984, + "gpu_mem": 4.427882496, + "loss": 1.4015, + "grad_norm": 0.2777295410633087, + "learning_rate": 0.00029679163012177737 + }, + { + "step": 101, + "epoch": 0.16185897435897437, + "cpu_mem": 1.7528832, + "gpu_mem": 4.42791168, + "loss": 1.4045, + "grad_norm": 0.43401452898979187, + "learning_rate": 0.0002966165248476196 + }, + { + "step": 102, + "epoch": 0.16346153846153846, + "cpu_mem": 1.753079808, + "gpu_mem": 4.42788864, + "loss": 1.3693, + "grad_norm": 0.3714810907840729, + "learning_rate": 0.00029643682171276203 + }, + { + "step": 103, + "epoch": 0.16506410256410256, + "cpu_mem": 1.753473024, + "gpu_mem": 4.427905536, + "loss": 1.3954, + "grad_norm": 0.3342154920101166, + "learning_rate": 0.0002962525263526538 + }, + { + "step": 104, + "epoch": 0.16666666666666666, + "cpu_mem": 1.753669632, + "gpu_mem": 4.42787328, + "loss": 1.3865, + "grad_norm": 0.34517109394073486, + "learning_rate": 0.0002960636445467553 + }, + { + "step": 105, + "epoch": 0.16826923076923078, + "cpu_mem": 1.754062848, + "gpu_mem": 4.42788864, + "loss": 1.3965, + "grad_norm": 0.31532129645347595, + "learning_rate": 0.0002958701822183569 + }, + { + "step": 106, + "epoch": 0.16987179487179488, + "cpu_mem": 1.754456064, + "gpu_mem": 4.427868672, + "loss": 1.404, + "grad_norm": 0.3387033939361572, + "learning_rate": 0.0002956721454343928 + }, + { + "step": 107, + "epoch": 0.17147435897435898, + "cpu_mem": 1.75484928, + "gpu_mem": 4.427910144, + "loss": 1.3741, + "grad_norm": 0.2058364599943161, + "learning_rate": 0.0002954695404052514 + }, + { + "step": 108, + "epoch": 0.17307692307692307, + "cpu_mem": 1.755242496, + "gpu_mem": 4.427905536, + "loss": 1.3983, + "grad_norm": 0.2514621615409851, + "learning_rate": 0.00029526237348458003 + }, + { + "step": 109, + "epoch": 0.17467948717948717, + "cpu_mem": 1.755439104, + "gpu_mem": 4.42791168, + "loss": 1.3747, + "grad_norm": 0.3155597746372223, + "learning_rate": 0.000295050651169086 + }, + { + "step": 110, + "epoch": 0.1762820512820513, + "cpu_mem": 1.755635712, + "gpu_mem": 4.427908608, + "loss": 1.3759, + "grad_norm": 0.23293180763721466, + "learning_rate": 0.00029483438009833264 + }, + { + "step": 111, + "epoch": 0.1778846153846154, + "cpu_mem": 1.756028928, + "gpu_mem": 4.427910144, + "loss": 1.3751, + "grad_norm": 0.22945232689380646, + "learning_rate": 0.0002946135670545314 + }, + { + "step": 112, + "epoch": 0.1794871794871795, + "cpu_mem": 1.756422144, + "gpu_mem": 4.427907072, + "loss": 1.3943, + "grad_norm": 0.27216753363609314, + "learning_rate": 0.0002943882189623288 + }, + { + "step": 113, + "epoch": 0.18108974358974358, + "cpu_mem": 1.75681536, + "gpu_mem": 4.427887104, + "loss": 1.4073, + "grad_norm": 0.23347613215446472, + "learning_rate": 0.00029415834288858947 + }, + { + "step": 114, + "epoch": 0.18269230769230768, + "cpu_mem": 1.757208576, + "gpu_mem": 4.427882496, + "loss": 1.4003, + "grad_norm": 0.4743565022945404, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 115, + "epoch": 0.1842948717948718, + "cpu_mem": 1.757405184, + "gpu_mem": 4.427900928, + "loss": 1.401, + "grad_norm": 0.16435742378234863, + "learning_rate": 0.0002936850357737156 + }, + { + "step": 116, + "epoch": 0.1858974358974359, + "cpu_mem": 1.7577984, + "gpu_mem": 4.42791168, + "loss": 1.4014, + "grad_norm": 0.2586367726325989, + "learning_rate": 0.0002934416195753839 + }, + { + "step": 117, + "epoch": 0.1875, + "cpu_mem": 1.758191616, + "gpu_mem": 4.427897856, + "loss": 1.3843, + "grad_norm": 0.2313457876443863, + "learning_rate": 0.00029319370508065594 + }, + { + "step": 118, + "epoch": 0.1891025641025641, + "cpu_mem": 1.758388224, + "gpu_mem": 4.427913216, + "loss": 1.4026, + "grad_norm": 0.4052332043647766, + "learning_rate": 0.0002929413000640735 + }, + { + "step": 119, + "epoch": 0.1907051282051282, + "cpu_mem": 1.75878144, + "gpu_mem": 4.427894784, + "loss": 1.3761, + "grad_norm": 0.4530315101146698, + "learning_rate": 0.0002926844124410001 + }, + { + "step": 120, + "epoch": 0.19230769230769232, + "cpu_mem": 1.759174656, + "gpu_mem": 4.427920896, + "loss": 1.4041, + "grad_norm": 0.38362884521484375, + "learning_rate": 0.0002924230502673731 + }, + { + "step": 121, + "epoch": 0.19391025641025642, + "cpu_mem": 1.759371264, + "gpu_mem": 4.427879424, + "loss": 1.4061, + "grad_norm": 0.3751177191734314, + "learning_rate": 0.00029215722173945034 + }, + { + "step": 122, + "epoch": 0.1955128205128205, + "cpu_mem": 1.75976448, + "gpu_mem": 4.42791168, + "loss": 1.3879, + "grad_norm": 0.29739537835121155, + "learning_rate": 0.0002918869351935537 + }, + { + "step": 123, + "epoch": 0.1971153846153846, + "cpu_mem": 1.760157696, + "gpu_mem": 4.427905536, + "loss": 1.387, + "grad_norm": 0.2242182344198227, + "learning_rate": 0.00029161219910580754 + }, + { + "step": 124, + "epoch": 0.1987179487179487, + "cpu_mem": 1.760550912, + "gpu_mem": 4.427907072, + "loss": 1.3827, + "grad_norm": 0.305651992559433, + "learning_rate": 0.00029133302209187267 + }, + { + "step": 125, + "epoch": 0.20032051282051283, + "cpu_mem": 1.76074752, + "gpu_mem": 4.427882496, + "loss": 1.3602, + "grad_norm": 0.28008952736854553, + "learning_rate": 0.00029104941290667655 + }, + { + "step": 126, + "epoch": 0.20192307692307693, + "cpu_mem": 1.760944128, + "gpu_mem": 4.427891712, + "loss": 1.3627, + "grad_norm": 0.2981486916542053, + "learning_rate": 0.00029076138044413827 + }, + { + "step": 127, + "epoch": 0.20352564102564102, + "cpu_mem": 1.761337344, + "gpu_mem": 4.427877888, + "loss": 1.4143, + "grad_norm": 0.3320181369781494, + "learning_rate": 0.00029046893373689 + }, + { + "step": 128, + "epoch": 0.20512820512820512, + "cpu_mem": 1.76173056, + "gpu_mem": 4.427914752, + "loss": 1.3756, + "grad_norm": 0.24858158826828003, + "learning_rate": 0.00029017208195599375 + }, + { + "step": 129, + "epoch": 0.20673076923076922, + "cpu_mem": 1.762123776, + "gpu_mem": 4.42791168, + "loss": 1.3955, + "grad_norm": 0.23311658203601837, + "learning_rate": 0.0002898708344106533 + }, + { + "step": 130, + "epoch": 0.20833333333333334, + "cpu_mem": 1.762320384, + "gpu_mem": 4.42791168, + "loss": 1.4194, + "grad_norm": 0.32549288868904114, + "learning_rate": 0.00028956520054792303 + }, + { + "step": 131, + "epoch": 0.20993589743589744, + "cpu_mem": 1.7627136, + "gpu_mem": 4.427900928, + "loss": 1.4164, + "grad_norm": 0.3670238256454468, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 132, + "epoch": 0.21153846153846154, + "cpu_mem": 1.763106816, + "gpu_mem": 4.427900928, + "loss": 1.3472, + "grad_norm": 0.29794058203697205, + "learning_rate": 0.0002889408123459782 + }, + { + "step": 133, + "epoch": 0.21314102564102563, + "cpu_mem": 1.763303424, + "gpu_mem": 4.427882496, + "loss": 1.3917, + "grad_norm": 0.3001929819583893, + "learning_rate": 0.000288622077587435 + }, + { + "step": 134, + "epoch": 0.21474358974358973, + "cpu_mem": 1.76369664, + "gpu_mem": 4.427893248, + "loss": 1.4125, + "grad_norm": 0.38998860120773315, + "learning_rate": 0.0002882989956722303 + }, + { + "step": 135, + "epoch": 0.21634615384615385, + "cpu_mem": 1.763893248, + "gpu_mem": 4.427902464, + "loss": 1.3919, + "grad_norm": 0.30731478333473206, + "learning_rate": 0.00028797157673213914 + }, + { + "step": 136, + "epoch": 0.21794871794871795, + "cpu_mem": 1.764089856, + "gpu_mem": 4.427917824, + "loss": 1.411, + "grad_norm": 0.3345436453819275, + "learning_rate": 0.00028763983103494465 + }, + { + "step": 137, + "epoch": 0.21955128205128205, + "cpu_mem": 1.764483072, + "gpu_mem": 4.4278656, + "loss": 1.3792, + "grad_norm": 0.18260601162910461, + "learning_rate": 0.00028730376898411606 + }, + { + "step": 138, + "epoch": 0.22115384615384615, + "cpu_mem": 1.764876288, + "gpu_mem": 4.427885568, + "loss": 1.4068, + "grad_norm": 0.3405553102493286, + "learning_rate": 0.00028696340111848245 + }, + { + "step": 139, + "epoch": 0.22275641025641027, + "cpu_mem": 1.765072896, + "gpu_mem": 4.427867136, + "loss": 1.3971, + "grad_norm": 0.41323742270469666, + "learning_rate": 0.00028661873811190226 + }, + { + "step": 140, + "epoch": 0.22435897435897437, + "cpu_mem": 1.765269504, + "gpu_mem": 4.427884032, + "loss": 1.382, + "grad_norm": 0.2742190361022949, + "learning_rate": 0.0002862697907729285 + }, + { + "step": 141, + "epoch": 0.22596153846153846, + "cpu_mem": 1.765466112, + "gpu_mem": 4.427890176, + "loss": 1.3837, + "grad_norm": 0.31916341185569763, + "learning_rate": 0.0002859165700444701 + }, + { + "step": 142, + "epoch": 0.22756410256410256, + "cpu_mem": 1.765859328, + "gpu_mem": 4.427887104, + "loss": 1.3808, + "grad_norm": 0.20120978355407715, + "learning_rate": 0.00028555908700344824 + }, + { + "step": 143, + "epoch": 0.22916666666666666, + "cpu_mem": 1.766252544, + "gpu_mem": 4.427913216, + "loss": 1.3916, + "grad_norm": 0.23243924975395203, + "learning_rate": 0.00028519735286044936 + }, + { + "step": 144, + "epoch": 0.23076923076923078, + "cpu_mem": 1.766449152, + "gpu_mem": 4.427887104, + "loss": 1.3915, + "grad_norm": 0.178579181432724, + "learning_rate": 0.0002848313789593736 + }, + { + "step": 145, + "epoch": 0.23237179487179488, + "cpu_mem": 1.76664576, + "gpu_mem": 4.42792704, + "loss": 1.3895, + "grad_norm": 0.3732435405254364, + "learning_rate": 0.00028446117677707867 + }, + { + "step": 146, + "epoch": 0.23397435897435898, + "cpu_mem": 1.767038976, + "gpu_mem": 4.427876352, + "loss": 1.3876, + "grad_norm": 0.23378752171993256, + "learning_rate": 0.0002840867579230205 + }, + { + "step": 147, + "epoch": 0.23557692307692307, + "cpu_mem": 1.767432192, + "gpu_mem": 4.427885568, + "loss": 1.4173, + "grad_norm": 0.26765573024749756, + "learning_rate": 0.00028370813413888866 + }, + { + "step": 148, + "epoch": 0.23717948717948717, + "cpu_mem": 1.7676288, + "gpu_mem": 4.427905536, + "loss": 1.3855, + "grad_norm": 0.31228381395339966, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 149, + "epoch": 0.2387820512820513, + "cpu_mem": 1.767825408, + "gpu_mem": 4.42789632, + "loss": 1.3853, + "grad_norm": 0.2812199890613556, + "learning_rate": 0.0002829383194061186 + }, + { + "step": 150, + "epoch": 0.2403846153846154, + "cpu_mem": 1.768218624, + "gpu_mem": 4.427908608, + "loss": 1.3703, + "grad_norm": 0.2634398639202118, + "learning_rate": 0.00028254715259869444 + }, + { + "step": 151, + "epoch": 0.2419871794871795, + "cpu_mem": 1.768415232, + "gpu_mem": 4.42787328, + "loss": 1.3812, + "grad_norm": 0.3885889947414398, + "learning_rate": 0.00028215182914286766 + }, + { + "step": 152, + "epoch": 0.24358974358974358, + "cpu_mem": 1.76861184, + "gpu_mem": 4.427904, + "loss": 1.389, + "grad_norm": 0.40520215034484863, + "learning_rate": 0.00028175236143589144 + }, + { + "step": 153, + "epoch": 0.24519230769230768, + "cpu_mem": 1.769005056, + "gpu_mem": 4.427899392, + "loss": 1.3844, + "grad_norm": 0.32309800386428833, + "learning_rate": 0.0002813487620049817 + }, + { + "step": 154, + "epoch": 0.2467948717948718, + "cpu_mem": 1.769201664, + "gpu_mem": 4.427923968, + "loss": 1.3921, + "grad_norm": 0.266435831785202, + "learning_rate": 0.00028094104350692435 + }, + { + "step": 155, + "epoch": 0.2483974358974359, + "cpu_mem": 1.769398272, + "gpu_mem": 4.427860992, + "loss": 1.389, + "grad_norm": 0.25740817189216614, + "learning_rate": 0.0002805292187276783 + }, + { + "step": 156, + "epoch": 0.25, + "cpu_mem": 1.769791488, + "gpu_mem": 4.427914752, + "loss": 1.3709, + "grad_norm": 0.24679309129714966, + "learning_rate": 0.0002801133005819744 + }, + { + "step": 157, + "epoch": 0.2516025641025641, + "cpu_mem": 1.769988096, + "gpu_mem": 4.427907072, + "loss": 1.3915, + "grad_norm": 0.2954455316066742, + "learning_rate": 0.00027969330211291077 + }, + { + "step": 158, + "epoch": 0.2532051282051282, + "cpu_mem": 1.770184704, + "gpu_mem": 4.427922432, + "loss": 1.4045, + "grad_norm": 0.26837798953056335, + "learning_rate": 0.00027926923649154327 + }, + { + "step": 159, + "epoch": 0.2548076923076923, + "cpu_mem": 1.77057792, + "gpu_mem": 4.427923968, + "loss": 1.3957, + "grad_norm": 0.23529639840126038, + "learning_rate": 0.00027884111701647284 + }, + { + "step": 160, + "epoch": 0.2564102564102564, + "cpu_mem": 1.770774528, + "gpu_mem": 4.427891712, + "loss": 1.428, + "grad_norm": 0.3924208879470825, + "learning_rate": 0.00027840895711342834 + }, + { + "step": 161, + "epoch": 0.25801282051282054, + "cpu_mem": 1.771167744, + "gpu_mem": 4.427884032, + "loss": 1.4008, + "grad_norm": 0.413627952337265, + "learning_rate": 0.00027797277033484553 + }, + { + "step": 162, + "epoch": 0.25961538461538464, + "cpu_mem": 1.771364352, + "gpu_mem": 4.42791936, + "loss": 1.3944, + "grad_norm": 0.2782786190509796, + "learning_rate": 0.0002775325703594421 + }, + { + "step": 163, + "epoch": 0.26121794871794873, + "cpu_mem": 1.77156096, + "gpu_mem": 4.427867136, + "loss": 1.4076, + "grad_norm": 0.3563677668571472, + "learning_rate": 0.0002770883709917886 + }, + { + "step": 164, + "epoch": 0.26282051282051283, + "cpu_mem": 1.771757568, + "gpu_mem": 4.427902464, + "loss": 1.3788, + "grad_norm": 0.354828804731369, + "learning_rate": 0.0002766401861618757 + }, + { + "step": 165, + "epoch": 0.2644230769230769, + "cpu_mem": 1.771954176, + "gpu_mem": 4.427891712, + "loss": 1.3693, + "grad_norm": 0.21159392595291138, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 166, + "epoch": 0.266025641025641, + "cpu_mem": 1.772150784, + "gpu_mem": 4.427923968, + "loss": 1.3888, + "grad_norm": 0.30549514293670654, + "learning_rate": 0.0002757319164597092 + }, + { + "step": 167, + "epoch": 0.2676282051282051, + "cpu_mem": 1.772347392, + "gpu_mem": 4.427917824, + "loss": 1.3713, + "grad_norm": 0.381334513425827, + "learning_rate": 0.0002752718600705858 + }, + { + "step": 168, + "epoch": 0.2692307692307692, + "cpu_mem": 1.772740608, + "gpu_mem": 4.42789632, + "loss": 1.3799, + "grad_norm": 0.18960174918174744, + "learning_rate": 0.00027480787518457023 + }, + { + "step": 169, + "epoch": 0.2708333333333333, + "cpu_mem": 1.773133824, + "gpu_mem": 4.427893248, + "loss": 1.3825, + "grad_norm": 0.35098424553871155, + "learning_rate": 0.0002743399763521223 + }, + { + "step": 170, + "epoch": 0.2724358974358974, + "cpu_mem": 1.773330432, + "gpu_mem": 4.427930112, + "loss": 1.408, + "grad_norm": 0.2747291326522827, + "learning_rate": 0.0002738681782464426 + }, + { + "step": 171, + "epoch": 0.27403846153846156, + "cpu_mem": 1.77352704, + "gpu_mem": 4.427904, + "loss": 1.3643, + "grad_norm": 0.30499595403671265, + "learning_rate": 0.0002733924956630117 + }, + { + "step": 172, + "epoch": 0.27564102564102566, + "cpu_mem": 1.773723648, + "gpu_mem": 4.42788096, + "loss": 1.385, + "grad_norm": 0.25835368037223816, + "learning_rate": 0.00027291294351912664 + }, + { + "step": 173, + "epoch": 0.27724358974358976, + "cpu_mem": 1.773920256, + "gpu_mem": 4.427907072, + "loss": 1.4193, + "grad_norm": 0.3409912586212158, + "learning_rate": 0.00027242953685343327 + }, + { + "step": 174, + "epoch": 0.27884615384615385, + "cpu_mem": 1.774116864, + "gpu_mem": 4.42791936, + "loss": 1.4167, + "grad_norm": 0.34257370233535767, + "learning_rate": 0.0002719422908254538 + }, + { + "step": 175, + "epoch": 0.28044871794871795, + "cpu_mem": 1.774313472, + "gpu_mem": 4.42788096, + "loss": 1.3957, + "grad_norm": 0.31916728615760803, + "learning_rate": 0.0002714512207151125 + }, + { + "step": 176, + "epoch": 0.28205128205128205, + "cpu_mem": 1.774706688, + "gpu_mem": 4.427890176, + "loss": 1.4061, + "grad_norm": 0.2695143520832062, + "learning_rate": 0.0002709563419222557 + }, + { + "step": 177, + "epoch": 0.28365384615384615, + "cpu_mem": 1.774903296, + "gpu_mem": 4.427871744, + "loss": 1.3892, + "grad_norm": 0.24435995519161224, + "learning_rate": 0.0002704576699661691 + }, + { + "step": 178, + "epoch": 0.28525641025641024, + "cpu_mem": 1.775099904, + "gpu_mem": 4.427885568, + "loss": 1.3994, + "grad_norm": 0.42188647389411926, + "learning_rate": 0.0002699552204850914 + }, + { + "step": 179, + "epoch": 0.28685897435897434, + "cpu_mem": 1.775296512, + "gpu_mem": 4.427893248, + "loss": 1.3985, + "grad_norm": 0.26247352361679077, + "learning_rate": 0.0002694490092357233 + }, + { + "step": 180, + "epoch": 0.28846153846153844, + "cpu_mem": 1.77549312, + "gpu_mem": 4.427874816, + "loss": 1.4003, + "grad_norm": 0.3982261121273041, + "learning_rate": 0.00026893905209273404 + }, + { + "step": 181, + "epoch": 0.2900641025641026, + "cpu_mem": 1.775689728, + "gpu_mem": 4.427905536, + "loss": 1.3846, + "grad_norm": 0.28454282879829407, + "learning_rate": 0.00026842536504826286 + }, + { + "step": 182, + "epoch": 0.2916666666666667, + "cpu_mem": 1.775886336, + "gpu_mem": 4.427876352, + "loss": 1.3955, + "grad_norm": 0.45679807662963867, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 183, + "epoch": 0.2932692307692308, + "cpu_mem": 1.776082944, + "gpu_mem": 4.427900928, + "loss": 1.3753, + "grad_norm": 0.23870326578617096, + "learning_rate": 0.0002673868658077717 + }, + { + "step": 184, + "epoch": 0.2948717948717949, + "cpu_mem": 1.77647616, + "gpu_mem": 4.42788096, + "loss": 1.3886, + "grad_norm": 0.3374330699443817, + "learning_rate": 0.00026686208617885055 + }, + { + "step": 185, + "epoch": 0.296474358974359, + "cpu_mem": 1.776869376, + "gpu_mem": 4.427913216, + "loss": 1.4043, + "grad_norm": 0.47399598360061646, + "learning_rate": 0.0002663336417816238 + }, + { + "step": 186, + "epoch": 0.2980769230769231, + "cpu_mem": 1.777065984, + "gpu_mem": 4.427904, + "loss": 1.3862, + "grad_norm": 0.2820194363594055, + "learning_rate": 0.0002658015491879868 + }, + { + "step": 187, + "epoch": 0.29967948717948717, + "cpu_mem": 1.777262592, + "gpu_mem": 4.427899392, + "loss": 1.4086, + "grad_norm": 0.4791780412197113, + "learning_rate": 0.00026526582508424175 + }, + { + "step": 188, + "epoch": 0.30128205128205127, + "cpu_mem": 1.7774592, + "gpu_mem": 4.427856384, + "loss": 1.366, + "grad_norm": 0.2521732747554779, + "learning_rate": 0.0002647264862705741 + }, + { + "step": 189, + "epoch": 0.30288461538461536, + "cpu_mem": 1.777655808, + "gpu_mem": 4.427936256, + "loss": 1.3719, + "grad_norm": 0.1489843726158142, + "learning_rate": 0.00026418354966052573 + }, + { + "step": 190, + "epoch": 0.30448717948717946, + "cpu_mem": 1.777852416, + "gpu_mem": 4.427887104, + "loss": 1.3802, + "grad_norm": 0.31168580055236816, + "learning_rate": 0.00026363703228046454 + }, + { + "step": 191, + "epoch": 0.3060897435897436, + "cpu_mem": 1.778049024, + "gpu_mem": 4.427887104, + "loss": 1.3651, + "grad_norm": 0.24028924107551575, + "learning_rate": 0.0002630869512690507 + }, + { + "step": 192, + "epoch": 0.3076923076923077, + "cpu_mem": 1.778245632, + "gpu_mem": 4.427853312, + "loss": 1.4346, + "grad_norm": 0.542298436164856, + "learning_rate": 0.0002625333238766989 + }, + { + "step": 193, + "epoch": 0.3092948717948718, + "cpu_mem": 1.77844224, + "gpu_mem": 4.427893248, + "loss": 1.3822, + "grad_norm": 0.37611863017082214, + "learning_rate": 0.0002619761674650377 + }, + { + "step": 194, + "epoch": 0.3108974358974359, + "cpu_mem": 1.778638848, + "gpu_mem": 4.42788864, + "loss": 1.3754, + "grad_norm": 0.3566122353076935, + "learning_rate": 0.0002614154995063647 + }, + { + "step": 195, + "epoch": 0.3125, + "cpu_mem": 1.778835456, + "gpu_mem": 4.427876352, + "loss": 1.402, + "grad_norm": 0.29868829250335693, + "learning_rate": 0.00026085133758309883 + }, + { + "step": 196, + "epoch": 0.3141025641025641, + "cpu_mem": 1.779032064, + "gpu_mem": 4.427900928, + "loss": 1.4237, + "grad_norm": 0.5709932446479797, + "learning_rate": 0.0002602836993872292 + }, + { + "step": 197, + "epoch": 0.3157051282051282, + "cpu_mem": 1.779228672, + "gpu_mem": 4.427916288, + "loss": 1.404, + "grad_norm": 0.49137893319129944, + "learning_rate": 0.0002597126027197598 + }, + { + "step": 198, + "epoch": 0.3173076923076923, + "cpu_mem": 1.77942528, + "gpu_mem": 4.42788864, + "loss": 1.4023, + "grad_norm": 0.31108132004737854, + "learning_rate": 0.0002591380654901515 + }, + { + "step": 199, + "epoch": 0.3189102564102564, + "cpu_mem": 1.779621888, + "gpu_mem": 4.427885568, + "loss": 1.3925, + "grad_norm": 0.254374235868454, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 200, + "epoch": 0.32051282051282054, + "cpu_mem": 1.779818496, + "gpu_mem": 4.427900928, + "loss": 1.4227, + "grad_norm": 0.2750442922115326, + "learning_rate": 0.0002579787415212732 + }, + { + "step": 201, + "epoch": 0.32211538461538464, + "cpu_mem": 1.780015104, + "gpu_mem": 4.427877888, + "loss": 1.3954, + "grad_norm": 0.16411855816841125, + "learning_rate": 0.00025739399113813784 + }, + { + "step": 202, + "epoch": 0.32371794871794873, + "cpu_mem": 1.780211712, + "gpu_mem": 4.427879424, + "loss": 1.3917, + "grad_norm": 0.2034159153699875, + "learning_rate": 0.00025680587290399277 + }, + { + "step": 203, + "epoch": 0.32532051282051283, + "cpu_mem": 1.780211712, + "gpu_mem": 4.427920896, + "loss": 1.3911, + "grad_norm": 0.3298017680644989, + "learning_rate": 0.0002562144052620913 + }, + { + "step": 204, + "epoch": 0.3269230769230769, + "cpu_mem": 1.780604928, + "gpu_mem": 4.427891712, + "loss": 1.3864, + "grad_norm": 0.24234017729759216, + "learning_rate": 0.00025561960676072354 + }, + { + "step": 205, + "epoch": 0.328525641025641, + "cpu_mem": 1.780801536, + "gpu_mem": 4.427891712, + "loss": 1.3788, + "grad_norm": 0.23847289383411407, + "learning_rate": 0.0002550214960526344 + }, + { + "step": 206, + "epoch": 0.3301282051282051, + "cpu_mem": 1.780998144, + "gpu_mem": 4.42788864, + "loss": 1.3952, + "grad_norm": 0.36404067277908325, + "learning_rate": 0.000254420091894439 + }, + { + "step": 207, + "epoch": 0.3317307692307692, + "cpu_mem": 1.781194752, + "gpu_mem": 4.42788864, + "loss": 1.4025, + "grad_norm": 0.2496383786201477, + "learning_rate": 0.0002538154131460342 + }, + { + "step": 208, + "epoch": 0.3333333333333333, + "cpu_mem": 1.781587968, + "gpu_mem": 4.427879424, + "loss": 1.3963, + "grad_norm": 0.24600175023078918, + "learning_rate": 0.00025320747877000745 + }, + { + "step": 209, + "epoch": 0.3349358974358974, + "cpu_mem": 1.781587968, + "gpu_mem": 4.427914752, + "loss": 1.3719, + "grad_norm": 0.26217636466026306, + "learning_rate": 0.00025259630783104164 + }, + { + "step": 210, + "epoch": 0.33653846153846156, + "cpu_mem": 1.781784576, + "gpu_mem": 4.427871744, + "loss": 1.3794, + "grad_norm": 0.23115678131580353, + "learning_rate": 0.00025198191949531786 + }, + { + "step": 211, + "epoch": 0.33814102564102566, + "cpu_mem": 1.781981184, + "gpu_mem": 4.427899392, + "loss": 1.3867, + "grad_norm": 0.21077190339565277, + "learning_rate": 0.00025136433302991366 + }, + { + "step": 212, + "epoch": 0.33974358974358976, + "cpu_mem": 1.782177792, + "gpu_mem": 4.427908608, + "loss": 1.3812, + "grad_norm": 0.4433186650276184, + "learning_rate": 0.00025074356780219946 + }, + { + "step": 213, + "epoch": 0.34134615384615385, + "cpu_mem": 1.7823744, + "gpu_mem": 4.42788096, + "loss": 1.3644, + "grad_norm": 0.27155545353889465, + "learning_rate": 0.000250119643279231 + }, + { + "step": 214, + "epoch": 0.34294871794871795, + "cpu_mem": 1.782571008, + "gpu_mem": 4.427890176, + "loss": 1.3944, + "grad_norm": 0.33626899123191833, + "learning_rate": 0.0002494925790271386 + }, + { + "step": 215, + "epoch": 0.34455128205128205, + "cpu_mem": 1.782767616, + "gpu_mem": 4.427891712, + "loss": 1.3917, + "grad_norm": 0.3349422216415405, + "learning_rate": 0.00024886239471051376 + }, + { + "step": 216, + "epoch": 0.34615384615384615, + "cpu_mem": 1.783160832, + "gpu_mem": 4.427891712, + "loss": 1.395, + "grad_norm": 0.3613399267196655, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 217, + "epoch": 0.34775641025641024, + "cpu_mem": 1.78335744, + "gpu_mem": 4.427876352, + "loss": 1.4019, + "grad_norm": 0.20067688822746277, + "learning_rate": 0.0002475927450306363 + }, + { + "step": 218, + "epoch": 0.34935897435897434, + "cpu_mem": 1.783554048, + "gpu_mem": 4.427897856, + "loss": 1.3912, + "grad_norm": 0.26964035630226135, + "learning_rate": 0.0002469533194833073 + }, + { + "step": 219, + "epoch": 0.35096153846153844, + "cpu_mem": 1.783750656, + "gpu_mem": 4.427931648, + "loss": 1.3856, + "grad_norm": 0.2510623335838318, + "learning_rate": 0.0002463108535020447 + }, + { + "step": 220, + "epoch": 0.3525641025641026, + "cpu_mem": 1.783947264, + "gpu_mem": 4.427885568, + "loss": 1.4074, + "grad_norm": 0.3087608516216278, + "learning_rate": 0.0002456653672344348 + }, + { + "step": 221, + "epoch": 0.3541666666666667, + "cpu_mem": 1.784143872, + "gpu_mem": 4.427891712, + "loss": 1.3901, + "grad_norm": 0.2555963695049286, + "learning_rate": 0.0002450168809227794 + }, + { + "step": 222, + "epoch": 0.3557692307692308, + "cpu_mem": 1.78434048, + "gpu_mem": 4.427907072, + "loss": 1.3995, + "grad_norm": 0.23474067449569702, + "learning_rate": 0.00024436541490346095 + }, + { + "step": 223, + "epoch": 0.3573717948717949, + "cpu_mem": 1.784537088, + "gpu_mem": 4.427925504, + "loss": 1.3921, + "grad_norm": 0.2689337432384491, + "learning_rate": 0.00024371098960630495 + }, + { + "step": 224, + "epoch": 0.358974358974359, + "cpu_mem": 1.784537088, + "gpu_mem": 4.427894784, + "loss": 1.3836, + "grad_norm": 0.16399122774600983, + "learning_rate": 0.000243053625553939 + }, + { + "step": 225, + "epoch": 0.3605769230769231, + "cpu_mem": 1.784733696, + "gpu_mem": 4.42788096, + "loss": 1.371, + "grad_norm": 0.20231993496418, + "learning_rate": 0.00024239334336114953 + }, + { + "step": 226, + "epoch": 0.36217948717948717, + "cpu_mem": 1.784930304, + "gpu_mem": 4.42787328, + "loss": 1.3872, + "grad_norm": 0.160404235124588, + "learning_rate": 0.0002417301637342352 + }, + { + "step": 227, + "epoch": 0.36378205128205127, + "cpu_mem": 1.785126912, + "gpu_mem": 4.427937792, + "loss": 1.3763, + "grad_norm": 0.2027992308139801, + "learning_rate": 0.00024106410747035744 + }, + { + "step": 228, + "epoch": 0.36538461538461536, + "cpu_mem": 1.78532352, + "gpu_mem": 4.427876352, + "loss": 1.3973, + "grad_norm": 0.4589766561985016, + "learning_rate": 0.00024039519545688846 + }, + { + "step": 229, + "epoch": 0.36698717948717946, + "cpu_mem": 1.785520128, + "gpu_mem": 4.427928576, + "loss": 1.3835, + "grad_norm": 0.1606283038854599, + "learning_rate": 0.000239723448670756 + }, + { + "step": 230, + "epoch": 0.3685897435897436, + "cpu_mem": 1.785716736, + "gpu_mem": 4.427910144, + "loss": 1.3816, + "grad_norm": 0.20518217980861664, + "learning_rate": 0.0002390488881777858 + }, + { + "step": 231, + "epoch": 0.3701923076923077, + "cpu_mem": 1.785913344, + "gpu_mem": 4.427908608, + "loss": 1.3804, + "grad_norm": 0.28026100993156433, + "learning_rate": 0.0002383715351320406 + }, + { + "step": 232, + "epoch": 0.3717948717948718, + "cpu_mem": 1.786109952, + "gpu_mem": 4.427913216, + "loss": 1.382, + "grad_norm": 0.16454854607582092, + "learning_rate": 0.00023769141077515713 + }, + { + "step": 233, + "epoch": 0.3733974358974359, + "cpu_mem": 1.786109952, + "gpu_mem": 4.42788864, + "loss": 1.3882, + "grad_norm": 0.5837136507034302, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 234, + "epoch": 0.375, + "cpu_mem": 1.78630656, + "gpu_mem": 4.427917824, + "loss": 1.4037, + "grad_norm": 0.2740838825702667, + "learning_rate": 0.0002363229335283915 + }, + { + "step": 235, + "epoch": 0.3766025641025641, + "cpu_mem": 1.786503168, + "gpu_mem": 4.427894784, + "loss": 1.3918, + "grad_norm": 0.3473643362522125, + "learning_rate": 0.00023563462355364297 + }, + { + "step": 236, + "epoch": 0.3782051282051282, + "cpu_mem": 1.786699776, + "gpu_mem": 4.427956224, + "loss": 1.3871, + "grad_norm": 0.3828486204147339, + "learning_rate": 0.0002349436280966775 + }, + { + "step": 237, + "epoch": 0.3798076923076923, + "cpu_mem": 1.786699776, + "gpu_mem": 4.42788096, + "loss": 1.4006, + "grad_norm": 0.3704400658607483, + "learning_rate": 0.00023424996882695468 + }, + { + "step": 238, + "epoch": 0.3814102564102564, + "cpu_mem": 1.786896384, + "gpu_mem": 4.427891712, + "loss": 1.4133, + "grad_norm": 0.2892366349697113, + "learning_rate": 0.00023355366749747063 + }, + { + "step": 239, + "epoch": 0.38301282051282054, + "cpu_mem": 1.787092992, + "gpu_mem": 4.427890176, + "loss": 1.4055, + "grad_norm": 0.21304170787334442, + "learning_rate": 0.00023285474594407585 + }, + { + "step": 240, + "epoch": 0.38461538461538464, + "cpu_mem": 1.7872896, + "gpu_mem": 4.427887104, + "loss": 1.3724, + "grad_norm": 0.20311444997787476, + "learning_rate": 0.0002321532260847905 + }, + { + "step": 241, + "epoch": 0.38621794871794873, + "cpu_mem": 1.787486208, + "gpu_mem": 4.427917824, + "loss": 1.3711, + "grad_norm": 0.2588862180709839, + "learning_rate": 0.00023144912991911691 + }, + { + "step": 242, + "epoch": 0.38782051282051283, + "cpu_mem": 1.787682816, + "gpu_mem": 4.42789632, + "loss": 1.3834, + "grad_norm": 0.2649410367012024, + "learning_rate": 0.0002307424795273499 + }, + { + "step": 243, + "epoch": 0.3894230769230769, + "cpu_mem": 1.787879424, + "gpu_mem": 4.427891712, + "loss": 1.3859, + "grad_norm": 0.22445179522037506, + "learning_rate": 0.00023003329706988425 + }, + { + "step": 244, + "epoch": 0.391025641025641, + "cpu_mem": 1.788076032, + "gpu_mem": 4.427902464, + "loss": 1.385, + "grad_norm": 0.40457040071487427, + "learning_rate": 0.00022932160478651963 + }, + { + "step": 245, + "epoch": 0.3926282051282051, + "cpu_mem": 1.78827264, + "gpu_mem": 4.427907072, + "loss": 1.4012, + "grad_norm": 0.20223093032836914, + "learning_rate": 0.00022860742499576338 + }, + { + "step": 246, + "epoch": 0.3942307692307692, + "cpu_mem": 1.788469248, + "gpu_mem": 4.427868672, + "loss": 1.3885, + "grad_norm": 0.18710444867610931, + "learning_rate": 0.00022789078009413042 + }, + { + "step": 247, + "epoch": 0.3958333333333333, + "cpu_mem": 1.788665856, + "gpu_mem": 4.427936256, + "loss": 1.373, + "grad_norm": 0.2182277888059616, + "learning_rate": 0.00022717169255544108 + }, + { + "step": 248, + "epoch": 0.3974358974358974, + "cpu_mem": 1.788665856, + "gpu_mem": 4.427899392, + "loss": 1.3725, + "grad_norm": 0.2739138901233673, + "learning_rate": 0.00022645018493011612 + }, + { + "step": 249, + "epoch": 0.39903846153846156, + "cpu_mem": 1.788862464, + "gpu_mem": 4.42788864, + "loss": 1.4049, + "grad_norm": 0.27758654952049255, + "learning_rate": 0.0002257262798444698 + }, + { + "step": 250, + "epoch": 0.40064102564102566, + "cpu_mem": 1.789059072, + "gpu_mem": 4.427905536, + "loss": 1.3756, + "grad_norm": 0.279474675655365, + "learning_rate": 0.000225 + }, + { + "step": 251, + "epoch": 0.40224358974358976, + "cpu_mem": 1.789059072, + "gpu_mem": 4.427879424, + "loss": 1.4004, + "grad_norm": 0.4316874146461487, + "learning_rate": 0.00022427136817267668 + }, + { + "step": 252, + "epoch": 0.40384615384615385, + "cpu_mem": 1.78925568, + "gpu_mem": 4.42792704, + "loss": 1.4137, + "grad_norm": 0.2838718295097351, + "learning_rate": 0.0002235404072122273 + }, + { + "step": 253, + "epoch": 0.40544871794871795, + "cpu_mem": 1.789452288, + "gpu_mem": 4.427894784, + "loss": 1.3814, + "grad_norm": 0.3089337646961212, + "learning_rate": 0.00022280714004142054 + }, + { + "step": 254, + "epoch": 0.40705128205128205, + "cpu_mem": 1.789648896, + "gpu_mem": 4.427884032, + "loss": 1.3741, + "grad_norm": 0.26441892981529236, + "learning_rate": 0.00022207158965534726 + }, + { + "step": 255, + "epoch": 0.40865384615384615, + "cpu_mem": 1.789845504, + "gpu_mem": 4.427899392, + "loss": 1.3808, + "grad_norm": 0.23282578587532043, + "learning_rate": 0.0002213337791206993 + }, + { + "step": 256, + "epoch": 0.41025641025641024, + "cpu_mem": 1.790042112, + "gpu_mem": 4.42789632, + "loss": 1.3786, + "grad_norm": 0.22620214521884918, + "learning_rate": 0.00022059373157504636 + }, + { + "step": 257, + "epoch": 0.41185897435897434, + "cpu_mem": 1.79023872, + "gpu_mem": 4.42789632, + "loss": 1.3936, + "grad_norm": 0.38987070322036743, + "learning_rate": 0.00021985147022611038 + }, + { + "step": 258, + "epoch": 0.41346153846153844, + "cpu_mem": 1.79023872, + "gpu_mem": 4.427884032, + "loss": 1.3716, + "grad_norm": 0.24752965569496155, + "learning_rate": 0.0002191070183510375 + }, + { + "step": 259, + "epoch": 0.4150641025641026, + "cpu_mem": 1.790435328, + "gpu_mem": 4.427867136, + "loss": 1.3921, + "grad_norm": 0.29713204503059387, + "learning_rate": 0.00021836039929566835 + }, + { + "step": 260, + "epoch": 0.4166666666666667, + "cpu_mem": 1.790435328, + "gpu_mem": 4.427930112, + "loss": 1.3976, + "grad_norm": 0.2650630474090576, + "learning_rate": 0.0002176116364738058 + }, + { + "step": 261, + "epoch": 0.4182692307692308, + "cpu_mem": 1.790631936, + "gpu_mem": 4.427884032, + "loss": 1.3993, + "grad_norm": 0.26719552278518677, + "learning_rate": 0.00021686075336648075 + }, + { + "step": 262, + "epoch": 0.4198717948717949, + "cpu_mem": 1.790828544, + "gpu_mem": 4.427893248, + "loss": 1.398, + "grad_norm": 0.42754459381103516, + "learning_rate": 0.00021610777352121574 + }, + { + "step": 263, + "epoch": 0.421474358974359, + "cpu_mem": 1.791025152, + "gpu_mem": 4.427928576, + "loss": 1.4001, + "grad_norm": 0.23662503063678741, + "learning_rate": 0.0002153527205512867 + }, + { + "step": 264, + "epoch": 0.4230769230769231, + "cpu_mem": 1.79122176, + "gpu_mem": 4.427893248, + "loss": 1.3854, + "grad_norm": 0.22174762189388275, + "learning_rate": 0.0002145956181349821 + }, + { + "step": 265, + "epoch": 0.42467948717948717, + "cpu_mem": 1.791418368, + "gpu_mem": 4.427897856, + "loss": 1.3882, + "grad_norm": 0.24217994511127472, + "learning_rate": 0.00021383649001486055 + }, + { + "step": 266, + "epoch": 0.42628205128205127, + "cpu_mem": 1.791418368, + "gpu_mem": 4.427945472, + "loss": 1.3848, + "grad_norm": 0.20656748116016388, + "learning_rate": 0.00021307535999700637 + }, + { + "step": 267, + "epoch": 0.42788461538461536, + "cpu_mem": 1.791614976, + "gpu_mem": 4.427954688, + "loss": 1.3908, + "grad_norm": 0.21850359439849854, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 268, + "epoch": 0.42948717948717946, + "cpu_mem": 1.791811584, + "gpu_mem": 4.427908608, + "loss": 1.3938, + "grad_norm": 0.33277738094329834, + "learning_rate": 0.00021154718980558417 + }, + { + "step": 269, + "epoch": 0.4310897435897436, + "cpu_mem": 1.792008192, + "gpu_mem": 4.427902464, + "loss": 1.4285, + "grad_norm": 0.5957878232002258, + "learning_rate": 0.00021078019755508398 + }, + { + "step": 270, + "epoch": 0.4326923076923077, + "cpu_mem": 1.792008192, + "gpu_mem": 4.427963904, + "loss": 1.3952, + "grad_norm": 0.3576374351978302, + "learning_rate": 0.00021001129925148396 + }, + { + "step": 271, + "epoch": 0.4342948717948718, + "cpu_mem": 1.7922048, + "gpu_mem": 4.427890176, + "loss": 1.3906, + "grad_norm": 0.2239820808172226, + "learning_rate": 0.00020924051900725923 + }, + { + "step": 272, + "epoch": 0.4358974358974359, + "cpu_mem": 1.7922048, + "gpu_mem": 4.42788864, + "loss": 1.3807, + "grad_norm": 0.1930428296327591, + "learning_rate": 0.00020846788099390188 + }, + { + "step": 273, + "epoch": 0.4375, + "cpu_mem": 1.792401408, + "gpu_mem": 4.427891712, + "loss": 1.3784, + "grad_norm": 0.18813225626945496, + "learning_rate": 0.0002076934094411635 + }, + { + "step": 274, + "epoch": 0.4391025641025641, + "cpu_mem": 1.792598016, + "gpu_mem": 4.427877888, + "loss": 1.4095, + "grad_norm": 0.30676618218421936, + "learning_rate": 0.0002069171286362949 + }, + { + "step": 275, + "epoch": 0.4407051282051282, + "cpu_mem": 1.792794624, + "gpu_mem": 4.427893248, + "loss": 1.4017, + "grad_norm": 0.2345120757818222, + "learning_rate": 0.00020613906292328457 + }, + { + "step": 276, + "epoch": 0.4423076923076923, + "cpu_mem": 1.792794624, + "gpu_mem": 4.427931648, + "loss": 1.3832, + "grad_norm": 0.1874900907278061, + "learning_rate": 0.0002053592367020955 + }, + { + "step": 277, + "epoch": 0.4439102564102564, + "cpu_mem": 1.792794624, + "gpu_mem": 4.42791168, + "loss": 1.3717, + "grad_norm": 0.2030116617679596, + "learning_rate": 0.0002045776744278996 + }, + { + "step": 278, + "epoch": 0.44551282051282054, + "cpu_mem": 1.792991232, + "gpu_mem": 4.427937792, + "loss": 1.4042, + "grad_norm": 0.2831662595272064, + "learning_rate": 0.00020379440061031118 + }, + { + "step": 279, + "epoch": 0.44711538461538464, + "cpu_mem": 1.79318784, + "gpu_mem": 4.42788864, + "loss": 1.377, + "grad_norm": 0.256205677986145, + "learning_rate": 0.00020300943981261808 + }, + { + "step": 280, + "epoch": 0.44871794871794873, + "cpu_mem": 1.793384448, + "gpu_mem": 4.427882496, + "loss": 1.4071, + "grad_norm": 0.31681525707244873, + "learning_rate": 0.0002022228166510114 + }, + { + "step": 281, + "epoch": 0.45032051282051283, + "cpu_mem": 1.793581056, + "gpu_mem": 4.427905536, + "loss": 1.3764, + "grad_norm": 0.23493638634681702, + "learning_rate": 0.00020143455579381373 + }, + { + "step": 282, + "epoch": 0.4519230769230769, + "cpu_mem": 1.793581056, + "gpu_mem": 4.427884032, + "loss": 1.3635, + "grad_norm": 0.2256968766450882, + "learning_rate": 0.0002006446819607053 + }, + { + "step": 283, + "epoch": 0.453525641025641, + "cpu_mem": 1.793777664, + "gpu_mem": 4.427897856, + "loss": 1.3808, + "grad_norm": 0.22907942533493042, + "learning_rate": 0.00019985321992194892 + }, + { + "step": 284, + "epoch": 0.4551282051282051, + "cpu_mem": 1.793974272, + "gpu_mem": 4.427902464, + "loss": 1.3688, + "grad_norm": 0.3242001235485077, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 285, + "epoch": 0.4567307692307692, + "cpu_mem": 1.79417088, + "gpu_mem": 4.427920896, + "loss": 1.412, + "grad_norm": 0.44510719180107117, + "learning_rate": 0.00019826563055679418 + }, + { + "step": 286, + "epoch": 0.4583333333333333, + "cpu_mem": 1.79417088, + "gpu_mem": 4.427891712, + "loss": 1.3571, + "grad_norm": 0.2116863876581192, + "learning_rate": 0.00019746955301683537 + }, + { + "step": 287, + "epoch": 0.4599358974358974, + "cpu_mem": 1.79417088, + "gpu_mem": 4.42791936, + "loss": 1.4039, + "grad_norm": 0.19444353878498077, + "learning_rate": 0.0001966719868425464 + }, + { + "step": 288, + "epoch": 0.46153846153846156, + "cpu_mem": 1.794367488, + "gpu_mem": 4.427900928, + "loss": 1.3933, + "grad_norm": 0.36611372232437134, + "learning_rate": 0.0001958729570454201 + }, + { + "step": 289, + "epoch": 0.46314102564102566, + "cpu_mem": 1.794564096, + "gpu_mem": 4.42788864, + "loss": 1.3976, + "grad_norm": 0.30852165818214417, + "learning_rate": 0.0001950724886828484 + }, + { + "step": 290, + "epoch": 0.46474358974358976, + "cpu_mem": 1.794564096, + "gpu_mem": 4.427897856, + "loss": 1.3732, + "grad_norm": 0.21905942261219025, + "learning_rate": 0.000194270606857336 + }, + { + "step": 291, + "epoch": 0.46634615384615385, + "cpu_mem": 1.794760704, + "gpu_mem": 4.427894784, + "loss": 1.3804, + "grad_norm": 0.25244370102882385, + "learning_rate": 0.00019346733671571367 + }, + { + "step": 292, + "epoch": 0.46794871794871795, + "cpu_mem": 1.794760704, + "gpu_mem": 4.427910144, + "loss": 1.4073, + "grad_norm": 0.3722356855869293, + "learning_rate": 0.00019266270344834942 + }, + { + "step": 293, + "epoch": 0.46955128205128205, + "cpu_mem": 1.794957312, + "gpu_mem": 4.427917824, + "loss": 1.3788, + "grad_norm": 0.22621676325798035, + "learning_rate": 0.00019185673228835857 + }, + { + "step": 294, + "epoch": 0.47115384615384615, + "cpu_mem": 1.79515392, + "gpu_mem": 4.427907072, + "loss": 1.4028, + "grad_norm": 0.2161528319120407, + "learning_rate": 0.00019104944851081244 + }, + { + "step": 295, + "epoch": 0.47275641025641024, + "cpu_mem": 1.79515392, + "gpu_mem": 4.427891712, + "loss": 1.3921, + "grad_norm": 0.30365151166915894, + "learning_rate": 0.00019024087743194564 + }, + { + "step": 296, + "epoch": 0.47435897435897434, + "cpu_mem": 1.79515392, + "gpu_mem": 4.427894784, + "loss": 1.3794, + "grad_norm": 0.17406271398067474, + "learning_rate": 0.0001894310444083625 + }, + { + "step": 297, + "epoch": 0.47596153846153844, + "cpu_mem": 1.795350528, + "gpu_mem": 4.42788864, + "loss": 1.3765, + "grad_norm": 0.25168919563293457, + "learning_rate": 0.00018861997483624136 + }, + { + "step": 298, + "epoch": 0.4775641025641026, + "cpu_mem": 1.795547136, + "gpu_mem": 4.427884032, + "loss": 1.3923, + "grad_norm": 0.39831075072288513, + "learning_rate": 0.00018780769415053866 + }, + { + "step": 299, + "epoch": 0.4791666666666667, + "cpu_mem": 1.795743744, + "gpu_mem": 4.427905536, + "loss": 1.3859, + "grad_norm": 0.19834551215171814, + "learning_rate": 0.00018699422782419094 + }, + { + "step": 300, + "epoch": 0.4807692307692308, + "cpu_mem": 1.795743744, + "gpu_mem": 4.427897856, + "loss": 1.3658, + "grad_norm": 0.2904741168022156, + "learning_rate": 0.00018617960136731624 + }, + { + "step": 301, + "epoch": 0.4823717948717949, + "cpu_mem": 1.795743744, + "gpu_mem": 4.427870208, + "loss": 1.3773, + "grad_norm": 0.16536711156368256, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 302, + "epoch": 0.483974358974359, + "cpu_mem": 1.795940352, + "gpu_mem": 4.427868672, + "loss": 1.3803, + "grad_norm": 0.18934541940689087, + "learning_rate": 0.0001845469702835641 + }, + { + "step": 303, + "epoch": 0.4855769230769231, + "cpu_mem": 1.795940352, + "gpu_mem": 4.427894784, + "loss": 1.4084, + "grad_norm": 0.2567669749259949, + "learning_rate": 0.00018372901685562414 + }, + { + "step": 304, + "epoch": 0.48717948717948717, + "cpu_mem": 1.79613696, + "gpu_mem": 4.427877888, + "loss": 1.3785, + "grad_norm": 0.21445639431476593, + "learning_rate": 0.00018291000569342676 + }, + { + "step": 305, + "epoch": 0.48878205128205127, + "cpu_mem": 1.796333568, + "gpu_mem": 4.427908608, + "loss": 1.3567, + "grad_norm": 0.6063414216041565, + "learning_rate": 0.00018208996248097458 + }, + { + "step": 306, + "epoch": 0.49038461538461536, + "cpu_mem": 1.796333568, + "gpu_mem": 4.427891712, + "loss": 1.4042, + "grad_norm": 0.2814069986343384, + "learning_rate": 0.00018126891293463547 + }, + { + "step": 307, + "epoch": 0.49198717948717946, + "cpu_mem": 1.796333568, + "gpu_mem": 4.427922432, + "loss": 1.3896, + "grad_norm": 0.32292866706848145, + "learning_rate": 0.0001804468828023354 + }, + { + "step": 308, + "epoch": 0.4935897435897436, + "cpu_mem": 1.796530176, + "gpu_mem": 4.427890176, + "loss": 1.4043, + "grad_norm": 0.23387503623962402, + "learning_rate": 0.00017962389786275142 + }, + { + "step": 309, + "epoch": 0.4951923076923077, + "cpu_mem": 1.796726784, + "gpu_mem": 4.427916288, + "loss": 1.3645, + "grad_norm": 0.25090140104293823, + "learning_rate": 0.000178799983924503 + }, + { + "step": 310, + "epoch": 0.4967948717948718, + "cpu_mem": 1.796726784, + "gpu_mem": 4.427891712, + "loss": 1.3821, + "grad_norm": 0.14247769117355347, + "learning_rate": 0.00017797516682534293 + }, + { + "step": 311, + "epoch": 0.4983974358974359, + "cpu_mem": 1.796923392, + "gpu_mem": 4.427887104, + "loss": 1.3948, + "grad_norm": 0.201180100440979, + "learning_rate": 0.00017714947243134695 + }, + { + "step": 312, + "epoch": 0.5, + "cpu_mem": 1.796923392, + "gpu_mem": 4.427890176, + "loss": 1.3876, + "grad_norm": 0.2883966863155365, + "learning_rate": 0.0001763229266361024 + }, + { + "step": 313, + "epoch": 0.5016025641025641, + "cpu_mem": 1.79712, + "gpu_mem": 4.427908608, + "loss": 1.4005, + "grad_norm": 0.30925244092941284, + "learning_rate": 0.00017549555535989648 + }, + { + "step": 314, + "epoch": 0.5032051282051282, + "cpu_mem": 1.79712, + "gpu_mem": 4.42788864, + "loss": 1.3656, + "grad_norm": 0.2836361825466156, + "learning_rate": 0.00017466738454890323 + }, + { + "step": 315, + "epoch": 0.5048076923076923, + "cpu_mem": 1.797316608, + "gpu_mem": 4.427893248, + "loss": 1.381, + "grad_norm": 0.15359069406986237, + "learning_rate": 0.00017383844017436996 + }, + { + "step": 316, + "epoch": 0.5064102564102564, + "cpu_mem": 1.797316608, + "gpu_mem": 4.42788864, + "loss": 1.4159, + "grad_norm": 0.2947242259979248, + "learning_rate": 0.00017300874823180282 + }, + { + "step": 317, + "epoch": 0.5080128205128205, + "cpu_mem": 1.797513216, + "gpu_mem": 4.42789632, + "loss": 1.3962, + "grad_norm": 0.17640447616577148, + "learning_rate": 0.00017217833474015128 + }, + { + "step": 318, + "epoch": 0.5096153846153846, + "cpu_mem": 1.797513216, + "gpu_mem": 4.427920896, + "loss": 1.381, + "grad_norm": 0.1865333765745163, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 319, + "epoch": 0.5112179487179487, + "cpu_mem": 1.797513216, + "gpu_mem": 4.427913216, + "loss": 1.3933, + "grad_norm": 0.2674986720085144, + "learning_rate": 0.0001705154472977154 + }, + { + "step": 320, + "epoch": 0.5128205128205128, + "cpu_mem": 1.797709824, + "gpu_mem": 4.427914752, + "loss": 1.395, + "grad_norm": 0.20257696509361267, + "learning_rate": 0.00016968302549470095 + }, + { + "step": 321, + "epoch": 0.5144230769230769, + "cpu_mem": 1.797906432, + "gpu_mem": 4.427890176, + "loss": 1.3979, + "grad_norm": 0.2197108417749405, + "learning_rate": 0.00016884998643650694 + }, + { + "step": 322, + "epoch": 0.5160256410256411, + "cpu_mem": 1.797906432, + "gpu_mem": 4.427891712, + "loss": 1.3826, + "grad_norm": 0.17545858025550842, + "learning_rate": 0.00016801635624704776 + }, + { + "step": 323, + "epoch": 0.5176282051282052, + "cpu_mem": 1.79810304, + "gpu_mem": 4.42791168, + "loss": 1.3895, + "grad_norm": 0.20069585740566254, + "learning_rate": 0.0001671821610687756 + }, + { + "step": 324, + "epoch": 0.5192307692307693, + "cpu_mem": 1.798299648, + "gpu_mem": 4.427884032, + "loss": 1.3952, + "grad_norm": 0.3554527461528778, + "learning_rate": 0.00016634742706186036 + }, + { + "step": 325, + "epoch": 0.5208333333333334, + "cpu_mem": 1.798299648, + "gpu_mem": 4.42789632, + "loss": 1.3654, + "grad_norm": 0.23818296194076538, + "learning_rate": 0.00016551218040336993 + }, + { + "step": 326, + "epoch": 0.5224358974358975, + "cpu_mem": 1.798496256, + "gpu_mem": 4.427905536, + "loss": 1.4006, + "grad_norm": 0.3801953196525574, + "learning_rate": 0.00016467644728644843 + }, + { + "step": 327, + "epoch": 0.5240384615384616, + "cpu_mem": 1.798496256, + "gpu_mem": 4.427882496, + "loss": 1.3603, + "grad_norm": 0.36717379093170166, + "learning_rate": 0.0001638402539194953 + }, + { + "step": 328, + "epoch": 0.5256410256410257, + "cpu_mem": 1.798496256, + "gpu_mem": 4.427907072, + "loss": 1.3972, + "grad_norm": 0.38428670167922974, + "learning_rate": 0.00016300362652534346 + }, + { + "step": 329, + "epoch": 0.5272435897435898, + "cpu_mem": 1.798692864, + "gpu_mem": 4.427907072, + "loss": 1.3781, + "grad_norm": 0.21847184002399445, + "learning_rate": 0.00016216659134043657 + }, + { + "step": 330, + "epoch": 0.5288461538461539, + "cpu_mem": 1.798692864, + "gpu_mem": 4.427890176, + "loss": 1.3824, + "grad_norm": 0.22717754542827606, + "learning_rate": 0.00016132917461400686 + }, + { + "step": 331, + "epoch": 0.530448717948718, + "cpu_mem": 1.798889472, + "gpu_mem": 4.427887104, + "loss": 1.3826, + "grad_norm": 0.5566385984420776, + "learning_rate": 0.00016049140260725127 + }, + { + "step": 332, + "epoch": 0.532051282051282, + "cpu_mem": 1.798889472, + "gpu_mem": 4.427879424, + "loss": 1.3858, + "grad_norm": 0.2457977831363678, + "learning_rate": 0.00015965330159250845 + }, + { + "step": 333, + "epoch": 0.5336538461538461, + "cpu_mem": 1.798889472, + "gpu_mem": 4.427917824, + "loss": 1.4097, + "grad_norm": 0.34162458777427673, + "learning_rate": 0.00015881489785243467 + }, + { + "step": 334, + "epoch": 0.5352564102564102, + "cpu_mem": 1.798889472, + "gpu_mem": 4.427894784, + "loss": 1.377, + "grad_norm": 0.20359401404857635, + "learning_rate": 0.00015797621767917942 + }, + { + "step": 335, + "epoch": 0.5368589743589743, + "cpu_mem": 1.798889472, + "gpu_mem": 4.427893248, + "loss": 1.3924, + "grad_norm": 0.19181066751480103, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 336, + "epoch": 0.5384615384615384, + "cpu_mem": 1.798889472, + "gpu_mem": 4.427910144, + "loss": 1.3892, + "grad_norm": 0.29182061553001404, + "learning_rate": 0.00015629813324424292 + }, + { + "step": 337, + "epoch": 0.5400641025641025, + "cpu_mem": 1.79908608, + "gpu_mem": 4.427894784, + "loss": 1.3632, + "grad_norm": 0.1866026073694229, + "learning_rate": 0.00015545878160690583 + }, + { + "step": 338, + "epoch": 0.5416666666666666, + "cpu_mem": 1.799282688, + "gpu_mem": 4.427907072, + "loss": 1.3989, + "grad_norm": 0.3078109622001648, + "learning_rate": 0.00015461925878342556 + }, + { + "step": 339, + "epoch": 0.5432692307692307, + "cpu_mem": 1.799282688, + "gpu_mem": 4.42791936, + "loss": 1.3928, + "grad_norm": 0.2851615846157074, + "learning_rate": 0.00015377959110104584 + }, + { + "step": 340, + "epoch": 0.5448717948717948, + "cpu_mem": 1.799479296, + "gpu_mem": 4.427894784, + "loss": 1.3988, + "grad_norm": 0.2348039746284485, + "learning_rate": 0.00015293980489155333 + }, + { + "step": 341, + "epoch": 0.5464743589743589, + "cpu_mem": 1.799479296, + "gpu_mem": 4.427939328, + "loss": 1.4106, + "grad_norm": 0.4080047011375427, + "learning_rate": 0.00015209992649045152 + }, + { + "step": 342, + "epoch": 0.5480769230769231, + "cpu_mem": 1.799479296, + "gpu_mem": 4.427913216, + "loss": 1.3658, + "grad_norm": 0.22262652218341827, + "learning_rate": 0.000151259982236135 + }, + { + "step": 343, + "epoch": 0.5496794871794872, + "cpu_mem": 1.799479296, + "gpu_mem": 4.427910144, + "loss": 1.3916, + "grad_norm": 0.19965814054012299, + "learning_rate": 0.00015041999846906367 + }, + { + "step": 344, + "epoch": 0.5512820512820513, + "cpu_mem": 1.799675904, + "gpu_mem": 4.427891712, + "loss": 1.3717, + "grad_norm": 0.15649214386940002, + "learning_rate": 0.00014958000153093634 + }, + { + "step": 345, + "epoch": 0.5528846153846154, + "cpu_mem": 1.799872512, + "gpu_mem": 4.427897856, + "loss": 1.3998, + "grad_norm": 0.31218910217285156, + "learning_rate": 0.000148740017763865 + }, + { + "step": 346, + "epoch": 0.5544871794871795, + "cpu_mem": 1.799872512, + "gpu_mem": 4.427867136, + "loss": 1.3863, + "grad_norm": 0.24148093163967133, + "learning_rate": 0.00014790007350954845 + }, + { + "step": 347, + "epoch": 0.5560897435897436, + "cpu_mem": 1.799872512, + "gpu_mem": 4.427931648, + "loss": 1.3837, + "grad_norm": 0.2234194278717041, + "learning_rate": 0.00014706019510844664 + }, + { + "step": 348, + "epoch": 0.5576923076923077, + "cpu_mem": 1.80006912, + "gpu_mem": 4.427885568, + "loss": 1.4175, + "grad_norm": 0.30912110209465027, + "learning_rate": 0.0001462204088989541 + }, + { + "step": 349, + "epoch": 0.5592948717948718, + "cpu_mem": 1.80006912, + "gpu_mem": 4.427879424, + "loss": 1.3863, + "grad_norm": 0.18907231092453003, + "learning_rate": 0.00014538074121657447 + }, + { + "step": 350, + "epoch": 0.5608974358974359, + "cpu_mem": 1.80006912, + "gpu_mem": 4.42793472, + "loss": 1.3813, + "grad_norm": 0.13177086412906647, + "learning_rate": 0.00014454121839309415 + }, + { + "step": 351, + "epoch": 0.5625, + "cpu_mem": 1.800265728, + "gpu_mem": 4.427900928, + "loss": 1.4068, + "grad_norm": 0.33327028155326843, + "learning_rate": 0.00014370186675575705 + }, + { + "step": 352, + "epoch": 0.5641025641025641, + "cpu_mem": 1.800265728, + "gpu_mem": 4.42788864, + "loss": 1.3816, + "grad_norm": 0.29859206080436707, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 353, + "epoch": 0.5657051282051282, + "cpu_mem": 1.800265728, + "gpu_mem": 4.427893248, + "loss": 1.3794, + "grad_norm": 0.16572199761867523, + "learning_rate": 0.00014202378232082053 + }, + { + "step": 354, + "epoch": 0.5673076923076923, + "cpu_mem": 1.800462336, + "gpu_mem": 4.42787328, + "loss": 1.3781, + "grad_norm": 0.18548189103603363, + "learning_rate": 0.00014118510214756536 + }, + { + "step": 355, + "epoch": 0.5689102564102564, + "cpu_mem": 1.800658944, + "gpu_mem": 4.427897856, + "loss": 1.3653, + "grad_norm": 0.1738688200712204, + "learning_rate": 0.00014034669840749152 + }, + { + "step": 356, + "epoch": 0.5705128205128205, + "cpu_mem": 1.800658944, + "gpu_mem": 4.427876352, + "loss": 1.3846, + "grad_norm": 0.1618034839630127, + "learning_rate": 0.0001395085973927487 + }, + { + "step": 357, + "epoch": 0.5721153846153846, + "cpu_mem": 1.800658944, + "gpu_mem": 4.427893248, + "loss": 1.3809, + "grad_norm": 0.18480895459651947, + "learning_rate": 0.00013867082538599317 + }, + { + "step": 358, + "epoch": 0.5737179487179487, + "cpu_mem": 1.800658944, + "gpu_mem": 4.42785792, + "loss": 1.3897, + "grad_norm": 0.16952429711818695, + "learning_rate": 0.00013783340865956338 + }, + { + "step": 359, + "epoch": 0.5753205128205128, + "cpu_mem": 1.800855552, + "gpu_mem": 4.427890176, + "loss": 1.3764, + "grad_norm": 0.19237090647220612, + "learning_rate": 0.0001369963734746566 + }, + { + "step": 360, + "epoch": 0.5769230769230769, + "cpu_mem": 1.800855552, + "gpu_mem": 4.427879424, + "loss": 1.3803, + "grad_norm": 0.15912336111068726, + "learning_rate": 0.0001361597460805047 + }, + { + "step": 361, + "epoch": 0.5785256410256411, + "cpu_mem": 1.80105216, + "gpu_mem": 4.427916288, + "loss": 1.3964, + "grad_norm": 0.2638576030731201, + "learning_rate": 0.00013532355271355154 + }, + { + "step": 362, + "epoch": 0.5801282051282052, + "cpu_mem": 1.80105216, + "gpu_mem": 4.427882496, + "loss": 1.3823, + "grad_norm": 0.23109182715415955, + "learning_rate": 0.00013448781959663004 + }, + { + "step": 363, + "epoch": 0.5817307692307693, + "cpu_mem": 1.80105216, + "gpu_mem": 4.427905536, + "loss": 1.3845, + "grad_norm": 0.26969248056411743, + "learning_rate": 0.00013365257293813956 + }, + { + "step": 364, + "epoch": 0.5833333333333334, + "cpu_mem": 1.801248768, + "gpu_mem": 4.427894784, + "loss": 1.3896, + "grad_norm": 0.22973833978176117, + "learning_rate": 0.00013281783893122446 + }, + { + "step": 365, + "epoch": 0.5849358974358975, + "cpu_mem": 1.801248768, + "gpu_mem": 4.427900928, + "loss": 1.3814, + "grad_norm": 0.19195805490016937, + "learning_rate": 0.00013198364375295224 + }, + { + "step": 366, + "epoch": 0.5865384615384616, + "cpu_mem": 1.801248768, + "gpu_mem": 4.427894784, + "loss": 1.3865, + "grad_norm": 0.3326044976711273, + "learning_rate": 0.000131150013563493 + }, + { + "step": 367, + "epoch": 0.5881410256410257, + "cpu_mem": 1.801248768, + "gpu_mem": 4.427913216, + "loss": 1.3957, + "grad_norm": 0.27566537261009216, + "learning_rate": 0.00013031697450529902 + }, + { + "step": 368, + "epoch": 0.5897435897435898, + "cpu_mem": 1.801445376, + "gpu_mem": 4.42787328, + "loss": 1.3929, + "grad_norm": 0.13244299590587616, + "learning_rate": 0.0001294845527022846 + }, + { + "step": 369, + "epoch": 0.5913461538461539, + "cpu_mem": 1.801641984, + "gpu_mem": 4.427905536, + "loss": 1.3682, + "grad_norm": 0.1512961983680725, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 370, + "epoch": 0.592948717948718, + "cpu_mem": 1.801641984, + "gpu_mem": 4.427925504, + "loss": 1.3952, + "grad_norm": 0.19759875535964966, + "learning_rate": 0.0001278216652598487 + }, + { + "step": 371, + "epoch": 0.594551282051282, + "cpu_mem": 1.801641984, + "gpu_mem": 4.42791936, + "loss": 1.3916, + "grad_norm": 0.23054386675357819, + "learning_rate": 0.00012699125176819716 + }, + { + "step": 372, + "epoch": 0.5961538461538461, + "cpu_mem": 1.801641984, + "gpu_mem": 4.427882496, + "loss": 1.4047, + "grad_norm": 0.3223818838596344, + "learning_rate": 0.00012616155982563 + }, + { + "step": 373, + "epoch": 0.5977564102564102, + "cpu_mem": 1.801641984, + "gpu_mem": 4.427899392, + "loss": 1.3986, + "grad_norm": 0.32353559136390686, + "learning_rate": 0.00012533261545109674 + }, + { + "step": 374, + "epoch": 0.5993589743589743, + "cpu_mem": 1.801838592, + "gpu_mem": 4.427876352, + "loss": 1.3668, + "grad_norm": 0.257954865694046, + "learning_rate": 0.00012450444464010352 + }, + { + "step": 375, + "epoch": 0.6009615384615384, + "cpu_mem": 1.801838592, + "gpu_mem": 4.427908608, + "loss": 1.371, + "grad_norm": 0.4749239981174469, + "learning_rate": 0.0001236770733638976 + }, + { + "step": 376, + "epoch": 0.6025641025641025, + "cpu_mem": 1.801838592, + "gpu_mem": 4.427904, + "loss": 1.3756, + "grad_norm": 0.21304917335510254, + "learning_rate": 0.000122850527568653 + }, + { + "step": 377, + "epoch": 0.6041666666666666, + "cpu_mem": 1.8020352, + "gpu_mem": 4.427913216, + "loss": 1.3861, + "grad_norm": 0.21457235515117645, + "learning_rate": 0.00012202483317465704 + }, + { + "step": 378, + "epoch": 0.6057692307692307, + "cpu_mem": 1.8020352, + "gpu_mem": 4.427887104, + "loss": 1.3923, + "grad_norm": 0.19404199719429016, + "learning_rate": 0.00012120001607549698 + }, + { + "step": 379, + "epoch": 0.6073717948717948, + "cpu_mem": 1.8020352, + "gpu_mem": 4.427907072, + "loss": 1.3789, + "grad_norm": 0.40425634384155273, + "learning_rate": 0.00012037610213724862 + }, + { + "step": 380, + "epoch": 0.6089743589743589, + "cpu_mem": 1.802231808, + "gpu_mem": 4.42788096, + "loss": 1.3885, + "grad_norm": 0.1962461620569229, + "learning_rate": 0.0001195531171976646 + }, + { + "step": 381, + "epoch": 0.6105769230769231, + "cpu_mem": 1.802231808, + "gpu_mem": 4.427905536, + "loss": 1.4073, + "grad_norm": 0.20592878758907318, + "learning_rate": 0.00011873108706536448 + }, + { + "step": 382, + "epoch": 0.6121794871794872, + "cpu_mem": 1.802231808, + "gpu_mem": 4.427890176, + "loss": 1.3761, + "grad_norm": 0.34218963980674744, + "learning_rate": 0.00011791003751902542 + }, + { + "step": 383, + "epoch": 0.6137820512820513, + "cpu_mem": 1.802428416, + "gpu_mem": 4.427923968, + "loss": 1.3751, + "grad_norm": 0.39566659927368164, + "learning_rate": 0.00011708999430657325 + }, + { + "step": 384, + "epoch": 0.6153846153846154, + "cpu_mem": 1.802428416, + "gpu_mem": 4.427904, + "loss": 1.3916, + "grad_norm": 0.233755424618721, + "learning_rate": 0.00011627098314437586 + }, + { + "step": 385, + "epoch": 0.6169871794871795, + "cpu_mem": 1.802428416, + "gpu_mem": 4.42788864, + "loss": 1.3827, + "grad_norm": 0.20257943868637085, + "learning_rate": 0.0001154530297164359 + }, + { + "step": 386, + "epoch": 0.6185897435897436, + "cpu_mem": 1.802428416, + "gpu_mem": 4.427923968, + "loss": 1.3772, + "grad_norm": 0.1704542636871338, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 387, + "epoch": 0.6201923076923077, + "cpu_mem": 1.802428416, + "gpu_mem": 4.427930112, + "loss": 1.3951, + "grad_norm": 0.27482807636260986, + "learning_rate": 0.00011382039863268374 + }, + { + "step": 388, + "epoch": 0.6217948717948718, + "cpu_mem": 1.802428416, + "gpu_mem": 4.427893248, + "loss": 1.3941, + "grad_norm": 0.2758890688419342, + "learning_rate": 0.00011300577217580905 + }, + { + "step": 389, + "epoch": 0.6233974358974359, + "cpu_mem": 1.802625024, + "gpu_mem": 4.427871744, + "loss": 1.3965, + "grad_norm": 0.37312597036361694, + "learning_rate": 0.00011219230584946136 + }, + { + "step": 390, + "epoch": 0.625, + "cpu_mem": 1.802625024, + "gpu_mem": 4.427923968, + "loss": 1.4016, + "grad_norm": 0.3904739022254944, + "learning_rate": 0.00011138002516375864 + }, + { + "step": 391, + "epoch": 0.6266025641025641, + "cpu_mem": 1.802625024, + "gpu_mem": 4.427910144, + "loss": 1.3772, + "grad_norm": 0.22471962869167328, + "learning_rate": 0.00011056895559163748 + }, + { + "step": 392, + "epoch": 0.6282051282051282, + "cpu_mem": 1.802625024, + "gpu_mem": 4.427904, + "loss": 1.3844, + "grad_norm": 0.345672070980072, + "learning_rate": 0.00010975912256805436 + }, + { + "step": 393, + "epoch": 0.6298076923076923, + "cpu_mem": 1.802821632, + "gpu_mem": 4.427910144, + "loss": 1.408, + "grad_norm": 0.3802317976951599, + "learning_rate": 0.00010895055148918756 + }, + { + "step": 394, + "epoch": 0.6314102564102564, + "cpu_mem": 1.802821632, + "gpu_mem": 4.427887104, + "loss": 1.3854, + "grad_norm": 0.290274053812027, + "learning_rate": 0.00010814326771164141 + }, + { + "step": 395, + "epoch": 0.6330128205128205, + "cpu_mem": 1.802821632, + "gpu_mem": 4.427900928, + "loss": 1.3655, + "grad_norm": 0.2517794370651245, + "learning_rate": 0.00010733729655165054 + }, + { + "step": 396, + "epoch": 0.6346153846153846, + "cpu_mem": 1.80301824, + "gpu_mem": 4.427900928, + "loss": 1.3751, + "grad_norm": 0.22509697079658508, + "learning_rate": 0.00010653266328428628 + }, + { + "step": 397, + "epoch": 0.6362179487179487, + "cpu_mem": 1.80301824, + "gpu_mem": 4.427870208, + "loss": 1.3692, + "grad_norm": 0.5314700603485107, + "learning_rate": 0.00010572939314266402 + }, + { + "step": 398, + "epoch": 0.6378205128205128, + "cpu_mem": 1.80301824, + "gpu_mem": 4.427904, + "loss": 1.3883, + "grad_norm": 0.3750682473182678, + "learning_rate": 0.00010492751131715159 + }, + { + "step": 399, + "epoch": 0.6394230769230769, + "cpu_mem": 1.80301824, + "gpu_mem": 4.427882496, + "loss": 1.3814, + "grad_norm": 0.22395756840705872, + "learning_rate": 0.00010412704295457988 + }, + { + "step": 400, + "epoch": 0.6410256410256411, + "cpu_mem": 1.80301824, + "gpu_mem": 4.427890176, + "loss": 1.3741, + "grad_norm": 0.18852758407592773, + "learning_rate": 0.00010332801315745361 + }, + { + "step": 401, + "epoch": 0.6426282051282052, + "cpu_mem": 1.803214848, + "gpu_mem": 4.427908608, + "loss": 1.3777, + "grad_norm": 0.13868863880634308, + "learning_rate": 0.00010253044698316464 + }, + { + "step": 402, + "epoch": 0.6442307692307693, + "cpu_mem": 1.803214848, + "gpu_mem": 4.427876352, + "loss": 1.3865, + "grad_norm": 0.21915729343891144, + "learning_rate": 0.00010173436944320582 + }, + { + "step": 403, + "epoch": 0.6458333333333334, + "cpu_mem": 1.803214848, + "gpu_mem": 4.42788096, + "loss": 1.3972, + "grad_norm": 0.16191133856773376, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 404, + "epoch": 0.6474358974358975, + "cpu_mem": 1.803214848, + "gpu_mem": 4.427876352, + "loss": 1.3849, + "grad_norm": 0.22344857454299927, + "learning_rate": 0.00010014678007805106 + }, + { + "step": 405, + "epoch": 0.6490384615384616, + "cpu_mem": 1.803411456, + "gpu_mem": 4.427920896, + "loss": 1.3906, + "grad_norm": 0.20155151188373566, + "learning_rate": 9.935531803929469e-05 + }, + { + "step": 406, + "epoch": 0.6506410256410257, + "cpu_mem": 1.803411456, + "gpu_mem": 4.427904, + "loss": 1.3921, + "grad_norm": 0.3609553277492523, + "learning_rate": 9.856544420618624e-05 + }, + { + "step": 407, + "epoch": 0.6522435897435898, + "cpu_mem": 1.803608064, + "gpu_mem": 4.427893248, + "loss": 1.3825, + "grad_norm": 0.17551137506961823, + "learning_rate": 9.777718334898859e-05 + }, + { + "step": 408, + "epoch": 0.6538461538461539, + "cpu_mem": 1.803608064, + "gpu_mem": 4.427914752, + "loss": 1.3923, + "grad_norm": 0.4139014184474945, + "learning_rate": 9.699056018738192e-05 + }, + { + "step": 409, + "epoch": 0.655448717948718, + "cpu_mem": 1.803608064, + "gpu_mem": 4.42788096, + "loss": 1.4064, + "grad_norm": 0.21137481927871704, + "learning_rate": 9.62055993896888e-05 + }, + { + "step": 410, + "epoch": 0.657051282051282, + "cpu_mem": 1.803608064, + "gpu_mem": 4.42789632, + "loss": 1.3568, + "grad_norm": 0.2770235240459442, + "learning_rate": 9.542232557210039e-05 + }, + { + "step": 411, + "epoch": 0.6586538461538461, + "cpu_mem": 1.803608064, + "gpu_mem": 4.42789632, + "loss": 1.393, + "grad_norm": 0.23751987516880035, + "learning_rate": 9.464076329790451e-05 + }, + { + "step": 412, + "epoch": 0.6602564102564102, + "cpu_mem": 1.803804672, + "gpu_mem": 4.427887104, + "loss": 1.3863, + "grad_norm": 0.2553846538066864, + "learning_rate": 9.386093707671543e-05 + }, + { + "step": 413, + "epoch": 0.6618589743589743, + "cpu_mem": 1.803804672, + "gpu_mem": 4.427897856, + "loss": 1.4105, + "grad_norm": 0.5835396647453308, + "learning_rate": 9.308287136370511e-05 + }, + { + "step": 414, + "epoch": 0.6634615384615384, + "cpu_mem": 1.80400128, + "gpu_mem": 4.427922432, + "loss": 1.3732, + "grad_norm": 0.19769549369812012, + "learning_rate": 9.230659055883649e-05 + }, + { + "step": 415, + "epoch": 0.6650641025641025, + "cpu_mem": 1.80400128, + "gpu_mem": 4.427874816, + "loss": 1.3966, + "grad_norm": 0.32881098985671997, + "learning_rate": 9.15321190060981e-05 + }, + { + "step": 416, + "epoch": 0.6666666666666666, + "cpu_mem": 1.80400128, + "gpu_mem": 4.427910144, + "loss": 1.3897, + "grad_norm": 0.30016276240348816, + "learning_rate": 9.075948099274078e-05 + }, + { + "step": 417, + "epoch": 0.6682692307692307, + "cpu_mem": 1.80400128, + "gpu_mem": 4.427871744, + "loss": 1.3593, + "grad_norm": 0.3574639856815338, + "learning_rate": 8.998870074851604e-05 + }, + { + "step": 418, + "epoch": 0.6698717948717948, + "cpu_mem": 1.80400128, + "gpu_mem": 4.427890176, + "loss": 1.3956, + "grad_norm": 0.3793168365955353, + "learning_rate": 8.9219802444916e-05 + }, + { + "step": 419, + "epoch": 0.6714743589743589, + "cpu_mem": 1.80400128, + "gpu_mem": 4.427882496, + "loss": 1.3804, + "grad_norm": 0.2265537679195404, + "learning_rate": 8.845281019441583e-05 + }, + { + "step": 420, + "epoch": 0.6730769230769231, + "cpu_mem": 1.80400128, + "gpu_mem": 4.42791936, + "loss": 1.3802, + "grad_norm": 0.21161644160747528, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 421, + "epoch": 0.6746794871794872, + "cpu_mem": 1.80400128, + "gpu_mem": 4.427879424, + "loss": 1.3832, + "grad_norm": 0.25485891103744507, + "learning_rate": 8.692464000299362e-05 + }, + { + "step": 422, + "epoch": 0.6762820512820513, + "cpu_mem": 1.80400128, + "gpu_mem": 4.427893248, + "loss": 1.3896, + "grad_norm": 0.4393777847290039, + "learning_rate": 8.61635099851395e-05 + }, + { + "step": 423, + "epoch": 0.6778846153846154, + "cpu_mem": 1.804197888, + "gpu_mem": 4.427897856, + "loss": 1.3904, + "grad_norm": 0.3273153603076935, + "learning_rate": 8.540438186501792e-05 + }, + { + "step": 424, + "epoch": 0.6794871794871795, + "cpu_mem": 1.804197888, + "gpu_mem": 4.427859456, + "loss": 1.3788, + "grad_norm": 0.3870006203651428, + "learning_rate": 8.464727944871322e-05 + }, + { + "step": 425, + "epoch": 0.6810897435897436, + "cpu_mem": 1.804197888, + "gpu_mem": 4.427882496, + "loss": 1.3988, + "grad_norm": 0.20169475674629211, + "learning_rate": 8.389222647878426e-05 + }, + { + "step": 426, + "epoch": 0.6826923076923077, + "cpu_mem": 1.804197888, + "gpu_mem": 4.42788096, + "loss": 1.369, + "grad_norm": 0.2402469515800476, + "learning_rate": 8.313924663351926e-05 + }, + { + "step": 427, + "epoch": 0.6842948717948718, + "cpu_mem": 1.804197888, + "gpu_mem": 4.427899392, + "loss": 1.4078, + "grad_norm": 0.3025183379650116, + "learning_rate": 8.238836352619424e-05 + }, + { + "step": 428, + "epoch": 0.6858974358974359, + "cpu_mem": 1.804394496, + "gpu_mem": 4.42789632, + "loss": 1.3973, + "grad_norm": 0.19372686743736267, + "learning_rate": 8.163960070433164e-05 + }, + { + "step": 429, + "epoch": 0.6875, + "cpu_mem": 1.804591104, + "gpu_mem": 4.427894784, + "loss": 1.3908, + "grad_norm": 0.15103192627429962, + "learning_rate": 8.089298164896245e-05 + }, + { + "step": 430, + "epoch": 0.6891025641025641, + "cpu_mem": 1.804591104, + "gpu_mem": 4.427913216, + "loss": 1.37, + "grad_norm": 0.27180764079093933, + "learning_rate": 8.014852977388964e-05 + }, + { + "step": 431, + "epoch": 0.6907051282051282, + "cpu_mem": 1.804591104, + "gpu_mem": 4.427874816, + "loss": 1.3815, + "grad_norm": 0.18607938289642334, + "learning_rate": 7.940626842495362e-05 + }, + { + "step": 432, + "epoch": 0.6923076923076923, + "cpu_mem": 1.804591104, + "gpu_mem": 4.42791936, + "loss": 1.383, + "grad_norm": 0.22780287265777588, + "learning_rate": 7.866622087930074e-05 + }, + { + "step": 433, + "epoch": 0.6939102564102564, + "cpu_mem": 1.804591104, + "gpu_mem": 4.427884032, + "loss": 1.3854, + "grad_norm": 0.31185048818588257, + "learning_rate": 7.792841034465275e-05 + }, + { + "step": 434, + "epoch": 0.6955128205128205, + "cpu_mem": 1.804787712, + "gpu_mem": 4.42791168, + "loss": 1.3777, + "grad_norm": 0.24155549705028534, + "learning_rate": 7.719285995857938e-05 + }, + { + "step": 435, + "epoch": 0.6971153846153846, + "cpu_mem": 1.804787712, + "gpu_mem": 4.427891712, + "loss": 1.3868, + "grad_norm": 0.20373307168483734, + "learning_rate": 7.64595927877727e-05 + }, + { + "step": 436, + "epoch": 0.6987179487179487, + "cpu_mem": 1.804787712, + "gpu_mem": 4.427937792, + "loss": 1.3896, + "grad_norm": 0.4268084168434143, + "learning_rate": 7.572863182732332e-05 + }, + { + "step": 437, + "epoch": 0.7003205128205128, + "cpu_mem": 1.804787712, + "gpu_mem": 4.427902464, + "loss": 1.3893, + "grad_norm": 0.26201963424682617, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 438, + "epoch": 0.7019230769230769, + "cpu_mem": 1.804787712, + "gpu_mem": 4.427893248, + "loss": 1.3903, + "grad_norm": 0.2579062581062317, + "learning_rate": 7.42737201555302e-05 + }, + { + "step": 439, + "epoch": 0.7035256410256411, + "cpu_mem": 1.804787712, + "gpu_mem": 4.427887104, + "loss": 1.4031, + "grad_norm": 0.261303573846817, + "learning_rate": 7.354981506988387e-05 + }, + { + "step": 440, + "epoch": 0.7051282051282052, + "cpu_mem": 1.804787712, + "gpu_mem": 4.427871744, + "loss": 1.387, + "grad_norm": 0.24433138966560364, + "learning_rate": 7.282830744455895e-05 + }, + { + "step": 441, + "epoch": 0.7067307692307693, + "cpu_mem": 1.804787712, + "gpu_mem": 4.427890176, + "loss": 1.4059, + "grad_norm": 0.4248882234096527, + "learning_rate": 7.210921990586957e-05 + }, + { + "step": 442, + "epoch": 0.7083333333333334, + "cpu_mem": 1.80498432, + "gpu_mem": 4.427891712, + "loss": 1.3884, + "grad_norm": 0.3944558799266815, + "learning_rate": 7.139257500423665e-05 + }, + { + "step": 443, + "epoch": 0.7099358974358975, + "cpu_mem": 1.80498432, + "gpu_mem": 4.42789632, + "loss": 1.4068, + "grad_norm": 0.3191363215446472, + "learning_rate": 7.067839521348035e-05 + }, + { + "step": 444, + "epoch": 0.7115384615384616, + "cpu_mem": 1.805180928, + "gpu_mem": 4.427899392, + "loss": 1.3881, + "grad_norm": 0.1942290961742401, + "learning_rate": 6.996670293011575e-05 + }, + { + "step": 445, + "epoch": 0.7131410256410257, + "cpu_mem": 1.805180928, + "gpu_mem": 4.427893248, + "loss": 1.3893, + "grad_norm": 0.22707723081111908, + "learning_rate": 6.92575204726501e-05 + }, + { + "step": 446, + "epoch": 0.7147435897435898, + "cpu_mem": 1.805180928, + "gpu_mem": 4.42791936, + "loss": 1.3823, + "grad_norm": 0.19506552815437317, + "learning_rate": 6.855087008088307e-05 + }, + { + "step": 447, + "epoch": 0.7163461538461539, + "cpu_mem": 1.805180928, + "gpu_mem": 4.427887104, + "loss": 1.3888, + "grad_norm": 0.36654236912727356, + "learning_rate": 6.784677391520952e-05 + }, + { + "step": 448, + "epoch": 0.717948717948718, + "cpu_mem": 1.805180928, + "gpu_mem": 4.427914752, + "loss": 1.3923, + "grad_norm": 0.2960723638534546, + "learning_rate": 6.714525405592412e-05 + }, + { + "step": 449, + "epoch": 0.719551282051282, + "cpu_mem": 1.805180928, + "gpu_mem": 4.427922432, + "loss": 1.3761, + "grad_norm": 0.39275842905044556, + "learning_rate": 6.644633250252937e-05 + }, + { + "step": 450, + "epoch": 0.7211538461538461, + "cpu_mem": 1.805180928, + "gpu_mem": 4.427904, + "loss": 1.4066, + "grad_norm": 0.32106947898864746, + "learning_rate": 6.575003117304535e-05 + }, + { + "step": 451, + "epoch": 0.7227564102564102, + "cpu_mem": 1.805180928, + "gpu_mem": 4.427890176, + "loss": 1.3962, + "grad_norm": 0.27104365825653076, + "learning_rate": 6.50563719033225e-05 + }, + { + "step": 452, + "epoch": 0.7243589743589743, + "cpu_mem": 1.805180928, + "gpu_mem": 4.427900928, + "loss": 1.3866, + "grad_norm": 0.1974455863237381, + "learning_rate": 6.436537644635705e-05 + }, + { + "step": 453, + "epoch": 0.7259615384615384, + "cpu_mem": 1.805180928, + "gpu_mem": 4.427893248, + "loss": 1.3725, + "grad_norm": 0.20134980976581573, + "learning_rate": 6.367706647160847e-05 + }, + { + "step": 454, + "epoch": 0.7275641025641025, + "cpu_mem": 1.805377536, + "gpu_mem": 4.427910144, + "loss": 1.3728, + "grad_norm": 0.18424881994724274, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 455, + "epoch": 0.7291666666666666, + "cpu_mem": 1.805377536, + "gpu_mem": 4.427882496, + "loss": 1.3767, + "grad_norm": 0.29478615522384644, + "learning_rate": 6.230858922484288e-05 + }, + { + "step": 456, + "epoch": 0.7307692307692307, + "cpu_mem": 1.805377536, + "gpu_mem": 4.427913216, + "loss": 1.4062, + "grad_norm": 0.26753243803977966, + "learning_rate": 6.162846486795938e-05 + }, + { + "step": 457, + "epoch": 0.7323717948717948, + "cpu_mem": 1.805377536, + "gpu_mem": 4.427894784, + "loss": 1.3705, + "grad_norm": 0.3306908905506134, + "learning_rate": 6.095111182221422e-05 + }, + { + "step": 458, + "epoch": 0.7339743589743589, + "cpu_mem": 1.805377536, + "gpu_mem": 4.427882496, + "loss": 1.3896, + "grad_norm": 0.20654071867465973, + "learning_rate": 6.027655132924397e-05 + }, + { + "step": 459, + "epoch": 0.7355769230769231, + "cpu_mem": 1.805377536, + "gpu_mem": 4.427894784, + "loss": 1.3723, + "grad_norm": 0.31424593925476074, + "learning_rate": 5.960480454311155e-05 + }, + { + "step": 460, + "epoch": 0.7371794871794872, + "cpu_mem": 1.805377536, + "gpu_mem": 4.427900928, + "loss": 1.3869, + "grad_norm": 0.22227732837200165, + "learning_rate": 5.893589252964258e-05 + }, + { + "step": 461, + "epoch": 0.7387820512820513, + "cpu_mem": 1.805574144, + "gpu_mem": 4.42788864, + "loss": 1.3898, + "grad_norm": 0.24830330908298492, + "learning_rate": 5.826983626576479e-05 + }, + { + "step": 462, + "epoch": 0.7403846153846154, + "cpu_mem": 1.805574144, + "gpu_mem": 4.427877888, + "loss": 1.3781, + "grad_norm": 0.26875898241996765, + "learning_rate": 5.760665663885046e-05 + }, + { + "step": 463, + "epoch": 0.7419871794871795, + "cpu_mem": 1.805574144, + "gpu_mem": 4.427879424, + "loss": 1.3953, + "grad_norm": 0.3337668180465698, + "learning_rate": 5.6946374446060984e-05 + }, + { + "step": 464, + "epoch": 0.7435897435897436, + "cpu_mem": 1.805574144, + "gpu_mem": 4.427893248, + "loss": 1.3798, + "grad_norm": 0.15273812413215637, + "learning_rate": 5.6289010393695056e-05 + }, + { + "step": 465, + "epoch": 0.7451923076923077, + "cpu_mem": 1.805770752, + "gpu_mem": 4.42789632, + "loss": 1.3833, + "grad_norm": 0.1519613265991211, + "learning_rate": 5.563458509653904e-05 + }, + { + "step": 466, + "epoch": 0.7467948717948718, + "cpu_mem": 1.805770752, + "gpu_mem": 4.427907072, + "loss": 1.3894, + "grad_norm": 0.2369672805070877, + "learning_rate": 5.498311907722057e-05 + }, + { + "step": 467, + "epoch": 0.7483974358974359, + "cpu_mem": 1.805770752, + "gpu_mem": 4.42788096, + "loss": 1.3718, + "grad_norm": 0.22679972648620605, + "learning_rate": 5.43346327655652e-05 + }, + { + "step": 468, + "epoch": 0.75, + "cpu_mem": 1.805770752, + "gpu_mem": 4.42789632, + "loss": 1.3777, + "grad_norm": 0.18139639496803284, + "learning_rate": 5.3689146497955274e-05 + }, + { + "step": 469, + "epoch": 0.7516025641025641, + "cpu_mem": 1.805770752, + "gpu_mem": 4.427905536, + "loss": 1.3633, + "grad_norm": 0.25635141134262085, + "learning_rate": 5.30466805166927e-05 + }, + { + "step": 470, + "epoch": 0.7532051282051282, + "cpu_mem": 1.805770752, + "gpu_mem": 4.427879424, + "loss": 1.3883, + "grad_norm": 0.309585839509964, + "learning_rate": 5.240725496936372e-05 + }, + { + "step": 471, + "epoch": 0.7548076923076923, + "cpu_mem": 1.805770752, + "gpu_mem": 4.427885568, + "loss": 1.3913, + "grad_norm": 0.2767360806465149, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 472, + "epoch": 0.7564102564102564, + "cpu_mem": 1.805770752, + "gpu_mem": 4.427874816, + "loss": 1.3806, + "grad_norm": 0.423133909702301, + "learning_rate": 5.113760528948622e-05 + }, + { + "step": 473, + "epoch": 0.7580128205128205, + "cpu_mem": 1.805770752, + "gpu_mem": 4.42788096, + "loss": 1.3901, + "grad_norm": 0.21332073211669922, + "learning_rate": 5.05074209728614e-05 + }, + { + "step": 474, + "epoch": 0.7596153846153846, + "cpu_mem": 1.805770752, + "gpu_mem": 4.427917824, + "loss": 1.3924, + "grad_norm": 0.20767788589000702, + "learning_rate": 4.988035672076899e-05 + }, + { + "step": 475, + "epoch": 0.7612179487179487, + "cpu_mem": 1.805770752, + "gpu_mem": 4.4278656, + "loss": 1.3835, + "grad_norm": 0.2239665389060974, + "learning_rate": 4.925643219780052e-05 + }, + { + "step": 476, + "epoch": 0.7628205128205128, + "cpu_mem": 1.805770752, + "gpu_mem": 4.427885568, + "loss": 1.404, + "grad_norm": 0.3428148329257965, + "learning_rate": 4.863566697008634e-05 + }, + { + "step": 477, + "epoch": 0.7644230769230769, + "cpu_mem": 1.805770752, + "gpu_mem": 4.427885568, + "loss": 1.4155, + "grad_norm": 0.36329185962677, + "learning_rate": 4.801808050468219e-05 + }, + { + "step": 478, + "epoch": 0.7660256410256411, + "cpu_mem": 1.805770752, + "gpu_mem": 4.427884032, + "loss": 1.4026, + "grad_norm": 0.24411866068840027, + "learning_rate": 4.7403692168958305e-05 + }, + { + "step": 479, + "epoch": 0.7676282051282052, + "cpu_mem": 1.80596736, + "gpu_mem": 4.427882496, + "loss": 1.3573, + "grad_norm": 0.3033726215362549, + "learning_rate": 4.679252122999255e-05 + }, + { + "step": 480, + "epoch": 0.7692307692307693, + "cpu_mem": 1.80596736, + "gpu_mem": 4.427874816, + "loss": 1.3924, + "grad_norm": 0.2970454692840576, + "learning_rate": 4.618458685396579e-05 + }, + { + "step": 481, + "epoch": 0.7708333333333334, + "cpu_mem": 1.80596736, + "gpu_mem": 4.42793472, + "loss": 1.3842, + "grad_norm": 0.21477773785591125, + "learning_rate": 4.5579908105561016e-05 + }, + { + "step": 482, + "epoch": 0.7724358974358975, + "cpu_mem": 1.80596736, + "gpu_mem": 4.427879424, + "loss": 1.3729, + "grad_norm": 0.22061719000339508, + "learning_rate": 4.497850394736563e-05 + }, + { + "step": 483, + "epoch": 0.7740384615384616, + "cpu_mem": 1.80596736, + "gpu_mem": 4.427862528, + "loss": 1.3784, + "grad_norm": 0.2508181631565094, + "learning_rate": 4.438039323927648e-05 + }, + { + "step": 484, + "epoch": 0.7756410256410257, + "cpu_mem": 1.80596736, + "gpu_mem": 4.427893248, + "loss": 1.3895, + "grad_norm": 0.3781414330005646, + "learning_rate": 4.3785594737908676e-05 + }, + { + "step": 485, + "epoch": 0.7772435897435898, + "cpu_mem": 1.80596736, + "gpu_mem": 4.427937792, + "loss": 1.3735, + "grad_norm": 0.30316635966300964, + "learning_rate": 4.319412709600723e-05 + }, + { + "step": 486, + "epoch": 0.7788461538461539, + "cpu_mem": 1.80596736, + "gpu_mem": 4.427917824, + "loss": 1.4053, + "grad_norm": 0.2677236497402191, + "learning_rate": 4.2606008861862116e-05 + }, + { + "step": 487, + "epoch": 0.780448717948718, + "cpu_mem": 1.80596736, + "gpu_mem": 4.427917824, + "loss": 1.394, + "grad_norm": 0.23909536004066467, + "learning_rate": 4.2021258478726774e-05 + }, + { + "step": 488, + "epoch": 0.782051282051282, + "cpu_mem": 1.806163968, + "gpu_mem": 4.427884032, + "loss": 1.4156, + "grad_norm": 0.37814927101135254, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 489, + "epoch": 0.7836538461538461, + "cpu_mem": 1.806163968, + "gpu_mem": 4.427908608, + "loss": 1.3623, + "grad_norm": 0.16496235132217407, + "learning_rate": 4.0861934509848507e-05 + }, + { + "step": 490, + "epoch": 0.7852564102564102, + "cpu_mem": 1.806163968, + "gpu_mem": 4.42791168, + "loss": 1.3821, + "grad_norm": 0.20373417437076569, + "learning_rate": 4.028739728024022e-05 + }, + { + "step": 491, + "epoch": 0.7868589743589743, + "cpu_mem": 1.806163968, + "gpu_mem": 4.427890176, + "loss": 1.3825, + "grad_norm": 0.2962764799594879, + "learning_rate": 3.971630061277077e-05 + }, + { + "step": 492, + "epoch": 0.7884615384615384, + "cpu_mem": 1.806163968, + "gpu_mem": 4.427913216, + "loss": 1.386, + "grad_norm": 0.40625646710395813, + "learning_rate": 3.914866241690115e-05 + }, + { + "step": 493, + "epoch": 0.7900641025641025, + "cpu_mem": 1.806163968, + "gpu_mem": 4.427893248, + "loss": 1.3773, + "grad_norm": 0.23542991280555725, + "learning_rate": 3.858450049363532e-05 + }, + { + "step": 494, + "epoch": 0.7916666666666666, + "cpu_mem": 1.806163968, + "gpu_mem": 4.427917824, + "loss": 1.3796, + "grad_norm": 0.21008829772472382, + "learning_rate": 3.8023832534962314e-05 + }, + { + "step": 495, + "epoch": 0.7932692307692307, + "cpu_mem": 1.806163968, + "gpu_mem": 4.427900928, + "loss": 1.3858, + "grad_norm": 0.14179249107837677, + "learning_rate": 3.746667612330109e-05 + }, + { + "step": 496, + "epoch": 0.7948717948717948, + "cpu_mem": 1.806163968, + "gpu_mem": 4.42789632, + "loss": 1.3876, + "grad_norm": 0.3233911693096161, + "learning_rate": 3.691304873094927e-05 + }, + { + "step": 497, + "epoch": 0.7964743589743589, + "cpu_mem": 1.806163968, + "gpu_mem": 4.427908608, + "loss": 1.385, + "grad_norm": 0.2537291944026947, + "learning_rate": 3.636296771953544e-05 + }, + { + "step": 498, + "epoch": 0.7980769230769231, + "cpu_mem": 1.806163968, + "gpu_mem": 4.427879424, + "loss": 1.381, + "grad_norm": 0.23926571011543274, + "learning_rate": 3.581645033947425e-05 + }, + { + "step": 499, + "epoch": 0.7996794871794872, + "cpu_mem": 1.806163968, + "gpu_mem": 4.427893248, + "loss": 1.3802, + "grad_norm": 0.22145840525627136, + "learning_rate": 3.527351372942588e-05 + }, + { + "step": 500, + "epoch": 0.8012820512820513, + "cpu_mem": 1.806360576, + "gpu_mem": 4.427879424, + "loss": 1.3726, + "grad_norm": 0.2471926510334015, + "learning_rate": 3.473417491575824e-05 + }, + { + "step": 501, + "epoch": 0.8028846153846154, + "cpu_mem": 1.806360576, + "gpu_mem": 4.42787328, + "loss": 1.3968, + "grad_norm": 0.152239590883255, + "learning_rate": 3.41984508120132e-05 + }, + { + "step": 502, + "epoch": 0.8044871794871795, + "cpu_mem": 1.806360576, + "gpu_mem": 4.427879424, + "loss": 1.3877, + "grad_norm": 0.1300077736377716, + "learning_rate": 3.366635821837627e-05 + }, + { + "step": 503, + "epoch": 0.8060897435897436, + "cpu_mem": 1.806360576, + "gpu_mem": 4.427893248, + "loss": 1.383, + "grad_norm": 0.17479684948921204, + "learning_rate": 3.3137913821149425e-05 + }, + { + "step": 504, + "epoch": 0.8076923076923077, + "cpu_mem": 1.806360576, + "gpu_mem": 4.427876352, + "loss": 1.3841, + "grad_norm": 0.3763291835784912, + "learning_rate": 3.261313419222825e-05 + }, + { + "step": 505, + "epoch": 0.8092948717948718, + "cpu_mem": 1.806360576, + "gpu_mem": 4.427930112, + "loss": 1.3806, + "grad_norm": 0.1937958300113678, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 506, + "epoch": 0.8108974358974359, + "cpu_mem": 1.806360576, + "gpu_mem": 4.42787328, + "loss": 1.3845, + "grad_norm": 0.17072072625160217, + "learning_rate": 3.157463495173713e-05 + }, + { + "step": 507, + "epoch": 0.8125, + "cpu_mem": 1.806360576, + "gpu_mem": 4.427951616, + "loss": 1.4029, + "grad_norm": 0.2708168029785156, + "learning_rate": 3.1060947907265936e-05 + }, + { + "step": 508, + "epoch": 0.8141025641025641, + "cpu_mem": 1.806557184, + "gpu_mem": 4.427894784, + "loss": 1.3687, + "grad_norm": 0.45605719089508057, + "learning_rate": 3.0550990764276634e-05 + }, + { + "step": 509, + "epoch": 0.8157051282051282, + "cpu_mem": 1.806557184, + "gpu_mem": 4.427913216, + "loss": 1.3759, + "grad_norm": 0.2190495878458023, + "learning_rate": 3.00447795149086e-05 + }, + { + "step": 510, + "epoch": 0.8173076923076923, + "cpu_mem": 1.806557184, + "gpu_mem": 4.42788864, + "loss": 1.4045, + "grad_norm": 0.3240780234336853, + "learning_rate": 2.9542330033830884e-05 + }, + { + "step": 511, + "epoch": 0.8189102564102564, + "cpu_mem": 1.806557184, + "gpu_mem": 4.427920896, + "loss": 1.3703, + "grad_norm": 0.22606100142002106, + "learning_rate": 2.9043658077744316e-05 + }, + { + "step": 512, + "epoch": 0.8205128205128205, + "cpu_mem": 1.806557184, + "gpu_mem": 4.427940864, + "loss": 1.3877, + "grad_norm": 0.27100232243537903, + "learning_rate": 2.8548779284887442e-05 + }, + { + "step": 513, + "epoch": 0.8221153846153846, + "cpu_mem": 1.806557184, + "gpu_mem": 4.427870208, + "loss": 1.3825, + "grad_norm": 0.2193320393562317, + "learning_rate": 2.805770917454614e-05 + }, + { + "step": 514, + "epoch": 0.8237179487179487, + "cpu_mem": 1.806557184, + "gpu_mem": 4.427884032, + "loss": 1.3755, + "grad_norm": 0.27541568875312805, + "learning_rate": 2.7570463146566758e-05 + }, + { + "step": 515, + "epoch": 0.8253205128205128, + "cpu_mem": 1.806557184, + "gpu_mem": 4.427868672, + "loss": 1.3716, + "grad_norm": 0.2815854847431183, + "learning_rate": 2.708705648087332e-05 + }, + { + "step": 516, + "epoch": 0.8269230769230769, + "cpu_mem": 1.806557184, + "gpu_mem": 4.427907072, + "loss": 1.3837, + "grad_norm": 0.1636969894170761, + "learning_rate": 2.6607504336988317e-05 + }, + { + "step": 517, + "epoch": 0.8285256410256411, + "cpu_mem": 1.806557184, + "gpu_mem": 4.427907072, + "loss": 1.3968, + "grad_norm": 0.2396899163722992, + "learning_rate": 2.613182175355739e-05 + }, + { + "step": 518, + "epoch": 0.8301282051282052, + "cpu_mem": 1.806557184, + "gpu_mem": 4.427893248, + "loss": 1.3817, + "grad_norm": 0.23152011632919312, + "learning_rate": 2.5660023647877644e-05 + }, + { + "step": 519, + "epoch": 0.8317307692307693, + "cpu_mem": 1.806557184, + "gpu_mem": 4.427884032, + "loss": 1.3743, + "grad_norm": 0.1589415967464447, + "learning_rate": 2.5192124815429777e-05 + }, + { + "step": 520, + "epoch": 0.8333333333333334, + "cpu_mem": 1.806753792, + "gpu_mem": 4.42788864, + "loss": 1.3863, + "grad_norm": 0.15506024658679962, + "learning_rate": 2.472813992941418e-05 + }, + { + "step": 521, + "epoch": 0.8349358974358975, + "cpu_mem": 1.806753792, + "gpu_mem": 4.427891712, + "loss": 1.3759, + "grad_norm": 0.22831925749778748, + "learning_rate": 2.426808354029078e-05 + }, + { + "step": 522, + "epoch": 0.8365384615384616, + "cpu_mem": 1.806753792, + "gpu_mem": 4.427897856, + "loss": 1.3882, + "grad_norm": 0.19787436723709106, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 523, + "epoch": 0.8381410256410257, + "cpu_mem": 1.806753792, + "gpu_mem": 4.427916288, + "loss": 1.3811, + "grad_norm": 0.15342572331428528, + "learning_rate": 2.3359813838124277e-05 + }, + { + "step": 524, + "epoch": 0.8397435897435898, + "cpu_mem": 1.806753792, + "gpu_mem": 4.427910144, + "loss": 1.3872, + "grad_norm": 0.2460925132036209, + "learning_rate": 2.2911629008211363e-05 + }, + { + "step": 525, + "epoch": 0.8413461538461539, + "cpu_mem": 1.806753792, + "gpu_mem": 4.427887104, + "loss": 1.3711, + "grad_norm": 0.2153356373310089, + "learning_rate": 2.24674296405579e-05 + }, + { + "step": 526, + "epoch": 0.842948717948718, + "cpu_mem": 1.806753792, + "gpu_mem": 4.427874816, + "loss": 1.3774, + "grad_norm": 0.25654342770576477, + "learning_rate": 2.2027229665154446e-05 + }, + { + "step": 527, + "epoch": 0.844551282051282, + "cpu_mem": 1.806753792, + "gpu_mem": 4.427841024, + "loss": 1.3867, + "grad_norm": 0.45844128727912903, + "learning_rate": 2.1591042886571634e-05 + }, + { + "step": 528, + "epoch": 0.8461538461538461, + "cpu_mem": 1.806753792, + "gpu_mem": 4.42788864, + "loss": 1.3855, + "grad_norm": 0.22111186385154724, + "learning_rate": 2.1158882983527166e-05 + }, + { + "step": 529, + "epoch": 0.8477564102564102, + "cpu_mem": 1.806753792, + "gpu_mem": 4.427854848, + "loss": 1.3764, + "grad_norm": 0.2175738513469696, + "learning_rate": 2.0730763508456738e-05 + }, + { + "step": 530, + "epoch": 0.8493589743589743, + "cpu_mem": 1.806753792, + "gpu_mem": 4.427902464, + "loss": 1.3775, + "grad_norm": 0.2879142761230469, + "learning_rate": 2.0306697887089235e-05 + }, + { + "step": 531, + "epoch": 0.8509615384615384, + "cpu_mem": 1.806753792, + "gpu_mem": 4.427900928, + "loss": 1.3906, + "grad_norm": 0.17733579874038696, + "learning_rate": 1.9886699418025543e-05 + }, + { + "step": 532, + "epoch": 0.8525641025641025, + "cpu_mem": 1.806753792, + "gpu_mem": 4.427902464, + "loss": 1.3998, + "grad_norm": 0.33939772844314575, + "learning_rate": 1.947078127232169e-05 + }, + { + "step": 533, + "epoch": 0.8541666666666666, + "cpu_mem": 1.806753792, + "gpu_mem": 4.42791168, + "loss": 1.3782, + "grad_norm": 0.23376309871673584, + "learning_rate": 1.9058956493075644e-05 + }, + { + "step": 534, + "epoch": 0.8557692307692307, + "cpu_mem": 1.806753792, + "gpu_mem": 4.427887104, + "loss": 1.3879, + "grad_norm": 0.21135689318180084, + "learning_rate": 1.8651237995018324e-05 + }, + { + "step": 535, + "epoch": 0.8573717948717948, + "cpu_mem": 1.806753792, + "gpu_mem": 4.427871744, + "loss": 1.3798, + "grad_norm": 0.18904167413711548, + "learning_rate": 1.8247638564108607e-05 + }, + { + "step": 536, + "epoch": 0.8589743589743589, + "cpu_mem": 1.806753792, + "gpu_mem": 4.427900928, + "loss": 1.3856, + "grad_norm": 0.3161558210849762, + "learning_rate": 1.7848170857132325e-05 + }, + { + "step": 537, + "epoch": 0.8605769230769231, + "cpu_mem": 1.806753792, + "gpu_mem": 4.427914752, + "loss": 1.3776, + "grad_norm": 0.2435716986656189, + "learning_rate": 1.7452847401305496e-05 + }, + { + "step": 538, + "epoch": 0.8621794871794872, + "cpu_mem": 1.8069504, + "gpu_mem": 4.427870208, + "loss": 1.3862, + "grad_norm": 0.36250630021095276, + "learning_rate": 1.7061680593881344e-05 + }, + { + "step": 539, + "epoch": 0.8637820512820513, + "cpu_mem": 1.8069504, + "gpu_mem": 4.427876352, + "loss": 1.3721, + "grad_norm": 0.33724355697631836, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 540, + "epoch": 0.8653846153846154, + "cpu_mem": 1.8069504, + "gpu_mem": 4.427905536, + "loss": 1.3739, + "grad_norm": 0.15495432913303375, + "learning_rate": 1.6291865861111353e-05 + }, + { + "step": 541, + "epoch": 0.8669871794871795, + "cpu_mem": 1.8069504, + "gpu_mem": 4.427900928, + "loss": 1.3814, + "grad_norm": 0.27987807989120483, + "learning_rate": 1.5913242076979493e-05 + }, + { + "step": 542, + "epoch": 0.8685897435897436, + "cpu_mem": 1.8069504, + "gpu_mem": 4.427887104, + "loss": 1.3854, + "grad_norm": 0.206812784075737, + "learning_rate": 1.5538823222921288e-05 + }, + { + "step": 543, + "epoch": 0.8701923076923077, + "cpu_mem": 1.8069504, + "gpu_mem": 4.427900928, + "loss": 1.3863, + "grad_norm": 0.280670702457428, + "learning_rate": 1.5168621040626388e-05 + }, + { + "step": 544, + "epoch": 0.8717948717948718, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427890176, + "loss": 1.3676, + "grad_norm": 0.24905936419963837, + "learning_rate": 1.4802647139550577e-05 + }, + { + "step": 545, + "epoch": 0.8733974358974359, + "cpu_mem": 1.807147008, + "gpu_mem": 4.42789632, + "loss": 1.3649, + "grad_norm": 0.15757879614830017, + "learning_rate": 1.444091299655175e-05 + }, + { + "step": 546, + "epoch": 0.875, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427900928, + "loss": 1.3878, + "grad_norm": 0.23866796493530273, + "learning_rate": 1.408342995552988e-05 + }, + { + "step": 547, + "epoch": 0.8766025641025641, + "cpu_mem": 1.807147008, + "gpu_mem": 4.42789632, + "loss": 1.3857, + "grad_norm": 0.1899343878030777, + "learning_rate": 1.3730209227071436e-05 + }, + { + "step": 548, + "epoch": 0.8782051282051282, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427870208, + "loss": 1.3991, + "grad_norm": 0.23497949540615082, + "learning_rate": 1.3381261888097755e-05 + }, + { + "step": 549, + "epoch": 0.8798076923076923, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427879424, + "loss": 1.3856, + "grad_norm": 0.31674185395240784, + "learning_rate": 1.303659888151753e-05 + }, + { + "step": 550, + "epoch": 0.8814102564102564, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427897856, + "loss": 1.374, + "grad_norm": 0.2980993688106537, + "learning_rate": 1.2696231015883913e-05 + }, + { + "step": 551, + "epoch": 0.8830128205128205, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427868672, + "loss": 1.3932, + "grad_norm": 0.3357037603855133, + "learning_rate": 1.2360168965055301e-05 + }, + { + "step": 552, + "epoch": 0.8846153846153846, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427899392, + "loss": 1.3755, + "grad_norm": 0.3329955041408539, + "learning_rate": 1.2028423267860805e-05 + }, + { + "step": 553, + "epoch": 0.8862179487179487, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427908608, + "loss": 1.3941, + "grad_norm": 0.2712467312812805, + "learning_rate": 1.1701004327769709e-05 + }, + { + "step": 554, + "epoch": 0.8878205128205128, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427870208, + "loss": 1.3822, + "grad_norm": 0.18637706339359283, + "learning_rate": 1.1377922412565005e-05 + }, + { + "step": 555, + "epoch": 0.8894230769230769, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427874816, + "loss": 1.3695, + "grad_norm": 0.2327679991722107, + "learning_rate": 1.1059187654021762e-05 + }, + { + "step": 556, + "epoch": 0.8910256410256411, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427899392, + "loss": 1.3753, + "grad_norm": 0.19190791249275208, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 557, + "epoch": 0.8926282051282052, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427917824, + "loss": 1.3702, + "grad_norm": 0.16021092236042023, + "learning_rate": 1.0434799452076915e-05 + }, + { + "step": 558, + "epoch": 0.8942307692307693, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427899392, + "loss": 1.3991, + "grad_norm": 0.44699370861053467, + "learning_rate": 1.0129165589346643e-05 + }, + { + "step": 559, + "epoch": 0.8958333333333334, + "cpu_mem": 1.807147008, + "gpu_mem": 4.42795008, + "loss": 1.3896, + "grad_norm": 0.22874058783054352, + "learning_rate": 9.82791804400626e-06 + }, + { + "step": 560, + "epoch": 0.8974358974358975, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427882496, + "loss": 1.3674, + "grad_norm": 0.2311699092388153, + "learning_rate": 9.531066263109971e-06 + }, + { + "step": 561, + "epoch": 0.8990384615384616, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427884032, + "loss": 1.3834, + "grad_norm": 0.21745900809764862, + "learning_rate": 9.238619555861731e-06 + }, + { + "step": 562, + "epoch": 0.9006410256410257, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427884032, + "loss": 1.4094, + "grad_norm": 0.28689590096473694, + "learning_rate": 8.950587093323435e-06 + }, + { + "step": 563, + "epoch": 0.9022435897435898, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427890176, + "loss": 1.3719, + "grad_norm": 0.25520288944244385, + "learning_rate": 8.66697790812731e-06 + }, + { + "step": 564, + "epoch": 0.9038461538461539, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427904, + "loss": 1.3859, + "grad_norm": 0.24004840850830078, + "learning_rate": 8.387800894192453e-06 + }, + { + "step": 565, + "epoch": 0.905448717948718, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427908608, + "loss": 1.4022, + "grad_norm": 0.22257260978221893, + "learning_rate": 8.113064806446285e-06 + }, + { + "step": 566, + "epoch": 0.907051282051282, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427902464, + "loss": 1.3919, + "grad_norm": 0.28382784128189087, + "learning_rate": 7.842778260549654e-06 + }, + { + "step": 567, + "epoch": 0.9086538461538461, + "cpu_mem": 1.807147008, + "gpu_mem": 4.42789632, + "loss": 1.4048, + "grad_norm": 0.35134342312812805, + "learning_rate": 7.5769497326268804e-06 + }, + { + "step": 568, + "epoch": 0.9102564102564102, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427910144, + "loss": 1.3811, + "grad_norm": 0.35479477047920227, + "learning_rate": 7.315587558999864e-06 + }, + { + "step": 569, + "epoch": 0.9118589743589743, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427902464, + "loss": 1.3781, + "grad_norm": 0.22355209290981293, + "learning_rate": 7.058699935926526e-06 + }, + { + "step": 570, + "epoch": 0.9134615384615384, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427887104, + "loss": 1.4047, + "grad_norm": 0.35810643434524536, + "learning_rate": 6.8062949193440515e-06 + }, + { + "step": 571, + "epoch": 0.9150641025641025, + "cpu_mem": 1.807147008, + "gpu_mem": 4.42789632, + "loss": 1.3824, + "grad_norm": 0.17156344652175903, + "learning_rate": 6.5583804246160385e-06 + }, + { + "step": 572, + "epoch": 0.9166666666666666, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427905536, + "loss": 1.3789, + "grad_norm": 0.22839798033237457, + "learning_rate": 6.3149642262843804e-06 + }, + { + "step": 573, + "epoch": 0.9182692307692307, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427908608, + "loss": 1.3922, + "grad_norm": 0.3308348059654236, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 574, + "epoch": 0.9198717948717948, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427862528, + "loss": 1.3767, + "grad_norm": 0.20727093517780304, + "learning_rate": 5.84165711141048e-06 + }, + { + "step": 575, + "epoch": 0.9214743589743589, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427916288, + "loss": 1.3826, + "grad_norm": 0.28668805956840515, + "learning_rate": 5.611781037671176e-06 + }, + { + "step": 576, + "epoch": 0.9230769230769231, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427917824, + "loss": 1.3919, + "grad_norm": 0.1984148472547531, + "learning_rate": 5.386432945468555e-06 + }, + { + "step": 577, + "epoch": 0.9246794871794872, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427862528, + "loss": 1.3906, + "grad_norm": 0.19590599834918976, + "learning_rate": 5.165619901667311e-06 + }, + { + "step": 578, + "epoch": 0.9262820512820513, + "cpu_mem": 1.807147008, + "gpu_mem": 4.42789632, + "loss": 1.3839, + "grad_norm": 0.31229159235954285, + "learning_rate": 4.949348830914002e-06 + }, + { + "step": 579, + "epoch": 0.9278846153846154, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427874816, + "loss": 1.3642, + "grad_norm": 0.2148614078760147, + "learning_rate": 4.737626515419951e-06 + }, + { + "step": 580, + "epoch": 0.9294871794871795, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427905536, + "loss": 1.376, + "grad_norm": 0.16044513881206512, + "learning_rate": 4.530459594748592e-06 + }, + { + "step": 581, + "epoch": 0.9310897435897436, + "cpu_mem": 1.807147008, + "gpu_mem": 4.42788096, + "loss": 1.3829, + "grad_norm": 0.36317330598831177, + "learning_rate": 4.327854565607164e-06 + }, + { + "step": 582, + "epoch": 0.9326923076923077, + "cpu_mem": 1.807147008, + "gpu_mem": 4.427914752, + "loss": 1.3817, + "grad_norm": 0.24774780869483948, + "learning_rate": 4.129817781643091e-06 + }, + { + "step": 583, + "epoch": 0.9342948717948718, + "cpu_mem": 1.807343616, + "gpu_mem": 4.42793472, + "loss": 1.3927, + "grad_norm": 0.22647827863693237, + "learning_rate": 3.9363554532446276e-06 + }, + { + "step": 584, + "epoch": 0.9358974358974359, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427899392, + "loss": 1.3823, + "grad_norm": 0.19259247183799744, + "learning_rate": 3.7474736473461607e-06 + }, + { + "step": 585, + "epoch": 0.9375, + "cpu_mem": 1.807343616, + "gpu_mem": 4.42791936, + "loss": 1.3744, + "grad_norm": 0.2021103799343109, + "learning_rate": 3.56317828723795e-06 + }, + { + "step": 586, + "epoch": 0.9391025641025641, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427899392, + "loss": 1.3813, + "grad_norm": 0.2729724943637848, + "learning_rate": 3.383475152380355e-06 + }, + { + "step": 587, + "epoch": 0.9407051282051282, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427900928, + "loss": 1.3737, + "grad_norm": 0.2996758222579956, + "learning_rate": 3.2083698782225997e-06 + }, + { + "step": 588, + "epoch": 0.9423076923076923, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427893248, + "loss": 1.3848, + "grad_norm": 0.2452775537967682, + "learning_rate": 3.0378679560260467e-06 + }, + { + "step": 589, + "epoch": 0.9439102564102564, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427891712, + "loss": 1.3728, + "grad_norm": 0.33303824067115784, + "learning_rate": 2.871974732691984e-06 + }, + { + "step": 590, + "epoch": 0.9455128205128205, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427905536, + "loss": 1.397, + "grad_norm": 0.2249879091978073, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 591, + "epoch": 0.9471153846153846, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427876352, + "loss": 1.3938, + "grad_norm": 0.23524099588394165, + "learning_rate": 2.554035047414732e-06 + }, + { + "step": 592, + "epoch": 0.9487179487179487, + "cpu_mem": 1.807343616, + "gpu_mem": 4.42792704, + "loss": 1.3907, + "grad_norm": 0.3531104624271393, + "learning_rate": 2.401998555987389e-06 + }, + { + "step": 593, + "epoch": 0.9503205128205128, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427923968, + "loss": 1.375, + "grad_norm": 0.19632327556610107, + "learning_rate": 2.2545907041415457e-06 + }, + { + "step": 594, + "epoch": 0.9519230769230769, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427904, + "loss": 1.3864, + "grad_norm": 0.24451696872711182, + "learning_rate": 2.1118161145537436e-06 + }, + { + "step": 595, + "epoch": 0.9535256410256411, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427885568, + "loss": 1.3924, + "grad_norm": 0.22732041776180267, + "learning_rate": 1.973679264602485e-06 + }, + { + "step": 596, + "epoch": 0.9551282051282052, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427894784, + "loss": 1.3999, + "grad_norm": 0.17179137468338013, + "learning_rate": 1.840184486227808e-06 + }, + { + "step": 597, + "epoch": 0.9567307692307693, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427862528, + "loss": 1.3727, + "grad_norm": 0.2046767920255661, + "learning_rate": 1.711335965795435e-06 + }, + { + "step": 598, + "epoch": 0.9583333333333334, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427923968, + "loss": 1.381, + "grad_norm": 0.24178598821163177, + "learning_rate": 1.5871377439655054e-06 + }, + { + "step": 599, + "epoch": 0.9599358974358975, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427922432, + "loss": 1.3962, + "grad_norm": 0.20198361575603485, + "learning_rate": 1.4675937155658456e-06 + }, + { + "step": 600, + "epoch": 0.9615384615384616, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427877888, + "loss": 1.3802, + "grad_norm": 0.2751697301864624, + "learning_rate": 1.3527076294698846e-06 + }, + { + "step": 601, + "epoch": 0.9631410256410257, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427910144, + "loss": 1.363, + "grad_norm": 0.18869523704051971, + "learning_rate": 1.2424830884790126e-06 + }, + { + "step": 602, + "epoch": 0.9647435897435898, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427904, + "loss": 1.3743, + "grad_norm": 0.2958340644836426, + "learning_rate": 1.1369235492096397e-06 + }, + { + "step": 603, + "epoch": 0.9663461538461539, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427890176, + "loss": 1.3912, + "grad_norm": 0.3308315873146057, + "learning_rate": 1.0360323219847645e-06 + }, + { + "step": 604, + "epoch": 0.967948717948718, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427890176, + "loss": 1.3762, + "grad_norm": 0.22092914581298828, + "learning_rate": 9.398125707302084e-07 + }, + { + "step": 605, + "epoch": 0.969551282051282, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427916288, + "loss": 1.3799, + "grad_norm": 0.21797901391983032, + "learning_rate": 8.482673128753947e-07 + }, + { + "step": 606, + "epoch": 0.9711538461538461, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427904, + "loss": 1.3933, + "grad_norm": 0.2387133091688156, + "learning_rate": 7.613994192586736e-07 + }, + { + "step": 607, + "epoch": 0.9727564102564102, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427894784, + "loss": 1.38, + "grad_norm": 0.20657935738563538, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 608, + "epoch": 0.9743589743589743, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427887104, + "loss": 1.378, + "grad_norm": 0.19979049265384674, + "learning_rate": 6.017064746021094e-07 + }, + { + "step": 609, + "epoch": 0.9759615384615384, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427910144, + "loss": 1.3766, + "grad_norm": 0.3033987879753113, + "learning_rate": 5.288864314965003e-07 + }, + { + "step": 610, + "epoch": 0.9775641025641025, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427899392, + "loss": 1.3636, + "grad_norm": 0.23238056898117065, + "learning_rate": 4.607537683404106e-07 + }, + { + "step": 611, + "epoch": 0.9791666666666666, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427884032, + "loss": 1.394, + "grad_norm": 0.3170299828052521, + "learning_rate": 3.973106217585842e-07 + }, + { + "step": 612, + "epoch": 0.9807692307692307, + "cpu_mem": 1.807343616, + "gpu_mem": 4.42795008, + "loss": 1.3795, + "grad_norm": 0.3350628614425659, + "learning_rate": 3.3855898131356915e-07 + }, + { + "step": 613, + "epoch": 0.9823717948717948, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427893248, + "loss": 1.3899, + "grad_norm": 0.19352863729000092, + "learning_rate": 2.845006894433843e-07 + }, + { + "step": 614, + "epoch": 0.9839743589743589, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427879424, + "loss": 1.3781, + "grad_norm": 0.23144470155239105, + "learning_rate": 2.351374414037155e-07 + }, + { + "step": 615, + "epoch": 0.9855769230769231, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427943936, + "loss": 1.3935, + "grad_norm": 0.34928619861602783, + "learning_rate": 1.9047078521474135e-07 + }, + { + "step": 616, + "epoch": 0.9871794871794872, + "cpu_mem": 1.807343616, + "gpu_mem": 4.42787328, + "loss": 1.3861, + "grad_norm": 0.22614452242851257, + "learning_rate": 1.505021216125557e-07 + }, + { + "step": 617, + "epoch": 0.9887820512820513, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427900928, + "loss": 1.3679, + "grad_norm": 0.2261040061712265, + "learning_rate": 1.1523270400535245e-07 + }, + { + "step": 618, + "epoch": 0.9903846153846154, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427904, + "loss": 1.3892, + "grad_norm": 0.2079881876707077, + "learning_rate": 8.466363843397383e-08 + }, + { + "step": 619, + "epoch": 0.9919871794871795, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427879424, + "loss": 1.3827, + "grad_norm": 0.30253365635871887, + "learning_rate": 5.8795883537338106e-08 + }, + { + "step": 620, + "epoch": 0.9935897435897436, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427907072, + "loss": 1.3835, + "grad_norm": 0.22048328816890717, + "learning_rate": 3.763025052231361e-08 + }, + { + "step": 621, + "epoch": 0.9951923076923077, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427916288, + "loss": 1.3869, + "grad_norm": 0.21798859536647797, + "learning_rate": 2.1167403138339088e-08 + }, + { + "step": 622, + "epoch": 0.9967948717948718, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427913216, + "loss": 1.4059, + "grad_norm": 0.2274550348520279, + "learning_rate": 9.407857656540397e-09 + }, + { + "step": 623, + "epoch": 0.9983974358974359, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427885568, + "loss": 1.3869, + "grad_norm": 0.2927607595920563, + "learning_rate": 2.3519828535434325e-09 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427632128, + "loss": 1.3875, + "grad_norm": 0.27612918615341187, + "learning_rate": 0.0 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.807343616, + "gpu_mem": 4.427632128, + "train_runtime": 8047.069, + "train_samples_per_second": 4.959, + "train_steps_per_second": 0.078, + "total_flos": 0.0, + "train_loss": 1.4574319168161123 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r32-a2/adapter_config.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4d69c6bc9ef572e681044e096143c4cad32a3229 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r32-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 64, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 32, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "D" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r32-a2/eval_results.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..47df7e1892832c6cee101a7429157b4b9332eb00 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "hellaswag", + "results": 0.8305118502290381 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r32-a2/training_configuration.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..dbc181bfcfe8ae2e1a53a54d4ce06b243f8913a5 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "HELLASWAG", + "dataset_id": "Rowan/hellaswag", + "preprocess_id": "hellaswag_train_deepeval" + }, + "peft_config": { + "method": "abl_D", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 12773376 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 1, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_D-hellaswag-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r32-a2", + "seed": 42, + "timestamp": "2025-09-02T22:47:17.048106" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r32-a2/training_logs.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..32eb3f551ee78d2dc405d71533dd9a667ca599c5 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r32-a2/training_logs.json @@ -0,0 +1,5629 @@ +[ + { + "step": 1, + "epoch": 0.0016025641025641025, + "cpu_mem": 1.709596672, + "gpu_mem": 4.47020288, + "loss": 3.4877, + "grad_norm": 25.396909713745117, + "learning_rate": 4.7619047619047615e-06 + }, + { + "step": 2, + "epoch": 0.003205128205128205, + "cpu_mem": 1.710579712, + "gpu_mem": 4.572383232, + "loss": 3.6203, + "grad_norm": 25.009212493896484, + "learning_rate": 9.523809523809523e-06 + }, + { + "step": 3, + "epoch": 0.004807692307692308, + "cpu_mem": 1.711562752, + "gpu_mem": 4.572390912, + "loss": 3.3119, + "grad_norm": 24.136140823364258, + "learning_rate": 1.4285714285714284e-05 + }, + { + "step": 4, + "epoch": 0.00641025641025641, + "cpu_mem": 1.7127424, + "gpu_mem": 4.572424704, + "loss": 3.2844, + "grad_norm": 21.43589973449707, + "learning_rate": 1.9047619047619046e-05 + }, + { + "step": 5, + "epoch": 0.008012820512820512, + "cpu_mem": 1.71372544, + "gpu_mem": 4.57238784, + "loss": 2.8896, + "grad_norm": 18.90281105041504, + "learning_rate": 2.3809523809523807e-05 + }, + { + "step": 6, + "epoch": 0.009615384615384616, + "cpu_mem": 1.71470848, + "gpu_mem": 4.57243392, + "loss": 2.608, + "grad_norm": 16.495716094970703, + "learning_rate": 2.8571428571428567e-05 + }, + { + "step": 7, + "epoch": 0.011217948717948718, + "cpu_mem": 1.715494912, + "gpu_mem": 4.572393984, + "loss": 2.3462, + "grad_norm": 11.801786422729492, + "learning_rate": 3.333333333333333e-05 + }, + { + "step": 8, + "epoch": 0.01282051282051282, + "cpu_mem": 1.716281344, + "gpu_mem": 4.572424704, + "loss": 1.841, + "grad_norm": 7.486019611358643, + "learning_rate": 3.809523809523809e-05 + }, + { + "step": 9, + "epoch": 0.014423076923076924, + "cpu_mem": 1.717067776, + "gpu_mem": 4.572424704, + "loss": 1.7273, + "grad_norm": 5.024730205535889, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 10, + "epoch": 0.016025641025641024, + "cpu_mem": 1.717854208, + "gpu_mem": 4.572367872, + "loss": 1.5819, + "grad_norm": 3.5665154457092285, + "learning_rate": 4.7619047619047614e-05 + }, + { + "step": 11, + "epoch": 0.017628205128205128, + "cpu_mem": 1.71864064, + "gpu_mem": 4.57238784, + "loss": 1.4184, + "grad_norm": 1.7875581979751587, + "learning_rate": 5.238095238095237e-05 + }, + { + "step": 12, + "epoch": 0.019230769230769232, + "cpu_mem": 1.719230464, + "gpu_mem": 4.572384768, + "loss": 1.4236, + "grad_norm": 2.129807949066162, + "learning_rate": 5.7142857142857135e-05 + }, + { + "step": 13, + "epoch": 0.020833333333333332, + "cpu_mem": 1.720016896, + "gpu_mem": 4.572377088, + "loss": 1.3997, + "grad_norm": 1.2990965843200684, + "learning_rate": 6.190476190476189e-05 + }, + { + "step": 14, + "epoch": 0.022435897435897436, + "cpu_mem": 1.720803328, + "gpu_mem": 4.5724032, + "loss": 1.3973, + "grad_norm": 1.8292770385742188, + "learning_rate": 6.666666666666666e-05 + }, + { + "step": 15, + "epoch": 0.02403846153846154, + "cpu_mem": 1.72158976, + "gpu_mem": 4.572401664, + "loss": 1.4534, + "grad_norm": 2.930569648742676, + "learning_rate": 7.142857142857142e-05 + }, + { + "step": 16, + "epoch": 0.02564102564102564, + "cpu_mem": 1.722376192, + "gpu_mem": 4.572393984, + "loss": 1.399, + "grad_norm": 1.212304949760437, + "learning_rate": 7.619047619047618e-05 + }, + { + "step": 17, + "epoch": 0.027243589743589744, + "cpu_mem": 1.722966016, + "gpu_mem": 4.572393984, + "loss": 1.4633, + "grad_norm": 2.765972852706909, + "learning_rate": 8.095238095238093e-05 + }, + { + "step": 18, + "epoch": 0.028846153846153848, + "cpu_mem": 1.723752448, + "gpu_mem": 4.572393984, + "loss": 1.374, + "grad_norm": 2.586609363555908, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 19, + "epoch": 0.030448717948717948, + "cpu_mem": 1.72453888, + "gpu_mem": 4.572393984, + "loss": 1.4735, + "grad_norm": 1.7069041728973389, + "learning_rate": 9.047619047619046e-05 + }, + { + "step": 20, + "epoch": 0.03205128205128205, + "cpu_mem": 1.725128704, + "gpu_mem": 4.572367872, + "loss": 1.5342, + "grad_norm": 2.9016835689544678, + "learning_rate": 9.523809523809523e-05 + }, + { + "step": 21, + "epoch": 0.03365384615384615, + "cpu_mem": 1.725718528, + "gpu_mem": 4.572384768, + "loss": 1.5019, + "grad_norm": 2.2960596084594727, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 22, + "epoch": 0.035256410256410256, + "cpu_mem": 1.726308352, + "gpu_mem": 4.572392448, + "loss": 1.443, + "grad_norm": 2.207826852798462, + "learning_rate": 0.00010476190476190474 + }, + { + "step": 23, + "epoch": 0.03685897435897436, + "cpu_mem": 1.727094784, + "gpu_mem": 4.572406272, + "loss": 1.4266, + "grad_norm": 1.6639866828918457, + "learning_rate": 0.0001095238095238095 + }, + { + "step": 24, + "epoch": 0.038461538461538464, + "cpu_mem": 1.727684608, + "gpu_mem": 4.572390912, + "loss": 1.3538, + "grad_norm": 1.3079107999801636, + "learning_rate": 0.00011428571428571427 + }, + { + "step": 25, + "epoch": 0.04006410256410257, + "cpu_mem": 1.728274432, + "gpu_mem": 4.572378624, + "loss": 1.494, + "grad_norm": 3.0879533290863037, + "learning_rate": 0.00011904761904761903 + }, + { + "step": 26, + "epoch": 0.041666666666666664, + "cpu_mem": 1.728864256, + "gpu_mem": 4.572384768, + "loss": 1.4886, + "grad_norm": 2.9382948875427246, + "learning_rate": 0.00012380952380952378 + }, + { + "step": 27, + "epoch": 0.04326923076923077, + "cpu_mem": 1.72945408, + "gpu_mem": 4.572392448, + "loss": 1.4075, + "grad_norm": 1.4374048709869385, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 28, + "epoch": 0.04487179487179487, + "cpu_mem": 1.730043904, + "gpu_mem": 4.57238784, + "loss": 1.4169, + "grad_norm": 1.3458433151245117, + "learning_rate": 0.0001333333333333333 + }, + { + "step": 29, + "epoch": 0.046474358974358976, + "cpu_mem": 1.730633728, + "gpu_mem": 4.572397056, + "loss": 1.4288, + "grad_norm": 1.4504384994506836, + "learning_rate": 0.00013809523809523808 + }, + { + "step": 30, + "epoch": 0.04807692307692308, + "cpu_mem": 1.731223552, + "gpu_mem": 4.572369408, + "loss": 1.4734, + "grad_norm": 1.8393325805664062, + "learning_rate": 0.00014285714285714284 + }, + { + "step": 31, + "epoch": 0.049679487179487176, + "cpu_mem": 1.731813376, + "gpu_mem": 4.572424704, + "loss": 1.3948, + "grad_norm": 0.8968992829322815, + "learning_rate": 0.0001476190476190476 + }, + { + "step": 32, + "epoch": 0.05128205128205128, + "cpu_mem": 1.7324032, + "gpu_mem": 4.572417024, + "loss": 1.4373, + "grad_norm": 1.1625826358795166, + "learning_rate": 0.00015238095238095237 + }, + { + "step": 33, + "epoch": 0.052884615384615384, + "cpu_mem": 1.732796416, + "gpu_mem": 4.572370944, + "loss": 1.3777, + "grad_norm": 0.297127902507782, + "learning_rate": 0.00015714285714285713 + }, + { + "step": 34, + "epoch": 0.05448717948717949, + "cpu_mem": 1.73338624, + "gpu_mem": 4.572389376, + "loss": 1.4154, + "grad_norm": 0.6762979030609131, + "learning_rate": 0.00016190476190476187 + }, + { + "step": 35, + "epoch": 0.05608974358974359, + "cpu_mem": 1.733976064, + "gpu_mem": 4.57241088, + "loss": 1.4594, + "grad_norm": 1.658595323562622, + "learning_rate": 0.00016666666666666666 + }, + { + "step": 36, + "epoch": 0.057692307692307696, + "cpu_mem": 1.734565888, + "gpu_mem": 4.572409344, + "loss": 1.4147, + "grad_norm": 0.8403065204620361, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 37, + "epoch": 0.05929487179487179, + "cpu_mem": 1.735155712, + "gpu_mem": 4.5724416, + "loss": 1.4081, + "grad_norm": 0.6784794330596924, + "learning_rate": 0.0001761904761904762 + }, + { + "step": 38, + "epoch": 0.060897435897435896, + "cpu_mem": 1.735745536, + "gpu_mem": 4.572393984, + "loss": 1.4021, + "grad_norm": 0.43012189865112305, + "learning_rate": 0.00018095238095238093 + }, + { + "step": 39, + "epoch": 0.0625, + "cpu_mem": 1.73633536, + "gpu_mem": 4.572450816, + "loss": 1.3995, + "grad_norm": 1.3948243856430054, + "learning_rate": 0.00018571428571428572 + }, + { + "step": 40, + "epoch": 0.0641025641025641, + "cpu_mem": 1.736925184, + "gpu_mem": 4.572378624, + "loss": 1.4515, + "grad_norm": 1.3615171909332275, + "learning_rate": 0.00019047619047619045 + }, + { + "step": 41, + "epoch": 0.06570512820512821, + "cpu_mem": 1.737515008, + "gpu_mem": 4.572406272, + "loss": 1.3803, + "grad_norm": 0.7484529614448547, + "learning_rate": 0.00019523809523809522 + }, + { + "step": 42, + "epoch": 0.0673076923076923, + "cpu_mem": 1.737908224, + "gpu_mem": 4.572420096, + "loss": 1.4184, + "grad_norm": 1.0618382692337036, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 43, + "epoch": 0.06891025641025642, + "cpu_mem": 1.738498048, + "gpu_mem": 4.57242624, + "loss": 1.3779, + "grad_norm": 0.25215455889701843, + "learning_rate": 0.00020476190476190475 + }, + { + "step": 44, + "epoch": 0.07051282051282051, + "cpu_mem": 1.739087872, + "gpu_mem": 4.572404736, + "loss": 1.3978, + "grad_norm": 0.5764678716659546, + "learning_rate": 0.00020952380952380948 + }, + { + "step": 45, + "epoch": 0.07211538461538461, + "cpu_mem": 1.739481088, + "gpu_mem": 4.572404736, + "loss": 1.3961, + "grad_norm": 0.36457017064094543, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 46, + "epoch": 0.07371794871794872, + "cpu_mem": 1.739874304, + "gpu_mem": 4.572404736, + "loss": 1.3957, + "grad_norm": 1.194234013557434, + "learning_rate": 0.000219047619047619 + }, + { + "step": 47, + "epoch": 0.07532051282051282, + "cpu_mem": 1.740464128, + "gpu_mem": 4.572390912, + "loss": 1.3867, + "grad_norm": 0.5570566058158875, + "learning_rate": 0.0002238095238095238 + }, + { + "step": 48, + "epoch": 0.07692307692307693, + "cpu_mem": 1.741053952, + "gpu_mem": 4.572409344, + "loss": 1.3752, + "grad_norm": 0.6216866374015808, + "learning_rate": 0.00022857142857142854 + }, + { + "step": 49, + "epoch": 0.07852564102564102, + "cpu_mem": 1.741643776, + "gpu_mem": 4.572421632, + "loss": 1.3988, + "grad_norm": 0.7088686227798462, + "learning_rate": 0.0002333333333333333 + }, + { + "step": 50, + "epoch": 0.08012820512820513, + "cpu_mem": 1.7422336, + "gpu_mem": 4.572398592, + "loss": 1.3882, + "grad_norm": 0.7424083948135376, + "learning_rate": 0.00023809523809523807 + }, + { + "step": 51, + "epoch": 0.08173076923076923, + "cpu_mem": 1.742626816, + "gpu_mem": 4.572383232, + "loss": 1.3749, + "grad_norm": 0.6118662357330322, + "learning_rate": 0.00024285714285714283 + }, + { + "step": 52, + "epoch": 0.08333333333333333, + "cpu_mem": 1.74321664, + "gpu_mem": 4.57238784, + "loss": 1.368, + "grad_norm": 0.41979193687438965, + "learning_rate": 0.00024761904761904757 + }, + { + "step": 53, + "epoch": 0.08493589743589744, + "cpu_mem": 1.743609856, + "gpu_mem": 4.572415488, + "loss": 1.4185, + "grad_norm": 1.0644590854644775, + "learning_rate": 0.0002523809523809524 + }, + { + "step": 54, + "epoch": 0.08653846153846154, + "cpu_mem": 1.744003072, + "gpu_mem": 4.572390912, + "loss": 1.44, + "grad_norm": 1.5842132568359375, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 55, + "epoch": 0.08814102564102565, + "cpu_mem": 1.744396288, + "gpu_mem": 4.572409344, + "loss": 1.3943, + "grad_norm": 0.7805165648460388, + "learning_rate": 0.00026190476190476186 + }, + { + "step": 56, + "epoch": 0.08974358974358974, + "cpu_mem": 1.744986112, + "gpu_mem": 4.5724032, + "loss": 1.3993, + "grad_norm": 0.7331279516220093, + "learning_rate": 0.0002666666666666666 + }, + { + "step": 57, + "epoch": 0.09134615384615384, + "cpu_mem": 1.745575936, + "gpu_mem": 4.572369408, + "loss": 1.3985, + "grad_norm": 0.9271532297134399, + "learning_rate": 0.0002714285714285714 + }, + { + "step": 58, + "epoch": 0.09294871794871795, + "cpu_mem": 1.745969152, + "gpu_mem": 4.572398592, + "loss": 1.4294, + "grad_norm": 0.99676114320755, + "learning_rate": 0.00027619047619047615 + }, + { + "step": 59, + "epoch": 0.09455128205128205, + "cpu_mem": 1.746362368, + "gpu_mem": 4.572381696, + "loss": 1.3234, + "grad_norm": 0.5852205157279968, + "learning_rate": 0.0002809523809523809 + }, + { + "step": 60, + "epoch": 0.09615384615384616, + "cpu_mem": 1.746952192, + "gpu_mem": 4.572423168, + "loss": 1.3998, + "grad_norm": 0.9808391332626343, + "learning_rate": 0.0002857142857142857 + }, + { + "step": 61, + "epoch": 0.09775641025641026, + "cpu_mem": 1.747345408, + "gpu_mem": 4.572389376, + "loss": 1.3821, + "grad_norm": 0.3165935277938843, + "learning_rate": 0.00029047619047619045 + }, + { + "step": 62, + "epoch": 0.09935897435897435, + "cpu_mem": 1.747935232, + "gpu_mem": 4.572429312, + "loss": 1.3319, + "grad_norm": 0.7542638778686523, + "learning_rate": 0.0002952380952380952 + }, + { + "step": 63, + "epoch": 0.10096153846153846, + "cpu_mem": 1.748328448, + "gpu_mem": 4.572383232, + "loss": 1.4561, + "grad_norm": 1.2276780605316162, + "learning_rate": 0.0003 + }, + { + "step": 64, + "epoch": 0.10256410256410256, + "cpu_mem": 1.748721664, + "gpu_mem": 4.57238784, + "loss": 1.4599, + "grad_norm": 1.1099942922592163, + "learning_rate": 0.00029999764801714643 + }, + { + "step": 65, + "epoch": 0.10416666666666667, + "cpu_mem": 1.74911488, + "gpu_mem": 4.572384768, + "loss": 1.3923, + "grad_norm": 0.5514512062072754, + "learning_rate": 0.00029999059214234344 + }, + { + "step": 66, + "epoch": 0.10576923076923077, + "cpu_mem": 1.749704704, + "gpu_mem": 4.5724032, + "loss": 1.398, + "grad_norm": 0.5415115356445312, + "learning_rate": 0.00029997883259686163 + }, + { + "step": 67, + "epoch": 0.10737179487179487, + "cpu_mem": 1.750294528, + "gpu_mem": 4.57239552, + "loss": 1.4307, + "grad_norm": 0.9903335571289062, + "learning_rate": 0.00029996236974947764 + }, + { + "step": 68, + "epoch": 0.10897435897435898, + "cpu_mem": 1.750687744, + "gpu_mem": 4.57238016, + "loss": 1.4593, + "grad_norm": 1.2476136684417725, + "learning_rate": 0.00029994120411646263 + }, + { + "step": 69, + "epoch": 0.11057692307692307, + "cpu_mem": 1.75108096, + "gpu_mem": 4.572450816, + "loss": 1.3978, + "grad_norm": 0.8261002898216248, + "learning_rate": 0.00029991533636156603 + }, + { + "step": 70, + "epoch": 0.11217948717948718, + "cpu_mem": 1.751474176, + "gpu_mem": 4.572401664, + "loss": 1.4205, + "grad_norm": 0.779125452041626, + "learning_rate": 0.00029988476729599464 + }, + { + "step": 71, + "epoch": 0.11378205128205128, + "cpu_mem": 1.751867392, + "gpu_mem": 4.57242624, + "loss": 1.3992, + "grad_norm": 0.8912459015846252, + "learning_rate": 0.0002998494978783874 + }, + { + "step": 72, + "epoch": 0.11538461538461539, + "cpu_mem": 1.752457216, + "gpu_mem": 4.572397056, + "loss": 1.411, + "grad_norm": 0.5945685505867004, + "learning_rate": 0.0002998095292147852 + }, + { + "step": 73, + "epoch": 0.11698717948717949, + "cpu_mem": 1.752850432, + "gpu_mem": 4.572389376, + "loss": 1.5117, + "grad_norm": 1.3131664991378784, + "learning_rate": 0.0002997648625585962 + }, + { + "step": 74, + "epoch": 0.11858974358974358, + "cpu_mem": 1.753440256, + "gpu_mem": 4.572383232, + "loss": 1.3822, + "grad_norm": 0.4422674775123596, + "learning_rate": 0.0002997154993105566 + }, + { + "step": 75, + "epoch": 0.1201923076923077, + "cpu_mem": 1.753833472, + "gpu_mem": 4.572412416, + "loss": 1.4236, + "grad_norm": 0.7152919769287109, + "learning_rate": 0.00029966144101868636 + }, + { + "step": 76, + "epoch": 0.12179487179487179, + "cpu_mem": 1.75403008, + "gpu_mem": 4.5724032, + "loss": 1.4245, + "grad_norm": 0.9969199299812317, + "learning_rate": 0.0002996026893782414 + }, + { + "step": 77, + "epoch": 0.1233974358974359, + "cpu_mem": 1.754423296, + "gpu_mem": 4.572390912, + "loss": 1.3904, + "grad_norm": 0.2879968583583832, + "learning_rate": 0.00029953924623165955 + }, + { + "step": 78, + "epoch": 0.125, + "cpu_mem": 1.754816512, + "gpu_mem": 4.572383232, + "loss": 1.4014, + "grad_norm": 0.6685864925384521, + "learning_rate": 0.0002994711135685035 + }, + { + "step": 79, + "epoch": 0.1266025641025641, + "cpu_mem": 1.755209728, + "gpu_mem": 4.572435456, + "loss": 1.3881, + "grad_norm": 0.4015916585922241, + "learning_rate": 0.00029939829352539787 + }, + { + "step": 80, + "epoch": 0.1282051282051282, + "cpu_mem": 1.755602944, + "gpu_mem": 4.572413952, + "loss": 1.4106, + "grad_norm": 0.5262051224708557, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 81, + "epoch": 0.12980769230769232, + "cpu_mem": 1.75599616, + "gpu_mem": 4.572407808, + "loss": 1.3601, + "grad_norm": 0.2764841616153717, + "learning_rate": 0.0002992386005807413 + }, + { + "step": 82, + "epoch": 0.13141025641025642, + "cpu_mem": 1.756585984, + "gpu_mem": 4.572384768, + "loss": 1.3956, + "grad_norm": 0.5895227193832397, + "learning_rate": 0.00029915173268712456 + }, + { + "step": 83, + "epoch": 0.1330128205128205, + "cpu_mem": 1.7569792, + "gpu_mem": 4.572406272, + "loss": 1.4162, + "grad_norm": 0.7090219259262085, + "learning_rate": 0.0002990601874292698 + }, + { + "step": 84, + "epoch": 0.1346153846153846, + "cpu_mem": 1.757372416, + "gpu_mem": 4.572378624, + "loss": 1.414, + "grad_norm": 0.7195500731468201, + "learning_rate": 0.0002989639676780152 + }, + { + "step": 85, + "epoch": 0.1362179487179487, + "cpu_mem": 1.757569024, + "gpu_mem": 4.572386304, + "loss": 1.3958, + "grad_norm": 0.7851431965827942, + "learning_rate": 0.0002988630764507904 + }, + { + "step": 86, + "epoch": 0.13782051282051283, + "cpu_mem": 1.758158848, + "gpu_mem": 4.572404736, + "loss": 1.419, + "grad_norm": 1.1495107412338257, + "learning_rate": 0.00029875751691152094 + }, + { + "step": 87, + "epoch": 0.13942307692307693, + "cpu_mem": 1.758748672, + "gpu_mem": 4.572393984, + "loss": 1.3908, + "grad_norm": 0.47955670952796936, + "learning_rate": 0.0002986472923705301 + }, + { + "step": 88, + "epoch": 0.14102564102564102, + "cpu_mem": 1.75894528, + "gpu_mem": 4.572392448, + "loss": 1.3948, + "grad_norm": 0.9657104015350342, + "learning_rate": 0.0002985324062844341 + }, + { + "step": 89, + "epoch": 0.14262820512820512, + "cpu_mem": 1.759141888, + "gpu_mem": 4.57238784, + "loss": 1.4021, + "grad_norm": 0.5875508189201355, + "learning_rate": 0.0002984128622560345 + }, + { + "step": 90, + "epoch": 0.14423076923076922, + "cpu_mem": 1.759535104, + "gpu_mem": 4.572392448, + "loss": 1.3839, + "grad_norm": 0.4707915484905243, + "learning_rate": 0.0002982886640342046 + }, + { + "step": 91, + "epoch": 0.14583333333333334, + "cpu_mem": 1.760124928, + "gpu_mem": 4.5724032, + "loss": 1.3875, + "grad_norm": 0.5565634369850159, + "learning_rate": 0.00029815981551377217 + }, + { + "step": 92, + "epoch": 0.14743589743589744, + "cpu_mem": 1.760321536, + "gpu_mem": 4.572406272, + "loss": 1.4071, + "grad_norm": 0.5638189315795898, + "learning_rate": 0.00029802632073539745 + }, + { + "step": 93, + "epoch": 0.14903846153846154, + "cpu_mem": 1.760714752, + "gpu_mem": 4.572406272, + "loss": 1.4054, + "grad_norm": 0.3105180859565735, + "learning_rate": 0.0002978881838854462 + }, + { + "step": 94, + "epoch": 0.15064102564102563, + "cpu_mem": 1.761304576, + "gpu_mem": 4.572401664, + "loss": 1.3941, + "grad_norm": 0.5256777405738831, + "learning_rate": 0.00029774540929585847 + }, + { + "step": 95, + "epoch": 0.15224358974358973, + "cpu_mem": 1.761697792, + "gpu_mem": 4.572420096, + "loss": 1.39, + "grad_norm": 0.5822255611419678, + "learning_rate": 0.0002975980014440126 + }, + { + "step": 96, + "epoch": 0.15384615384615385, + "cpu_mem": 1.762091008, + "gpu_mem": 4.572423168, + "loss": 1.3773, + "grad_norm": 0.1672244817018509, + "learning_rate": 0.00029744596495258525 + }, + { + "step": 97, + "epoch": 0.15544871794871795, + "cpu_mem": 1.762484224, + "gpu_mem": 4.572400128, + "loss": 1.4023, + "grad_norm": 0.4110303521156311, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 98, + "epoch": 0.15705128205128205, + "cpu_mem": 1.76287744, + "gpu_mem": 4.57241088, + "loss": 1.4121, + "grad_norm": 0.49975842237472534, + "learning_rate": 0.000297128025267308 + }, + { + "step": 99, + "epoch": 0.15865384615384615, + "cpu_mem": 1.763074048, + "gpu_mem": 4.57241088, + "loss": 1.3803, + "grad_norm": 0.6286399960517883, + "learning_rate": 0.00029696213204397396 + }, + { + "step": 100, + "epoch": 0.16025641025641027, + "cpu_mem": 1.763467264, + "gpu_mem": 4.572386304, + "loss": 1.3965, + "grad_norm": 0.4793117344379425, + "learning_rate": 0.00029679163012177737 + }, + { + "step": 101, + "epoch": 0.16185897435897437, + "cpu_mem": 1.76386048, + "gpu_mem": 4.572415488, + "loss": 1.4093, + "grad_norm": 0.6176011562347412, + "learning_rate": 0.0002966165248476196 + }, + { + "step": 102, + "epoch": 0.16346153846153846, + "cpu_mem": 1.764253696, + "gpu_mem": 4.572392448, + "loss": 1.3637, + "grad_norm": 0.4566946029663086, + "learning_rate": 0.00029643682171276203 + }, + { + "step": 103, + "epoch": 0.16506410256410256, + "cpu_mem": 1.764646912, + "gpu_mem": 4.572409344, + "loss": 1.4162, + "grad_norm": 0.6366556286811829, + "learning_rate": 0.0002962525263526538 + }, + { + "step": 104, + "epoch": 0.16666666666666666, + "cpu_mem": 1.76484352, + "gpu_mem": 4.572377088, + "loss": 1.4172, + "grad_norm": 0.7973282337188721, + "learning_rate": 0.0002960636445467553 + }, + { + "step": 105, + "epoch": 0.16826923076923078, + "cpu_mem": 1.765040128, + "gpu_mem": 4.572392448, + "loss": 1.3874, + "grad_norm": 0.4764900505542755, + "learning_rate": 0.0002958701822183569 + }, + { + "step": 106, + "epoch": 0.16987179487179488, + "cpu_mem": 1.765433344, + "gpu_mem": 4.57237248, + "loss": 1.3932, + "grad_norm": 0.5983567237854004, + "learning_rate": 0.0002956721454343928 + }, + { + "step": 107, + "epoch": 0.17147435897435898, + "cpu_mem": 1.76582656, + "gpu_mem": 4.572413952, + "loss": 1.3937, + "grad_norm": 0.3426465094089508, + "learning_rate": 0.0002954695404052514 + }, + { + "step": 108, + "epoch": 0.17307692307692307, + "cpu_mem": 1.766219776, + "gpu_mem": 4.572409344, + "loss": 1.3871, + "grad_norm": 0.3976573348045349, + "learning_rate": 0.00029526237348458003 + }, + { + "step": 109, + "epoch": 0.17467948717948717, + "cpu_mem": 1.766416384, + "gpu_mem": 4.572415488, + "loss": 1.4083, + "grad_norm": 0.6993476748466492, + "learning_rate": 0.000295050651169086 + }, + { + "step": 110, + "epoch": 0.1762820512820513, + "cpu_mem": 1.7668096, + "gpu_mem": 4.572412416, + "loss": 1.3905, + "grad_norm": 0.4156718850135803, + "learning_rate": 0.00029483438009833264 + }, + { + "step": 111, + "epoch": 0.1778846153846154, + "cpu_mem": 1.767202816, + "gpu_mem": 4.572413952, + "loss": 1.3774, + "grad_norm": 0.4693636894226074, + "learning_rate": 0.0002946135670545314 + }, + { + "step": 112, + "epoch": 0.1794871794871795, + "cpu_mem": 1.767596032, + "gpu_mem": 4.57241088, + "loss": 1.3827, + "grad_norm": 0.33568042516708374, + "learning_rate": 0.0002943882189623288 + }, + { + "step": 113, + "epoch": 0.18108974358974358, + "cpu_mem": 1.76779264, + "gpu_mem": 4.572390912, + "loss": 1.3957, + "grad_norm": 0.3714927136898041, + "learning_rate": 0.00029415834288858947 + }, + { + "step": 114, + "epoch": 0.18269230769230768, + "cpu_mem": 1.768185856, + "gpu_mem": 4.572386304, + "loss": 1.364, + "grad_norm": 0.458238422870636, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 115, + "epoch": 0.1842948717948718, + "cpu_mem": 1.768382464, + "gpu_mem": 4.572404736, + "loss": 1.4111, + "grad_norm": 0.5273997783660889, + "learning_rate": 0.0002936850357737156 + }, + { + "step": 116, + "epoch": 0.1858974358974359, + "cpu_mem": 1.76877568, + "gpu_mem": 4.572415488, + "loss": 1.4064, + "grad_norm": 0.45401254296302795, + "learning_rate": 0.0002934416195753839 + }, + { + "step": 117, + "epoch": 0.1875, + "cpu_mem": 1.769168896, + "gpu_mem": 4.572401664, + "loss": 1.3938, + "grad_norm": 0.36659786105155945, + "learning_rate": 0.00029319370508065594 + }, + { + "step": 118, + "epoch": 0.1891025641025641, + "cpu_mem": 1.769562112, + "gpu_mem": 4.572417024, + "loss": 1.4486, + "grad_norm": 0.9787864685058594, + "learning_rate": 0.0002929413000640735 + }, + { + "step": 119, + "epoch": 0.1907051282051282, + "cpu_mem": 1.769955328, + "gpu_mem": 4.572398592, + "loss": 1.3552, + "grad_norm": 0.5448693633079529, + "learning_rate": 0.0002926844124410001 + }, + { + "step": 120, + "epoch": 0.19230769230769232, + "cpu_mem": 1.770348544, + "gpu_mem": 4.572424704, + "loss": 1.4001, + "grad_norm": 0.6557159423828125, + "learning_rate": 0.0002924230502673731 + }, + { + "step": 121, + "epoch": 0.19391025641025642, + "cpu_mem": 1.770545152, + "gpu_mem": 4.572383232, + "loss": 1.3857, + "grad_norm": 0.4595993757247925, + "learning_rate": 0.00029215722173945034 + }, + { + "step": 122, + "epoch": 0.1955128205128205, + "cpu_mem": 1.770938368, + "gpu_mem": 4.572415488, + "loss": 1.3893, + "grad_norm": 0.49075114727020264, + "learning_rate": 0.0002918869351935537 + }, + { + "step": 123, + "epoch": 0.1971153846153846, + "cpu_mem": 1.771331584, + "gpu_mem": 4.572409344, + "loss": 1.4064, + "grad_norm": 0.2879602909088135, + "learning_rate": 0.00029161219910580754 + }, + { + "step": 124, + "epoch": 0.1987179487179487, + "cpu_mem": 1.771528192, + "gpu_mem": 4.57241088, + "loss": 1.3688, + "grad_norm": 0.27523940801620483, + "learning_rate": 0.00029133302209187267 + }, + { + "step": 125, + "epoch": 0.20032051282051283, + "cpu_mem": 1.7717248, + "gpu_mem": 4.572386304, + "loss": 1.3913, + "grad_norm": 0.6776530146598816, + "learning_rate": 0.00029104941290667655 + }, + { + "step": 126, + "epoch": 0.20192307692307693, + "cpu_mem": 1.772118016, + "gpu_mem": 4.57239552, + "loss": 1.3723, + "grad_norm": 0.5853263735771179, + "learning_rate": 0.00029076138044413827 + }, + { + "step": 127, + "epoch": 0.20352564102564102, + "cpu_mem": 1.772511232, + "gpu_mem": 4.572381696, + "loss": 1.4142, + "grad_norm": 0.6179822683334351, + "learning_rate": 0.00029046893373689 + }, + { + "step": 128, + "epoch": 0.20512820512820512, + "cpu_mem": 1.772904448, + "gpu_mem": 4.57241856, + "loss": 1.3671, + "grad_norm": 0.37271422147750854, + "learning_rate": 0.00029017208195599375 + }, + { + "step": 129, + "epoch": 0.20673076923076922, + "cpu_mem": 1.773297664, + "gpu_mem": 4.572415488, + "loss": 1.3843, + "grad_norm": 0.18353167176246643, + "learning_rate": 0.0002898708344106533 + }, + { + "step": 130, + "epoch": 0.20833333333333334, + "cpu_mem": 1.773494272, + "gpu_mem": 4.572415488, + "loss": 1.4069, + "grad_norm": 0.45111411809921265, + "learning_rate": 0.00028956520054792303 + }, + { + "step": 131, + "epoch": 0.20993589743589744, + "cpu_mem": 1.77369088, + "gpu_mem": 4.572404736, + "loss": 1.3857, + "grad_norm": 0.4190365970134735, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 132, + "epoch": 0.21153846153846154, + "cpu_mem": 1.774084096, + "gpu_mem": 4.572404736, + "loss": 1.3532, + "grad_norm": 0.3373219966888428, + "learning_rate": 0.0002889408123459782 + }, + { + "step": 133, + "epoch": 0.21314102564102563, + "cpu_mem": 1.774477312, + "gpu_mem": 4.572386304, + "loss": 1.3779, + "grad_norm": 0.3579989969730377, + "learning_rate": 0.000288622077587435 + }, + { + "step": 134, + "epoch": 0.21474358974358973, + "cpu_mem": 1.77467392, + "gpu_mem": 4.572397056, + "loss": 1.4037, + "grad_norm": 0.508078932762146, + "learning_rate": 0.0002882989956722303 + }, + { + "step": 135, + "epoch": 0.21634615384615385, + "cpu_mem": 1.775067136, + "gpu_mem": 4.572406272, + "loss": 1.3607, + "grad_norm": 0.24533209204673767, + "learning_rate": 0.00028797157673213914 + }, + { + "step": 136, + "epoch": 0.21794871794871795, + "cpu_mem": 1.775460352, + "gpu_mem": 4.572421632, + "loss": 1.4173, + "grad_norm": 0.6665544509887695, + "learning_rate": 0.00028763983103494465 + }, + { + "step": 137, + "epoch": 0.21955128205128205, + "cpu_mem": 1.77565696, + "gpu_mem": 4.572369408, + "loss": 1.4058, + "grad_norm": 0.5267869830131531, + "learning_rate": 0.00028730376898411606 + }, + { + "step": 138, + "epoch": 0.22115384615384615, + "cpu_mem": 1.776050176, + "gpu_mem": 4.572389376, + "loss": 1.3914, + "grad_norm": 0.3911932706832886, + "learning_rate": 0.00028696340111848245 + }, + { + "step": 139, + "epoch": 0.22275641025641027, + "cpu_mem": 1.776246784, + "gpu_mem": 4.572370944, + "loss": 1.3694, + "grad_norm": 0.36887943744659424, + "learning_rate": 0.00028661873811190226 + }, + { + "step": 140, + "epoch": 0.22435897435897437, + "cpu_mem": 1.776443392, + "gpu_mem": 4.57238784, + "loss": 1.3928, + "grad_norm": 0.6021479964256287, + "learning_rate": 0.0002862697907729285 + }, + { + "step": 141, + "epoch": 0.22596153846153846, + "cpu_mem": 1.77664, + "gpu_mem": 4.572393984, + "loss": 1.4009, + "grad_norm": 0.5595008134841919, + "learning_rate": 0.0002859165700444701 + }, + { + "step": 142, + "epoch": 0.22756410256410256, + "cpu_mem": 1.777033216, + "gpu_mem": 4.572390912, + "loss": 1.3771, + "grad_norm": 0.27814146876335144, + "learning_rate": 0.00028555908700344824 + }, + { + "step": 143, + "epoch": 0.22916666666666666, + "cpu_mem": 1.777426432, + "gpu_mem": 4.572417024, + "loss": 1.3697, + "grad_norm": 0.29995304346084595, + "learning_rate": 0.00028519735286044936 + }, + { + "step": 144, + "epoch": 0.23076923076923078, + "cpu_mem": 1.77762304, + "gpu_mem": 4.572390912, + "loss": 1.3885, + "grad_norm": 0.28541743755340576, + "learning_rate": 0.0002848313789593736 + }, + { + "step": 145, + "epoch": 0.23237179487179488, + "cpu_mem": 1.777819648, + "gpu_mem": 4.572430848, + "loss": 1.3683, + "grad_norm": 0.45492023229599, + "learning_rate": 0.00028446117677707867 + }, + { + "step": 146, + "epoch": 0.23397435897435898, + "cpu_mem": 1.778212864, + "gpu_mem": 4.57238016, + "loss": 1.4171, + "grad_norm": 0.5694912672042847, + "learning_rate": 0.0002840867579230205 + }, + { + "step": 147, + "epoch": 0.23557692307692307, + "cpu_mem": 1.778409472, + "gpu_mem": 4.572389376, + "loss": 1.3853, + "grad_norm": 0.5467325448989868, + "learning_rate": 0.00028370813413888866 + }, + { + "step": 148, + "epoch": 0.23717948717948717, + "cpu_mem": 1.77860608, + "gpu_mem": 4.572409344, + "loss": 1.3644, + "grad_norm": 0.3558543622493744, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 149, + "epoch": 0.2387820512820513, + "cpu_mem": 1.778802688, + "gpu_mem": 4.572400128, + "loss": 1.3449, + "grad_norm": 0.37063729763031006, + "learning_rate": 0.0002829383194061186 + }, + { + "step": 150, + "epoch": 0.2403846153846154, + "cpu_mem": 1.779195904, + "gpu_mem": 4.572412416, + "loss": 1.3765, + "grad_norm": 0.4921226501464844, + "learning_rate": 0.00028254715259869444 + }, + { + "step": 151, + "epoch": 0.2419871794871795, + "cpu_mem": 1.77958912, + "gpu_mem": 4.572377088, + "loss": 1.3851, + "grad_norm": 0.5561254024505615, + "learning_rate": 0.00028215182914286766 + }, + { + "step": 152, + "epoch": 0.24358974358974358, + "cpu_mem": 1.779785728, + "gpu_mem": 4.572407808, + "loss": 1.3603, + "grad_norm": 0.5320815443992615, + "learning_rate": 0.00028175236143589144 + }, + { + "step": 153, + "epoch": 0.24519230769230768, + "cpu_mem": 1.779982336, + "gpu_mem": 4.5724032, + "loss": 1.3827, + "grad_norm": 0.5862358212471008, + "learning_rate": 0.0002813487620049817 + }, + { + "step": 154, + "epoch": 0.2467948717948718, + "cpu_mem": 1.780178944, + "gpu_mem": 4.572427776, + "loss": 1.3629, + "grad_norm": 0.5253262519836426, + "learning_rate": 0.00028094104350692435 + }, + { + "step": 155, + "epoch": 0.2483974358974359, + "cpu_mem": 1.780375552, + "gpu_mem": 4.5723648, + "loss": 1.4285, + "grad_norm": 0.8913632035255432, + "learning_rate": 0.0002805292187276783 + }, + { + "step": 156, + "epoch": 0.25, + "cpu_mem": 1.780768768, + "gpu_mem": 4.57241856, + "loss": 1.3477, + "grad_norm": 0.4603636860847473, + "learning_rate": 0.0002801133005819744 + }, + { + "step": 157, + "epoch": 0.2516025641025641, + "cpu_mem": 1.781161984, + "gpu_mem": 4.57241088, + "loss": 1.3548, + "grad_norm": 0.40162211656570435, + "learning_rate": 0.00027969330211291077 + }, + { + "step": 158, + "epoch": 0.2532051282051282, + "cpu_mem": 1.781358592, + "gpu_mem": 4.57242624, + "loss": 1.3641, + "grad_norm": 0.36865532398223877, + "learning_rate": 0.00027926923649154327 + }, + { + "step": 159, + "epoch": 0.2548076923076923, + "cpu_mem": 1.7815552, + "gpu_mem": 4.572427776, + "loss": 1.3593, + "grad_norm": 0.3822115659713745, + "learning_rate": 0.00027884111701647284 + }, + { + "step": 160, + "epoch": 0.2564102564102564, + "cpu_mem": 1.781948416, + "gpu_mem": 4.57239552, + "loss": 1.3544, + "grad_norm": 0.4850587844848633, + "learning_rate": 0.00027840895711342834 + }, + { + "step": 161, + "epoch": 0.25801282051282054, + "cpu_mem": 1.782145024, + "gpu_mem": 4.57238784, + "loss": 1.3559, + "grad_norm": 0.4977882206439972, + "learning_rate": 0.00027797277033484553 + }, + { + "step": 162, + "epoch": 0.25961538461538464, + "cpu_mem": 1.782341632, + "gpu_mem": 4.572423168, + "loss": 1.3256, + "grad_norm": 0.5010091066360474, + "learning_rate": 0.0002775325703594421 + }, + { + "step": 163, + "epoch": 0.26121794871794873, + "cpu_mem": 1.78253824, + "gpu_mem": 4.572370944, + "loss": 1.3528, + "grad_norm": 0.9563540816307068, + "learning_rate": 0.0002770883709917886 + }, + { + "step": 164, + "epoch": 0.26282051282051283, + "cpu_mem": 1.782931456, + "gpu_mem": 4.572406272, + "loss": 1.3497, + "grad_norm": 1.003287434577942, + "learning_rate": 0.0002766401861618757 + }, + { + "step": 165, + "epoch": 0.2644230769230769, + "cpu_mem": 1.783128064, + "gpu_mem": 4.57239552, + "loss": 1.3502, + "grad_norm": 1.018513560295105, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 166, + "epoch": 0.266025641025641, + "cpu_mem": 1.783324672, + "gpu_mem": 4.572427776, + "loss": 1.3632, + "grad_norm": 1.3334022760391235, + "learning_rate": 0.0002757319164597092 + }, + { + "step": 167, + "epoch": 0.2676282051282051, + "cpu_mem": 1.78352128, + "gpu_mem": 4.572421632, + "loss": 1.3336, + "grad_norm": 1.014074444770813, + "learning_rate": 0.0002752718600705858 + }, + { + "step": 168, + "epoch": 0.2692307692307692, + "cpu_mem": 1.783914496, + "gpu_mem": 4.572400128, + "loss": 1.2849, + "grad_norm": 0.8176320791244507, + "learning_rate": 0.00027480787518457023 + }, + { + "step": 169, + "epoch": 0.2708333333333333, + "cpu_mem": 1.784111104, + "gpu_mem": 4.572397056, + "loss": 1.3431, + "grad_norm": 1.70740807056427, + "learning_rate": 0.0002743399763521223 + }, + { + "step": 170, + "epoch": 0.2724358974358974, + "cpu_mem": 1.784307712, + "gpu_mem": 4.57243392, + "loss": 1.3431, + "grad_norm": 1.855790615081787, + "learning_rate": 0.0002738681782464426 + }, + { + "step": 171, + "epoch": 0.27403846153846156, + "cpu_mem": 1.78450432, + "gpu_mem": 4.572407808, + "loss": 1.1734, + "grad_norm": 1.0797275304794312, + "learning_rate": 0.0002733924956630117 + }, + { + "step": 172, + "epoch": 0.27564102564102566, + "cpu_mem": 1.784700928, + "gpu_mem": 4.572384768, + "loss": 1.1627, + "grad_norm": 1.1271494626998901, + "learning_rate": 0.00027291294351912664 + }, + { + "step": 173, + "epoch": 0.27724358974358976, + "cpu_mem": 1.785094144, + "gpu_mem": 4.57241088, + "loss": 1.2616, + "grad_norm": 1.6684577465057373, + "learning_rate": 0.00027242953685343327 + }, + { + "step": 174, + "epoch": 0.27884615384615385, + "cpu_mem": 1.785290752, + "gpu_mem": 4.572423168, + "loss": 1.2701, + "grad_norm": 2.019705057144165, + "learning_rate": 0.0002719422908254538 + }, + { + "step": 175, + "epoch": 0.28044871794871795, + "cpu_mem": 1.78548736, + "gpu_mem": 4.572384768, + "loss": 1.1314, + "grad_norm": 2.0091049671173096, + "learning_rate": 0.0002714512207151125 + }, + { + "step": 176, + "epoch": 0.28205128205128205, + "cpu_mem": 1.785683968, + "gpu_mem": 4.572393984, + "loss": 1.1772, + "grad_norm": 1.835602045059204, + "learning_rate": 0.0002709563419222557 + }, + { + "step": 177, + "epoch": 0.28365384615384615, + "cpu_mem": 1.785880576, + "gpu_mem": 4.572375552, + "loss": 1.2384, + "grad_norm": 2.512620210647583, + "learning_rate": 0.0002704576699661691 + }, + { + "step": 178, + "epoch": 0.28525641025641024, + "cpu_mem": 1.786077184, + "gpu_mem": 4.572389376, + "loss": 1.0932, + "grad_norm": 1.579443097114563, + "learning_rate": 0.0002699552204850914 + }, + { + "step": 179, + "epoch": 0.28685897435897434, + "cpu_mem": 1.786273792, + "gpu_mem": 4.572397056, + "loss": 1.1212, + "grad_norm": 1.7648038864135742, + "learning_rate": 0.0002694490092357233 + }, + { + "step": 180, + "epoch": 0.28846153846153844, + "cpu_mem": 1.786667008, + "gpu_mem": 4.572378624, + "loss": 1.1344, + "grad_norm": 2.400998115539551, + "learning_rate": 0.00026893905209273404 + }, + { + "step": 181, + "epoch": 0.2900641025641026, + "cpu_mem": 1.786863616, + "gpu_mem": 4.572409344, + "loss": 1.1565, + "grad_norm": 2.122922420501709, + "learning_rate": 0.00026842536504826286 + }, + { + "step": 182, + "epoch": 0.2916666666666667, + "cpu_mem": 1.787060224, + "gpu_mem": 4.57238016, + "loss": 1.1779, + "grad_norm": 1.69736647605896, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 183, + "epoch": 0.2932692307692308, + "cpu_mem": 1.78745344, + "gpu_mem": 4.572404736, + "loss": 1.0679, + "grad_norm": 1.4188836812973022, + "learning_rate": 0.0002673868658077717 + }, + { + "step": 184, + "epoch": 0.2948717948717949, + "cpu_mem": 1.787650048, + "gpu_mem": 4.572384768, + "loss": 1.1113, + "grad_norm": 1.89350163936615, + "learning_rate": 0.00026686208617885055 + }, + { + "step": 185, + "epoch": 0.296474358974359, + "cpu_mem": 1.787846656, + "gpu_mem": 4.572417024, + "loss": 1.0633, + "grad_norm": 1.4810597896575928, + "learning_rate": 0.0002663336417816238 + }, + { + "step": 186, + "epoch": 0.2980769230769231, + "cpu_mem": 1.788043264, + "gpu_mem": 4.572407808, + "loss": 1.2008, + "grad_norm": 1.650045394897461, + "learning_rate": 0.0002658015491879868 + }, + { + "step": 187, + "epoch": 0.29967948717948717, + "cpu_mem": 1.788239872, + "gpu_mem": 4.5724032, + "loss": 1.0845, + "grad_norm": 1.596790075302124, + "learning_rate": 0.00026526582508424175 + }, + { + "step": 188, + "epoch": 0.30128205128205127, + "cpu_mem": 1.78843648, + "gpu_mem": 4.572360192, + "loss": 1.1397, + "grad_norm": 2.031054735183716, + "learning_rate": 0.0002647264862705741 + }, + { + "step": 189, + "epoch": 0.30288461538461536, + "cpu_mem": 1.788633088, + "gpu_mem": 4.572440064, + "loss": 1.0511, + "grad_norm": 1.7195582389831543, + "learning_rate": 0.00026418354966052573 + }, + { + "step": 190, + "epoch": 0.30448717948717946, + "cpu_mem": 1.788829696, + "gpu_mem": 4.572390912, + "loss": 0.9422, + "grad_norm": 2.011331558227539, + "learning_rate": 0.00026363703228046454 + }, + { + "step": 191, + "epoch": 0.3060897435897436, + "cpu_mem": 1.789026304, + "gpu_mem": 4.572390912, + "loss": 0.9348, + "grad_norm": 3.6165857315063477, + "learning_rate": 0.0002630869512690507 + }, + { + "step": 192, + "epoch": 0.3076923076923077, + "cpu_mem": 1.78941952, + "gpu_mem": 4.57235712, + "loss": 1.0489, + "grad_norm": 2.7752137184143066, + "learning_rate": 0.0002625333238766989 + }, + { + "step": 193, + "epoch": 0.3092948717948718, + "cpu_mem": 1.789616128, + "gpu_mem": 4.572397056, + "loss": 0.653, + "grad_norm": 1.767888069152832, + "learning_rate": 0.0002619761674650377 + }, + { + "step": 194, + "epoch": 0.3108974358974359, + "cpu_mem": 1.789812736, + "gpu_mem": 4.572392448, + "loss": 1.0487, + "grad_norm": 3.781047821044922, + "learning_rate": 0.0002614154995063647 + }, + { + "step": 195, + "epoch": 0.3125, + "cpu_mem": 1.790009344, + "gpu_mem": 4.57238016, + "loss": 0.924, + "grad_norm": 2.2053394317626953, + "learning_rate": 0.00026085133758309883 + }, + { + "step": 196, + "epoch": 0.3141025641025641, + "cpu_mem": 1.790205952, + "gpu_mem": 4.572404736, + "loss": 0.8063, + "grad_norm": 2.7982916831970215, + "learning_rate": 0.0002602836993872292 + }, + { + "step": 197, + "epoch": 0.3157051282051282, + "cpu_mem": 1.79040256, + "gpu_mem": 4.572420096, + "loss": 0.9064, + "grad_norm": 2.465733528137207, + "learning_rate": 0.0002597126027197598 + }, + { + "step": 198, + "epoch": 0.3173076923076923, + "cpu_mem": 1.790599168, + "gpu_mem": 4.572392448, + "loss": 0.8722, + "grad_norm": 5.041089057922363, + "learning_rate": 0.0002591380654901515 + }, + { + "step": 199, + "epoch": 0.3189102564102564, + "cpu_mem": 1.790795776, + "gpu_mem": 4.572389376, + "loss": 1.4901, + "grad_norm": 6.089566707611084, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 200, + "epoch": 0.32051282051282054, + "cpu_mem": 1.790992384, + "gpu_mem": 4.572404736, + "loss": 1.3749, + "grad_norm": 5.296858787536621, + "learning_rate": 0.0002579787415212732 + }, + { + "step": 201, + "epoch": 0.32211538461538464, + "cpu_mem": 1.791188992, + "gpu_mem": 4.572381696, + "loss": 0.9179, + "grad_norm": 2.773772954940796, + "learning_rate": 0.00025739399113813784 + }, + { + "step": 202, + "epoch": 0.32371794871794873, + "cpu_mem": 1.7913856, + "gpu_mem": 4.572383232, + "loss": 0.7621, + "grad_norm": 1.5139659643173218, + "learning_rate": 0.00025680587290399277 + }, + { + "step": 203, + "epoch": 0.32532051282051283, + "cpu_mem": 1.7913856, + "gpu_mem": 4.572424704, + "loss": 0.7155, + "grad_norm": 1.752686619758606, + "learning_rate": 0.0002562144052620913 + }, + { + "step": 204, + "epoch": 0.3269230769230769, + "cpu_mem": 1.791582208, + "gpu_mem": 4.57239552, + "loss": 0.7952, + "grad_norm": 1.8799868822097778, + "learning_rate": 0.00025561960676072354 + }, + { + "step": 205, + "epoch": 0.328525641025641, + "cpu_mem": 1.791778816, + "gpu_mem": 4.57239552, + "loss": 1.0038, + "grad_norm": 1.7779262065887451, + "learning_rate": 0.0002550214960526344 + }, + { + "step": 206, + "epoch": 0.3301282051282051, + "cpu_mem": 1.791975424, + "gpu_mem": 4.572392448, + "loss": 0.8104, + "grad_norm": 1.7243186235427856, + "learning_rate": 0.000254420091894439 + }, + { + "step": 207, + "epoch": 0.3317307692307692, + "cpu_mem": 1.792172032, + "gpu_mem": 4.572392448, + "loss": 0.925, + "grad_norm": 1.736804485321045, + "learning_rate": 0.0002538154131460342 + }, + { + "step": 208, + "epoch": 0.3333333333333333, + "cpu_mem": 1.792565248, + "gpu_mem": 4.572383232, + "loss": 0.8343, + "grad_norm": 2.173525094985962, + "learning_rate": 0.00025320747877000745 + }, + { + "step": 209, + "epoch": 0.3349358974358974, + "cpu_mem": 1.792761856, + "gpu_mem": 4.57241856, + "loss": 0.71, + "grad_norm": 2.537572145462036, + "learning_rate": 0.00025259630783104164 + }, + { + "step": 210, + "epoch": 0.33653846153846156, + "cpu_mem": 1.792958464, + "gpu_mem": 4.572375552, + "loss": 0.9752, + "grad_norm": 2.327475070953369, + "learning_rate": 0.00025198191949531786 + }, + { + "step": 211, + "epoch": 0.33814102564102566, + "cpu_mem": 1.793155072, + "gpu_mem": 4.5724032, + "loss": 0.7642, + "grad_norm": 2.1844372749328613, + "learning_rate": 0.00025136433302991366 + }, + { + "step": 212, + "epoch": 0.33974358974358976, + "cpu_mem": 1.793155072, + "gpu_mem": 4.572412416, + "loss": 0.7745, + "grad_norm": 1.6594189405441284, + "learning_rate": 0.00025074356780219946 + }, + { + "step": 213, + "epoch": 0.34134615384615385, + "cpu_mem": 1.79335168, + "gpu_mem": 4.572384768, + "loss": 0.8189, + "grad_norm": 2.0410757064819336, + "learning_rate": 0.000250119643279231 + }, + { + "step": 214, + "epoch": 0.34294871794871795, + "cpu_mem": 1.793548288, + "gpu_mem": 4.572393984, + "loss": 0.7776, + "grad_norm": 2.755866765975952, + "learning_rate": 0.0002494925790271386 + }, + { + "step": 215, + "epoch": 0.34455128205128205, + "cpu_mem": 1.793744896, + "gpu_mem": 4.57239552, + "loss": 0.7219, + "grad_norm": 2.430938482284546, + "learning_rate": 0.00024886239471051376 + }, + { + "step": 216, + "epoch": 0.34615384615384615, + "cpu_mem": 1.794138112, + "gpu_mem": 4.57239552, + "loss": 0.7962, + "grad_norm": 2.4007620811462402, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 217, + "epoch": 0.34775641025641024, + "cpu_mem": 1.794531328, + "gpu_mem": 4.57238016, + "loss": 0.6506, + "grad_norm": 2.7735657691955566, + "learning_rate": 0.0002475927450306363 + }, + { + "step": 218, + "epoch": 0.34935897435897434, + "cpu_mem": 1.794727936, + "gpu_mem": 4.572401664, + "loss": 0.662, + "grad_norm": 3.1564464569091797, + "learning_rate": 0.0002469533194833073 + }, + { + "step": 219, + "epoch": 0.35096153846153844, + "cpu_mem": 1.794727936, + "gpu_mem": 4.572435456, + "loss": 0.7997, + "grad_norm": 2.4561262130737305, + "learning_rate": 0.0002463108535020447 + }, + { + "step": 220, + "epoch": 0.3525641025641026, + "cpu_mem": 1.794924544, + "gpu_mem": 4.572389376, + "loss": 0.8808, + "grad_norm": 2.9486501216888428, + "learning_rate": 0.0002456653672344348 + }, + { + "step": 221, + "epoch": 0.3541666666666667, + "cpu_mem": 1.795121152, + "gpu_mem": 4.57239552, + "loss": 0.8674, + "grad_norm": 3.36849045753479, + "learning_rate": 0.0002450168809227794 + }, + { + "step": 222, + "epoch": 0.3557692307692308, + "cpu_mem": 1.79531776, + "gpu_mem": 4.57241088, + "loss": 0.9135, + "grad_norm": 2.9693214893341064, + "learning_rate": 0.00024436541490346095 + }, + { + "step": 223, + "epoch": 0.3573717948717949, + "cpu_mem": 1.79531776, + "gpu_mem": 4.572429312, + "loss": 0.6466, + "grad_norm": 2.19952130317688, + "learning_rate": 0.00024371098960630495 + }, + { + "step": 224, + "epoch": 0.358974358974359, + "cpu_mem": 1.795514368, + "gpu_mem": 4.572398592, + "loss": 0.6278, + "grad_norm": 2.192324638366699, + "learning_rate": 0.000243053625553939 + }, + { + "step": 225, + "epoch": 0.3605769230769231, + "cpu_mem": 1.795710976, + "gpu_mem": 4.572384768, + "loss": 0.6854, + "grad_norm": 1.8270143270492554, + "learning_rate": 0.00024239334336114953 + }, + { + "step": 226, + "epoch": 0.36217948717948717, + "cpu_mem": 1.795907584, + "gpu_mem": 4.572377088, + "loss": 0.6563, + "grad_norm": 2.584806442260742, + "learning_rate": 0.0002417301637342352 + }, + { + "step": 227, + "epoch": 0.36378205128205127, + "cpu_mem": 1.796104192, + "gpu_mem": 4.5724416, + "loss": 0.6876, + "grad_norm": 1.8320900201797485, + "learning_rate": 0.00024106410747035744 + }, + { + "step": 228, + "epoch": 0.36538461538461536, + "cpu_mem": 1.7963008, + "gpu_mem": 4.57238016, + "loss": 0.7853, + "grad_norm": 2.953617572784424, + "learning_rate": 0.00024039519545688846 + }, + { + "step": 229, + "epoch": 0.36698717948717946, + "cpu_mem": 1.796497408, + "gpu_mem": 4.572432384, + "loss": 0.6813, + "grad_norm": 2.281503677368164, + "learning_rate": 0.000239723448670756 + }, + { + "step": 230, + "epoch": 0.3685897435897436, + "cpu_mem": 1.796694016, + "gpu_mem": 4.572413952, + "loss": 0.6395, + "grad_norm": 1.9307745695114136, + "learning_rate": 0.0002390488881777858 + }, + { + "step": 231, + "epoch": 0.3701923076923077, + "cpu_mem": 1.796890624, + "gpu_mem": 4.572412416, + "loss": 0.6719, + "grad_norm": 1.9474403858184814, + "learning_rate": 0.0002383715351320406 + }, + { + "step": 232, + "epoch": 0.3717948717948718, + "cpu_mem": 1.79728384, + "gpu_mem": 4.572417024, + "loss": 0.73, + "grad_norm": 1.9232150316238403, + "learning_rate": 0.00023769141077515713 + }, + { + "step": 233, + "epoch": 0.3733974358974359, + "cpu_mem": 1.797480448, + "gpu_mem": 4.572392448, + "loss": 0.5434, + "grad_norm": 1.8359713554382324, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 234, + "epoch": 0.375, + "cpu_mem": 1.797480448, + "gpu_mem": 4.572421632, + "loss": 0.5047, + "grad_norm": 2.3669490814208984, + "learning_rate": 0.0002363229335283915 + }, + { + "step": 235, + "epoch": 0.3766025641025641, + "cpu_mem": 1.797677056, + "gpu_mem": 4.572398592, + "loss": 0.6767, + "grad_norm": 2.087580919265747, + "learning_rate": 0.00023563462355364297 + }, + { + "step": 236, + "epoch": 0.3782051282051282, + "cpu_mem": 1.797677056, + "gpu_mem": 4.572460032, + "loss": 0.5157, + "grad_norm": 2.1725306510925293, + "learning_rate": 0.0002349436280966775 + }, + { + "step": 237, + "epoch": 0.3798076923076923, + "cpu_mem": 1.797873664, + "gpu_mem": 4.572384768, + "loss": 0.5702, + "grad_norm": 2.490737199783325, + "learning_rate": 0.00023424996882695468 + }, + { + "step": 238, + "epoch": 0.3814102564102564, + "cpu_mem": 1.798070272, + "gpu_mem": 4.57239552, + "loss": 0.7019, + "grad_norm": 2.8851280212402344, + "learning_rate": 0.00023355366749747063 + }, + { + "step": 239, + "epoch": 0.38301282051282054, + "cpu_mem": 1.79826688, + "gpu_mem": 4.572393984, + "loss": 0.8605, + "grad_norm": 3.0613157749176025, + "learning_rate": 0.00023285474594407585 + }, + { + "step": 240, + "epoch": 0.38461538461538464, + "cpu_mem": 1.798463488, + "gpu_mem": 4.572390912, + "loss": 0.7869, + "grad_norm": 3.247941493988037, + "learning_rate": 0.0002321532260847905 + }, + { + "step": 241, + "epoch": 0.38621794871794873, + "cpu_mem": 1.798660096, + "gpu_mem": 4.572421632, + "loss": 0.3829, + "grad_norm": 1.8984922170639038, + "learning_rate": 0.00023144912991911691 + }, + { + "step": 242, + "epoch": 0.38782051282051283, + "cpu_mem": 1.798856704, + "gpu_mem": 4.572400128, + "loss": 0.4924, + "grad_norm": 2.033193588256836, + "learning_rate": 0.0002307424795273499 + }, + { + "step": 243, + "epoch": 0.3894230769230769, + "cpu_mem": 1.799053312, + "gpu_mem": 4.57239552, + "loss": 0.6568, + "grad_norm": 3.0940663814544678, + "learning_rate": 0.00023003329706988425 + }, + { + "step": 244, + "epoch": 0.391025641025641, + "cpu_mem": 1.79924992, + "gpu_mem": 4.572406272, + "loss": 0.5547, + "grad_norm": 1.9632433652877808, + "learning_rate": 0.00022932160478651963 + }, + { + "step": 245, + "epoch": 0.3926282051282051, + "cpu_mem": 1.79924992, + "gpu_mem": 4.57241088, + "loss": 0.7931, + "grad_norm": 2.4235031604766846, + "learning_rate": 0.00022860742499576338 + }, + { + "step": 246, + "epoch": 0.3942307692307692, + "cpu_mem": 1.799446528, + "gpu_mem": 4.57237248, + "loss": 0.8317, + "grad_norm": 2.894374132156372, + "learning_rate": 0.00022789078009413042 + }, + { + "step": 247, + "epoch": 0.3958333333333333, + "cpu_mem": 1.799643136, + "gpu_mem": 4.572440064, + "loss": 0.9816, + "grad_norm": 2.576711416244507, + "learning_rate": 0.00022717169255544108 + }, + { + "step": 248, + "epoch": 0.3974358974358974, + "cpu_mem": 1.799839744, + "gpu_mem": 4.5724032, + "loss": 0.7731, + "grad_norm": 1.768857479095459, + "learning_rate": 0.00022645018493011612 + }, + { + "step": 249, + "epoch": 0.39903846153846156, + "cpu_mem": 1.800036352, + "gpu_mem": 4.572392448, + "loss": 0.7311, + "grad_norm": 1.9289555549621582, + "learning_rate": 0.0002257262798444698 + }, + { + "step": 250, + "epoch": 0.40064102564102566, + "cpu_mem": 1.80023296, + "gpu_mem": 4.572409344, + "loss": 0.671, + "grad_norm": 1.5736393928527832, + "learning_rate": 0.000225 + }, + { + "step": 251, + "epoch": 0.40224358974358976, + "cpu_mem": 1.80023296, + "gpu_mem": 4.572383232, + "loss": 0.5563, + "grad_norm": 1.483507513999939, + "learning_rate": 0.00022427136817267668 + }, + { + "step": 252, + "epoch": 0.40384615384615385, + "cpu_mem": 1.800429568, + "gpu_mem": 4.572430848, + "loss": 0.6098, + "grad_norm": 1.8974123001098633, + "learning_rate": 0.0002235404072122273 + }, + { + "step": 253, + "epoch": 0.40544871794871795, + "cpu_mem": 1.800626176, + "gpu_mem": 4.572398592, + "loss": 0.6189, + "grad_norm": 2.144845724105835, + "learning_rate": 0.00022280714004142054 + }, + { + "step": 254, + "epoch": 0.40705128205128205, + "cpu_mem": 1.800822784, + "gpu_mem": 4.57238784, + "loss": 0.4605, + "grad_norm": 1.4728738069534302, + "learning_rate": 0.00022207158965534726 + }, + { + "step": 255, + "epoch": 0.40865384615384615, + "cpu_mem": 1.801019392, + "gpu_mem": 4.5724032, + "loss": 0.7175, + "grad_norm": 2.678682804107666, + "learning_rate": 0.0002213337791206993 + }, + { + "step": 256, + "epoch": 0.41025641025641024, + "cpu_mem": 1.801216, + "gpu_mem": 4.572400128, + "loss": 0.75, + "grad_norm": 2.9448490142822266, + "learning_rate": 0.00022059373157504636 + }, + { + "step": 257, + "epoch": 0.41185897435897434, + "cpu_mem": 1.801216, + "gpu_mem": 4.572400128, + "loss": 0.6324, + "grad_norm": 2.389721632003784, + "learning_rate": 0.00021985147022611038 + }, + { + "step": 258, + "epoch": 0.41346153846153844, + "cpu_mem": 1.801412608, + "gpu_mem": 4.57238784, + "loss": 0.5875, + "grad_norm": 2.329468250274658, + "learning_rate": 0.0002191070183510375 + }, + { + "step": 259, + "epoch": 0.4150641025641026, + "cpu_mem": 1.801609216, + "gpu_mem": 4.572370944, + "loss": 0.7294, + "grad_norm": 3.3815619945526123, + "learning_rate": 0.00021836039929566835 + }, + { + "step": 260, + "epoch": 0.4166666666666667, + "cpu_mem": 1.801609216, + "gpu_mem": 4.57243392, + "loss": 0.7751, + "grad_norm": 2.8914947509765625, + "learning_rate": 0.0002176116364738058 + }, + { + "step": 261, + "epoch": 0.4182692307692308, + "cpu_mem": 1.801805824, + "gpu_mem": 4.57238784, + "loss": 0.622, + "grad_norm": 2.5867340564727783, + "learning_rate": 0.00021686075336648075 + }, + { + "step": 262, + "epoch": 0.4198717948717949, + "cpu_mem": 1.802002432, + "gpu_mem": 4.572397056, + "loss": 0.4397, + "grad_norm": 2.011258602142334, + "learning_rate": 0.00021610777352121574 + }, + { + "step": 263, + "epoch": 0.421474358974359, + "cpu_mem": 1.80219904, + "gpu_mem": 4.572432384, + "loss": 0.4445, + "grad_norm": 2.348716974258423, + "learning_rate": 0.0002153527205512867 + }, + { + "step": 264, + "epoch": 0.4230769230769231, + "cpu_mem": 1.802395648, + "gpu_mem": 4.572397056, + "loss": 0.7094, + "grad_norm": 2.278942108154297, + "learning_rate": 0.0002145956181349821 + }, + { + "step": 265, + "epoch": 0.42467948717948717, + "cpu_mem": 1.802395648, + "gpu_mem": 4.572401664, + "loss": 0.6389, + "grad_norm": 2.4263010025024414, + "learning_rate": 0.00021383649001486055 + }, + { + "step": 266, + "epoch": 0.42628205128205127, + "cpu_mem": 1.802395648, + "gpu_mem": 4.57244928, + "loss": 0.483, + "grad_norm": 1.8980971574783325, + "learning_rate": 0.00021307535999700637 + }, + { + "step": 267, + "epoch": 0.42788461538461536, + "cpu_mem": 1.802592256, + "gpu_mem": 4.572458496, + "loss": 0.5404, + "grad_norm": 1.633828043937683, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 268, + "epoch": 0.42948717948717946, + "cpu_mem": 1.802788864, + "gpu_mem": 4.572412416, + "loss": 0.425, + "grad_norm": 1.9854241609573364, + "learning_rate": 0.00021154718980558417 + }, + { + "step": 269, + "epoch": 0.4310897435897436, + "cpu_mem": 1.802985472, + "gpu_mem": 4.572406272, + "loss": 0.5557, + "grad_norm": 1.8791842460632324, + "learning_rate": 0.00021078019755508398 + }, + { + "step": 270, + "epoch": 0.4326923076923077, + "cpu_mem": 1.802985472, + "gpu_mem": 4.572467712, + "loss": 0.6107, + "grad_norm": 1.7713807821273804, + "learning_rate": 0.00021001129925148396 + }, + { + "step": 271, + "epoch": 0.4342948717948718, + "cpu_mem": 1.80318208, + "gpu_mem": 4.572393984, + "loss": 0.4377, + "grad_norm": 2.0091569423675537, + "learning_rate": 0.00020924051900725923 + }, + { + "step": 272, + "epoch": 0.4358974358974359, + "cpu_mem": 1.803378688, + "gpu_mem": 4.572392448, + "loss": 0.6015, + "grad_norm": 2.201500177383423, + "learning_rate": 0.00020846788099390188 + }, + { + "step": 273, + "epoch": 0.4375, + "cpu_mem": 1.803575296, + "gpu_mem": 4.57239552, + "loss": 0.6401, + "grad_norm": 2.0088374614715576, + "learning_rate": 0.0002076934094411635 + }, + { + "step": 274, + "epoch": 0.4391025641025641, + "cpu_mem": 1.803575296, + "gpu_mem": 4.572381696, + "loss": 0.5921, + "grad_norm": 1.8797252178192139, + "learning_rate": 0.0002069171286362949 + }, + { + "step": 275, + "epoch": 0.4407051282051282, + "cpu_mem": 1.803771904, + "gpu_mem": 4.572397056, + "loss": 0.4273, + "grad_norm": 2.0350940227508545, + "learning_rate": 0.00020613906292328457 + }, + { + "step": 276, + "epoch": 0.4423076923076923, + "cpu_mem": 1.803968512, + "gpu_mem": 4.572435456, + "loss": 0.5895, + "grad_norm": 2.0890450477600098, + "learning_rate": 0.0002053592367020955 + }, + { + "step": 277, + "epoch": 0.4439102564102564, + "cpu_mem": 1.80416512, + "gpu_mem": 4.572415488, + "loss": 0.6866, + "grad_norm": 2.0388224124908447, + "learning_rate": 0.0002045776744278996 + }, + { + "step": 278, + "epoch": 0.44551282051282054, + "cpu_mem": 1.804361728, + "gpu_mem": 4.5724416, + "loss": 0.5047, + "grad_norm": 2.2020905017852783, + "learning_rate": 0.00020379440061031118 + }, + { + "step": 279, + "epoch": 0.44711538461538464, + "cpu_mem": 1.804361728, + "gpu_mem": 4.572392448, + "loss": 0.4247, + "grad_norm": 2.184063196182251, + "learning_rate": 0.00020300943981261808 + }, + { + "step": 280, + "epoch": 0.44871794871794873, + "cpu_mem": 1.804558336, + "gpu_mem": 4.572386304, + "loss": 0.6021, + "grad_norm": 2.901616334915161, + "learning_rate": 0.0002022228166510114 + }, + { + "step": 281, + "epoch": 0.45032051282051283, + "cpu_mem": 1.804558336, + "gpu_mem": 4.572409344, + "loss": 0.4846, + "grad_norm": 2.304006814956665, + "learning_rate": 0.00020143455579381373 + }, + { + "step": 282, + "epoch": 0.4519230769230769, + "cpu_mem": 1.804558336, + "gpu_mem": 4.57238784, + "loss": 0.5421, + "grad_norm": 2.455965995788574, + "learning_rate": 0.0002006446819607053 + }, + { + "step": 283, + "epoch": 0.453525641025641, + "cpu_mem": 1.804951552, + "gpu_mem": 4.572401664, + "loss": 0.7335, + "grad_norm": 2.379002571105957, + "learning_rate": 0.00019985321992194892 + }, + { + "step": 284, + "epoch": 0.4551282051282051, + "cpu_mem": 1.80514816, + "gpu_mem": 4.572406272, + "loss": 0.4613, + "grad_norm": 1.873285174369812, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 285, + "epoch": 0.4567307692307692, + "cpu_mem": 1.80514816, + "gpu_mem": 4.572424704, + "loss": 0.4768, + "grad_norm": 2.0198049545288086, + "learning_rate": 0.00019826563055679418 + }, + { + "step": 286, + "epoch": 0.4583333333333333, + "cpu_mem": 1.80514816, + "gpu_mem": 4.57239552, + "loss": 0.3955, + "grad_norm": 1.7920222282409668, + "learning_rate": 0.00019746955301683537 + }, + { + "step": 287, + "epoch": 0.4599358974358974, + "cpu_mem": 1.805344768, + "gpu_mem": 4.572423168, + "loss": 0.7905, + "grad_norm": 2.2204089164733887, + "learning_rate": 0.0001966719868425464 + }, + { + "step": 288, + "epoch": 0.46153846153846156, + "cpu_mem": 1.805344768, + "gpu_mem": 4.572404736, + "loss": 0.5777, + "grad_norm": 2.1772408485412598, + "learning_rate": 0.0001958729570454201 + }, + { + "step": 289, + "epoch": 0.46314102564102566, + "cpu_mem": 1.805541376, + "gpu_mem": 4.572392448, + "loss": 0.6253, + "grad_norm": 2.0185599327087402, + "learning_rate": 0.0001950724886828484 + }, + { + "step": 290, + "epoch": 0.46474358974358976, + "cpu_mem": 1.805737984, + "gpu_mem": 4.572401664, + "loss": 0.6712, + "grad_norm": 2.2817163467407227, + "learning_rate": 0.000194270606857336 + }, + { + "step": 291, + "epoch": 0.46634615384615385, + "cpu_mem": 1.805737984, + "gpu_mem": 4.572398592, + "loss": 0.7657, + "grad_norm": 2.5811731815338135, + "learning_rate": 0.00019346733671571367 + }, + { + "step": 292, + "epoch": 0.46794871794871795, + "cpu_mem": 1.805737984, + "gpu_mem": 4.572413952, + "loss": 0.5393, + "grad_norm": 2.1039271354675293, + "learning_rate": 0.00019266270344834942 + }, + { + "step": 293, + "epoch": 0.46955128205128205, + "cpu_mem": 1.805934592, + "gpu_mem": 4.572421632, + "loss": 0.5133, + "grad_norm": 1.7693818807601929, + "learning_rate": 0.00019185673228835857 + }, + { + "step": 294, + "epoch": 0.47115384615384615, + "cpu_mem": 1.8061312, + "gpu_mem": 4.57241088, + "loss": 0.5892, + "grad_norm": 1.9819704294204712, + "learning_rate": 0.00019104944851081244 + }, + { + "step": 295, + "epoch": 0.47275641025641024, + "cpu_mem": 1.806327808, + "gpu_mem": 4.57239552, + "loss": 0.8163, + "grad_norm": 1.8848047256469727, + "learning_rate": 0.00019024087743194564 + }, + { + "step": 296, + "epoch": 0.47435897435897434, + "cpu_mem": 1.806327808, + "gpu_mem": 4.572398592, + "loss": 0.5781, + "grad_norm": 1.8410922288894653, + "learning_rate": 0.0001894310444083625 + }, + { + "step": 297, + "epoch": 0.47596153846153844, + "cpu_mem": 1.806524416, + "gpu_mem": 4.572392448, + "loss": 0.4735, + "grad_norm": 1.7143431901931763, + "learning_rate": 0.00018861997483624136 + }, + { + "step": 298, + "epoch": 0.4775641025641026, + "cpu_mem": 1.806524416, + "gpu_mem": 4.57238784, + "loss": 0.8052, + "grad_norm": 2.3174238204956055, + "learning_rate": 0.00018780769415053866 + }, + { + "step": 299, + "epoch": 0.4791666666666667, + "cpu_mem": 1.806721024, + "gpu_mem": 4.572409344, + "loss": 0.7693, + "grad_norm": 2.0956342220306396, + "learning_rate": 0.00018699422782419094 + }, + { + "step": 300, + "epoch": 0.4807692307692308, + "cpu_mem": 1.806917632, + "gpu_mem": 4.572401664, + "loss": 0.5526, + "grad_norm": 1.4929345846176147, + "learning_rate": 0.00018617960136731624 + }, + { + "step": 301, + "epoch": 0.4823717948717949, + "cpu_mem": 1.806917632, + "gpu_mem": 4.572374016, + "loss": 0.6687, + "grad_norm": 2.2686831951141357, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 302, + "epoch": 0.483974358974359, + "cpu_mem": 1.80711424, + "gpu_mem": 4.57237248, + "loss": 0.5266, + "grad_norm": 1.6429171562194824, + "learning_rate": 0.0001845469702835641 + }, + { + "step": 303, + "epoch": 0.4855769230769231, + "cpu_mem": 1.80711424, + "gpu_mem": 4.572398592, + "loss": 0.5612, + "grad_norm": 1.7741395235061646, + "learning_rate": 0.00018372901685562414 + }, + { + "step": 304, + "epoch": 0.48717948717948717, + "cpu_mem": 1.80711424, + "gpu_mem": 4.572381696, + "loss": 0.3907, + "grad_norm": 1.6909221410751343, + "learning_rate": 0.00018291000569342676 + }, + { + "step": 305, + "epoch": 0.48878205128205127, + "cpu_mem": 1.80711424, + "gpu_mem": 4.572412416, + "loss": 0.4759, + "grad_norm": 1.8904331922531128, + "learning_rate": 0.00018208996248097458 + }, + { + "step": 306, + "epoch": 0.49038461538461536, + "cpu_mem": 1.807310848, + "gpu_mem": 4.57239552, + "loss": 0.6323, + "grad_norm": 1.925713300704956, + "learning_rate": 0.00018126891293463547 + }, + { + "step": 307, + "epoch": 0.49198717948717946, + "cpu_mem": 1.807507456, + "gpu_mem": 4.57242624, + "loss": 0.5605, + "grad_norm": 1.8923957347869873, + "learning_rate": 0.0001804468828023354 + }, + { + "step": 308, + "epoch": 0.4935897435897436, + "cpu_mem": 1.807704064, + "gpu_mem": 4.572393984, + "loss": 0.5894, + "grad_norm": 2.4258220195770264, + "learning_rate": 0.00017962389786275142 + }, + { + "step": 309, + "epoch": 0.4951923076923077, + "cpu_mem": 1.807704064, + "gpu_mem": 4.572420096, + "loss": 0.4512, + "grad_norm": 2.0147106647491455, + "learning_rate": 0.000178799983924503 + }, + { + "step": 310, + "epoch": 0.4967948717948718, + "cpu_mem": 1.807900672, + "gpu_mem": 4.57239552, + "loss": 0.4176, + "grad_norm": 1.515066385269165, + "learning_rate": 0.00017797516682534293 + }, + { + "step": 311, + "epoch": 0.4983974358974359, + "cpu_mem": 1.807900672, + "gpu_mem": 4.572390912, + "loss": 0.6255, + "grad_norm": 2.3580846786499023, + "learning_rate": 0.00017714947243134695 + }, + { + "step": 312, + "epoch": 0.5, + "cpu_mem": 1.80809728, + "gpu_mem": 4.572393984, + "loss": 0.4415, + "grad_norm": 2.349097490310669, + "learning_rate": 0.0001763229266361024 + }, + { + "step": 313, + "epoch": 0.5016025641025641, + "cpu_mem": 1.80809728, + "gpu_mem": 4.572412416, + "loss": 0.4249, + "grad_norm": 2.138343095779419, + "learning_rate": 0.00017549555535989648 + }, + { + "step": 314, + "epoch": 0.5032051282051282, + "cpu_mem": 1.808293888, + "gpu_mem": 4.572392448, + "loss": 0.669, + "grad_norm": 2.462341070175171, + "learning_rate": 0.00017466738454890323 + }, + { + "step": 315, + "epoch": 0.5048076923076923, + "cpu_mem": 1.808293888, + "gpu_mem": 4.572397056, + "loss": 0.5479, + "grad_norm": 2.2205259799957275, + "learning_rate": 0.00017383844017436996 + }, + { + "step": 316, + "epoch": 0.5064102564102564, + "cpu_mem": 1.808293888, + "gpu_mem": 4.572392448, + "loss": 0.7091, + "grad_norm": 2.9859628677368164, + "learning_rate": 0.00017300874823180282 + }, + { + "step": 317, + "epoch": 0.5080128205128205, + "cpu_mem": 1.808490496, + "gpu_mem": 4.572400128, + "loss": 0.4131, + "grad_norm": 1.763095736503601, + "learning_rate": 0.00017217833474015128 + }, + { + "step": 318, + "epoch": 0.5096153846153846, + "cpu_mem": 1.808490496, + "gpu_mem": 4.572424704, + "loss": 0.6072, + "grad_norm": 2.32967472076416, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 319, + "epoch": 0.5112179487179487, + "cpu_mem": 1.808687104, + "gpu_mem": 4.572417024, + "loss": 0.5083, + "grad_norm": 2.2245593070983887, + "learning_rate": 0.0001705154472977154 + }, + { + "step": 320, + "epoch": 0.5128205128205128, + "cpu_mem": 1.808883712, + "gpu_mem": 4.57241856, + "loss": 0.549, + "grad_norm": 1.6320501565933228, + "learning_rate": 0.00016968302549470095 + }, + { + "step": 321, + "epoch": 0.5144230769230769, + "cpu_mem": 1.808883712, + "gpu_mem": 4.572393984, + "loss": 0.4643, + "grad_norm": 1.7853938341140747, + "learning_rate": 0.00016884998643650694 + }, + { + "step": 322, + "epoch": 0.5160256410256411, + "cpu_mem": 1.808883712, + "gpu_mem": 4.57239552, + "loss": 0.6403, + "grad_norm": 2.10166335105896, + "learning_rate": 0.00016801635624704776 + }, + { + "step": 323, + "epoch": 0.5176282051282052, + "cpu_mem": 1.80908032, + "gpu_mem": 4.572415488, + "loss": 0.6473, + "grad_norm": 2.133596658706665, + "learning_rate": 0.0001671821610687756 + }, + { + "step": 324, + "epoch": 0.5192307692307693, + "cpu_mem": 1.809276928, + "gpu_mem": 4.57238784, + "loss": 0.6849, + "grad_norm": 2.5037779808044434, + "learning_rate": 0.00016634742706186036 + }, + { + "step": 325, + "epoch": 0.5208333333333334, + "cpu_mem": 1.809473536, + "gpu_mem": 4.572400128, + "loss": 0.3333, + "grad_norm": 1.5264371633529663, + "learning_rate": 0.00016551218040336993 + }, + { + "step": 326, + "epoch": 0.5224358974358975, + "cpu_mem": 1.809473536, + "gpu_mem": 4.572409344, + "loss": 0.4241, + "grad_norm": 1.8263121843338013, + "learning_rate": 0.00016467644728644843 + }, + { + "step": 327, + "epoch": 0.5240384615384616, + "cpu_mem": 1.809670144, + "gpu_mem": 4.572386304, + "loss": 0.5645, + "grad_norm": 2.1158697605133057, + "learning_rate": 0.0001638402539194953 + }, + { + "step": 328, + "epoch": 0.5256410256410257, + "cpu_mem": 1.809670144, + "gpu_mem": 4.57241088, + "loss": 0.633, + "grad_norm": 2.2622430324554443, + "learning_rate": 0.00016300362652534346 + }, + { + "step": 329, + "epoch": 0.5272435897435898, + "cpu_mem": 1.809670144, + "gpu_mem": 4.57241088, + "loss": 0.5457, + "grad_norm": 1.5456417798995972, + "learning_rate": 0.00016216659134043657 + }, + { + "step": 330, + "epoch": 0.5288461538461539, + "cpu_mem": 1.809670144, + "gpu_mem": 4.572393984, + "loss": 0.475, + "grad_norm": 1.8432239294052124, + "learning_rate": 0.00016132917461400686 + }, + { + "step": 331, + "epoch": 0.530448717948718, + "cpu_mem": 1.809866752, + "gpu_mem": 4.572390912, + "loss": 0.5158, + "grad_norm": 2.2177534103393555, + "learning_rate": 0.00016049140260725127 + }, + { + "step": 332, + "epoch": 0.532051282051282, + "cpu_mem": 1.809866752, + "gpu_mem": 4.572383232, + "loss": 0.4812, + "grad_norm": 1.862587809562683, + "learning_rate": 0.00015965330159250845 + }, + { + "step": 333, + "epoch": 0.5336538461538461, + "cpu_mem": 1.809866752, + "gpu_mem": 4.572421632, + "loss": 0.4767, + "grad_norm": 1.7609305381774902, + "learning_rate": 0.00015881489785243467 + }, + { + "step": 334, + "epoch": 0.5352564102564102, + "cpu_mem": 1.81006336, + "gpu_mem": 4.572398592, + "loss": 0.8867, + "grad_norm": 2.5160202980041504, + "learning_rate": 0.00015797621767917942 + }, + { + "step": 335, + "epoch": 0.5368589743589743, + "cpu_mem": 1.810259968, + "gpu_mem": 4.572397056, + "loss": 0.6668, + "grad_norm": 3.0232224464416504, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 336, + "epoch": 0.5384615384615384, + "cpu_mem": 1.810259968, + "gpu_mem": 4.572413952, + "loss": 0.4639, + "grad_norm": 2.0184779167175293, + "learning_rate": 0.00015629813324424292 + }, + { + "step": 337, + "epoch": 0.5400641025641025, + "cpu_mem": 1.810259968, + "gpu_mem": 4.572398592, + "loss": 0.508, + "grad_norm": 1.9582005739212036, + "learning_rate": 0.00015545878160690583 + }, + { + "step": 338, + "epoch": 0.5416666666666666, + "cpu_mem": 1.810259968, + "gpu_mem": 4.57241088, + "loss": 0.485, + "grad_norm": 1.7651203870773315, + "learning_rate": 0.00015461925878342556 + }, + { + "step": 339, + "epoch": 0.5432692307692307, + "cpu_mem": 1.810456576, + "gpu_mem": 4.572423168, + "loss": 0.4638, + "grad_norm": 1.8525420427322388, + "learning_rate": 0.00015377959110104584 + }, + { + "step": 340, + "epoch": 0.5448717948717948, + "cpu_mem": 1.810456576, + "gpu_mem": 4.572398592, + "loss": 0.4801, + "grad_norm": 1.774743676185608, + "learning_rate": 0.00015293980489155333 + }, + { + "step": 341, + "epoch": 0.5464743589743589, + "cpu_mem": 1.810456576, + "gpu_mem": 4.572443136, + "loss": 0.6667, + "grad_norm": 2.0509798526763916, + "learning_rate": 0.00015209992649045152 + }, + { + "step": 342, + "epoch": 0.5480769230769231, + "cpu_mem": 1.810456576, + "gpu_mem": 4.572417024, + "loss": 0.5803, + "grad_norm": 1.890817642211914, + "learning_rate": 0.000151259982236135 + }, + { + "step": 343, + "epoch": 0.5496794871794872, + "cpu_mem": 1.810653184, + "gpu_mem": 4.572413952, + "loss": 0.5528, + "grad_norm": 1.8789170980453491, + "learning_rate": 0.00015041999846906367 + }, + { + "step": 344, + "epoch": 0.5512820512820513, + "cpu_mem": 1.810849792, + "gpu_mem": 4.57239552, + "loss": 0.3921, + "grad_norm": 1.6118203401565552, + "learning_rate": 0.00014958000153093634 + }, + { + "step": 345, + "epoch": 0.5528846153846154, + "cpu_mem": 1.810849792, + "gpu_mem": 4.572401664, + "loss": 0.4195, + "grad_norm": 1.7180877923965454, + "learning_rate": 0.000148740017763865 + }, + { + "step": 346, + "epoch": 0.5544871794871795, + "cpu_mem": 1.810849792, + "gpu_mem": 4.572370944, + "loss": 0.4661, + "grad_norm": 1.7766473293304443, + "learning_rate": 0.00014790007350954845 + }, + { + "step": 347, + "epoch": 0.5560897435897436, + "cpu_mem": 1.810849792, + "gpu_mem": 4.572435456, + "loss": 0.5998, + "grad_norm": 2.139861583709717, + "learning_rate": 0.00014706019510844664 + }, + { + "step": 348, + "epoch": 0.5576923076923077, + "cpu_mem": 1.8110464, + "gpu_mem": 4.572389376, + "loss": 0.4928, + "grad_norm": 2.1838932037353516, + "learning_rate": 0.0001462204088989541 + }, + { + "step": 349, + "epoch": 0.5592948717948718, + "cpu_mem": 1.811243008, + "gpu_mem": 4.572383232, + "loss": 0.5493, + "grad_norm": 1.914306402206421, + "learning_rate": 0.00014538074121657447 + }, + { + "step": 350, + "epoch": 0.5608974358974359, + "cpu_mem": 1.811243008, + "gpu_mem": 4.572438528, + "loss": 0.2918, + "grad_norm": 1.2906224727630615, + "learning_rate": 0.00014454121839309415 + }, + { + "step": 351, + "epoch": 0.5625, + "cpu_mem": 1.811243008, + "gpu_mem": 4.572404736, + "loss": 0.3743, + "grad_norm": 1.8563796281814575, + "learning_rate": 0.00014370186675575705 + }, + { + "step": 352, + "epoch": 0.5641025641025641, + "cpu_mem": 1.811439616, + "gpu_mem": 4.572392448, + "loss": 0.4555, + "grad_norm": 2.147393226623535, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 353, + "epoch": 0.5657051282051282, + "cpu_mem": 1.811439616, + "gpu_mem": 4.572397056, + "loss": 0.4174, + "grad_norm": 1.9877393245697021, + "learning_rate": 0.00014202378232082053 + }, + { + "step": 354, + "epoch": 0.5673076923076923, + "cpu_mem": 1.811636224, + "gpu_mem": 4.572377088, + "loss": 0.5443, + "grad_norm": 2.892596960067749, + "learning_rate": 0.00014118510214756536 + }, + { + "step": 355, + "epoch": 0.5689102564102564, + "cpu_mem": 1.811636224, + "gpu_mem": 4.572401664, + "loss": 0.4713, + "grad_norm": 1.8485485315322876, + "learning_rate": 0.00014034669840749152 + }, + { + "step": 356, + "epoch": 0.5705128205128205, + "cpu_mem": 1.811832832, + "gpu_mem": 4.57238016, + "loss": 0.357, + "grad_norm": 1.8582942485809326, + "learning_rate": 0.0001395085973927487 + }, + { + "step": 357, + "epoch": 0.5721153846153846, + "cpu_mem": 1.811832832, + "gpu_mem": 4.572397056, + "loss": 0.3965, + "grad_norm": 1.944653868675232, + "learning_rate": 0.00013867082538599317 + }, + { + "step": 358, + "epoch": 0.5737179487179487, + "cpu_mem": 1.811832832, + "gpu_mem": 4.572361728, + "loss": 0.603, + "grad_norm": 3.2797741889953613, + "learning_rate": 0.00013783340865956338 + }, + { + "step": 359, + "epoch": 0.5753205128205128, + "cpu_mem": 1.811832832, + "gpu_mem": 4.572393984, + "loss": 0.3229, + "grad_norm": 1.8441203832626343, + "learning_rate": 0.0001369963734746566 + }, + { + "step": 360, + "epoch": 0.5769230769230769, + "cpu_mem": 1.81202944, + "gpu_mem": 4.572383232, + "loss": 0.5466, + "grad_norm": 2.8274447917938232, + "learning_rate": 0.0001361597460805047 + }, + { + "step": 361, + "epoch": 0.5785256410256411, + "cpu_mem": 1.812226048, + "gpu_mem": 4.572420096, + "loss": 0.5368, + "grad_norm": 2.7604212760925293, + "learning_rate": 0.00013532355271355154 + }, + { + "step": 362, + "epoch": 0.5801282051282052, + "cpu_mem": 1.812226048, + "gpu_mem": 4.572386304, + "loss": 0.4891, + "grad_norm": 2.1206562519073486, + "learning_rate": 0.00013448781959663004 + }, + { + "step": 363, + "epoch": 0.5817307692307693, + "cpu_mem": 1.812226048, + "gpu_mem": 4.572409344, + "loss": 0.5336, + "grad_norm": 2.0888586044311523, + "learning_rate": 0.00013365257293813956 + }, + { + "step": 364, + "epoch": 0.5833333333333334, + "cpu_mem": 1.812226048, + "gpu_mem": 4.572398592, + "loss": 0.4663, + "grad_norm": 2.0039443969726562, + "learning_rate": 0.00013281783893122446 + }, + { + "step": 365, + "epoch": 0.5849358974358975, + "cpu_mem": 1.812226048, + "gpu_mem": 4.572404736, + "loss": 0.4553, + "grad_norm": 2.2016820907592773, + "learning_rate": 0.00013198364375295224 + }, + { + "step": 366, + "epoch": 0.5865384615384616, + "cpu_mem": 1.812422656, + "gpu_mem": 4.572398592, + "loss": 0.6332, + "grad_norm": 2.3679275512695312, + "learning_rate": 0.000131150013563493 + }, + { + "step": 367, + "epoch": 0.5881410256410257, + "cpu_mem": 1.812619264, + "gpu_mem": 4.572417024, + "loss": 0.6823, + "grad_norm": 2.2347846031188965, + "learning_rate": 0.00013031697450529902 + }, + { + "step": 368, + "epoch": 0.5897435897435898, + "cpu_mem": 1.812619264, + "gpu_mem": 4.572377088, + "loss": 0.3888, + "grad_norm": 1.6727988719940186, + "learning_rate": 0.0001294845527022846 + }, + { + "step": 369, + "epoch": 0.5913461538461539, + "cpu_mem": 1.812619264, + "gpu_mem": 4.572409344, + "loss": 0.7508, + "grad_norm": 2.3906848430633545, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 370, + "epoch": 0.592948717948718, + "cpu_mem": 1.812815872, + "gpu_mem": 4.572429312, + "loss": 0.4829, + "grad_norm": 1.709958553314209, + "learning_rate": 0.0001278216652598487 + }, + { + "step": 371, + "epoch": 0.594551282051282, + "cpu_mem": 1.812815872, + "gpu_mem": 4.572423168, + "loss": 0.4353, + "grad_norm": 1.8110034465789795, + "learning_rate": 0.00012699125176819716 + }, + { + "step": 372, + "epoch": 0.5961538461538461, + "cpu_mem": 1.812815872, + "gpu_mem": 4.572386304, + "loss": 0.5259, + "grad_norm": 2.3991684913635254, + "learning_rate": 0.00012616155982563 + }, + { + "step": 373, + "epoch": 0.5977564102564102, + "cpu_mem": 1.812815872, + "gpu_mem": 4.5724032, + "loss": 0.3505, + "grad_norm": 1.6201503276824951, + "learning_rate": 0.00012533261545109674 + }, + { + "step": 374, + "epoch": 0.5993589743589743, + "cpu_mem": 1.812815872, + "gpu_mem": 4.57238016, + "loss": 0.6067, + "grad_norm": 1.6597931385040283, + "learning_rate": 0.00012450444464010352 + }, + { + "step": 375, + "epoch": 0.6009615384615384, + "cpu_mem": 1.812815872, + "gpu_mem": 4.572412416, + "loss": 0.5565, + "grad_norm": 2.3403549194335938, + "learning_rate": 0.0001236770733638976 + }, + { + "step": 376, + "epoch": 0.6025641025641025, + "cpu_mem": 1.81301248, + "gpu_mem": 4.572407808, + "loss": 0.4825, + "grad_norm": 2.184373378753662, + "learning_rate": 0.000122850527568653 + }, + { + "step": 377, + "epoch": 0.6041666666666666, + "cpu_mem": 1.81301248, + "gpu_mem": 4.572417024, + "loss": 0.3095, + "grad_norm": 2.000377655029297, + "learning_rate": 0.00012202483317465704 + }, + { + "step": 378, + "epoch": 0.6057692307692307, + "cpu_mem": 1.81301248, + "gpu_mem": 4.572390912, + "loss": 0.4528, + "grad_norm": 2.129002571105957, + "learning_rate": 0.00012120001607549698 + }, + { + "step": 379, + "epoch": 0.6073717948717948, + "cpu_mem": 1.813209088, + "gpu_mem": 4.57241088, + "loss": 0.4462, + "grad_norm": 1.9592366218566895, + "learning_rate": 0.00012037610213724862 + }, + { + "step": 380, + "epoch": 0.6089743589743589, + "cpu_mem": 1.813209088, + "gpu_mem": 4.572384768, + "loss": 0.5364, + "grad_norm": 2.4130051136016846, + "learning_rate": 0.0001195531171976646 + }, + { + "step": 381, + "epoch": 0.6105769230769231, + "cpu_mem": 1.813209088, + "gpu_mem": 4.572409344, + "loss": 0.4957, + "grad_norm": 1.9423049688339233, + "learning_rate": 0.00011873108706536448 + }, + { + "step": 382, + "epoch": 0.6121794871794872, + "cpu_mem": 1.813405696, + "gpu_mem": 4.572393984, + "loss": 0.337, + "grad_norm": 1.8135566711425781, + "learning_rate": 0.00011791003751902542 + }, + { + "step": 383, + "epoch": 0.6137820512820513, + "cpu_mem": 1.813405696, + "gpu_mem": 4.572427776, + "loss": 0.6022, + "grad_norm": 2.3227410316467285, + "learning_rate": 0.00011708999430657325 + }, + { + "step": 384, + "epoch": 0.6153846153846154, + "cpu_mem": 1.813602304, + "gpu_mem": 4.572407808, + "loss": 0.4997, + "grad_norm": 2.731081962585449, + "learning_rate": 0.00011627098314437586 + }, + { + "step": 385, + "epoch": 0.6169871794871795, + "cpu_mem": 1.813602304, + "gpu_mem": 4.572392448, + "loss": 0.5342, + "grad_norm": 1.9342825412750244, + "learning_rate": 0.0001154530297164359 + }, + { + "step": 386, + "epoch": 0.6185897435897436, + "cpu_mem": 1.813602304, + "gpu_mem": 4.572427776, + "loss": 0.5116, + "grad_norm": 2.307800531387329, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 387, + "epoch": 0.6201923076923077, + "cpu_mem": 1.813602304, + "gpu_mem": 4.57243392, + "loss": 0.3456, + "grad_norm": 1.647102952003479, + "learning_rate": 0.00011382039863268374 + }, + { + "step": 388, + "epoch": 0.6217948717948718, + "cpu_mem": 1.813798912, + "gpu_mem": 4.572397056, + "loss": 0.515, + "grad_norm": 2.5735957622528076, + "learning_rate": 0.00011300577217580905 + }, + { + "step": 389, + "epoch": 0.6233974358974359, + "cpu_mem": 1.813798912, + "gpu_mem": 4.572375552, + "loss": 0.5177, + "grad_norm": 1.9279035329818726, + "learning_rate": 0.00011219230584946136 + }, + { + "step": 390, + "epoch": 0.625, + "cpu_mem": 1.813798912, + "gpu_mem": 4.572427776, + "loss": 0.616, + "grad_norm": 2.1695919036865234, + "learning_rate": 0.00011138002516375864 + }, + { + "step": 391, + "epoch": 0.6266025641025641, + "cpu_mem": 1.813798912, + "gpu_mem": 4.572413952, + "loss": 0.3925, + "grad_norm": 1.658004641532898, + "learning_rate": 0.00011056895559163748 + }, + { + "step": 392, + "epoch": 0.6282051282051282, + "cpu_mem": 1.813798912, + "gpu_mem": 4.572407808, + "loss": 0.5029, + "grad_norm": 2.1272757053375244, + "learning_rate": 0.00010975912256805436 + }, + { + "step": 393, + "epoch": 0.6298076923076923, + "cpu_mem": 1.81399552, + "gpu_mem": 4.572413952, + "loss": 0.4599, + "grad_norm": 1.3876793384552002, + "learning_rate": 0.00010895055148918756 + }, + { + "step": 394, + "epoch": 0.6314102564102564, + "cpu_mem": 1.81399552, + "gpu_mem": 4.572390912, + "loss": 0.6083, + "grad_norm": 1.9667943716049194, + "learning_rate": 0.00010814326771164141 + }, + { + "step": 395, + "epoch": 0.6330128205128205, + "cpu_mem": 1.81399552, + "gpu_mem": 4.572404736, + "loss": 0.3586, + "grad_norm": 1.7789164781570435, + "learning_rate": 0.00010733729655165054 + }, + { + "step": 396, + "epoch": 0.6346153846153846, + "cpu_mem": 1.81399552, + "gpu_mem": 4.572404736, + "loss": 0.5679, + "grad_norm": 1.8229703903198242, + "learning_rate": 0.00010653266328428628 + }, + { + "step": 397, + "epoch": 0.6362179487179487, + "cpu_mem": 1.81399552, + "gpu_mem": 4.572374016, + "loss": 0.4824, + "grad_norm": 1.6676251888275146, + "learning_rate": 0.00010572939314266402 + }, + { + "step": 398, + "epoch": 0.6378205128205128, + "cpu_mem": 1.814192128, + "gpu_mem": 4.572407808, + "loss": 0.4954, + "grad_norm": 2.027183771133423, + "learning_rate": 0.00010492751131715159 + }, + { + "step": 399, + "epoch": 0.6394230769230769, + "cpu_mem": 1.814192128, + "gpu_mem": 4.572386304, + "loss": 0.4772, + "grad_norm": 1.8551509380340576, + "learning_rate": 0.00010412704295457988 + }, + { + "step": 400, + "epoch": 0.6410256410256411, + "cpu_mem": 1.814192128, + "gpu_mem": 4.572393984, + "loss": 0.3021, + "grad_norm": 1.5631887912750244, + "learning_rate": 0.00010332801315745361 + }, + { + "step": 401, + "epoch": 0.6426282051282052, + "cpu_mem": 1.814192128, + "gpu_mem": 4.572412416, + "loss": 0.4249, + "grad_norm": 1.5011136531829834, + "learning_rate": 0.00010253044698316464 + }, + { + "step": 402, + "epoch": 0.6442307692307693, + "cpu_mem": 1.814388736, + "gpu_mem": 4.57238016, + "loss": 0.8016, + "grad_norm": 3.149914026260376, + "learning_rate": 0.00010173436944320582 + }, + { + "step": 403, + "epoch": 0.6458333333333334, + "cpu_mem": 1.814388736, + "gpu_mem": 4.572384768, + "loss": 0.4919, + "grad_norm": 2.113513946533203, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 404, + "epoch": 0.6474358974358975, + "cpu_mem": 1.814388736, + "gpu_mem": 4.57238016, + "loss": 0.4062, + "grad_norm": 1.8861093521118164, + "learning_rate": 0.00010014678007805106 + }, + { + "step": 405, + "epoch": 0.6490384615384616, + "cpu_mem": 1.814585344, + "gpu_mem": 4.572424704, + "loss": 0.6181, + "grad_norm": 2.2513864040374756, + "learning_rate": 9.935531803929469e-05 + }, + { + "step": 406, + "epoch": 0.6506410256410257, + "cpu_mem": 1.814585344, + "gpu_mem": 4.572407808, + "loss": 0.3604, + "grad_norm": 1.5365360975265503, + "learning_rate": 9.856544420618624e-05 + }, + { + "step": 407, + "epoch": 0.6522435897435898, + "cpu_mem": 1.814585344, + "gpu_mem": 4.572397056, + "loss": 0.4382, + "grad_norm": 1.7543511390686035, + "learning_rate": 9.777718334898859e-05 + }, + { + "step": 408, + "epoch": 0.6538461538461539, + "cpu_mem": 1.814781952, + "gpu_mem": 4.57241856, + "loss": 0.432, + "grad_norm": 2.1594126224517822, + "learning_rate": 9.699056018738192e-05 + }, + { + "step": 409, + "epoch": 0.655448717948718, + "cpu_mem": 1.814781952, + "gpu_mem": 4.572384768, + "loss": 0.4267, + "grad_norm": 1.9450078010559082, + "learning_rate": 9.62055993896888e-05 + }, + { + "step": 410, + "epoch": 0.657051282051282, + "cpu_mem": 1.814781952, + "gpu_mem": 4.572400128, + "loss": 0.4283, + "grad_norm": 1.7857673168182373, + "learning_rate": 9.542232557210039e-05 + }, + { + "step": 411, + "epoch": 0.6586538461538461, + "cpu_mem": 1.814781952, + "gpu_mem": 4.572400128, + "loss": 0.4974, + "grad_norm": 1.9128037691116333, + "learning_rate": 9.464076329790451e-05 + }, + { + "step": 412, + "epoch": 0.6602564102564102, + "cpu_mem": 1.814781952, + "gpu_mem": 4.572390912, + "loss": 0.4275, + "grad_norm": 2.1618385314941406, + "learning_rate": 9.386093707671543e-05 + }, + { + "step": 413, + "epoch": 0.6618589743589743, + "cpu_mem": 1.81497856, + "gpu_mem": 4.572401664, + "loss": 0.7253, + "grad_norm": 2.6955342292785645, + "learning_rate": 9.308287136370511e-05 + }, + { + "step": 414, + "epoch": 0.6634615384615384, + "cpu_mem": 1.81497856, + "gpu_mem": 4.57242624, + "loss": 0.472, + "grad_norm": 2.0836822986602783, + "learning_rate": 9.230659055883649e-05 + }, + { + "step": 415, + "epoch": 0.6650641025641025, + "cpu_mem": 1.81497856, + "gpu_mem": 4.572378624, + "loss": 0.7215, + "grad_norm": 2.314744472503662, + "learning_rate": 9.15321190060981e-05 + }, + { + "step": 416, + "epoch": 0.6666666666666666, + "cpu_mem": 1.81497856, + "gpu_mem": 4.572413952, + "loss": 0.54, + "grad_norm": 2.1989049911499023, + "learning_rate": 9.075948099274078e-05 + }, + { + "step": 417, + "epoch": 0.6682692307692307, + "cpu_mem": 1.81497856, + "gpu_mem": 4.572375552, + "loss": 0.4174, + "grad_norm": 2.1271347999572754, + "learning_rate": 8.998870074851604e-05 + }, + { + "step": 418, + "epoch": 0.6698717948717948, + "cpu_mem": 1.81497856, + "gpu_mem": 4.572393984, + "loss": 0.5066, + "grad_norm": 1.9919546842575073, + "learning_rate": 8.9219802444916e-05 + }, + { + "step": 419, + "epoch": 0.6714743589743589, + "cpu_mem": 1.815175168, + "gpu_mem": 4.572386304, + "loss": 0.4611, + "grad_norm": 1.5797715187072754, + "learning_rate": 8.845281019441583e-05 + }, + { + "step": 420, + "epoch": 0.6730769230769231, + "cpu_mem": 1.815371776, + "gpu_mem": 4.572423168, + "loss": 0.4974, + "grad_norm": 2.173483371734619, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 421, + "epoch": 0.6746794871794872, + "cpu_mem": 1.815371776, + "gpu_mem": 4.572383232, + "loss": 0.3582, + "grad_norm": 1.7789108753204346, + "learning_rate": 8.692464000299362e-05 + }, + { + "step": 422, + "epoch": 0.6762820512820513, + "cpu_mem": 1.815371776, + "gpu_mem": 4.572397056, + "loss": 0.4157, + "grad_norm": 1.8692355155944824, + "learning_rate": 8.61635099851395e-05 + }, + { + "step": 423, + "epoch": 0.6778846153846154, + "cpu_mem": 1.815371776, + "gpu_mem": 4.572401664, + "loss": 0.2899, + "grad_norm": 1.5685145854949951, + "learning_rate": 8.540438186501792e-05 + }, + { + "step": 424, + "epoch": 0.6794871794871795, + "cpu_mem": 1.815371776, + "gpu_mem": 4.572363264, + "loss": 0.5857, + "grad_norm": 2.234609842300415, + "learning_rate": 8.464727944871322e-05 + }, + { + "step": 425, + "epoch": 0.6810897435897436, + "cpu_mem": 1.815371776, + "gpu_mem": 4.572386304, + "loss": 0.6275, + "grad_norm": 1.9095319509506226, + "learning_rate": 8.389222647878426e-05 + }, + { + "step": 426, + "epoch": 0.6826923076923077, + "cpu_mem": 1.815371776, + "gpu_mem": 4.572384768, + "loss": 0.4144, + "grad_norm": 1.9071664810180664, + "learning_rate": 8.313924663351926e-05 + }, + { + "step": 427, + "epoch": 0.6842948717948718, + "cpu_mem": 1.815371776, + "gpu_mem": 4.5724032, + "loss": 0.4275, + "grad_norm": 1.8783771991729736, + "learning_rate": 8.238836352619424e-05 + }, + { + "step": 428, + "epoch": 0.6858974358974359, + "cpu_mem": 1.815371776, + "gpu_mem": 4.572400128, + "loss": 0.4545, + "grad_norm": 2.030482053756714, + "learning_rate": 8.163960070433164e-05 + }, + { + "step": 429, + "epoch": 0.6875, + "cpu_mem": 1.815371776, + "gpu_mem": 4.572398592, + "loss": 0.6097, + "grad_norm": 2.3348824977874756, + "learning_rate": 8.089298164896245e-05 + }, + { + "step": 430, + "epoch": 0.6891025641025641, + "cpu_mem": 1.815568384, + "gpu_mem": 4.572417024, + "loss": 0.3017, + "grad_norm": 1.7290152311325073, + "learning_rate": 8.014852977388964e-05 + }, + { + "step": 431, + "epoch": 0.6907051282051282, + "cpu_mem": 1.815764992, + "gpu_mem": 4.572378624, + "loss": 0.4456, + "grad_norm": 2.073530673980713, + "learning_rate": 7.940626842495362e-05 + }, + { + "step": 432, + "epoch": 0.6923076923076923, + "cpu_mem": 1.815764992, + "gpu_mem": 4.572423168, + "loss": 0.3621, + "grad_norm": 1.6620073318481445, + "learning_rate": 7.866622087930074e-05 + }, + { + "step": 433, + "epoch": 0.6939102564102564, + "cpu_mem": 1.815764992, + "gpu_mem": 4.57238784, + "loss": 0.4308, + "grad_norm": 1.9856208562850952, + "learning_rate": 7.792841034465275e-05 + }, + { + "step": 434, + "epoch": 0.6955128205128205, + "cpu_mem": 1.815764992, + "gpu_mem": 4.572415488, + "loss": 0.5324, + "grad_norm": 2.2417540550231934, + "learning_rate": 7.719285995857938e-05 + }, + { + "step": 435, + "epoch": 0.6971153846153846, + "cpu_mem": 1.815764992, + "gpu_mem": 4.57239552, + "loss": 0.4605, + "grad_norm": 2.285654306411743, + "learning_rate": 7.64595927877727e-05 + }, + { + "step": 436, + "epoch": 0.6987179487179487, + "cpu_mem": 1.8159616, + "gpu_mem": 4.5724416, + "loss": 0.5226, + "grad_norm": 2.1244232654571533, + "learning_rate": 7.572863182732332e-05 + }, + { + "step": 437, + "epoch": 0.7003205128205128, + "cpu_mem": 1.8159616, + "gpu_mem": 4.572406272, + "loss": 0.4125, + "grad_norm": 1.8309661149978638, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 438, + "epoch": 0.7019230769230769, + "cpu_mem": 1.8159616, + "gpu_mem": 4.572397056, + "loss": 0.3687, + "grad_norm": 1.90573251247406, + "learning_rate": 7.42737201555302e-05 + }, + { + "step": 439, + "epoch": 0.7035256410256411, + "cpu_mem": 1.8159616, + "gpu_mem": 4.572390912, + "loss": 0.3497, + "grad_norm": 1.921601414680481, + "learning_rate": 7.354981506988387e-05 + }, + { + "step": 440, + "epoch": 0.7051282051282052, + "cpu_mem": 1.8159616, + "gpu_mem": 4.572375552, + "loss": 0.4507, + "grad_norm": 1.9451408386230469, + "learning_rate": 7.282830744455895e-05 + }, + { + "step": 441, + "epoch": 0.7067307692307693, + "cpu_mem": 1.8159616, + "gpu_mem": 4.572393984, + "loss": 0.7253, + "grad_norm": 3.176419973373413, + "learning_rate": 7.210921990586957e-05 + }, + { + "step": 442, + "epoch": 0.7083333333333334, + "cpu_mem": 1.816158208, + "gpu_mem": 4.57239552, + "loss": 0.4138, + "grad_norm": 2.023298740386963, + "learning_rate": 7.139257500423665e-05 + }, + { + "step": 443, + "epoch": 0.7099358974358975, + "cpu_mem": 1.816158208, + "gpu_mem": 4.572400128, + "loss": 0.4763, + "grad_norm": 2.3130581378936768, + "learning_rate": 7.067839521348035e-05 + }, + { + "step": 444, + "epoch": 0.7115384615384616, + "cpu_mem": 1.816158208, + "gpu_mem": 4.5724032, + "loss": 0.384, + "grad_norm": 2.273935556411743, + "learning_rate": 6.996670293011575e-05 + }, + { + "step": 445, + "epoch": 0.7131410256410257, + "cpu_mem": 1.816158208, + "gpu_mem": 4.572397056, + "loss": 0.7198, + "grad_norm": 2.473442792892456, + "learning_rate": 6.92575204726501e-05 + }, + { + "step": 446, + "epoch": 0.7147435897435898, + "cpu_mem": 1.816158208, + "gpu_mem": 4.572423168, + "loss": 0.6944, + "grad_norm": 2.95565128326416, + "learning_rate": 6.855087008088307e-05 + }, + { + "step": 447, + "epoch": 0.7163461538461539, + "cpu_mem": 1.816158208, + "gpu_mem": 4.572390912, + "loss": 0.3851, + "grad_norm": 1.5766470432281494, + "learning_rate": 6.784677391520952e-05 + }, + { + "step": 448, + "epoch": 0.717948717948718, + "cpu_mem": 1.816158208, + "gpu_mem": 4.57241856, + "loss": 0.3546, + "grad_norm": 1.712007999420166, + "learning_rate": 6.714525405592412e-05 + }, + { + "step": 449, + "epoch": 0.719551282051282, + "cpu_mem": 1.816158208, + "gpu_mem": 4.57242624, + "loss": 0.4116, + "grad_norm": 2.319373846054077, + "learning_rate": 6.644633250252937e-05 + }, + { + "step": 450, + "epoch": 0.7211538461538461, + "cpu_mem": 1.816158208, + "gpu_mem": 4.572407808, + "loss": 0.547, + "grad_norm": 2.5570788383483887, + "learning_rate": 6.575003117304535e-05 + }, + { + "step": 451, + "epoch": 0.7227564102564102, + "cpu_mem": 1.816354816, + "gpu_mem": 4.572393984, + "loss": 0.3139, + "grad_norm": 1.6993554830551147, + "learning_rate": 6.50563719033225e-05 + }, + { + "step": 452, + "epoch": 0.7243589743589743, + "cpu_mem": 1.816354816, + "gpu_mem": 4.572404736, + "loss": 0.4649, + "grad_norm": 2.187100410461426, + "learning_rate": 6.436537644635705e-05 + }, + { + "step": 453, + "epoch": 0.7259615384615384, + "cpu_mem": 1.816354816, + "gpu_mem": 4.572397056, + "loss": 0.3832, + "grad_norm": 1.769230842590332, + "learning_rate": 6.367706647160847e-05 + }, + { + "step": 454, + "epoch": 0.7275641025641025, + "cpu_mem": 1.816354816, + "gpu_mem": 4.572413952, + "loss": 0.5495, + "grad_norm": 1.9816536903381348, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 455, + "epoch": 0.7291666666666666, + "cpu_mem": 1.816354816, + "gpu_mem": 4.572386304, + "loss": 0.3969, + "grad_norm": 2.0151307582855225, + "learning_rate": 6.230858922484288e-05 + }, + { + "step": 456, + "epoch": 0.7307692307692307, + "cpu_mem": 1.816354816, + "gpu_mem": 4.572417024, + "loss": 0.4399, + "grad_norm": 1.8947246074676514, + "learning_rate": 6.162846486795938e-05 + }, + { + "step": 457, + "epoch": 0.7323717948717948, + "cpu_mem": 1.816354816, + "gpu_mem": 4.572398592, + "loss": 0.6072, + "grad_norm": 3.0281028747558594, + "learning_rate": 6.095111182221422e-05 + }, + { + "step": 458, + "epoch": 0.7339743589743589, + "cpu_mem": 1.816354816, + "gpu_mem": 4.572386304, + "loss": 0.4837, + "grad_norm": 1.5698434114456177, + "learning_rate": 6.027655132924397e-05 + }, + { + "step": 459, + "epoch": 0.7355769230769231, + "cpu_mem": 1.816354816, + "gpu_mem": 4.572398592, + "loss": 0.4434, + "grad_norm": 1.6022676229476929, + "learning_rate": 5.960480454311155e-05 + }, + { + "step": 460, + "epoch": 0.7371794871794872, + "cpu_mem": 1.816551424, + "gpu_mem": 4.572404736, + "loss": 0.3706, + "grad_norm": 1.911746859550476, + "learning_rate": 5.893589252964258e-05 + }, + { + "step": 461, + "epoch": 0.7387820512820513, + "cpu_mem": 1.816551424, + "gpu_mem": 4.572392448, + "loss": 0.3, + "grad_norm": 1.7526575326919556, + "learning_rate": 5.826983626576479e-05 + }, + { + "step": 462, + "epoch": 0.7403846153846154, + "cpu_mem": 1.816551424, + "gpu_mem": 4.572381696, + "loss": 0.3495, + "grad_norm": 2.1251373291015625, + "learning_rate": 5.760665663885046e-05 + }, + { + "step": 463, + "epoch": 0.7419871794871795, + "cpu_mem": 1.816551424, + "gpu_mem": 4.572383232, + "loss": 0.4025, + "grad_norm": 1.8236830234527588, + "learning_rate": 5.6946374446060984e-05 + }, + { + "step": 464, + "epoch": 0.7435897435897436, + "cpu_mem": 1.816551424, + "gpu_mem": 4.572397056, + "loss": 0.5385, + "grad_norm": 1.6412752866744995, + "learning_rate": 5.6289010393695056e-05 + }, + { + "step": 465, + "epoch": 0.7451923076923077, + "cpu_mem": 1.816551424, + "gpu_mem": 4.572400128, + "loss": 0.5144, + "grad_norm": 1.963837742805481, + "learning_rate": 5.563458509653904e-05 + }, + { + "step": 466, + "epoch": 0.7467948717948718, + "cpu_mem": 1.816551424, + "gpu_mem": 4.57241088, + "loss": 0.4102, + "grad_norm": 2.1447224617004395, + "learning_rate": 5.498311907722057e-05 + }, + { + "step": 467, + "epoch": 0.7483974358974359, + "cpu_mem": 1.816551424, + "gpu_mem": 4.572384768, + "loss": 0.4317, + "grad_norm": 1.9165029525756836, + "learning_rate": 5.43346327655652e-05 + }, + { + "step": 468, + "epoch": 0.75, + "cpu_mem": 1.816551424, + "gpu_mem": 4.572400128, + "loss": 0.6374, + "grad_norm": 2.1453607082366943, + "learning_rate": 5.3689146497955274e-05 + }, + { + "step": 469, + "epoch": 0.7516025641025641, + "cpu_mem": 1.816748032, + "gpu_mem": 4.572409344, + "loss": 0.471, + "grad_norm": 1.8836634159088135, + "learning_rate": 5.30466805166927e-05 + }, + { + "step": 470, + "epoch": 0.7532051282051282, + "cpu_mem": 1.816748032, + "gpu_mem": 4.572383232, + "loss": 0.4925, + "grad_norm": 2.0671310424804688, + "learning_rate": 5.240725496936372e-05 + }, + { + "step": 471, + "epoch": 0.7548076923076923, + "cpu_mem": 1.81694464, + "gpu_mem": 4.572389376, + "loss": 0.4434, + "grad_norm": 1.773122787475586, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 472, + "epoch": 0.7564102564102564, + "cpu_mem": 1.81694464, + "gpu_mem": 4.572378624, + "loss": 0.4866, + "grad_norm": 1.8623539209365845, + "learning_rate": 5.113760528948622e-05 + }, + { + "step": 473, + "epoch": 0.7580128205128205, + "cpu_mem": 1.81694464, + "gpu_mem": 4.572384768, + "loss": 0.3953, + "grad_norm": 1.8052185773849487, + "learning_rate": 5.05074209728614e-05 + }, + { + "step": 474, + "epoch": 0.7596153846153846, + "cpu_mem": 1.81694464, + "gpu_mem": 4.572421632, + "loss": 0.3042, + "grad_norm": 1.6068843603134155, + "learning_rate": 4.988035672076899e-05 + }, + { + "step": 475, + "epoch": 0.7612179487179487, + "cpu_mem": 1.81694464, + "gpu_mem": 4.572369408, + "loss": 0.5361, + "grad_norm": 2.126542568206787, + "learning_rate": 4.925643219780052e-05 + }, + { + "step": 476, + "epoch": 0.7628205128205128, + "cpu_mem": 1.81694464, + "gpu_mem": 4.572389376, + "loss": 0.4167, + "grad_norm": 1.8870316743850708, + "learning_rate": 4.863566697008634e-05 + }, + { + "step": 477, + "epoch": 0.7644230769230769, + "cpu_mem": 1.81694464, + "gpu_mem": 4.572389376, + "loss": 0.3929, + "grad_norm": 1.8950051069259644, + "learning_rate": 4.801808050468219e-05 + }, + { + "step": 478, + "epoch": 0.7660256410256411, + "cpu_mem": 1.81694464, + "gpu_mem": 4.57238784, + "loss": 0.3632, + "grad_norm": 1.6383334398269653, + "learning_rate": 4.7403692168958305e-05 + }, + { + "step": 479, + "epoch": 0.7676282051282052, + "cpu_mem": 1.817141248, + "gpu_mem": 4.572386304, + "loss": 0.2792, + "grad_norm": 1.6050939559936523, + "learning_rate": 4.679252122999255e-05 + }, + { + "step": 480, + "epoch": 0.7692307692307693, + "cpu_mem": 1.817141248, + "gpu_mem": 4.572378624, + "loss": 0.5328, + "grad_norm": 2.361191987991333, + "learning_rate": 4.618458685396579e-05 + }, + { + "step": 481, + "epoch": 0.7708333333333334, + "cpu_mem": 1.817141248, + "gpu_mem": 4.572438528, + "loss": 0.3125, + "grad_norm": 1.5569229125976562, + "learning_rate": 4.5579908105561016e-05 + }, + { + "step": 482, + "epoch": 0.7724358974358975, + "cpu_mem": 1.817141248, + "gpu_mem": 4.572383232, + "loss": 0.4373, + "grad_norm": 2.1483192443847656, + "learning_rate": 4.497850394736563e-05 + }, + { + "step": 483, + "epoch": 0.7740384615384616, + "cpu_mem": 1.817141248, + "gpu_mem": 4.572366336, + "loss": 0.5435, + "grad_norm": 2.129880428314209, + "learning_rate": 4.438039323927648e-05 + }, + { + "step": 484, + "epoch": 0.7756410256410257, + "cpu_mem": 1.817141248, + "gpu_mem": 4.572397056, + "loss": 0.6663, + "grad_norm": 2.4833476543426514, + "learning_rate": 4.3785594737908676e-05 + }, + { + "step": 485, + "epoch": 0.7772435897435898, + "cpu_mem": 1.817141248, + "gpu_mem": 4.5724416, + "loss": 0.5968, + "grad_norm": 2.3417773246765137, + "learning_rate": 4.319412709600723e-05 + }, + { + "step": 486, + "epoch": 0.7788461538461539, + "cpu_mem": 1.817141248, + "gpu_mem": 4.572421632, + "loss": 0.2734, + "grad_norm": 1.4951038360595703, + "learning_rate": 4.2606008861862116e-05 + }, + { + "step": 487, + "epoch": 0.780448717948718, + "cpu_mem": 1.817141248, + "gpu_mem": 4.572421632, + "loss": 0.3206, + "grad_norm": 1.7375810146331787, + "learning_rate": 4.2021258478726774e-05 + }, + { + "step": 488, + "epoch": 0.782051282051282, + "cpu_mem": 1.817141248, + "gpu_mem": 4.57238784, + "loss": 0.3763, + "grad_norm": 1.9115643501281738, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 489, + "epoch": 0.7836538461538461, + "cpu_mem": 1.817141248, + "gpu_mem": 4.572412416, + "loss": 0.4947, + "grad_norm": 1.75957190990448, + "learning_rate": 4.0861934509848507e-05 + }, + { + "step": 490, + "epoch": 0.7852564102564102, + "cpu_mem": 1.817337856, + "gpu_mem": 4.572415488, + "loss": 0.523, + "grad_norm": 2.6849780082702637, + "learning_rate": 4.028739728024022e-05 + }, + { + "step": 491, + "epoch": 0.7868589743589743, + "cpu_mem": 1.817337856, + "gpu_mem": 4.572393984, + "loss": 0.4023, + "grad_norm": 2.3551106452941895, + "learning_rate": 3.971630061277077e-05 + }, + { + "step": 492, + "epoch": 0.7884615384615384, + "cpu_mem": 1.817337856, + "gpu_mem": 4.572417024, + "loss": 0.4089, + "grad_norm": 2.463989496231079, + "learning_rate": 3.914866241690115e-05 + }, + { + "step": 493, + "epoch": 0.7900641025641025, + "cpu_mem": 1.817337856, + "gpu_mem": 4.572397056, + "loss": 0.4988, + "grad_norm": 2.365406036376953, + "learning_rate": 3.858450049363532e-05 + }, + { + "step": 494, + "epoch": 0.7916666666666666, + "cpu_mem": 1.817337856, + "gpu_mem": 4.572421632, + "loss": 0.3642, + "grad_norm": 1.4577648639678955, + "learning_rate": 3.8023832534962314e-05 + }, + { + "step": 495, + "epoch": 0.7932692307692307, + "cpu_mem": 1.817337856, + "gpu_mem": 4.572404736, + "loss": 0.4714, + "grad_norm": 2.3883488178253174, + "learning_rate": 3.746667612330109e-05 + }, + { + "step": 496, + "epoch": 0.7948717948717948, + "cpu_mem": 1.817337856, + "gpu_mem": 4.572400128, + "loss": 0.4813, + "grad_norm": 2.4021058082580566, + "learning_rate": 3.691304873094927e-05 + }, + { + "step": 497, + "epoch": 0.7964743589743589, + "cpu_mem": 1.817337856, + "gpu_mem": 4.572412416, + "loss": 0.3866, + "grad_norm": 1.8651506900787354, + "learning_rate": 3.636296771953544e-05 + }, + { + "step": 498, + "epoch": 0.7980769230769231, + "cpu_mem": 1.817337856, + "gpu_mem": 4.572383232, + "loss": 0.4023, + "grad_norm": 1.793846607208252, + "learning_rate": 3.581645033947425e-05 + }, + { + "step": 499, + "epoch": 0.7996794871794872, + "cpu_mem": 1.817337856, + "gpu_mem": 4.572397056, + "loss": 0.579, + "grad_norm": 2.1746044158935547, + "learning_rate": 3.527351372942588e-05 + }, + { + "step": 500, + "epoch": 0.8012820512820513, + "cpu_mem": 1.817534464, + "gpu_mem": 4.572383232, + "loss": 0.4543, + "grad_norm": 2.0470495223999023, + "learning_rate": 3.473417491575824e-05 + }, + { + "step": 501, + "epoch": 0.8028846153846154, + "cpu_mem": 1.817534464, + "gpu_mem": 4.572377088, + "loss": 0.4796, + "grad_norm": 2.248880386352539, + "learning_rate": 3.41984508120132e-05 + }, + { + "step": 502, + "epoch": 0.8044871794871795, + "cpu_mem": 1.817534464, + "gpu_mem": 4.572383232, + "loss": 0.2654, + "grad_norm": 1.3822038173675537, + "learning_rate": 3.366635821837627e-05 + }, + { + "step": 503, + "epoch": 0.8060897435897436, + "cpu_mem": 1.817534464, + "gpu_mem": 4.572397056, + "loss": 0.522, + "grad_norm": 2.0930724143981934, + "learning_rate": 3.3137913821149425e-05 + }, + { + "step": 504, + "epoch": 0.8076923076923077, + "cpu_mem": 1.817534464, + "gpu_mem": 4.57238016, + "loss": 0.5952, + "grad_norm": 2.076791524887085, + "learning_rate": 3.261313419222825e-05 + }, + { + "step": 505, + "epoch": 0.8092948717948718, + "cpu_mem": 1.817534464, + "gpu_mem": 4.57243392, + "loss": 0.3627, + "grad_norm": 1.5023587942123413, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 506, + "epoch": 0.8108974358974359, + "cpu_mem": 1.817534464, + "gpu_mem": 4.572377088, + "loss": 0.4247, + "grad_norm": 1.9354780912399292, + "learning_rate": 3.157463495173713e-05 + }, + { + "step": 507, + "epoch": 0.8125, + "cpu_mem": 1.817534464, + "gpu_mem": 4.572455424, + "loss": 0.4765, + "grad_norm": 1.70867919921875, + "learning_rate": 3.1060947907265936e-05 + }, + { + "step": 508, + "epoch": 0.8141025641025641, + "cpu_mem": 1.817534464, + "gpu_mem": 4.572398592, + "loss": 0.395, + "grad_norm": 1.9125045537948608, + "learning_rate": 3.0550990764276634e-05 + }, + { + "step": 509, + "epoch": 0.8157051282051282, + "cpu_mem": 1.817534464, + "gpu_mem": 4.572417024, + "loss": 0.4485, + "grad_norm": 2.05828857421875, + "learning_rate": 3.00447795149086e-05 + }, + { + "step": 510, + "epoch": 0.8173076923076923, + "cpu_mem": 1.817534464, + "gpu_mem": 4.572392448, + "loss": 0.3864, + "grad_norm": 1.626701831817627, + "learning_rate": 2.9542330033830884e-05 + }, + { + "step": 511, + "epoch": 0.8189102564102564, + "cpu_mem": 1.817534464, + "gpu_mem": 4.572424704, + "loss": 0.431, + "grad_norm": 1.8012969493865967, + "learning_rate": 2.9043658077744316e-05 + }, + { + "step": 512, + "epoch": 0.8205128205128205, + "cpu_mem": 1.817731072, + "gpu_mem": 4.572444672, + "loss": 0.5479, + "grad_norm": 2.333224296569824, + "learning_rate": 2.8548779284887442e-05 + }, + { + "step": 513, + "epoch": 0.8221153846153846, + "cpu_mem": 1.817731072, + "gpu_mem": 4.572374016, + "loss": 0.3208, + "grad_norm": 1.5325191020965576, + "learning_rate": 2.805770917454614e-05 + }, + { + "step": 514, + "epoch": 0.8237179487179487, + "cpu_mem": 1.817731072, + "gpu_mem": 4.57238784, + "loss": 0.2635, + "grad_norm": 1.3742403984069824, + "learning_rate": 2.7570463146566758e-05 + }, + { + "step": 515, + "epoch": 0.8253205128205128, + "cpu_mem": 1.817731072, + "gpu_mem": 4.57237248, + "loss": 0.5061, + "grad_norm": 2.1871979236602783, + "learning_rate": 2.708705648087332e-05 + }, + { + "step": 516, + "epoch": 0.8269230769230769, + "cpu_mem": 1.817731072, + "gpu_mem": 4.57241088, + "loss": 0.3481, + "grad_norm": 1.5297473669052124, + "learning_rate": 2.6607504336988317e-05 + }, + { + "step": 517, + "epoch": 0.8285256410256411, + "cpu_mem": 1.817731072, + "gpu_mem": 4.57241088, + "loss": 0.4847, + "grad_norm": 2.298027753829956, + "learning_rate": 2.613182175355739e-05 + }, + { + "step": 518, + "epoch": 0.8301282051282052, + "cpu_mem": 1.817731072, + "gpu_mem": 4.572397056, + "loss": 0.3915, + "grad_norm": 1.558052659034729, + "learning_rate": 2.5660023647877644e-05 + }, + { + "step": 519, + "epoch": 0.8317307692307693, + "cpu_mem": 1.817731072, + "gpu_mem": 4.57238784, + "loss": 0.4218, + "grad_norm": 2.019594192504883, + "learning_rate": 2.5192124815429777e-05 + }, + { + "step": 520, + "epoch": 0.8333333333333334, + "cpu_mem": 1.817731072, + "gpu_mem": 4.572392448, + "loss": 0.4949, + "grad_norm": 1.7942453622817993, + "learning_rate": 2.472813992941418e-05 + }, + { + "step": 521, + "epoch": 0.8349358974358975, + "cpu_mem": 1.817731072, + "gpu_mem": 4.57239552, + "loss": 0.4238, + "grad_norm": 1.9291306734085083, + "learning_rate": 2.426808354029078e-05 + }, + { + "step": 522, + "epoch": 0.8365384615384616, + "cpu_mem": 1.81792768, + "gpu_mem": 4.572401664, + "loss": 0.4106, + "grad_norm": 1.660714864730835, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 523, + "epoch": 0.8381410256410257, + "cpu_mem": 1.81792768, + "gpu_mem": 4.572420096, + "loss": 0.4893, + "grad_norm": 2.343261957168579, + "learning_rate": 2.3359813838124277e-05 + }, + { + "step": 524, + "epoch": 0.8397435897435898, + "cpu_mem": 1.81792768, + "gpu_mem": 4.572413952, + "loss": 0.6707, + "grad_norm": 2.674065351486206, + "learning_rate": 2.2911629008211363e-05 + }, + { + "step": 525, + "epoch": 0.8413461538461539, + "cpu_mem": 1.81792768, + "gpu_mem": 4.572390912, + "loss": 0.2927, + "grad_norm": 1.607337474822998, + "learning_rate": 2.24674296405579e-05 + }, + { + "step": 526, + "epoch": 0.842948717948718, + "cpu_mem": 1.81792768, + "gpu_mem": 4.572378624, + "loss": 0.5244, + "grad_norm": 1.9733198881149292, + "learning_rate": 2.2027229665154446e-05 + }, + { + "step": 527, + "epoch": 0.844551282051282, + "cpu_mem": 1.81792768, + "gpu_mem": 4.572344832, + "loss": 0.4722, + "grad_norm": 2.128936290740967, + "learning_rate": 2.1591042886571634e-05 + }, + { + "step": 528, + "epoch": 0.8461538461538461, + "cpu_mem": 1.81792768, + "gpu_mem": 4.572392448, + "loss": 0.4581, + "grad_norm": 2.170234441757202, + "learning_rate": 2.1158882983527166e-05 + }, + { + "step": 529, + "epoch": 0.8477564102564102, + "cpu_mem": 1.81792768, + "gpu_mem": 4.572358656, + "loss": 0.5784, + "grad_norm": 2.0886926651000977, + "learning_rate": 2.0730763508456738e-05 + }, + { + "step": 530, + "epoch": 0.8493589743589743, + "cpu_mem": 1.81792768, + "gpu_mem": 4.572406272, + "loss": 0.4024, + "grad_norm": 1.674379825592041, + "learning_rate": 2.0306697887089235e-05 + }, + { + "step": 531, + "epoch": 0.8509615384615384, + "cpu_mem": 1.81792768, + "gpu_mem": 4.572404736, + "loss": 0.6144, + "grad_norm": 1.9908626079559326, + "learning_rate": 1.9886699418025543e-05 + }, + { + "step": 532, + "epoch": 0.8525641025641025, + "cpu_mem": 1.81792768, + "gpu_mem": 4.572406272, + "loss": 0.2585, + "grad_norm": 1.1616791486740112, + "learning_rate": 1.947078127232169e-05 + }, + { + "step": 533, + "epoch": 0.8541666666666666, + "cpu_mem": 1.81792768, + "gpu_mem": 4.572415488, + "loss": 0.5978, + "grad_norm": 2.6362144947052, + "learning_rate": 1.9058956493075644e-05 + }, + { + "step": 534, + "epoch": 0.8557692307692307, + "cpu_mem": 1.81792768, + "gpu_mem": 4.572390912, + "loss": 0.7071, + "grad_norm": 2.5829038619995117, + "learning_rate": 1.8651237995018324e-05 + }, + { + "step": 535, + "epoch": 0.8573717948717948, + "cpu_mem": 1.81792768, + "gpu_mem": 4.572375552, + "loss": 0.5474, + "grad_norm": 2.151520252227783, + "learning_rate": 1.8247638564108607e-05 + }, + { + "step": 536, + "epoch": 0.8589743589743589, + "cpu_mem": 1.818124288, + "gpu_mem": 4.572404736, + "loss": 0.4746, + "grad_norm": 1.910907506942749, + "learning_rate": 1.7848170857132325e-05 + }, + { + "step": 537, + "epoch": 0.8605769230769231, + "cpu_mem": 1.818124288, + "gpu_mem": 4.57241856, + "loss": 0.4733, + "grad_norm": 2.0243418216705322, + "learning_rate": 1.7452847401305496e-05 + }, + { + "step": 538, + "epoch": 0.8621794871794872, + "cpu_mem": 1.818124288, + "gpu_mem": 4.572374016, + "loss": 0.3905, + "grad_norm": 1.7571030855178833, + "learning_rate": 1.7061680593881344e-05 + }, + { + "step": 539, + "epoch": 0.8637820512820513, + "cpu_mem": 1.818124288, + "gpu_mem": 4.57238016, + "loss": 0.4772, + "grad_norm": 1.60667085647583, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 540, + "epoch": 0.8653846153846154, + "cpu_mem": 1.818124288, + "gpu_mem": 4.572409344, + "loss": 0.5513, + "grad_norm": 1.9388078451156616, + "learning_rate": 1.6291865861111353e-05 + }, + { + "step": 541, + "epoch": 0.8669871794871795, + "cpu_mem": 1.818124288, + "gpu_mem": 4.572404736, + "loss": 0.3297, + "grad_norm": 1.5909992456436157, + "learning_rate": 1.5913242076979493e-05 + }, + { + "step": 542, + "epoch": 0.8685897435897436, + "cpu_mem": 1.818124288, + "gpu_mem": 4.572390912, + "loss": 0.4853, + "grad_norm": 1.7063899040222168, + "learning_rate": 1.5538823222921288e-05 + }, + { + "step": 543, + "epoch": 0.8701923076923077, + "cpu_mem": 1.818124288, + "gpu_mem": 4.572404736, + "loss": 0.3202, + "grad_norm": 1.483625888824463, + "learning_rate": 1.5168621040626388e-05 + }, + { + "step": 544, + "epoch": 0.8717948717948718, + "cpu_mem": 1.818124288, + "gpu_mem": 4.572393984, + "loss": 0.425, + "grad_norm": 1.8786370754241943, + "learning_rate": 1.4802647139550577e-05 + }, + { + "step": 545, + "epoch": 0.8733974358974359, + "cpu_mem": 1.818124288, + "gpu_mem": 4.572400128, + "loss": 0.3795, + "grad_norm": 1.6258753538131714, + "learning_rate": 1.444091299655175e-05 + }, + { + "step": 546, + "epoch": 0.875, + "cpu_mem": 1.818124288, + "gpu_mem": 4.572404736, + "loss": 0.6213, + "grad_norm": 2.5765862464904785, + "learning_rate": 1.408342995552988e-05 + }, + { + "step": 547, + "epoch": 0.8766025641025641, + "cpu_mem": 1.818124288, + "gpu_mem": 4.572400128, + "loss": 0.399, + "grad_norm": 1.6496835947036743, + "learning_rate": 1.3730209227071436e-05 + }, + { + "step": 548, + "epoch": 0.8782051282051282, + "cpu_mem": 1.818124288, + "gpu_mem": 4.572374016, + "loss": 0.4456, + "grad_norm": 1.8716294765472412, + "learning_rate": 1.3381261888097755e-05 + }, + { + "step": 549, + "epoch": 0.8798076923076923, + "cpu_mem": 1.818124288, + "gpu_mem": 4.572383232, + "loss": 0.3795, + "grad_norm": 1.7129846811294556, + "learning_rate": 1.303659888151753e-05 + }, + { + "step": 550, + "epoch": 0.8814102564102564, + "cpu_mem": 1.818124288, + "gpu_mem": 4.572401664, + "loss": 0.5149, + "grad_norm": 1.8524696826934814, + "learning_rate": 1.2696231015883913e-05 + }, + { + "step": 551, + "epoch": 0.8830128205128205, + "cpu_mem": 1.818124288, + "gpu_mem": 4.57237248, + "loss": 0.4399, + "grad_norm": 1.9076368808746338, + "learning_rate": 1.2360168965055301e-05 + }, + { + "step": 552, + "epoch": 0.8846153846153846, + "cpu_mem": 1.818124288, + "gpu_mem": 4.5724032, + "loss": 0.478, + "grad_norm": 2.094322919845581, + "learning_rate": 1.2028423267860805e-05 + }, + { + "step": 553, + "epoch": 0.8862179487179487, + "cpu_mem": 1.818124288, + "gpu_mem": 4.572412416, + "loss": 0.4116, + "grad_norm": 1.6616631746292114, + "learning_rate": 1.1701004327769709e-05 + }, + { + "step": 554, + "epoch": 0.8878205128205128, + "cpu_mem": 1.818124288, + "gpu_mem": 4.572374016, + "loss": 0.6304, + "grad_norm": 2.494316816329956, + "learning_rate": 1.1377922412565005e-05 + }, + { + "step": 555, + "epoch": 0.8894230769230769, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572378624, + "loss": 0.4277, + "grad_norm": 2.2613613605499268, + "learning_rate": 1.1059187654021762e-05 + }, + { + "step": 556, + "epoch": 0.8910256410256411, + "cpu_mem": 1.818320896, + "gpu_mem": 4.5724032, + "loss": 0.4496, + "grad_norm": 1.6608104705810547, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 557, + "epoch": 0.8926282051282052, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572421632, + "loss": 0.3477, + "grad_norm": 1.6910827159881592, + "learning_rate": 1.0434799452076915e-05 + }, + { + "step": 558, + "epoch": 0.8942307692307693, + "cpu_mem": 1.818320896, + "gpu_mem": 4.5724032, + "loss": 0.5667, + "grad_norm": 2.165856122970581, + "learning_rate": 1.0129165589346643e-05 + }, + { + "step": 559, + "epoch": 0.8958333333333334, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572453888, + "loss": 0.6435, + "grad_norm": 2.398664712905884, + "learning_rate": 9.82791804400626e-06 + }, + { + "step": 560, + "epoch": 0.8974358974358975, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572386304, + "loss": 0.8357, + "grad_norm": 2.666874647140503, + "learning_rate": 9.531066263109971e-06 + }, + { + "step": 561, + "epoch": 0.8990384615384616, + "cpu_mem": 1.818320896, + "gpu_mem": 4.57238784, + "loss": 0.3399, + "grad_norm": 1.694171667098999, + "learning_rate": 9.238619555861731e-06 + }, + { + "step": 562, + "epoch": 0.9006410256410257, + "cpu_mem": 1.818320896, + "gpu_mem": 4.57238784, + "loss": 0.4141, + "grad_norm": 1.6301894187927246, + "learning_rate": 8.950587093323435e-06 + }, + { + "step": 563, + "epoch": 0.9022435897435898, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572393984, + "loss": 0.3996, + "grad_norm": 1.728690505027771, + "learning_rate": 8.66697790812731e-06 + }, + { + "step": 564, + "epoch": 0.9038461538461539, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572407808, + "loss": 0.2845, + "grad_norm": 1.273030161857605, + "learning_rate": 8.387800894192453e-06 + }, + { + "step": 565, + "epoch": 0.905448717948718, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572412416, + "loss": 0.5983, + "grad_norm": 2.4595842361450195, + "learning_rate": 8.113064806446285e-06 + }, + { + "step": 566, + "epoch": 0.907051282051282, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572406272, + "loss": 0.3654, + "grad_norm": 1.5745009183883667, + "learning_rate": 7.842778260549654e-06 + }, + { + "step": 567, + "epoch": 0.9086538461538461, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572400128, + "loss": 0.4396, + "grad_norm": 2.297952175140381, + "learning_rate": 7.5769497326268804e-06 + }, + { + "step": 568, + "epoch": 0.9102564102564102, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572413952, + "loss": 0.5226, + "grad_norm": 2.300553560256958, + "learning_rate": 7.315587558999864e-06 + }, + { + "step": 569, + "epoch": 0.9118589743589743, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572406272, + "loss": 0.6127, + "grad_norm": 2.3296618461608887, + "learning_rate": 7.058699935926526e-06 + }, + { + "step": 570, + "epoch": 0.9134615384615384, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572390912, + "loss": 0.431, + "grad_norm": 2.121154308319092, + "learning_rate": 6.8062949193440515e-06 + }, + { + "step": 571, + "epoch": 0.9150641025641025, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572400128, + "loss": 0.5235, + "grad_norm": 1.9105228185653687, + "learning_rate": 6.5583804246160385e-06 + }, + { + "step": 572, + "epoch": 0.9166666666666666, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572409344, + "loss": 0.3803, + "grad_norm": 1.6965358257293701, + "learning_rate": 6.3149642262843804e-06 + }, + { + "step": 573, + "epoch": 0.9182692307692307, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572412416, + "loss": 0.398, + "grad_norm": 1.8887690305709839, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 574, + "epoch": 0.9198717948717948, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572366336, + "loss": 0.4219, + "grad_norm": 1.6658941507339478, + "learning_rate": 5.84165711141048e-06 + }, + { + "step": 575, + "epoch": 0.9214743589743589, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572420096, + "loss": 0.3302, + "grad_norm": 1.545142650604248, + "learning_rate": 5.611781037671176e-06 + }, + { + "step": 576, + "epoch": 0.9230769230769231, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572421632, + "loss": 0.3909, + "grad_norm": 1.6964352130889893, + "learning_rate": 5.386432945468555e-06 + }, + { + "step": 577, + "epoch": 0.9246794871794872, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572366336, + "loss": 0.4357, + "grad_norm": 1.6184147596359253, + "learning_rate": 5.165619901667311e-06 + }, + { + "step": 578, + "epoch": 0.9262820512820513, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572400128, + "loss": 0.4923, + "grad_norm": 1.9646446704864502, + "learning_rate": 4.949348830914002e-06 + }, + { + "step": 579, + "epoch": 0.9278846153846154, + "cpu_mem": 1.818320896, + "gpu_mem": 4.572378624, + "loss": 0.5139, + "grad_norm": 2.152498483657837, + "learning_rate": 4.737626515419951e-06 + }, + { + "step": 580, + "epoch": 0.9294871794871795, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572409344, + "loss": 0.48, + "grad_norm": 1.904234766960144, + "learning_rate": 4.530459594748592e-06 + }, + { + "step": 581, + "epoch": 0.9310897435897436, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572384768, + "loss": 0.3783, + "grad_norm": 1.9898953437805176, + "learning_rate": 4.327854565607164e-06 + }, + { + "step": 582, + "epoch": 0.9326923076923077, + "cpu_mem": 1.818517504, + "gpu_mem": 4.57241856, + "loss": 0.3362, + "grad_norm": 1.8201286792755127, + "learning_rate": 4.129817781643091e-06 + }, + { + "step": 583, + "epoch": 0.9342948717948718, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572438528, + "loss": 0.5949, + "grad_norm": 2.094822645187378, + "learning_rate": 3.9363554532446276e-06 + }, + { + "step": 584, + "epoch": 0.9358974358974359, + "cpu_mem": 1.818517504, + "gpu_mem": 4.5724032, + "loss": 0.3743, + "grad_norm": 1.5895158052444458, + "learning_rate": 3.7474736473461607e-06 + }, + { + "step": 585, + "epoch": 0.9375, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572423168, + "loss": 0.3262, + "grad_norm": 1.7037602663040161, + "learning_rate": 3.56317828723795e-06 + }, + { + "step": 586, + "epoch": 0.9391025641025641, + "cpu_mem": 1.818517504, + "gpu_mem": 4.5724032, + "loss": 0.3905, + "grad_norm": 1.6365033388137817, + "learning_rate": 3.383475152380355e-06 + }, + { + "step": 587, + "epoch": 0.9407051282051282, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572404736, + "loss": 0.4993, + "grad_norm": 2.3648412227630615, + "learning_rate": 3.2083698782225997e-06 + }, + { + "step": 588, + "epoch": 0.9423076923076923, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572397056, + "loss": 0.3482, + "grad_norm": 1.605014443397522, + "learning_rate": 3.0378679560260467e-06 + }, + { + "step": 589, + "epoch": 0.9439102564102564, + "cpu_mem": 1.818517504, + "gpu_mem": 4.57239552, + "loss": 0.4908, + "grad_norm": 1.9327313899993896, + "learning_rate": 2.871974732691984e-06 + }, + { + "step": 590, + "epoch": 0.9455128205128205, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572409344, + "loss": 0.432, + "grad_norm": 1.671717882156372, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 591, + "epoch": 0.9471153846153846, + "cpu_mem": 1.818517504, + "gpu_mem": 4.57238016, + "loss": 0.4478, + "grad_norm": 1.5281836986541748, + "learning_rate": 2.554035047414732e-06 + }, + { + "step": 592, + "epoch": 0.9487179487179487, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572430848, + "loss": 0.4619, + "grad_norm": 1.9861153364181519, + "learning_rate": 2.401998555987389e-06 + }, + { + "step": 593, + "epoch": 0.9503205128205128, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572427776, + "loss": 0.4774, + "grad_norm": 1.8905874490737915, + "learning_rate": 2.2545907041415457e-06 + }, + { + "step": 594, + "epoch": 0.9519230769230769, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572407808, + "loss": 0.4145, + "grad_norm": 1.874778151512146, + "learning_rate": 2.1118161145537436e-06 + }, + { + "step": 595, + "epoch": 0.9535256410256411, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572389376, + "loss": 0.424, + "grad_norm": 1.661137580871582, + "learning_rate": 1.973679264602485e-06 + }, + { + "step": 596, + "epoch": 0.9551282051282052, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572398592, + "loss": 0.3212, + "grad_norm": 1.7450460195541382, + "learning_rate": 1.840184486227808e-06 + }, + { + "step": 597, + "epoch": 0.9567307692307693, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572366336, + "loss": 0.426, + "grad_norm": 2.4419991970062256, + "learning_rate": 1.711335965795435e-06 + }, + { + "step": 598, + "epoch": 0.9583333333333334, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572427776, + "loss": 0.4728, + "grad_norm": 1.781221628189087, + "learning_rate": 1.5871377439655054e-06 + }, + { + "step": 599, + "epoch": 0.9599358974358975, + "cpu_mem": 1.818517504, + "gpu_mem": 4.57242624, + "loss": 0.3622, + "grad_norm": 1.828503966331482, + "learning_rate": 1.4675937155658456e-06 + }, + { + "step": 600, + "epoch": 0.9615384615384616, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572381696, + "loss": 0.4201, + "grad_norm": 1.726698637008667, + "learning_rate": 1.3527076294698846e-06 + }, + { + "step": 601, + "epoch": 0.9631410256410257, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572413952, + "loss": 0.3779, + "grad_norm": 1.606973648071289, + "learning_rate": 1.2424830884790126e-06 + }, + { + "step": 602, + "epoch": 0.9647435897435898, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572407808, + "loss": 0.484, + "grad_norm": 2.3565497398376465, + "learning_rate": 1.1369235492096397e-06 + }, + { + "step": 603, + "epoch": 0.9663461538461539, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572393984, + "loss": 0.326, + "grad_norm": 1.5953701734542847, + "learning_rate": 1.0360323219847645e-06 + }, + { + "step": 604, + "epoch": 0.967948717948718, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572393984, + "loss": 0.4946, + "grad_norm": 1.96070396900177, + "learning_rate": 9.398125707302084e-07 + }, + { + "step": 605, + "epoch": 0.969551282051282, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572420096, + "loss": 0.3222, + "grad_norm": 1.2284862995147705, + "learning_rate": 8.482673128753947e-07 + }, + { + "step": 606, + "epoch": 0.9711538461538461, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572407808, + "loss": 0.291, + "grad_norm": 1.3124595880508423, + "learning_rate": 7.613994192586736e-07 + }, + { + "step": 607, + "epoch": 0.9727564102564102, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572398592, + "loss": 0.5016, + "grad_norm": 1.5796568393707275, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 608, + "epoch": 0.9743589743589743, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572390912, + "loss": 0.552, + "grad_norm": 2.058391571044922, + "learning_rate": 6.017064746021094e-07 + }, + { + "step": 609, + "epoch": 0.9759615384615384, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572413952, + "loss": 0.3261, + "grad_norm": 1.5234756469726562, + "learning_rate": 5.288864314965003e-07 + }, + { + "step": 610, + "epoch": 0.9775641025641025, + "cpu_mem": 1.818517504, + "gpu_mem": 4.5724032, + "loss": 0.2719, + "grad_norm": 1.5803236961364746, + "learning_rate": 4.607537683404106e-07 + }, + { + "step": 611, + "epoch": 0.9791666666666666, + "cpu_mem": 1.818517504, + "gpu_mem": 4.57238784, + "loss": 0.3343, + "grad_norm": 1.3270512819290161, + "learning_rate": 3.973106217585842e-07 + }, + { + "step": 612, + "epoch": 0.9807692307692307, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572453888, + "loss": 0.4189, + "grad_norm": 1.6430644989013672, + "learning_rate": 3.3855898131356915e-07 + }, + { + "step": 613, + "epoch": 0.9823717948717948, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572397056, + "loss": 0.4608, + "grad_norm": 2.0475971698760986, + "learning_rate": 2.845006894433843e-07 + }, + { + "step": 614, + "epoch": 0.9839743589743589, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572383232, + "loss": 0.5585, + "grad_norm": 2.305540084838867, + "learning_rate": 2.351374414037155e-07 + }, + { + "step": 615, + "epoch": 0.9855769230769231, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572447744, + "loss": 0.5132, + "grad_norm": 1.8071883916854858, + "learning_rate": 1.9047078521474135e-07 + }, + { + "step": 616, + "epoch": 0.9871794871794872, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572377088, + "loss": 0.4357, + "grad_norm": 1.9215466976165771, + "learning_rate": 1.505021216125557e-07 + }, + { + "step": 617, + "epoch": 0.9887820512820513, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572404736, + "loss": 0.2588, + "grad_norm": 1.5094534158706665, + "learning_rate": 1.1523270400535245e-07 + }, + { + "step": 618, + "epoch": 0.9903846153846154, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572407808, + "loss": 0.5259, + "grad_norm": 2.097648859024048, + "learning_rate": 8.466363843397383e-08 + }, + { + "step": 619, + "epoch": 0.9919871794871795, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572383232, + "loss": 0.5451, + "grad_norm": 1.994186282157898, + "learning_rate": 5.8795883537338106e-08 + }, + { + "step": 620, + "epoch": 0.9935897435897436, + "cpu_mem": 1.818517504, + "gpu_mem": 4.57241088, + "loss": 0.4685, + "grad_norm": 1.8602310419082642, + "learning_rate": 3.763025052231361e-08 + }, + { + "step": 621, + "epoch": 0.9951923076923077, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572420096, + "loss": 0.373, + "grad_norm": 1.8432235717773438, + "learning_rate": 2.1167403138339088e-08 + }, + { + "step": 622, + "epoch": 0.9967948717948718, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572417024, + "loss": 0.3975, + "grad_norm": 1.3256340026855469, + "learning_rate": 9.407857656540397e-09 + }, + { + "step": 623, + "epoch": 0.9983974358974359, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572389376, + "loss": 0.5038, + "grad_norm": 1.9081904888153076, + "learning_rate": 2.3519828535434325e-09 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572135936, + "loss": 0.425, + "grad_norm": 2.0672781467437744, + "learning_rate": 0.0 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.818517504, + "gpu_mem": 4.572135936, + "train_runtime": 8092.7093, + "train_samples_per_second": 4.931, + "train_steps_per_second": 0.077, + "total_flos": 0.0, + "train_loss": 0.8038664028908198 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r8-a2/adapter_config.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c9ded8039b496858a8aa3d756f427279337f8964 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r8-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 16, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 8, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "D" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r8-a2/eval_results.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..de5a99aeb61eecd7cae5551f975ae799b374fb4f --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "hellaswag", + "results": 0.6353316072495518 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r8-a2/training_configuration.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..26c51dccad9c273ba8585e7b872f9430666cb6a0 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "HELLASWAG", + "dataset_id": "Rowan/hellaswag", + "preprocess_id": "hellaswag_train_deepeval" + }, + "peft_config": { + "method": "abl_D", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 3163776 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 1, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_D-hellaswag-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r8-a2", + "seed": 42, + "timestamp": "2025-09-02T15:50:58.917981" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r8-a2/training_logs.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..23336d078994c20ca4f728693f533e49ae3a7d2b --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-hellaswag-r8-a2/training_logs.json @@ -0,0 +1,5629 @@ +[ + { + "step": 1, + "epoch": 0.0016025641025641025, + "cpu_mem": 1.700429824, + "gpu_mem": 4.431066624, + "loss": 3.4877, + "grad_norm": 14.095464706420898, + "learning_rate": 4.7619047619047615e-06 + }, + { + "step": 2, + "epoch": 0.003205128205128205, + "cpu_mem": 1.701412864, + "gpu_mem": 4.456449024, + "loss": 3.6203, + "grad_norm": 13.875768661499023, + "learning_rate": 9.523809523809523e-06 + }, + { + "step": 3, + "epoch": 0.004807692307692308, + "cpu_mem": 1.702592512, + "gpu_mem": 4.456456704, + "loss": 3.405, + "grad_norm": 13.723123550415039, + "learning_rate": 1.4285714285714284e-05 + }, + { + "step": 4, + "epoch": 0.00641025641025641, + "cpu_mem": 1.703575552, + "gpu_mem": 4.456490496, + "loss": 3.5459, + "grad_norm": 13.107795715332031, + "learning_rate": 1.9047619047619046e-05 + }, + { + "step": 5, + "epoch": 0.008012820512820512, + "cpu_mem": 1.704558592, + "gpu_mem": 4.456453632, + "loss": 3.381, + "grad_norm": 13.134697914123535, + "learning_rate": 2.3809523809523807e-05 + }, + { + "step": 6, + "epoch": 0.009615384615384616, + "cpu_mem": 1.705541632, + "gpu_mem": 4.456499712, + "loss": 3.3822, + "grad_norm": 13.9066801071167, + "learning_rate": 2.8571428571428567e-05 + }, + { + "step": 7, + "epoch": 0.011217948717948718, + "cpu_mem": 1.706524672, + "gpu_mem": 4.456459776, + "loss": 3.2623, + "grad_norm": 11.901814460754395, + "learning_rate": 3.333333333333333e-05 + }, + { + "step": 8, + "epoch": 0.01282051282051282, + "cpu_mem": 1.707311104, + "gpu_mem": 4.456490496, + "loss": 2.8683, + "grad_norm": 12.362210273742676, + "learning_rate": 3.809523809523809e-05 + }, + { + "step": 9, + "epoch": 0.014423076923076924, + "cpu_mem": 1.708097536, + "gpu_mem": 4.456490496, + "loss": 2.7071, + "grad_norm": 10.18989086151123, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 10, + "epoch": 0.016025641025641024, + "cpu_mem": 1.708883968, + "gpu_mem": 4.456433664, + "loss": 2.4799, + "grad_norm": 7.9352707862854, + "learning_rate": 4.7619047619047614e-05 + }, + { + "step": 11, + "epoch": 0.017628205128205128, + "cpu_mem": 1.7096704, + "gpu_mem": 4.456453632, + "loss": 2.2769, + "grad_norm": 8.0736083984375, + "learning_rate": 5.238095238095237e-05 + }, + { + "step": 12, + "epoch": 0.019230769230769232, + "cpu_mem": 1.710456832, + "gpu_mem": 4.45645056, + "loss": 2.3926, + "grad_norm": 7.47382116317749, + "learning_rate": 5.7142857142857135e-05 + }, + { + "step": 13, + "epoch": 0.020833333333333332, + "cpu_mem": 1.711046656, + "gpu_mem": 4.45644288, + "loss": 2.1334, + "grad_norm": 5.7148590087890625, + "learning_rate": 6.190476190476189e-05 + }, + { + "step": 14, + "epoch": 0.022435897435897436, + "cpu_mem": 1.711833088, + "gpu_mem": 4.456468992, + "loss": 1.9836, + "grad_norm": 4.707211971282959, + "learning_rate": 6.666666666666666e-05 + }, + { + "step": 15, + "epoch": 0.02403846153846154, + "cpu_mem": 1.71261952, + "gpu_mem": 4.456467456, + "loss": 1.7634, + "grad_norm": 3.455317735671997, + "learning_rate": 7.142857142857142e-05 + }, + { + "step": 16, + "epoch": 0.02564102564102564, + "cpu_mem": 1.713405952, + "gpu_mem": 4.456459776, + "loss": 1.7901, + "grad_norm": 3.486337184906006, + "learning_rate": 7.619047619047618e-05 + }, + { + "step": 17, + "epoch": 0.027243589743589744, + "cpu_mem": 1.713995776, + "gpu_mem": 4.456459776, + "loss": 1.6633, + "grad_norm": 2.325795888900757, + "learning_rate": 8.095238095238093e-05 + }, + { + "step": 18, + "epoch": 0.028846153846153848, + "cpu_mem": 1.714782208, + "gpu_mem": 4.456459776, + "loss": 1.6433, + "grad_norm": 2.396042585372925, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 19, + "epoch": 0.030448717948717948, + "cpu_mem": 1.715372032, + "gpu_mem": 4.456459776, + "loss": 1.5572, + "grad_norm": 1.554761290550232, + "learning_rate": 9.047619047619046e-05 + }, + { + "step": 20, + "epoch": 0.03205128205128205, + "cpu_mem": 1.715961856, + "gpu_mem": 4.456433664, + "loss": 1.4401, + "grad_norm": 1.1871362924575806, + "learning_rate": 9.523809523809523e-05 + }, + { + "step": 21, + "epoch": 0.03365384615384615, + "cpu_mem": 1.716748288, + "gpu_mem": 4.45645056, + "loss": 1.4396, + "grad_norm": 1.2787489891052246, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 22, + "epoch": 0.035256410256410256, + "cpu_mem": 1.717338112, + "gpu_mem": 4.45645824, + "loss": 1.5045, + "grad_norm": 1.5204066038131714, + "learning_rate": 0.00010476190476190474 + }, + { + "step": 23, + "epoch": 0.03685897435897436, + "cpu_mem": 1.717927936, + "gpu_mem": 4.456472064, + "loss": 1.397, + "grad_norm": 0.8946168422698975, + "learning_rate": 0.0001095238095238095 + }, + { + "step": 24, + "epoch": 0.038461538461538464, + "cpu_mem": 1.71851776, + "gpu_mem": 4.456456704, + "loss": 1.3788, + "grad_norm": 0.7262819409370422, + "learning_rate": 0.00011428571428571427 + }, + { + "step": 25, + "epoch": 0.04006410256410257, + "cpu_mem": 1.719107584, + "gpu_mem": 4.456444416, + "loss": 1.5122, + "grad_norm": 2.0647897720336914, + "learning_rate": 0.00011904761904761903 + }, + { + "step": 26, + "epoch": 0.041666666666666664, + "cpu_mem": 1.719697408, + "gpu_mem": 4.45645056, + "loss": 1.5225, + "grad_norm": 2.4401161670684814, + "learning_rate": 0.00012380952380952378 + }, + { + "step": 27, + "epoch": 0.04326923076923077, + "cpu_mem": 1.72048384, + "gpu_mem": 4.45645824, + "loss": 1.4501, + "grad_norm": 1.4879963397979736, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 28, + "epoch": 0.04487179487179487, + "cpu_mem": 1.720877056, + "gpu_mem": 4.456453632, + "loss": 1.4864, + "grad_norm": 1.7521673440933228, + "learning_rate": 0.0001333333333333333 + }, + { + "step": 29, + "epoch": 0.046474358974358976, + "cpu_mem": 1.72146688, + "gpu_mem": 4.456462848, + "loss": 1.3827, + "grad_norm": 0.5491812825202942, + "learning_rate": 0.00013809523809523808 + }, + { + "step": 30, + "epoch": 0.04807692307692308, + "cpu_mem": 1.722253312, + "gpu_mem": 4.4564352, + "loss": 1.4129, + "grad_norm": 0.8914498090744019, + "learning_rate": 0.00014285714285714284 + }, + { + "step": 31, + "epoch": 0.049679487179487176, + "cpu_mem": 1.722646528, + "gpu_mem": 4.456490496, + "loss": 1.3844, + "grad_norm": 0.617733895778656, + "learning_rate": 0.0001476190476190476 + }, + { + "step": 32, + "epoch": 0.05128205128205128, + "cpu_mem": 1.72343296, + "gpu_mem": 4.456482816, + "loss": 1.4058, + "grad_norm": 1.0581271648406982, + "learning_rate": 0.00015238095238095237 + }, + { + "step": 33, + "epoch": 0.052884615384615384, + "cpu_mem": 1.723826176, + "gpu_mem": 4.456436736, + "loss": 1.3865, + "grad_norm": 0.5864410996437073, + "learning_rate": 0.00015714285714285713 + }, + { + "step": 34, + "epoch": 0.05448717948717949, + "cpu_mem": 1.724416, + "gpu_mem": 4.456455168, + "loss": 1.417, + "grad_norm": 0.6384380459785461, + "learning_rate": 0.00016190476190476187 + }, + { + "step": 35, + "epoch": 0.05608974358974359, + "cpu_mem": 1.725005824, + "gpu_mem": 4.456476672, + "loss": 1.3956, + "grad_norm": 1.118798017501831, + "learning_rate": 0.00016666666666666666 + }, + { + "step": 36, + "epoch": 0.057692307692307696, + "cpu_mem": 1.725595648, + "gpu_mem": 4.456475136, + "loss": 1.3867, + "grad_norm": 0.5862351059913635, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 37, + "epoch": 0.05929487179487179, + "cpu_mem": 1.725988864, + "gpu_mem": 4.456507392, + "loss": 1.4084, + "grad_norm": 0.8245809674263, + "learning_rate": 0.0001761904761904762 + }, + { + "step": 38, + "epoch": 0.060897435897435896, + "cpu_mem": 1.726578688, + "gpu_mem": 4.456459776, + "loss": 1.4045, + "grad_norm": 0.5638912916183472, + "learning_rate": 0.00018095238095238093 + }, + { + "step": 39, + "epoch": 0.0625, + "cpu_mem": 1.727168512, + "gpu_mem": 4.456516608, + "loss": 1.3644, + "grad_norm": 1.0878793001174927, + "learning_rate": 0.00018571428571428572 + }, + { + "step": 40, + "epoch": 0.0641025641025641, + "cpu_mem": 1.727758336, + "gpu_mem": 4.456444416, + "loss": 1.443, + "grad_norm": 1.0749262571334839, + "learning_rate": 0.00019047619047619045 + }, + { + "step": 41, + "epoch": 0.06570512820512821, + "cpu_mem": 1.72834816, + "gpu_mem": 4.456472064, + "loss": 1.3968, + "grad_norm": 0.5044074058532715, + "learning_rate": 0.00019523809523809522 + }, + { + "step": 42, + "epoch": 0.0673076923076923, + "cpu_mem": 1.728741376, + "gpu_mem": 4.456485888, + "loss": 1.4079, + "grad_norm": 0.6611565351486206, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 43, + "epoch": 0.06891025641025642, + "cpu_mem": 1.7293312, + "gpu_mem": 4.456492032, + "loss": 1.3629, + "grad_norm": 0.3436950147151947, + "learning_rate": 0.00020476190476190475 + }, + { + "step": 44, + "epoch": 0.07051282051282051, + "cpu_mem": 1.729921024, + "gpu_mem": 4.456470528, + "loss": 1.3877, + "grad_norm": 0.5219911336898804, + "learning_rate": 0.00020952380952380948 + }, + { + "step": 45, + "epoch": 0.07211538461538461, + "cpu_mem": 1.730510848, + "gpu_mem": 4.456470528, + "loss": 1.3839, + "grad_norm": 0.42558592557907104, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 46, + "epoch": 0.07371794871794872, + "cpu_mem": 1.730904064, + "gpu_mem": 4.456470528, + "loss": 1.4135, + "grad_norm": 0.9319276213645935, + "learning_rate": 0.000219047619047619 + }, + { + "step": 47, + "epoch": 0.07532051282051282, + "cpu_mem": 1.731493888, + "gpu_mem": 4.456456704, + "loss": 1.3993, + "grad_norm": 0.4072854816913605, + "learning_rate": 0.0002238095238095238 + }, + { + "step": 48, + "epoch": 0.07692307692307693, + "cpu_mem": 1.732083712, + "gpu_mem": 4.456475136, + "loss": 1.3763, + "grad_norm": 0.685032844543457, + "learning_rate": 0.00022857142857142854 + }, + { + "step": 49, + "epoch": 0.07852564102564102, + "cpu_mem": 1.732476928, + "gpu_mem": 4.456487424, + "loss": 1.4057, + "grad_norm": 0.6279550790786743, + "learning_rate": 0.0002333333333333333 + }, + { + "step": 50, + "epoch": 0.08012820512820513, + "cpu_mem": 1.733066752, + "gpu_mem": 4.456464384, + "loss": 1.3756, + "grad_norm": 0.49337947368621826, + "learning_rate": 0.00023809523809523807 + }, + { + "step": 51, + "epoch": 0.08173076923076923, + "cpu_mem": 1.733459968, + "gpu_mem": 4.456449024, + "loss": 1.3644, + "grad_norm": 0.5134803652763367, + "learning_rate": 0.00024285714285714283 + }, + { + "step": 52, + "epoch": 0.08333333333333333, + "cpu_mem": 1.734049792, + "gpu_mem": 4.456453632, + "loss": 1.3649, + "grad_norm": 0.4157271981239319, + "learning_rate": 0.00024761904761904757 + }, + { + "step": 53, + "epoch": 0.08493589743589744, + "cpu_mem": 1.734443008, + "gpu_mem": 4.45648128, + "loss": 1.4344, + "grad_norm": 1.0536940097808838, + "learning_rate": 0.0002523809523809524 + }, + { + "step": 54, + "epoch": 0.08653846153846154, + "cpu_mem": 1.734836224, + "gpu_mem": 4.456456704, + "loss": 1.3968, + "grad_norm": 0.8853192329406738, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 55, + "epoch": 0.08814102564102565, + "cpu_mem": 1.735426048, + "gpu_mem": 4.456475136, + "loss": 1.3943, + "grad_norm": 0.6484671235084534, + "learning_rate": 0.00026190476190476186 + }, + { + "step": 56, + "epoch": 0.08974358974358974, + "cpu_mem": 1.735819264, + "gpu_mem": 4.456468992, + "loss": 1.4027, + "grad_norm": 0.6769310832023621, + "learning_rate": 0.0002666666666666666 + }, + { + "step": 57, + "epoch": 0.09134615384615384, + "cpu_mem": 1.736409088, + "gpu_mem": 4.4564352, + "loss": 1.3851, + "grad_norm": 0.5126485228538513, + "learning_rate": 0.0002714285714285714 + }, + { + "step": 58, + "epoch": 0.09294871794871795, + "cpu_mem": 1.736998912, + "gpu_mem": 4.456464384, + "loss": 1.4163, + "grad_norm": 0.6107249855995178, + "learning_rate": 0.00027619047619047615 + }, + { + "step": 59, + "epoch": 0.09455128205128205, + "cpu_mem": 1.737392128, + "gpu_mem": 4.456447488, + "loss": 1.3445, + "grad_norm": 0.5452185273170471, + "learning_rate": 0.0002809523809523809 + }, + { + "step": 60, + "epoch": 0.09615384615384616, + "cpu_mem": 1.737785344, + "gpu_mem": 4.45648896, + "loss": 1.3796, + "grad_norm": 0.64973384141922, + "learning_rate": 0.0002857142857142857 + }, + { + "step": 61, + "epoch": 0.09775641025641026, + "cpu_mem": 1.73817856, + "gpu_mem": 4.456455168, + "loss": 1.4002, + "grad_norm": 0.4014802873134613, + "learning_rate": 0.00029047619047619045 + }, + { + "step": 62, + "epoch": 0.09935897435897435, + "cpu_mem": 1.738768384, + "gpu_mem": 4.456495104, + "loss": 1.3302, + "grad_norm": 0.5372034907341003, + "learning_rate": 0.0002952380952380952 + }, + { + "step": 63, + "epoch": 0.10096153846153846, + "cpu_mem": 1.7391616, + "gpu_mem": 4.456449024, + "loss": 1.4605, + "grad_norm": 0.6907044053077698, + "learning_rate": 0.0003 + }, + { + "step": 64, + "epoch": 0.10256410256410256, + "cpu_mem": 1.739751424, + "gpu_mem": 4.456453632, + "loss": 1.4651, + "grad_norm": 0.700005829334259, + "learning_rate": 0.00029999764801714643 + }, + { + "step": 65, + "epoch": 0.10416666666666667, + "cpu_mem": 1.74014464, + "gpu_mem": 4.45645056, + "loss": 1.455, + "grad_norm": 0.7177985906600952, + "learning_rate": 0.00029999059214234344 + }, + { + "step": 66, + "epoch": 0.10576923076923077, + "cpu_mem": 1.740537856, + "gpu_mem": 4.456468992, + "loss": 1.4275, + "grad_norm": 0.46127229928970337, + "learning_rate": 0.00029997883259686163 + }, + { + "step": 67, + "epoch": 0.10737179487179487, + "cpu_mem": 1.74112768, + "gpu_mem": 4.456461312, + "loss": 1.3946, + "grad_norm": 0.21615774929523468, + "learning_rate": 0.00029996236974947764 + }, + { + "step": 68, + "epoch": 0.10897435897435898, + "cpu_mem": 1.741520896, + "gpu_mem": 4.456445952, + "loss": 1.3972, + "grad_norm": 0.4650282561779022, + "learning_rate": 0.00029994120411646263 + }, + { + "step": 69, + "epoch": 0.11057692307692307, + "cpu_mem": 1.741914112, + "gpu_mem": 4.456516608, + "loss": 1.4045, + "grad_norm": 0.5615075826644897, + "learning_rate": 0.00029991533636156603 + }, + { + "step": 70, + "epoch": 0.11217948717948718, + "cpu_mem": 1.742307328, + "gpu_mem": 4.456467456, + "loss": 1.3964, + "grad_norm": 0.5318522453308105, + "learning_rate": 0.00029988476729599464 + }, + { + "step": 71, + "epoch": 0.11378205128205128, + "cpu_mem": 1.742897152, + "gpu_mem": 4.456492032, + "loss": 1.3949, + "grad_norm": 0.6598122715950012, + "learning_rate": 0.0002998494978783874 + }, + { + "step": 72, + "epoch": 0.11538461538461539, + "cpu_mem": 1.743486976, + "gpu_mem": 4.456462848, + "loss": 1.3949, + "grad_norm": 0.4235268533229828, + "learning_rate": 0.0002998095292147852 + }, + { + "step": 73, + "epoch": 0.11698717948717949, + "cpu_mem": 1.743880192, + "gpu_mem": 4.456455168, + "loss": 1.3758, + "grad_norm": 0.5822791457176208, + "learning_rate": 0.0002997648625585962 + }, + { + "step": 74, + "epoch": 0.11858974358974358, + "cpu_mem": 1.744273408, + "gpu_mem": 4.456449024, + "loss": 1.3786, + "grad_norm": 0.4675252139568329, + "learning_rate": 0.0002997154993105566 + }, + { + "step": 75, + "epoch": 0.1201923076923077, + "cpu_mem": 1.744666624, + "gpu_mem": 4.456478208, + "loss": 1.4501, + "grad_norm": 0.7975900769233704, + "learning_rate": 0.00029966144101868636 + }, + { + "step": 76, + "epoch": 0.12179487179487179, + "cpu_mem": 1.744863232, + "gpu_mem": 4.456468992, + "loss": 1.485, + "grad_norm": 1.1651885509490967, + "learning_rate": 0.0002996026893782414 + }, + { + "step": 77, + "epoch": 0.1233974358974359, + "cpu_mem": 1.745256448, + "gpu_mem": 4.456456704, + "loss": 1.4207, + "grad_norm": 0.5352832078933716, + "learning_rate": 0.00029953924623165955 + }, + { + "step": 78, + "epoch": 0.125, + "cpu_mem": 1.745649664, + "gpu_mem": 4.456449024, + "loss": 1.3849, + "grad_norm": 0.44987940788269043, + "learning_rate": 0.0002994711135685035 + }, + { + "step": 79, + "epoch": 0.1266025641025641, + "cpu_mem": 1.74604288, + "gpu_mem": 4.456501248, + "loss": 1.386, + "grad_norm": 0.2897724509239197, + "learning_rate": 0.00029939829352539787 + }, + { + "step": 80, + "epoch": 0.1282051282051282, + "cpu_mem": 1.746632704, + "gpu_mem": 4.456479744, + "loss": 1.4115, + "grad_norm": 0.4007583558559418, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 81, + "epoch": 0.12980769230769232, + "cpu_mem": 1.74702592, + "gpu_mem": 4.4564736, + "loss": 1.3686, + "grad_norm": 0.29046037793159485, + "learning_rate": 0.0002992386005807413 + }, + { + "step": 82, + "epoch": 0.13141025641025642, + "cpu_mem": 1.747419136, + "gpu_mem": 4.45645056, + "loss": 1.4249, + "grad_norm": 0.6103869080543518, + "learning_rate": 0.00029915173268712456 + }, + { + "step": 83, + "epoch": 0.1330128205128205, + "cpu_mem": 1.747812352, + "gpu_mem": 4.456472064, + "loss": 1.3906, + "grad_norm": 0.38035935163497925, + "learning_rate": 0.0002990601874292698 + }, + { + "step": 84, + "epoch": 0.1346153846153846, + "cpu_mem": 1.748205568, + "gpu_mem": 4.456444416, + "loss": 1.4282, + "grad_norm": 0.5124643445014954, + "learning_rate": 0.0002989639676780152 + }, + { + "step": 85, + "epoch": 0.1362179487179487, + "cpu_mem": 1.748598784, + "gpu_mem": 4.456452096, + "loss": 1.3824, + "grad_norm": 0.1932167410850525, + "learning_rate": 0.0002988630764507904 + }, + { + "step": 86, + "epoch": 0.13782051282051283, + "cpu_mem": 1.749188608, + "gpu_mem": 4.456470528, + "loss": 1.3733, + "grad_norm": 0.2339990884065628, + "learning_rate": 0.00029875751691152094 + }, + { + "step": 87, + "epoch": 0.13942307692307693, + "cpu_mem": 1.749581824, + "gpu_mem": 4.456459776, + "loss": 1.39, + "grad_norm": 0.3170234262943268, + "learning_rate": 0.0002986472923705301 + }, + { + "step": 88, + "epoch": 0.14102564102564102, + "cpu_mem": 1.74997504, + "gpu_mem": 4.45645824, + "loss": 1.4019, + "grad_norm": 0.483407199382782, + "learning_rate": 0.0002985324062844341 + }, + { + "step": 89, + "epoch": 0.14262820512820512, + "cpu_mem": 1.750171648, + "gpu_mem": 4.456453632, + "loss": 1.3935, + "grad_norm": 0.31262341141700745, + "learning_rate": 0.0002984128622560345 + }, + { + "step": 90, + "epoch": 0.14423076923076922, + "cpu_mem": 1.750368256, + "gpu_mem": 4.45645824, + "loss": 1.3655, + "grad_norm": 0.33274024724960327, + "learning_rate": 0.0002982886640342046 + }, + { + "step": 91, + "epoch": 0.14583333333333334, + "cpu_mem": 1.75095808, + "gpu_mem": 4.456468992, + "loss": 1.4007, + "grad_norm": 0.45294469594955444, + "learning_rate": 0.00029815981551377217 + }, + { + "step": 92, + "epoch": 0.14743589743589744, + "cpu_mem": 1.751351296, + "gpu_mem": 4.456472064, + "loss": 1.413, + "grad_norm": 0.4251677393913269, + "learning_rate": 0.00029802632073539745 + }, + { + "step": 93, + "epoch": 0.14903846153846154, + "cpu_mem": 1.751744512, + "gpu_mem": 4.456472064, + "loss": 1.3981, + "grad_norm": 0.21787908673286438, + "learning_rate": 0.0002978881838854462 + }, + { + "step": 94, + "epoch": 0.15064102564102563, + "cpu_mem": 1.752137728, + "gpu_mem": 4.456467456, + "loss": 1.416, + "grad_norm": 0.46231982111930847, + "learning_rate": 0.00029774540929585847 + }, + { + "step": 95, + "epoch": 0.15224358974358973, + "cpu_mem": 1.752530944, + "gpu_mem": 4.456485888, + "loss": 1.3697, + "grad_norm": 0.3362692594528198, + "learning_rate": 0.0002975980014440126 + }, + { + "step": 96, + "epoch": 0.15384615384615385, + "cpu_mem": 1.752727552, + "gpu_mem": 4.45648896, + "loss": 1.3779, + "grad_norm": 0.18292684853076935, + "learning_rate": 0.00029744596495258525 + }, + { + "step": 97, + "epoch": 0.15544871794871795, + "cpu_mem": 1.753317376, + "gpu_mem": 4.45646592, + "loss": 1.4089, + "grad_norm": 0.3484671115875244, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 98, + "epoch": 0.15705128205128205, + "cpu_mem": 1.753710592, + "gpu_mem": 4.456476672, + "loss": 1.3901, + "grad_norm": 0.2358308732509613, + "learning_rate": 0.000297128025267308 + }, + { + "step": 99, + "epoch": 0.15865384615384615, + "cpu_mem": 1.7539072, + "gpu_mem": 4.456476672, + "loss": 1.3859, + "grad_norm": 0.41554537415504456, + "learning_rate": 0.00029696213204397396 + }, + { + "step": 100, + "epoch": 0.16025641025641027, + "cpu_mem": 1.754300416, + "gpu_mem": 4.456452096, + "loss": 1.3858, + "grad_norm": 0.24141836166381836, + "learning_rate": 0.00029679163012177737 + }, + { + "step": 101, + "epoch": 0.16185897435897437, + "cpu_mem": 1.754693632, + "gpu_mem": 4.45648128, + "loss": 1.3916, + "grad_norm": 0.3961893618106842, + "learning_rate": 0.0002966165248476196 + }, + { + "step": 102, + "epoch": 0.16346153846153846, + "cpu_mem": 1.755086848, + "gpu_mem": 4.45645824, + "loss": 1.361, + "grad_norm": 0.36020639538764954, + "learning_rate": 0.00029643682171276203 + }, + { + "step": 103, + "epoch": 0.16506410256410256, + "cpu_mem": 1.755480064, + "gpu_mem": 4.456475136, + "loss": 1.4046, + "grad_norm": 0.37374791502952576, + "learning_rate": 0.0002962525263526538 + }, + { + "step": 104, + "epoch": 0.16666666666666666, + "cpu_mem": 1.75587328, + "gpu_mem": 4.45644288, + "loss": 1.3952, + "grad_norm": 0.4205634295940399, + "learning_rate": 0.0002960636445467553 + }, + { + "step": 105, + "epoch": 0.16826923076923078, + "cpu_mem": 1.756069888, + "gpu_mem": 4.45645824, + "loss": 1.3806, + "grad_norm": 0.28966137766838074, + "learning_rate": 0.0002958701822183569 + }, + { + "step": 106, + "epoch": 0.16987179487179488, + "cpu_mem": 1.756463104, + "gpu_mem": 4.456438272, + "loss": 1.3974, + "grad_norm": 0.40935030579566956, + "learning_rate": 0.0002956721454343928 + }, + { + "step": 107, + "epoch": 0.17147435897435898, + "cpu_mem": 1.75685632, + "gpu_mem": 4.456479744, + "loss": 1.3846, + "grad_norm": 0.23420073091983795, + "learning_rate": 0.0002954695404052514 + }, + { + "step": 108, + "epoch": 0.17307692307692307, + "cpu_mem": 1.757249536, + "gpu_mem": 4.456475136, + "loss": 1.3832, + "grad_norm": 0.21373963356018066, + "learning_rate": 0.00029526237348458003 + }, + { + "step": 109, + "epoch": 0.17467948717948717, + "cpu_mem": 1.757446144, + "gpu_mem": 4.45648128, + "loss": 1.3963, + "grad_norm": 0.4896712899208069, + "learning_rate": 0.000295050651169086 + }, + { + "step": 110, + "epoch": 0.1762820512820513, + "cpu_mem": 1.75783936, + "gpu_mem": 4.456478208, + "loss": 1.3899, + "grad_norm": 0.34885406494140625, + "learning_rate": 0.00029483438009833264 + }, + { + "step": 111, + "epoch": 0.1778846153846154, + "cpu_mem": 1.758035968, + "gpu_mem": 4.456479744, + "loss": 1.3765, + "grad_norm": 0.2738078236579895, + "learning_rate": 0.0002946135670545314 + }, + { + "step": 112, + "epoch": 0.1794871794871795, + "cpu_mem": 1.758429184, + "gpu_mem": 4.456476672, + "loss": 1.3822, + "grad_norm": 0.3038106858730316, + "learning_rate": 0.0002943882189623288 + }, + { + "step": 113, + "epoch": 0.18108974358974358, + "cpu_mem": 1.7588224, + "gpu_mem": 4.456456704, + "loss": 1.3965, + "grad_norm": 0.24287869036197662, + "learning_rate": 0.00029415834288858947 + }, + { + "step": 114, + "epoch": 0.18269230769230768, + "cpu_mem": 1.759019008, + "gpu_mem": 4.456452096, + "loss": 1.3718, + "grad_norm": 0.42314180731773376, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 115, + "epoch": 0.1842948717948718, + "cpu_mem": 1.759412224, + "gpu_mem": 4.456470528, + "loss": 1.4067, + "grad_norm": 0.310683012008667, + "learning_rate": 0.0002936850357737156 + }, + { + "step": 116, + "epoch": 0.1858974358974359, + "cpu_mem": 1.75980544, + "gpu_mem": 4.45648128, + "loss": 1.4057, + "grad_norm": 0.29812875390052795, + "learning_rate": 0.0002934416195753839 + }, + { + "step": 117, + "epoch": 0.1875, + "cpu_mem": 1.760002048, + "gpu_mem": 4.456467456, + "loss": 1.3859, + "grad_norm": 0.26686087250709534, + "learning_rate": 0.00029319370508065594 + }, + { + "step": 118, + "epoch": 0.1891025641025641, + "cpu_mem": 1.760395264, + "gpu_mem": 4.456482816, + "loss": 1.437, + "grad_norm": 0.6901261210441589, + "learning_rate": 0.0002929413000640735 + }, + { + "step": 119, + "epoch": 0.1907051282051282, + "cpu_mem": 1.76078848, + "gpu_mem": 4.456464384, + "loss": 1.3514, + "grad_norm": 0.33021309971809387, + "learning_rate": 0.0002926844124410001 + }, + { + "step": 120, + "epoch": 0.19230769230769232, + "cpu_mem": 1.761181696, + "gpu_mem": 4.456490496, + "loss": 1.424, + "grad_norm": 0.541824996471405, + "learning_rate": 0.0002924230502673731 + }, + { + "step": 121, + "epoch": 0.19391025641025642, + "cpu_mem": 1.761378304, + "gpu_mem": 4.456449024, + "loss": 1.3802, + "grad_norm": 0.34321486949920654, + "learning_rate": 0.00029215722173945034 + }, + { + "step": 122, + "epoch": 0.1955128205128205, + "cpu_mem": 1.76177152, + "gpu_mem": 4.45648128, + "loss": 1.3808, + "grad_norm": 0.31990453600883484, + "learning_rate": 0.0002918869351935537 + }, + { + "step": 123, + "epoch": 0.1971153846153846, + "cpu_mem": 1.762164736, + "gpu_mem": 4.456475136, + "loss": 1.4014, + "grad_norm": 0.25558334589004517, + "learning_rate": 0.00029161219910580754 + }, + { + "step": 124, + "epoch": 0.1987179487179487, + "cpu_mem": 1.762557952, + "gpu_mem": 4.456476672, + "loss": 1.3707, + "grad_norm": 0.2793528437614441, + "learning_rate": 0.00029133302209187267 + }, + { + "step": 125, + "epoch": 0.20032051282051283, + "cpu_mem": 1.76275456, + "gpu_mem": 4.456452096, + "loss": 1.3757, + "grad_norm": 0.35792556405067444, + "learning_rate": 0.00029104941290667655 + }, + { + "step": 126, + "epoch": 0.20192307692307693, + "cpu_mem": 1.762951168, + "gpu_mem": 4.456461312, + "loss": 1.3551, + "grad_norm": 0.34719130396842957, + "learning_rate": 0.00029076138044413827 + }, + { + "step": 127, + "epoch": 0.20352564102564102, + "cpu_mem": 1.763344384, + "gpu_mem": 4.456447488, + "loss": 1.425, + "grad_norm": 0.49844658374786377, + "learning_rate": 0.00029046893373689 + }, + { + "step": 128, + "epoch": 0.20512820512820512, + "cpu_mem": 1.7637376, + "gpu_mem": 4.456484352, + "loss": 1.3685, + "grad_norm": 0.26245594024658203, + "learning_rate": 0.00029017208195599375 + }, + { + "step": 129, + "epoch": 0.20673076923076922, + "cpu_mem": 1.764130816, + "gpu_mem": 4.45648128, + "loss": 1.3854, + "grad_norm": 0.2032405585050583, + "learning_rate": 0.0002898708344106533 + }, + { + "step": 130, + "epoch": 0.20833333333333334, + "cpu_mem": 1.764524032, + "gpu_mem": 4.45648128, + "loss": 1.4155, + "grad_norm": 0.33882296085357666, + "learning_rate": 0.00028956520054792303 + }, + { + "step": 131, + "epoch": 0.20993589743589744, + "cpu_mem": 1.76472064, + "gpu_mem": 4.456470528, + "loss": 1.4019, + "grad_norm": 0.33191707730293274, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 132, + "epoch": 0.21153846153846154, + "cpu_mem": 1.764917248, + "gpu_mem": 4.456470528, + "loss": 1.3744, + "grad_norm": 0.36734244227409363, + "learning_rate": 0.0002889408123459782 + }, + { + "step": 133, + "epoch": 0.21314102564102563, + "cpu_mem": 1.765310464, + "gpu_mem": 4.456452096, + "loss": 1.3866, + "grad_norm": 0.2859019339084625, + "learning_rate": 0.000288622077587435 + }, + { + "step": 134, + "epoch": 0.21474358974358973, + "cpu_mem": 1.76570368, + "gpu_mem": 4.456462848, + "loss": 1.41, + "grad_norm": 0.3999142348766327, + "learning_rate": 0.0002882989956722303 + }, + { + "step": 135, + "epoch": 0.21634615384615385, + "cpu_mem": 1.766096896, + "gpu_mem": 4.456472064, + "loss": 1.3705, + "grad_norm": 0.2730843722820282, + "learning_rate": 0.00028797157673213914 + }, + { + "step": 136, + "epoch": 0.21794871794871795, + "cpu_mem": 1.766293504, + "gpu_mem": 4.456487424, + "loss": 1.4002, + "grad_norm": 0.44417497515678406, + "learning_rate": 0.00028763983103494465 + }, + { + "step": 137, + "epoch": 0.21955128205128205, + "cpu_mem": 1.76668672, + "gpu_mem": 4.4564352, + "loss": 1.3861, + "grad_norm": 0.2605229914188385, + "learning_rate": 0.00028730376898411606 + }, + { + "step": 138, + "epoch": 0.22115384615384615, + "cpu_mem": 1.766883328, + "gpu_mem": 4.456455168, + "loss": 1.3944, + "grad_norm": 0.2520316541194916, + "learning_rate": 0.00028696340111848245 + }, + { + "step": 139, + "epoch": 0.22275641025641027, + "cpu_mem": 1.767079936, + "gpu_mem": 4.456436736, + "loss": 1.3865, + "grad_norm": 0.3563708961009979, + "learning_rate": 0.00028661873811190226 + }, + { + "step": 140, + "epoch": 0.22435897435897437, + "cpu_mem": 1.767276544, + "gpu_mem": 4.456453632, + "loss": 1.3954, + "grad_norm": 0.39678505063056946, + "learning_rate": 0.0002862697907729285 + }, + { + "step": 141, + "epoch": 0.22596153846153846, + "cpu_mem": 1.76766976, + "gpu_mem": 4.456459776, + "loss": 1.3926, + "grad_norm": 0.36244311928749084, + "learning_rate": 0.0002859165700444701 + }, + { + "step": 142, + "epoch": 0.22756410256410256, + "cpu_mem": 1.767866368, + "gpu_mem": 4.456456704, + "loss": 1.3889, + "grad_norm": 0.248079314827919, + "learning_rate": 0.00028555908700344824 + }, + { + "step": 143, + "epoch": 0.22916666666666666, + "cpu_mem": 1.768062976, + "gpu_mem": 4.456482816, + "loss": 1.3813, + "grad_norm": 0.18736645579338074, + "learning_rate": 0.00028519735286044936 + }, + { + "step": 144, + "epoch": 0.23076923076923078, + "cpu_mem": 1.768456192, + "gpu_mem": 4.456456704, + "loss": 1.3882, + "grad_norm": 0.19351427257061005, + "learning_rate": 0.0002848313789593736 + }, + { + "step": 145, + "epoch": 0.23237179487179488, + "cpu_mem": 1.768849408, + "gpu_mem": 4.45649664, + "loss": 1.382, + "grad_norm": 0.39422181248664856, + "learning_rate": 0.00028446117677707867 + }, + { + "step": 146, + "epoch": 0.23397435897435898, + "cpu_mem": 1.769046016, + "gpu_mem": 4.456445952, + "loss": 1.384, + "grad_norm": 0.23472219705581665, + "learning_rate": 0.0002840867579230205 + }, + { + "step": 147, + "epoch": 0.23557692307692307, + "cpu_mem": 1.769439232, + "gpu_mem": 4.456455168, + "loss": 1.4089, + "grad_norm": 0.41428226232528687, + "learning_rate": 0.00028370813413888866 + }, + { + "step": 148, + "epoch": 0.23717948717948717, + "cpu_mem": 1.76963584, + "gpu_mem": 4.456475136, + "loss": 1.3718, + "grad_norm": 0.28242161870002747, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 149, + "epoch": 0.2387820512820513, + "cpu_mem": 1.769832448, + "gpu_mem": 4.45646592, + "loss": 1.3758, + "grad_norm": 0.2426951825618744, + "learning_rate": 0.0002829383194061186 + }, + { + "step": 150, + "epoch": 0.2403846153846154, + "cpu_mem": 1.770029056, + "gpu_mem": 4.456478208, + "loss": 1.382, + "grad_norm": 0.31796255707740784, + "learning_rate": 0.00028254715259869444 + }, + { + "step": 151, + "epoch": 0.2419871794871795, + "cpu_mem": 1.770422272, + "gpu_mem": 4.45644288, + "loss": 1.3934, + "grad_norm": 0.436970591545105, + "learning_rate": 0.00028215182914286766 + }, + { + "step": 152, + "epoch": 0.24358974358974358, + "cpu_mem": 1.770815488, + "gpu_mem": 4.4564736, + "loss": 1.3976, + "grad_norm": 0.47649306058883667, + "learning_rate": 0.00028175236143589144 + }, + { + "step": 153, + "epoch": 0.24519230769230768, + "cpu_mem": 1.771012096, + "gpu_mem": 4.456468992, + "loss": 1.379, + "grad_norm": 0.3658604919910431, + "learning_rate": 0.0002813487620049817 + }, + { + "step": 154, + "epoch": 0.2467948717948718, + "cpu_mem": 1.771208704, + "gpu_mem": 4.456493568, + "loss": 1.381, + "grad_norm": 0.2675972580909729, + "learning_rate": 0.00028094104350692435 + }, + { + "step": 155, + "epoch": 0.2483974358974359, + "cpu_mem": 1.771405312, + "gpu_mem": 4.456430592, + "loss": 1.394, + "grad_norm": 0.29080531001091003, + "learning_rate": 0.0002805292187276783 + }, + { + "step": 156, + "epoch": 0.25, + "cpu_mem": 1.771798528, + "gpu_mem": 4.456484352, + "loss": 1.3685, + "grad_norm": 0.23970751464366913, + "learning_rate": 0.0002801133005819744 + }, + { + "step": 157, + "epoch": 0.2516025641025641, + "cpu_mem": 1.771995136, + "gpu_mem": 4.456476672, + "loss": 1.3877, + "grad_norm": 0.25444090366363525, + "learning_rate": 0.00027969330211291077 + }, + { + "step": 158, + "epoch": 0.2532051282051282, + "cpu_mem": 1.772388352, + "gpu_mem": 4.456492032, + "loss": 1.3996, + "grad_norm": 0.29837363958358765, + "learning_rate": 0.00027926923649154327 + }, + { + "step": 159, + "epoch": 0.2548076923076923, + "cpu_mem": 1.77258496, + "gpu_mem": 4.456493568, + "loss": 1.3902, + "grad_norm": 0.2787809371948242, + "learning_rate": 0.00027884111701647284 + }, + { + "step": 160, + "epoch": 0.2564102564102564, + "cpu_mem": 1.772781568, + "gpu_mem": 4.456461312, + "loss": 1.4474, + "grad_norm": 0.595015823841095, + "learning_rate": 0.00027840895711342834 + }, + { + "step": 161, + "epoch": 0.25801282051282054, + "cpu_mem": 1.772978176, + "gpu_mem": 4.456453632, + "loss": 1.4045, + "grad_norm": 0.4013582468032837, + "learning_rate": 0.00027797277033484553 + }, + { + "step": 162, + "epoch": 0.25961538461538464, + "cpu_mem": 1.773371392, + "gpu_mem": 4.45648896, + "loss": 1.4035, + "grad_norm": 0.30483126640319824, + "learning_rate": 0.0002775325703594421 + }, + { + "step": 163, + "epoch": 0.26121794871794873, + "cpu_mem": 1.773568, + "gpu_mem": 4.456436736, + "loss": 1.3991, + "grad_norm": 0.42774951457977295, + "learning_rate": 0.0002770883709917886 + }, + { + "step": 164, + "epoch": 0.26282051282051283, + "cpu_mem": 1.773764608, + "gpu_mem": 4.456472064, + "loss": 1.3935, + "grad_norm": 0.4030027389526367, + "learning_rate": 0.0002766401861618757 + }, + { + "step": 165, + "epoch": 0.2644230769230769, + "cpu_mem": 1.773961216, + "gpu_mem": 4.456461312, + "loss": 1.3785, + "grad_norm": 0.184085875749588, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 166, + "epoch": 0.266025641025641, + "cpu_mem": 1.774157824, + "gpu_mem": 4.456493568, + "loss": 1.3889, + "grad_norm": 0.2611943781375885, + "learning_rate": 0.0002757319164597092 + }, + { + "step": 167, + "epoch": 0.2676282051282051, + "cpu_mem": 1.774354432, + "gpu_mem": 4.456487424, + "loss": 1.3825, + "grad_norm": 0.4100770354270935, + "learning_rate": 0.0002752718600705858 + }, + { + "step": 168, + "epoch": 0.2692307692307692, + "cpu_mem": 1.774747648, + "gpu_mem": 4.45646592, + "loss": 1.3756, + "grad_norm": 0.14855626225471497, + "learning_rate": 0.00027480787518457023 + }, + { + "step": 169, + "epoch": 0.2708333333333333, + "cpu_mem": 1.775140864, + "gpu_mem": 4.456462848, + "loss": 1.3953, + "grad_norm": 0.3923768699169159, + "learning_rate": 0.0002743399763521223 + }, + { + "step": 170, + "epoch": 0.2724358974358974, + "cpu_mem": 1.775337472, + "gpu_mem": 4.456499712, + "loss": 1.392, + "grad_norm": 0.23006635904312134, + "learning_rate": 0.0002738681782464426 + }, + { + "step": 171, + "epoch": 0.27403846153846156, + "cpu_mem": 1.77553408, + "gpu_mem": 4.4564736, + "loss": 1.3776, + "grad_norm": 0.34958183765411377, + "learning_rate": 0.0002733924956630117 + }, + { + "step": 172, + "epoch": 0.27564102564102566, + "cpu_mem": 1.775730688, + "gpu_mem": 4.45645056, + "loss": 1.4005, + "grad_norm": 0.3414144515991211, + "learning_rate": 0.00027291294351912664 + }, + { + "step": 173, + "epoch": 0.27724358974358976, + "cpu_mem": 1.775927296, + "gpu_mem": 4.456476672, + "loss": 1.4111, + "grad_norm": 0.375083863735199, + "learning_rate": 0.00027242953685343327 + }, + { + "step": 174, + "epoch": 0.27884615384615385, + "cpu_mem": 1.776123904, + "gpu_mem": 4.45648896, + "loss": 1.413, + "grad_norm": 0.39379361271858215, + "learning_rate": 0.0002719422908254538 + }, + { + "step": 175, + "epoch": 0.28044871794871795, + "cpu_mem": 1.776320512, + "gpu_mem": 4.45645056, + "loss": 1.3979, + "grad_norm": 0.305146723985672, + "learning_rate": 0.0002714512207151125 + }, + { + "step": 176, + "epoch": 0.28205128205128205, + "cpu_mem": 1.776713728, + "gpu_mem": 4.456459776, + "loss": 1.4041, + "grad_norm": 0.23426823318004608, + "learning_rate": 0.0002709563419222557 + }, + { + "step": 177, + "epoch": 0.28365384615384615, + "cpu_mem": 1.776910336, + "gpu_mem": 4.456441344, + "loss": 1.3866, + "grad_norm": 0.29743343591690063, + "learning_rate": 0.0002704576699661691 + }, + { + "step": 178, + "epoch": 0.28525641025641024, + "cpu_mem": 1.777106944, + "gpu_mem": 4.456455168, + "loss": 1.4054, + "grad_norm": 0.4705488681793213, + "learning_rate": 0.0002699552204850914 + }, + { + "step": 179, + "epoch": 0.28685897435897434, + "cpu_mem": 1.77750016, + "gpu_mem": 4.456462848, + "loss": 1.3951, + "grad_norm": 0.2622022330760956, + "learning_rate": 0.0002694490092357233 + }, + { + "step": 180, + "epoch": 0.28846153846153844, + "cpu_mem": 1.777696768, + "gpu_mem": 4.456444416, + "loss": 1.4122, + "grad_norm": 0.4126568138599396, + "learning_rate": 0.00026893905209273404 + }, + { + "step": 181, + "epoch": 0.2900641025641026, + "cpu_mem": 1.777893376, + "gpu_mem": 4.456475136, + "loss": 1.3862, + "grad_norm": 0.3025873005390167, + "learning_rate": 0.00026842536504826286 + }, + { + "step": 182, + "epoch": 0.2916666666666667, + "cpu_mem": 1.777893376, + "gpu_mem": 4.456445952, + "loss": 1.3995, + "grad_norm": 0.40786364674568176, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 183, + "epoch": 0.2932692307692308, + "cpu_mem": 1.778089984, + "gpu_mem": 4.456470528, + "loss": 1.3828, + "grad_norm": 0.18969933688640594, + "learning_rate": 0.0002673868658077717 + }, + { + "step": 184, + "epoch": 0.2948717948717949, + "cpu_mem": 1.7784832, + "gpu_mem": 4.45645056, + "loss": 1.3933, + "grad_norm": 0.3107098937034607, + "learning_rate": 0.00026686208617885055 + }, + { + "step": 185, + "epoch": 0.296474358974359, + "cpu_mem": 1.778679808, + "gpu_mem": 4.456482816, + "loss": 1.4008, + "grad_norm": 0.4428739845752716, + "learning_rate": 0.0002663336417816238 + }, + { + "step": 186, + "epoch": 0.2980769230769231, + "cpu_mem": 1.778876416, + "gpu_mem": 4.4564736, + "loss": 1.4061, + "grad_norm": 0.3490951657295227, + "learning_rate": 0.0002658015491879868 + }, + { + "step": 187, + "epoch": 0.29967948717948717, + "cpu_mem": 1.779073024, + "gpu_mem": 4.456468992, + "loss": 1.4176, + "grad_norm": 0.5071659088134766, + "learning_rate": 0.00026526582508424175 + }, + { + "step": 188, + "epoch": 0.30128205128205127, + "cpu_mem": 1.779269632, + "gpu_mem": 4.456425984, + "loss": 1.3766, + "grad_norm": 0.18708591163158417, + "learning_rate": 0.0002647264862705741 + }, + { + "step": 189, + "epoch": 0.30288461538461536, + "cpu_mem": 1.77946624, + "gpu_mem": 4.456505856, + "loss": 1.3831, + "grad_norm": 0.14888982474803925, + "learning_rate": 0.00026418354966052573 + }, + { + "step": 190, + "epoch": 0.30448717948717946, + "cpu_mem": 1.779662848, + "gpu_mem": 4.456456704, + "loss": 1.372, + "grad_norm": 0.32653772830963135, + "learning_rate": 0.00026363703228046454 + }, + { + "step": 191, + "epoch": 0.3060897435897436, + "cpu_mem": 1.779859456, + "gpu_mem": 4.456456704, + "loss": 1.3622, + "grad_norm": 0.23691974580287933, + "learning_rate": 0.0002630869512690507 + }, + { + "step": 192, + "epoch": 0.3076923076923077, + "cpu_mem": 1.780252672, + "gpu_mem": 4.456422912, + "loss": 1.434, + "grad_norm": 0.5325156450271606, + "learning_rate": 0.0002625333238766989 + }, + { + "step": 193, + "epoch": 0.3092948717948718, + "cpu_mem": 1.78044928, + "gpu_mem": 4.456462848, + "loss": 1.3731, + "grad_norm": 0.3505885899066925, + "learning_rate": 0.0002619761674650377 + }, + { + "step": 194, + "epoch": 0.3108974358974359, + "cpu_mem": 1.780645888, + "gpu_mem": 4.45645824, + "loss": 1.361, + "grad_norm": 0.3582043945789337, + "learning_rate": 0.0002614154995063647 + }, + { + "step": 195, + "epoch": 0.3125, + "cpu_mem": 1.780842496, + "gpu_mem": 4.456445952, + "loss": 1.3821, + "grad_norm": 0.20942674577236176, + "learning_rate": 0.00026085133758309883 + }, + { + "step": 196, + "epoch": 0.3141025641025641, + "cpu_mem": 1.781235712, + "gpu_mem": 4.456470528, + "loss": 1.3953, + "grad_norm": 0.5377510190010071, + "learning_rate": 0.0002602836993872292 + }, + { + "step": 197, + "epoch": 0.3157051282051282, + "cpu_mem": 1.78143232, + "gpu_mem": 4.456485888, + "loss": 1.3955, + "grad_norm": 0.490332692861557, + "learning_rate": 0.0002597126027197598 + }, + { + "step": 198, + "epoch": 0.3173076923076923, + "cpu_mem": 1.781628928, + "gpu_mem": 4.45645824, + "loss": 1.3656, + "grad_norm": 0.22302719950675964, + "learning_rate": 0.0002591380654901515 + }, + { + "step": 199, + "epoch": 0.3189102564102564, + "cpu_mem": 1.781628928, + "gpu_mem": 4.456455168, + "loss": 1.4009, + "grad_norm": 0.3683852553367615, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 200, + "epoch": 0.32051282051282054, + "cpu_mem": 1.781825536, + "gpu_mem": 4.456470528, + "loss": 1.4281, + "grad_norm": 0.3934862017631531, + "learning_rate": 0.0002579787415212732 + }, + { + "step": 201, + "epoch": 0.32211538461538464, + "cpu_mem": 1.782022144, + "gpu_mem": 4.456447488, + "loss": 1.4059, + "grad_norm": 0.24954020977020264, + "learning_rate": 0.00025739399113813784 + }, + { + "step": 202, + "epoch": 0.32371794871794873, + "cpu_mem": 1.782218752, + "gpu_mem": 4.456449024, + "loss": 1.3961, + "grad_norm": 0.25417691469192505, + "learning_rate": 0.00025680587290399277 + }, + { + "step": 203, + "epoch": 0.32532051282051283, + "cpu_mem": 1.782218752, + "gpu_mem": 4.456490496, + "loss": 1.4023, + "grad_norm": 0.401838481426239, + "learning_rate": 0.0002562144052620913 + }, + { + "step": 204, + "epoch": 0.3269230769230769, + "cpu_mem": 1.782611968, + "gpu_mem": 4.456461312, + "loss": 1.3929, + "grad_norm": 0.22035275399684906, + "learning_rate": 0.00025561960676072354 + }, + { + "step": 205, + "epoch": 0.328525641025641, + "cpu_mem": 1.783005184, + "gpu_mem": 4.456461312, + "loss": 1.3791, + "grad_norm": 0.26599034667015076, + "learning_rate": 0.0002550214960526344 + }, + { + "step": 206, + "epoch": 0.3301282051282051, + "cpu_mem": 1.783201792, + "gpu_mem": 4.45645824, + "loss": 1.3982, + "grad_norm": 0.42136284708976746, + "learning_rate": 0.000254420091894439 + }, + { + "step": 207, + "epoch": 0.3317307692307692, + "cpu_mem": 1.7833984, + "gpu_mem": 4.45645824, + "loss": 1.3907, + "grad_norm": 0.2529582381248474, + "learning_rate": 0.0002538154131460342 + }, + { + "step": 208, + "epoch": 0.3333333333333333, + "cpu_mem": 1.783595008, + "gpu_mem": 4.456449024, + "loss": 1.3792, + "grad_norm": 0.18555006384849548, + "learning_rate": 0.00025320747877000745 + }, + { + "step": 209, + "epoch": 0.3349358974358974, + "cpu_mem": 1.783595008, + "gpu_mem": 4.456484352, + "loss": 1.3834, + "grad_norm": 0.34524399042129517, + "learning_rate": 0.00025259630783104164 + }, + { + "step": 210, + "epoch": 0.33653846153846156, + "cpu_mem": 1.783791616, + "gpu_mem": 4.456441344, + "loss": 1.3778, + "grad_norm": 0.2249782532453537, + "learning_rate": 0.00025198191949531786 + }, + { + "step": 211, + "epoch": 0.33814102564102566, + "cpu_mem": 1.783988224, + "gpu_mem": 4.456468992, + "loss": 1.3925, + "grad_norm": 0.2943700850009918, + "learning_rate": 0.00025136433302991366 + }, + { + "step": 212, + "epoch": 0.33974358974358976, + "cpu_mem": 1.784184832, + "gpu_mem": 4.456478208, + "loss": 1.4141, + "grad_norm": 0.6228026151657104, + "learning_rate": 0.00025074356780219946 + }, + { + "step": 213, + "epoch": 0.34134615384615385, + "cpu_mem": 1.78438144, + "gpu_mem": 4.45645056, + "loss": 1.3996, + "grad_norm": 0.5142458081245422, + "learning_rate": 0.000250119643279231 + }, + { + "step": 214, + "epoch": 0.34294871794871795, + "cpu_mem": 1.784578048, + "gpu_mem": 4.456459776, + "loss": 1.3793, + "grad_norm": 0.29969626665115356, + "learning_rate": 0.0002494925790271386 + }, + { + "step": 215, + "epoch": 0.34455128205128205, + "cpu_mem": 1.784774656, + "gpu_mem": 4.456461312, + "loss": 1.3849, + "grad_norm": 0.4016188383102417, + "learning_rate": 0.00024886239471051376 + }, + { + "step": 216, + "epoch": 0.34615384615384615, + "cpu_mem": 1.784971264, + "gpu_mem": 4.456461312, + "loss": 1.3943, + "grad_norm": 0.3565366268157959, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 217, + "epoch": 0.34775641025641024, + "cpu_mem": 1.785167872, + "gpu_mem": 4.456445952, + "loss": 1.3886, + "grad_norm": 0.25925496220588684, + "learning_rate": 0.0002475927450306363 + }, + { + "step": 218, + "epoch": 0.34935897435897434, + "cpu_mem": 1.785561088, + "gpu_mem": 4.456467456, + "loss": 1.3831, + "grad_norm": 0.2719407379627228, + "learning_rate": 0.0002469533194833073 + }, + { + "step": 219, + "epoch": 0.35096153846153844, + "cpu_mem": 1.785561088, + "gpu_mem": 4.456501248, + "loss": 1.3888, + "grad_norm": 0.3413953185081482, + "learning_rate": 0.0002463108535020447 + }, + { + "step": 220, + "epoch": 0.3525641025641026, + "cpu_mem": 1.785757696, + "gpu_mem": 4.456455168, + "loss": 1.392, + "grad_norm": 0.3300778865814209, + "learning_rate": 0.0002456653672344348 + }, + { + "step": 221, + "epoch": 0.3541666666666667, + "cpu_mem": 1.785954304, + "gpu_mem": 4.456461312, + "loss": 1.3917, + "grad_norm": 0.3114909529685974, + "learning_rate": 0.0002450168809227794 + }, + { + "step": 222, + "epoch": 0.3557692307692308, + "cpu_mem": 1.786150912, + "gpu_mem": 4.456476672, + "loss": 1.3923, + "grad_norm": 0.29760125279426575, + "learning_rate": 0.00024436541490346095 + }, + { + "step": 223, + "epoch": 0.3573717948717949, + "cpu_mem": 1.78634752, + "gpu_mem": 4.456495104, + "loss": 1.3912, + "grad_norm": 0.28222474455833435, + "learning_rate": 0.00024371098960630495 + }, + { + "step": 224, + "epoch": 0.358974358974359, + "cpu_mem": 1.786544128, + "gpu_mem": 4.456464384, + "loss": 1.3836, + "grad_norm": 0.15416952967643738, + "learning_rate": 0.000243053625553939 + }, + { + "step": 225, + "epoch": 0.3605769230769231, + "cpu_mem": 1.786740736, + "gpu_mem": 4.45645056, + "loss": 1.3715, + "grad_norm": 0.19557882845401764, + "learning_rate": 0.00024239334336114953 + }, + { + "step": 226, + "epoch": 0.36217948717948717, + "cpu_mem": 1.786937344, + "gpu_mem": 4.45644288, + "loss": 1.3809, + "grad_norm": 0.15452682971954346, + "learning_rate": 0.0002417301637342352 + }, + { + "step": 227, + "epoch": 0.36378205128205127, + "cpu_mem": 1.787133952, + "gpu_mem": 4.456507392, + "loss": 1.3783, + "grad_norm": 0.25241532921791077, + "learning_rate": 0.00024106410747035744 + }, + { + "step": 228, + "epoch": 0.36538461538461536, + "cpu_mem": 1.78733056, + "gpu_mem": 4.456445952, + "loss": 1.3831, + "grad_norm": 0.4406883418560028, + "learning_rate": 0.00024039519545688846 + }, + { + "step": 229, + "epoch": 0.36698717948717946, + "cpu_mem": 1.787527168, + "gpu_mem": 4.456498176, + "loss": 1.3797, + "grad_norm": 0.16332398355007172, + "learning_rate": 0.000239723448670756 + }, + { + "step": 230, + "epoch": 0.3685897435897436, + "cpu_mem": 1.787723776, + "gpu_mem": 4.456479744, + "loss": 1.3759, + "grad_norm": 0.21676817536354065, + "learning_rate": 0.0002390488881777858 + }, + { + "step": 231, + "epoch": 0.3701923076923077, + "cpu_mem": 1.787920384, + "gpu_mem": 4.456478208, + "loss": 1.3967, + "grad_norm": 0.3869215250015259, + "learning_rate": 0.0002383715351320406 + }, + { + "step": 232, + "epoch": 0.3717948717948718, + "cpu_mem": 1.788116992, + "gpu_mem": 4.456482816, + "loss": 1.3777, + "grad_norm": 0.22962695360183716, + "learning_rate": 0.00023769141077515713 + }, + { + "step": 233, + "epoch": 0.3733974358974359, + "cpu_mem": 1.7883136, + "gpu_mem": 4.45645824, + "loss": 1.3962, + "grad_norm": 0.6808836460113525, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 234, + "epoch": 0.375, + "cpu_mem": 1.788510208, + "gpu_mem": 4.456487424, + "loss": 1.3936, + "grad_norm": 0.34562379121780396, + "learning_rate": 0.0002363229335283915 + }, + { + "step": 235, + "epoch": 0.3766025641025641, + "cpu_mem": 1.788706816, + "gpu_mem": 4.456464384, + "loss": 1.3868, + "grad_norm": 0.4363693296909332, + "learning_rate": 0.00023563462355364297 + }, + { + "step": 236, + "epoch": 0.3782051282051282, + "cpu_mem": 1.788706816, + "gpu_mem": 4.456525824, + "loss": 1.4053, + "grad_norm": 0.5767877101898193, + "learning_rate": 0.0002349436280966775 + }, + { + "step": 237, + "epoch": 0.3798076923076923, + "cpu_mem": 1.788903424, + "gpu_mem": 4.45645056, + "loss": 1.4162, + "grad_norm": 0.5625803470611572, + "learning_rate": 0.00023424996882695468 + }, + { + "step": 238, + "epoch": 0.3814102564102564, + "cpu_mem": 1.789100032, + "gpu_mem": 4.456461312, + "loss": 1.3953, + "grad_norm": 0.41059908270835876, + "learning_rate": 0.00023355366749747063 + }, + { + "step": 239, + "epoch": 0.38301282051282054, + "cpu_mem": 1.78929664, + "gpu_mem": 4.456459776, + "loss": 1.3994, + "grad_norm": 0.26568037271499634, + "learning_rate": 0.00023285474594407585 + }, + { + "step": 240, + "epoch": 0.38461538461538464, + "cpu_mem": 1.789493248, + "gpu_mem": 4.456456704, + "loss": 1.3718, + "grad_norm": 0.2576943039894104, + "learning_rate": 0.0002321532260847905 + }, + { + "step": 241, + "epoch": 0.38621794871794873, + "cpu_mem": 1.789689856, + "gpu_mem": 4.456487424, + "loss": 1.3823, + "grad_norm": 0.3465059697628021, + "learning_rate": 0.00023144912991911691 + }, + { + "step": 242, + "epoch": 0.38782051282051283, + "cpu_mem": 1.789689856, + "gpu_mem": 4.45646592, + "loss": 1.3718, + "grad_norm": 0.33907851576805115, + "learning_rate": 0.0002307424795273499 + }, + { + "step": 243, + "epoch": 0.3894230769230769, + "cpu_mem": 1.789886464, + "gpu_mem": 4.456461312, + "loss": 1.3717, + "grad_norm": 0.32871443033218384, + "learning_rate": 0.00023003329706988425 + }, + { + "step": 244, + "epoch": 0.391025641025641, + "cpu_mem": 1.790083072, + "gpu_mem": 4.456472064, + "loss": 1.3877, + "grad_norm": 0.5632917284965515, + "learning_rate": 0.00022932160478651963 + }, + { + "step": 245, + "epoch": 0.3926282051282051, + "cpu_mem": 1.79027968, + "gpu_mem": 4.456476672, + "loss": 1.3905, + "grad_norm": 0.2127143293619156, + "learning_rate": 0.00022860742499576338 + }, + { + "step": 246, + "epoch": 0.3942307692307692, + "cpu_mem": 1.790476288, + "gpu_mem": 4.456438272, + "loss": 1.393, + "grad_norm": 0.24060311913490295, + "learning_rate": 0.00022789078009413042 + }, + { + "step": 247, + "epoch": 0.3958333333333333, + "cpu_mem": 1.790672896, + "gpu_mem": 4.456505856, + "loss": 1.3766, + "grad_norm": 0.23704484105110168, + "learning_rate": 0.00022717169255544108 + }, + { + "step": 248, + "epoch": 0.3974358974358974, + "cpu_mem": 1.790869504, + "gpu_mem": 4.456468992, + "loss": 1.3581, + "grad_norm": 0.3303680419921875, + "learning_rate": 0.00022645018493011612 + }, + { + "step": 249, + "epoch": 0.39903846153846156, + "cpu_mem": 1.791066112, + "gpu_mem": 4.45645824, + "loss": 1.401, + "grad_norm": 0.3787165582180023, + "learning_rate": 0.0002257262798444698 + }, + { + "step": 250, + "epoch": 0.40064102564102566, + "cpu_mem": 1.791066112, + "gpu_mem": 4.456475136, + "loss": 1.3853, + "grad_norm": 0.38144877552986145, + "learning_rate": 0.000225 + }, + { + "step": 251, + "epoch": 0.40224358974358976, + "cpu_mem": 1.791066112, + "gpu_mem": 4.456449024, + "loss": 1.4041, + "grad_norm": 0.5314891338348389, + "learning_rate": 0.00022427136817267668 + }, + { + "step": 252, + "epoch": 0.40384615384615385, + "cpu_mem": 1.79126272, + "gpu_mem": 4.45649664, + "loss": 1.3923, + "grad_norm": 0.2883230149745941, + "learning_rate": 0.0002235404072122273 + }, + { + "step": 253, + "epoch": 0.40544871794871795, + "cpu_mem": 1.791459328, + "gpu_mem": 4.456464384, + "loss": 1.3856, + "grad_norm": 0.30883508920669556, + "learning_rate": 0.00022280714004142054 + }, + { + "step": 254, + "epoch": 0.40705128205128205, + "cpu_mem": 1.791655936, + "gpu_mem": 4.456453632, + "loss": 1.3595, + "grad_norm": 0.23993192613124847, + "learning_rate": 0.00022207158965534726 + }, + { + "step": 255, + "epoch": 0.40865384615384615, + "cpu_mem": 1.791852544, + "gpu_mem": 4.456468992, + "loss": 1.3827, + "grad_norm": 0.20591992139816284, + "learning_rate": 0.0002213337791206993 + }, + { + "step": 256, + "epoch": 0.41025641025641024, + "cpu_mem": 1.792049152, + "gpu_mem": 4.45646592, + "loss": 1.3671, + "grad_norm": 0.18118496239185333, + "learning_rate": 0.00022059373157504636 + }, + { + "step": 257, + "epoch": 0.41185897435897434, + "cpu_mem": 1.79224576, + "gpu_mem": 4.45646592, + "loss": 1.4232, + "grad_norm": 0.5539151430130005, + "learning_rate": 0.00021985147022611038 + }, + { + "step": 258, + "epoch": 0.41346153846153844, + "cpu_mem": 1.79224576, + "gpu_mem": 4.456453632, + "loss": 1.3832, + "grad_norm": 0.31881117820739746, + "learning_rate": 0.0002191070183510375 + }, + { + "step": 259, + "epoch": 0.4150641025641026, + "cpu_mem": 1.792442368, + "gpu_mem": 4.456436736, + "loss": 1.4006, + "grad_norm": 0.4240643382072449, + "learning_rate": 0.00021836039929566835 + }, + { + "step": 260, + "epoch": 0.4166666666666667, + "cpu_mem": 1.792638976, + "gpu_mem": 4.456499712, + "loss": 1.3992, + "grad_norm": 0.39303088188171387, + "learning_rate": 0.0002176116364738058 + }, + { + "step": 261, + "epoch": 0.4182692307692308, + "cpu_mem": 1.792835584, + "gpu_mem": 4.456453632, + "loss": 1.396, + "grad_norm": 0.355730265378952, + "learning_rate": 0.00021686075336648075 + }, + { + "step": 262, + "epoch": 0.4198717948717949, + "cpu_mem": 1.793032192, + "gpu_mem": 4.456462848, + "loss": 1.3827, + "grad_norm": 0.4353964030742645, + "learning_rate": 0.00021610777352121574 + }, + { + "step": 263, + "epoch": 0.421474358974359, + "cpu_mem": 1.793032192, + "gpu_mem": 4.456498176, + "loss": 1.3797, + "grad_norm": 0.24804233014583588, + "learning_rate": 0.0002153527205512867 + }, + { + "step": 264, + "epoch": 0.4230769230769231, + "cpu_mem": 1.793032192, + "gpu_mem": 4.456462848, + "loss": 1.3677, + "grad_norm": 0.2087867707014084, + "learning_rate": 0.0002145956181349821 + }, + { + "step": 265, + "epoch": 0.42467948717948717, + "cpu_mem": 1.7932288, + "gpu_mem": 4.456467456, + "loss": 1.3996, + "grad_norm": 0.40083566308021545, + "learning_rate": 0.00021383649001486055 + }, + { + "step": 266, + "epoch": 0.42628205128205127, + "cpu_mem": 1.793425408, + "gpu_mem": 4.456515072, + "loss": 1.3699, + "grad_norm": 0.19072279334068298, + "learning_rate": 0.00021307535999700637 + }, + { + "step": 267, + "epoch": 0.42788461538461536, + "cpu_mem": 1.793622016, + "gpu_mem": 4.456524288, + "loss": 1.4068, + "grad_norm": 0.37349769473075867, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 268, + "epoch": 0.42948717948717946, + "cpu_mem": 1.793818624, + "gpu_mem": 4.456478208, + "loss": 1.3839, + "grad_norm": 0.3260330855846405, + "learning_rate": 0.00021154718980558417 + }, + { + "step": 269, + "epoch": 0.4310897435897436, + "cpu_mem": 1.794015232, + "gpu_mem": 4.456472064, + "loss": 1.4108, + "grad_norm": 0.6276695728302002, + "learning_rate": 0.00021078019755508398 + }, + { + "step": 270, + "epoch": 0.4326923076923077, + "cpu_mem": 1.794015232, + "gpu_mem": 4.456533504, + "loss": 1.3823, + "grad_norm": 0.4282324016094208, + "learning_rate": 0.00021001129925148396 + }, + { + "step": 271, + "epoch": 0.4342948717948718, + "cpu_mem": 1.794015232, + "gpu_mem": 4.456459776, + "loss": 1.385, + "grad_norm": 0.32279354333877563, + "learning_rate": 0.00020924051900725923 + }, + { + "step": 272, + "epoch": 0.4358974358974359, + "cpu_mem": 1.79421184, + "gpu_mem": 4.45645824, + "loss": 1.382, + "grad_norm": 0.2606470286846161, + "learning_rate": 0.00020846788099390188 + }, + { + "step": 273, + "epoch": 0.4375, + "cpu_mem": 1.794408448, + "gpu_mem": 4.456461312, + "loss": 1.3819, + "grad_norm": 0.1615799516439438, + "learning_rate": 0.0002076934094411635 + }, + { + "step": 274, + "epoch": 0.4391025641025641, + "cpu_mem": 1.794605056, + "gpu_mem": 4.456447488, + "loss": 1.3843, + "grad_norm": 0.3554295599460602, + "learning_rate": 0.0002069171286362949 + }, + { + "step": 275, + "epoch": 0.4407051282051282, + "cpu_mem": 1.794801664, + "gpu_mem": 4.456462848, + "loss": 1.3829, + "grad_norm": 0.23938071727752686, + "learning_rate": 0.00020613906292328457 + }, + { + "step": 276, + "epoch": 0.4423076923076923, + "cpu_mem": 1.794801664, + "gpu_mem": 4.456501248, + "loss": 1.3941, + "grad_norm": 0.2591249942779541, + "learning_rate": 0.0002053592367020955 + }, + { + "step": 277, + "epoch": 0.4439102564102564, + "cpu_mem": 1.794998272, + "gpu_mem": 4.45648128, + "loss": 1.3769, + "grad_norm": 0.26971572637557983, + "learning_rate": 0.0002045776744278996 + }, + { + "step": 278, + "epoch": 0.44551282051282054, + "cpu_mem": 1.79519488, + "gpu_mem": 4.456507392, + "loss": 1.3762, + "grad_norm": 0.25108039379119873, + "learning_rate": 0.00020379440061031118 + }, + { + "step": 279, + "epoch": 0.44711538461538464, + "cpu_mem": 1.795391488, + "gpu_mem": 4.45645824, + "loss": 1.3662, + "grad_norm": 0.21153707802295685, + "learning_rate": 0.00020300943981261808 + }, + { + "step": 280, + "epoch": 0.44871794871794873, + "cpu_mem": 1.795391488, + "gpu_mem": 4.456452096, + "loss": 1.3821, + "grad_norm": 0.2835651636123657, + "learning_rate": 0.0002022228166510114 + }, + { + "step": 281, + "epoch": 0.45032051282051283, + "cpu_mem": 1.795391488, + "gpu_mem": 4.456475136, + "loss": 1.3598, + "grad_norm": 0.24776691198349, + "learning_rate": 0.00020143455579381373 + }, + { + "step": 282, + "epoch": 0.4519230769230769, + "cpu_mem": 1.795588096, + "gpu_mem": 4.456453632, + "loss": 1.3758, + "grad_norm": 0.31109628081321716, + "learning_rate": 0.0002006446819607053 + }, + { + "step": 283, + "epoch": 0.453525641025641, + "cpu_mem": 1.795784704, + "gpu_mem": 4.456467456, + "loss": 1.3949, + "grad_norm": 0.2916359007358551, + "learning_rate": 0.00019985321992194892 + }, + { + "step": 284, + "epoch": 0.4551282051282051, + "cpu_mem": 1.795981312, + "gpu_mem": 4.456472064, + "loss": 1.3705, + "grad_norm": 0.35861441493034363, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 285, + "epoch": 0.4567307692307692, + "cpu_mem": 1.795981312, + "gpu_mem": 4.456490496, + "loss": 1.3954, + "grad_norm": 0.3590727746486664, + "learning_rate": 0.00019826563055679418 + }, + { + "step": 286, + "epoch": 0.4583333333333333, + "cpu_mem": 1.795981312, + "gpu_mem": 4.456461312, + "loss": 1.3562, + "grad_norm": 0.2638302147388458, + "learning_rate": 0.00019746955301683537 + }, + { + "step": 287, + "epoch": 0.4599358974358974, + "cpu_mem": 1.79617792, + "gpu_mem": 4.45648896, + "loss": 1.3989, + "grad_norm": 0.22712145745754242, + "learning_rate": 0.0001966719868425464 + }, + { + "step": 288, + "epoch": 0.46153846153846156, + "cpu_mem": 1.796374528, + "gpu_mem": 4.456470528, + "loss": 1.3693, + "grad_norm": 0.30042219161987305, + "learning_rate": 0.0001958729570454201 + }, + { + "step": 289, + "epoch": 0.46314102564102566, + "cpu_mem": 1.796571136, + "gpu_mem": 4.45645824, + "loss": 1.3965, + "grad_norm": 0.3346201777458191, + "learning_rate": 0.0001950724886828484 + }, + { + "step": 290, + "epoch": 0.46474358974358976, + "cpu_mem": 1.796571136, + "gpu_mem": 4.456467456, + "loss": 1.3674, + "grad_norm": 0.2995418906211853, + "learning_rate": 0.000194270606857336 + }, + { + "step": 291, + "epoch": 0.46634615384615385, + "cpu_mem": 1.796571136, + "gpu_mem": 4.456464384, + "loss": 1.3844, + "grad_norm": 0.3030101954936981, + "learning_rate": 0.00019346733671571367 + }, + { + "step": 292, + "epoch": 0.46794871794871795, + "cpu_mem": 1.796767744, + "gpu_mem": 4.456479744, + "loss": 1.3961, + "grad_norm": 0.33009180426597595, + "learning_rate": 0.00019266270344834942 + }, + { + "step": 293, + "epoch": 0.46955128205128205, + "cpu_mem": 1.796964352, + "gpu_mem": 4.456487424, + "loss": 1.3774, + "grad_norm": 0.24260330200195312, + "learning_rate": 0.00019185673228835857 + }, + { + "step": 294, + "epoch": 0.47115384615384615, + "cpu_mem": 1.79716096, + "gpu_mem": 4.456476672, + "loss": 1.3967, + "grad_norm": 0.3089144825935364, + "learning_rate": 0.00019104944851081244 + }, + { + "step": 295, + "epoch": 0.47275641025641024, + "cpu_mem": 1.79716096, + "gpu_mem": 4.456461312, + "loss": 1.3832, + "grad_norm": 0.3414043188095093, + "learning_rate": 0.00019024087743194564 + }, + { + "step": 296, + "epoch": 0.47435897435897434, + "cpu_mem": 1.79716096, + "gpu_mem": 4.456464384, + "loss": 1.3718, + "grad_norm": 0.17955338954925537, + "learning_rate": 0.0001894310444083625 + }, + { + "step": 297, + "epoch": 0.47596153846153844, + "cpu_mem": 1.797357568, + "gpu_mem": 4.45645824, + "loss": 1.3776, + "grad_norm": 0.24964337050914764, + "learning_rate": 0.00018861997483624136 + }, + { + "step": 298, + "epoch": 0.4775641025641026, + "cpu_mem": 1.797554176, + "gpu_mem": 4.456453632, + "loss": 1.3896, + "grad_norm": 0.4620607793331146, + "learning_rate": 0.00018780769415053866 + }, + { + "step": 299, + "epoch": 0.4791666666666667, + "cpu_mem": 1.797750784, + "gpu_mem": 4.456475136, + "loss": 1.3789, + "grad_norm": 0.21833990514278412, + "learning_rate": 0.00018699422782419094 + }, + { + "step": 300, + "epoch": 0.4807692307692308, + "cpu_mem": 1.797750784, + "gpu_mem": 4.456467456, + "loss": 1.3669, + "grad_norm": 0.34281447529792786, + "learning_rate": 0.00018617960136731624 + }, + { + "step": 301, + "epoch": 0.4823717948717949, + "cpu_mem": 1.797750784, + "gpu_mem": 4.456439808, + "loss": 1.3733, + "grad_norm": 0.19032926857471466, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 302, + "epoch": 0.483974358974359, + "cpu_mem": 1.797947392, + "gpu_mem": 4.456438272, + "loss": 1.3765, + "grad_norm": 0.22631363570690155, + "learning_rate": 0.0001845469702835641 + }, + { + "step": 303, + "epoch": 0.4855769230769231, + "cpu_mem": 1.798144, + "gpu_mem": 4.456464384, + "loss": 1.3764, + "grad_norm": 0.3414700925350189, + "learning_rate": 0.00018372901685562414 + }, + { + "step": 304, + "epoch": 0.48717948717948717, + "cpu_mem": 1.798144, + "gpu_mem": 4.456447488, + "loss": 1.3776, + "grad_norm": 0.23705294728279114, + "learning_rate": 0.00018291000569342676 + }, + { + "step": 305, + "epoch": 0.48878205128205127, + "cpu_mem": 1.798340608, + "gpu_mem": 4.456478208, + "loss": 1.3475, + "grad_norm": 0.6920710206031799, + "learning_rate": 0.00018208996248097458 + }, + { + "step": 306, + "epoch": 0.49038461538461536, + "cpu_mem": 1.798340608, + "gpu_mem": 4.456461312, + "loss": 1.3921, + "grad_norm": 0.3216448724269867, + "learning_rate": 0.00018126891293463547 + }, + { + "step": 307, + "epoch": 0.49198717948717946, + "cpu_mem": 1.798340608, + "gpu_mem": 4.456492032, + "loss": 1.381, + "grad_norm": 0.34208944439888, + "learning_rate": 0.0001804468828023354 + }, + { + "step": 308, + "epoch": 0.4935897435897436, + "cpu_mem": 1.798537216, + "gpu_mem": 4.456459776, + "loss": 1.3841, + "grad_norm": 0.20339500904083252, + "learning_rate": 0.00017962389786275142 + }, + { + "step": 309, + "epoch": 0.4951923076923077, + "cpu_mem": 1.798733824, + "gpu_mem": 4.456485888, + "loss": 1.3622, + "grad_norm": 0.24112558364868164, + "learning_rate": 0.000178799983924503 + }, + { + "step": 310, + "epoch": 0.4967948717948718, + "cpu_mem": 1.798733824, + "gpu_mem": 4.456461312, + "loss": 1.3777, + "grad_norm": 0.15670928359031677, + "learning_rate": 0.00017797516682534293 + }, + { + "step": 311, + "epoch": 0.4983974358974359, + "cpu_mem": 1.798930432, + "gpu_mem": 4.456456704, + "loss": 1.3823, + "grad_norm": 0.26914849877357483, + "learning_rate": 0.00017714947243134695 + }, + { + "step": 312, + "epoch": 0.5, + "cpu_mem": 1.798930432, + "gpu_mem": 4.456459776, + "loss": 1.3686, + "grad_norm": 0.2761475443840027, + "learning_rate": 0.0001763229266361024 + }, + { + "step": 313, + "epoch": 0.5016025641025641, + "cpu_mem": 1.798930432, + "gpu_mem": 4.456478208, + "loss": 1.3937, + "grad_norm": 0.321126788854599, + "learning_rate": 0.00017549555535989648 + }, + { + "step": 314, + "epoch": 0.5032051282051282, + "cpu_mem": 1.79912704, + "gpu_mem": 4.45645824, + "loss": 1.3687, + "grad_norm": 0.33646339178085327, + "learning_rate": 0.00017466738454890323 + }, + { + "step": 315, + "epoch": 0.5048076923076923, + "cpu_mem": 1.799323648, + "gpu_mem": 4.456462848, + "loss": 1.3871, + "grad_norm": 0.21041952073574066, + "learning_rate": 0.00017383844017436996 + }, + { + "step": 316, + "epoch": 0.5064102564102564, + "cpu_mem": 1.799323648, + "gpu_mem": 4.45645824, + "loss": 1.4029, + "grad_norm": 0.3932421803474426, + "learning_rate": 0.00017300874823180282 + }, + { + "step": 317, + "epoch": 0.5080128205128205, + "cpu_mem": 1.799520256, + "gpu_mem": 4.45646592, + "loss": 1.3779, + "grad_norm": 0.19864565134048462, + "learning_rate": 0.00017217833474015128 + }, + { + "step": 318, + "epoch": 0.5096153846153846, + "cpu_mem": 1.799520256, + "gpu_mem": 4.456490496, + "loss": 1.3654, + "grad_norm": 0.24557720124721527, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 319, + "epoch": 0.5112179487179487, + "cpu_mem": 1.799716864, + "gpu_mem": 4.456482816, + "loss": 1.366, + "grad_norm": 0.3147515058517456, + "learning_rate": 0.0001705154472977154 + }, + { + "step": 320, + "epoch": 0.5128205128205128, + "cpu_mem": 1.799716864, + "gpu_mem": 4.456484352, + "loss": 1.3669, + "grad_norm": 0.292223185300827, + "learning_rate": 0.00016968302549470095 + }, + { + "step": 321, + "epoch": 0.5144230769230769, + "cpu_mem": 1.799716864, + "gpu_mem": 4.456459776, + "loss": 1.3789, + "grad_norm": 0.30180424451828003, + "learning_rate": 0.00016884998643650694 + }, + { + "step": 322, + "epoch": 0.5160256410256411, + "cpu_mem": 1.799913472, + "gpu_mem": 4.456461312, + "loss": 1.393, + "grad_norm": 0.3244592249393463, + "learning_rate": 0.00016801635624704776 + }, + { + "step": 323, + "epoch": 0.5176282051282052, + "cpu_mem": 1.799913472, + "gpu_mem": 4.45648128, + "loss": 1.3827, + "grad_norm": 0.3451114594936371, + "learning_rate": 0.0001671821610687756 + }, + { + "step": 324, + "epoch": 0.5192307692307693, + "cpu_mem": 1.80011008, + "gpu_mem": 4.456453632, + "loss": 1.3908, + "grad_norm": 0.4603932201862335, + "learning_rate": 0.00016634742706186036 + }, + { + "step": 325, + "epoch": 0.5208333333333334, + "cpu_mem": 1.80011008, + "gpu_mem": 4.45646592, + "loss": 1.3433, + "grad_norm": 0.3497075140476227, + "learning_rate": 0.00016551218040336993 + }, + { + "step": 326, + "epoch": 0.5224358974358975, + "cpu_mem": 1.800306688, + "gpu_mem": 4.456475136, + "loss": 1.3907, + "grad_norm": 0.40782874822616577, + "learning_rate": 0.00016467644728644843 + }, + { + "step": 327, + "epoch": 0.5240384615384616, + "cpu_mem": 1.800306688, + "gpu_mem": 4.456452096, + "loss": 1.3703, + "grad_norm": 0.5624381899833679, + "learning_rate": 0.0001638402539194953 + }, + { + "step": 328, + "epoch": 0.5256410256410257, + "cpu_mem": 1.800503296, + "gpu_mem": 4.456476672, + "loss": 1.3768, + "grad_norm": 0.4368140697479248, + "learning_rate": 0.00016300362652534346 + }, + { + "step": 329, + "epoch": 0.5272435897435898, + "cpu_mem": 1.800699904, + "gpu_mem": 4.456476672, + "loss": 1.3644, + "grad_norm": 0.30626195669174194, + "learning_rate": 0.00016216659134043657 + }, + { + "step": 330, + "epoch": 0.5288461538461539, + "cpu_mem": 1.800699904, + "gpu_mem": 4.456459776, + "loss": 1.3787, + "grad_norm": 0.3445449471473694, + "learning_rate": 0.00016132917461400686 + }, + { + "step": 331, + "epoch": 0.530448717948718, + "cpu_mem": 1.800699904, + "gpu_mem": 4.456456704, + "loss": 1.3684, + "grad_norm": 0.7048637866973877, + "learning_rate": 0.00016049140260725127 + }, + { + "step": 332, + "epoch": 0.532051282051282, + "cpu_mem": 1.800699904, + "gpu_mem": 4.456449024, + "loss": 1.3682, + "grad_norm": 0.31783413887023926, + "learning_rate": 0.00015965330159250845 + }, + { + "step": 333, + "epoch": 0.5336538461538461, + "cpu_mem": 1.800699904, + "gpu_mem": 4.456487424, + "loss": 1.418, + "grad_norm": 0.5915924906730652, + "learning_rate": 0.00015881489785243467 + }, + { + "step": 334, + "epoch": 0.5352564102564102, + "cpu_mem": 1.800896512, + "gpu_mem": 4.456464384, + "loss": 1.3849, + "grad_norm": 0.36501532793045044, + "learning_rate": 0.00015797621767917942 + }, + { + "step": 335, + "epoch": 0.5368589743589743, + "cpu_mem": 1.800896512, + "gpu_mem": 4.456462848, + "loss": 1.3799, + "grad_norm": 0.2751516103744507, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 336, + "epoch": 0.5384615384615384, + "cpu_mem": 1.80109312, + "gpu_mem": 4.456479744, + "loss": 1.3759, + "grad_norm": 0.33100783824920654, + "learning_rate": 0.00015629813324424292 + }, + { + "step": 337, + "epoch": 0.5400641025641025, + "cpu_mem": 1.801289728, + "gpu_mem": 4.456464384, + "loss": 1.35, + "grad_norm": 0.30152633786201477, + "learning_rate": 0.00015545878160690583 + }, + { + "step": 338, + "epoch": 0.5416666666666666, + "cpu_mem": 1.801289728, + "gpu_mem": 4.456476672, + "loss": 1.3915, + "grad_norm": 0.4885941445827484, + "learning_rate": 0.00015461925878342556 + }, + { + "step": 339, + "epoch": 0.5432692307692307, + "cpu_mem": 1.801486336, + "gpu_mem": 4.45648896, + "loss": 1.3639, + "grad_norm": 0.29398658871650696, + "learning_rate": 0.00015377959110104584 + }, + { + "step": 340, + "epoch": 0.5448717948717948, + "cpu_mem": 1.801486336, + "gpu_mem": 4.456464384, + "loss": 1.3669, + "grad_norm": 0.35672521591186523, + "learning_rate": 0.00015293980489155333 + }, + { + "step": 341, + "epoch": 0.5464743589743589, + "cpu_mem": 1.801486336, + "gpu_mem": 4.456508928, + "loss": 1.3893, + "grad_norm": 0.5283788442611694, + "learning_rate": 0.00015209992649045152 + }, + { + "step": 342, + "epoch": 0.5480769230769231, + "cpu_mem": 1.801486336, + "gpu_mem": 4.456482816, + "loss": 1.349, + "grad_norm": 0.27885013818740845, + "learning_rate": 0.000151259982236135 + }, + { + "step": 343, + "epoch": 0.5496794871794872, + "cpu_mem": 1.801486336, + "gpu_mem": 4.456479744, + "loss": 1.3589, + "grad_norm": 0.27331966161727905, + "learning_rate": 0.00015041999846906367 + }, + { + "step": 344, + "epoch": 0.5512820512820513, + "cpu_mem": 1.801682944, + "gpu_mem": 4.456461312, + "loss": 1.3548, + "grad_norm": 0.34776753187179565, + "learning_rate": 0.00014958000153093634 + }, + { + "step": 345, + "epoch": 0.5528846153846154, + "cpu_mem": 1.801879552, + "gpu_mem": 4.456467456, + "loss": 1.4242, + "grad_norm": 0.6444196701049805, + "learning_rate": 0.000148740017763865 + }, + { + "step": 346, + "epoch": 0.5544871794871795, + "cpu_mem": 1.801879552, + "gpu_mem": 4.456436736, + "loss": 1.3878, + "grad_norm": 0.4500907361507416, + "learning_rate": 0.00014790007350954845 + }, + { + "step": 347, + "epoch": 0.5560897435897436, + "cpu_mem": 1.801879552, + "gpu_mem": 4.456501248, + "loss": 1.3759, + "grad_norm": 0.32508084177970886, + "learning_rate": 0.00014706019510844664 + }, + { + "step": 348, + "epoch": 0.5576923076923077, + "cpu_mem": 1.80207616, + "gpu_mem": 4.456455168, + "loss": 1.3914, + "grad_norm": 0.4978691041469574, + "learning_rate": 0.0001462204088989541 + }, + { + "step": 349, + "epoch": 0.5592948717948718, + "cpu_mem": 1.80207616, + "gpu_mem": 4.456449024, + "loss": 1.3704, + "grad_norm": 0.36849328875541687, + "learning_rate": 0.00014538074121657447 + }, + { + "step": 350, + "epoch": 0.5608974358974359, + "cpu_mem": 1.802272768, + "gpu_mem": 4.45650432, + "loss": 1.351, + "grad_norm": 0.2934258282184601, + "learning_rate": 0.00014454121839309415 + }, + { + "step": 351, + "epoch": 0.5625, + "cpu_mem": 1.802469376, + "gpu_mem": 4.456470528, + "loss": 1.3593, + "grad_norm": 0.3268783986568451, + "learning_rate": 0.00014370186675575705 + }, + { + "step": 352, + "epoch": 0.5641025641025641, + "cpu_mem": 1.802469376, + "gpu_mem": 4.45645824, + "loss": 1.3465, + "grad_norm": 0.5246816277503967, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 353, + "epoch": 0.5657051282051282, + "cpu_mem": 1.802469376, + "gpu_mem": 4.456462848, + "loss": 1.3726, + "grad_norm": 0.41283857822418213, + "learning_rate": 0.00014202378232082053 + }, + { + "step": 354, + "epoch": 0.5673076923076923, + "cpu_mem": 1.802469376, + "gpu_mem": 4.45644288, + "loss": 1.3604, + "grad_norm": 0.3387734293937683, + "learning_rate": 0.00014118510214756536 + }, + { + "step": 355, + "epoch": 0.5689102564102564, + "cpu_mem": 1.802469376, + "gpu_mem": 4.456467456, + "loss": 1.3573, + "grad_norm": 0.4390133023262024, + "learning_rate": 0.00014034669840749152 + }, + { + "step": 356, + "epoch": 0.5705128205128205, + "cpu_mem": 1.802665984, + "gpu_mem": 4.456445952, + "loss": 1.3571, + "grad_norm": 0.3529515266418457, + "learning_rate": 0.0001395085973927487 + }, + { + "step": 357, + "epoch": 0.5721153846153846, + "cpu_mem": 1.802665984, + "gpu_mem": 4.456462848, + "loss": 1.3619, + "grad_norm": 0.4463919699192047, + "learning_rate": 0.00013867082538599317 + }, + { + "step": 358, + "epoch": 0.5737179487179487, + "cpu_mem": 1.802862592, + "gpu_mem": 4.45642752, + "loss": 1.3834, + "grad_norm": 0.32317018508911133, + "learning_rate": 0.00013783340865956338 + }, + { + "step": 359, + "epoch": 0.5753205128205128, + "cpu_mem": 1.802862592, + "gpu_mem": 4.456459776, + "loss": 1.3354, + "grad_norm": 0.33981382846832275, + "learning_rate": 0.0001369963734746566 + }, + { + "step": 360, + "epoch": 0.5769230769230769, + "cpu_mem": 1.8030592, + "gpu_mem": 4.456449024, + "loss": 1.3508, + "grad_norm": 0.3714148700237274, + "learning_rate": 0.0001361597460805047 + }, + { + "step": 361, + "epoch": 0.5785256410256411, + "cpu_mem": 1.8030592, + "gpu_mem": 4.456485888, + "loss": 1.3519, + "grad_norm": 0.39999955892562866, + "learning_rate": 0.00013532355271355154 + }, + { + "step": 362, + "epoch": 0.5801282051282052, + "cpu_mem": 1.8030592, + "gpu_mem": 4.456452096, + "loss": 1.3493, + "grad_norm": 0.43257421255111694, + "learning_rate": 0.00013448781959663004 + }, + { + "step": 363, + "epoch": 0.5817307692307693, + "cpu_mem": 1.8030592, + "gpu_mem": 4.456475136, + "loss": 1.3924, + "grad_norm": 0.5234917402267456, + "learning_rate": 0.00013365257293813956 + }, + { + "step": 364, + "epoch": 0.5833333333333334, + "cpu_mem": 1.803255808, + "gpu_mem": 4.456464384, + "loss": 1.3354, + "grad_norm": 0.3192165195941925, + "learning_rate": 0.00013281783893122446 + }, + { + "step": 365, + "epoch": 0.5849358974358975, + "cpu_mem": 1.803255808, + "gpu_mem": 4.456470528, + "loss": 1.3536, + "grad_norm": 0.43248245120048523, + "learning_rate": 0.00013198364375295224 + }, + { + "step": 366, + "epoch": 0.5865384615384616, + "cpu_mem": 1.803255808, + "gpu_mem": 4.456464384, + "loss": 1.3911, + "grad_norm": 0.5354653000831604, + "learning_rate": 0.000131150013563493 + }, + { + "step": 367, + "epoch": 0.5881410256410257, + "cpu_mem": 1.803452416, + "gpu_mem": 4.456482816, + "loss": 1.396, + "grad_norm": 0.5466229915618896, + "learning_rate": 0.00013031697450529902 + }, + { + "step": 368, + "epoch": 0.5897435897435898, + "cpu_mem": 1.803452416, + "gpu_mem": 4.45644288, + "loss": 1.3684, + "grad_norm": 0.33453068137168884, + "learning_rate": 0.0001294845527022846 + }, + { + "step": 369, + "epoch": 0.5913461538461539, + "cpu_mem": 1.803452416, + "gpu_mem": 4.456475136, + "loss": 1.3518, + "grad_norm": 0.3269110918045044, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 370, + "epoch": 0.592948717948718, + "cpu_mem": 1.803452416, + "gpu_mem": 4.456495104, + "loss": 1.3651, + "grad_norm": 0.42408987879753113, + "learning_rate": 0.0001278216652598487 + }, + { + "step": 371, + "epoch": 0.594551282051282, + "cpu_mem": 1.803649024, + "gpu_mem": 4.45648896, + "loss": 1.3478, + "grad_norm": 0.47613364458084106, + "learning_rate": 0.00012699125176819716 + }, + { + "step": 372, + "epoch": 0.5961538461538461, + "cpu_mem": 1.803649024, + "gpu_mem": 4.456452096, + "loss": 1.3931, + "grad_norm": 0.648717999458313, + "learning_rate": 0.00012616155982563 + }, + { + "step": 373, + "epoch": 0.5977564102564102, + "cpu_mem": 1.803649024, + "gpu_mem": 4.456468992, + "loss": 1.3307, + "grad_norm": 0.6515316367149353, + "learning_rate": 0.00012533261545109674 + }, + { + "step": 374, + "epoch": 0.5993589743589743, + "cpu_mem": 1.803845632, + "gpu_mem": 4.456445952, + "loss": 1.3064, + "grad_norm": 0.5355702638626099, + "learning_rate": 0.00012450444464010352 + }, + { + "step": 375, + "epoch": 0.6009615384615384, + "cpu_mem": 1.803845632, + "gpu_mem": 4.456478208, + "loss": 1.3613, + "grad_norm": 0.8768520951271057, + "learning_rate": 0.0001236770733638976 + }, + { + "step": 376, + "epoch": 0.6025641025641025, + "cpu_mem": 1.803845632, + "gpu_mem": 4.4564736, + "loss": 1.3059, + "grad_norm": 0.4714309871196747, + "learning_rate": 0.000122850527568653 + }, + { + "step": 377, + "epoch": 0.6041666666666666, + "cpu_mem": 1.80404224, + "gpu_mem": 4.456482816, + "loss": 1.3596, + "grad_norm": 0.5421189665794373, + "learning_rate": 0.00012202483317465704 + }, + { + "step": 378, + "epoch": 0.6057692307692307, + "cpu_mem": 1.80404224, + "gpu_mem": 4.456456704, + "loss": 1.3231, + "grad_norm": 0.49507468938827515, + "learning_rate": 0.00012120001607549698 + }, + { + "step": 379, + "epoch": 0.6073717948717948, + "cpu_mem": 1.80404224, + "gpu_mem": 4.456476672, + "loss": 1.2945, + "grad_norm": 0.7088404893875122, + "learning_rate": 0.00012037610213724862 + }, + { + "step": 380, + "epoch": 0.6089743589743589, + "cpu_mem": 1.80404224, + "gpu_mem": 4.45645056, + "loss": 1.3088, + "grad_norm": 0.5102182030677795, + "learning_rate": 0.0001195531171976646 + }, + { + "step": 381, + "epoch": 0.6105769230769231, + "cpu_mem": 1.804238848, + "gpu_mem": 4.456475136, + "loss": 1.3414, + "grad_norm": 0.5478403568267822, + "learning_rate": 0.00011873108706536448 + }, + { + "step": 382, + "epoch": 0.6121794871794872, + "cpu_mem": 1.804238848, + "gpu_mem": 4.456459776, + "loss": 1.3265, + "grad_norm": 0.9156337380409241, + "learning_rate": 0.00011791003751902542 + }, + { + "step": 383, + "epoch": 0.6137820512820513, + "cpu_mem": 1.804238848, + "gpu_mem": 4.456493568, + "loss": 1.2757, + "grad_norm": 0.7079420685768127, + "learning_rate": 0.00011708999430657325 + }, + { + "step": 384, + "epoch": 0.6153846153846154, + "cpu_mem": 1.804238848, + "gpu_mem": 4.4564736, + "loss": 1.2784, + "grad_norm": 0.5032815933227539, + "learning_rate": 0.00011627098314437586 + }, + { + "step": 385, + "epoch": 0.6169871794871795, + "cpu_mem": 1.804435456, + "gpu_mem": 4.45645824, + "loss": 1.3143, + "grad_norm": 0.9801205396652222, + "learning_rate": 0.0001154530297164359 + }, + { + "step": 386, + "epoch": 0.6185897435897436, + "cpu_mem": 1.804435456, + "gpu_mem": 4.456493568, + "loss": 1.2938, + "grad_norm": 0.8485478758811951, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 387, + "epoch": 0.6201923076923077, + "cpu_mem": 1.804632064, + "gpu_mem": 4.456499712, + "loss": 1.3729, + "grad_norm": 1.1745039224624634, + "learning_rate": 0.00011382039863268374 + }, + { + "step": 388, + "epoch": 0.6217948717948718, + "cpu_mem": 1.804632064, + "gpu_mem": 4.456462848, + "loss": 1.3991, + "grad_norm": 1.1911351680755615, + "learning_rate": 0.00011300577217580905 + }, + { + "step": 389, + "epoch": 0.6233974358974359, + "cpu_mem": 1.804632064, + "gpu_mem": 4.456441344, + "loss": 1.2805, + "grad_norm": 0.7701823711395264, + "learning_rate": 0.00011219230584946136 + }, + { + "step": 390, + "epoch": 0.625, + "cpu_mem": 1.804632064, + "gpu_mem": 4.456493568, + "loss": 1.2413, + "grad_norm": 0.8868763446807861, + "learning_rate": 0.00011138002516375864 + }, + { + "step": 391, + "epoch": 0.6266025641025641, + "cpu_mem": 1.804632064, + "gpu_mem": 4.456479744, + "loss": 1.2715, + "grad_norm": 1.0701212882995605, + "learning_rate": 0.00011056895559163748 + }, + { + "step": 392, + "epoch": 0.6282051282051282, + "cpu_mem": 1.804632064, + "gpu_mem": 4.4564736, + "loss": 1.3047, + "grad_norm": 1.0676594972610474, + "learning_rate": 0.00010975912256805436 + }, + { + "step": 393, + "epoch": 0.6298076923076923, + "cpu_mem": 1.804632064, + "gpu_mem": 4.456479744, + "loss": 1.2949, + "grad_norm": 0.8111123442649841, + "learning_rate": 0.00010895055148918756 + }, + { + "step": 394, + "epoch": 0.6314102564102564, + "cpu_mem": 1.804828672, + "gpu_mem": 4.456456704, + "loss": 1.4342, + "grad_norm": 1.6300300359725952, + "learning_rate": 0.00010814326771164141 + }, + { + "step": 395, + "epoch": 0.6330128205128205, + "cpu_mem": 1.80502528, + "gpu_mem": 4.456470528, + "loss": 1.3436, + "grad_norm": 1.7721853256225586, + "learning_rate": 0.00010733729655165054 + }, + { + "step": 396, + "epoch": 0.6346153846153846, + "cpu_mem": 1.80502528, + "gpu_mem": 4.456470528, + "loss": 1.3235, + "grad_norm": 0.8750191926956177, + "learning_rate": 0.00010653266328428628 + }, + { + "step": 397, + "epoch": 0.6362179487179487, + "cpu_mem": 1.80502528, + "gpu_mem": 4.456439808, + "loss": 1.2896, + "grad_norm": 1.3600075244903564, + "learning_rate": 0.00010572939314266402 + }, + { + "step": 398, + "epoch": 0.6378205128205128, + "cpu_mem": 1.80502528, + "gpu_mem": 4.4564736, + "loss": 1.2929, + "grad_norm": 0.9232955574989319, + "learning_rate": 0.00010492751131715159 + }, + { + "step": 399, + "epoch": 0.6394230769230769, + "cpu_mem": 1.80502528, + "gpu_mem": 4.456452096, + "loss": 1.2459, + "grad_norm": 0.9077758193016052, + "learning_rate": 0.00010412704295457988 + }, + { + "step": 400, + "epoch": 0.6410256410256411, + "cpu_mem": 1.80502528, + "gpu_mem": 4.456459776, + "loss": 1.2284, + "grad_norm": 0.7218019962310791, + "learning_rate": 0.00010332801315745361 + }, + { + "step": 401, + "epoch": 0.6426282051282052, + "cpu_mem": 1.805221888, + "gpu_mem": 4.456478208, + "loss": 1.1987, + "grad_norm": 0.6583279371261597, + "learning_rate": 0.00010253044698316464 + }, + { + "step": 402, + "epoch": 0.6442307692307693, + "cpu_mem": 1.805418496, + "gpu_mem": 4.456445952, + "loss": 1.2622, + "grad_norm": 0.916189968585968, + "learning_rate": 0.00010173436944320582 + }, + { + "step": 403, + "epoch": 0.6458333333333334, + "cpu_mem": 1.805418496, + "gpu_mem": 4.45645056, + "loss": 1.2725, + "grad_norm": 0.8789129257202148, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 404, + "epoch": 0.6474358974358975, + "cpu_mem": 1.805418496, + "gpu_mem": 4.456445952, + "loss": 1.208, + "grad_norm": 0.87615966796875, + "learning_rate": 0.00010014678007805106 + }, + { + "step": 405, + "epoch": 0.6490384615384616, + "cpu_mem": 1.805418496, + "gpu_mem": 4.456490496, + "loss": 1.2833, + "grad_norm": 0.9081527590751648, + "learning_rate": 9.935531803929469e-05 + }, + { + "step": 406, + "epoch": 0.6506410256410257, + "cpu_mem": 1.805418496, + "gpu_mem": 4.4564736, + "loss": 1.1945, + "grad_norm": 1.1708338260650635, + "learning_rate": 9.856544420618624e-05 + }, + { + "step": 407, + "epoch": 0.6522435897435898, + "cpu_mem": 1.805615104, + "gpu_mem": 4.456462848, + "loss": 1.2217, + "grad_norm": 0.9913130402565002, + "learning_rate": 9.777718334898859e-05 + }, + { + "step": 408, + "epoch": 0.6538461538461539, + "cpu_mem": 1.805615104, + "gpu_mem": 4.456484352, + "loss": 1.1817, + "grad_norm": 1.180288553237915, + "learning_rate": 9.699056018738192e-05 + }, + { + "step": 409, + "epoch": 0.655448717948718, + "cpu_mem": 1.805615104, + "gpu_mem": 4.45645056, + "loss": 1.2412, + "grad_norm": 1.0372031927108765, + "learning_rate": 9.62055993896888e-05 + }, + { + "step": 410, + "epoch": 0.657051282051282, + "cpu_mem": 1.805615104, + "gpu_mem": 4.45646592, + "loss": 1.2245, + "grad_norm": 1.8488191366195679, + "learning_rate": 9.542232557210039e-05 + }, + { + "step": 411, + "epoch": 0.6586538461538461, + "cpu_mem": 1.805615104, + "gpu_mem": 4.45646592, + "loss": 1.2568, + "grad_norm": 1.4988534450531006, + "learning_rate": 9.464076329790451e-05 + }, + { + "step": 412, + "epoch": 0.6602564102564102, + "cpu_mem": 1.805615104, + "gpu_mem": 4.456456704, + "loss": 1.1459, + "grad_norm": 1.4519954919815063, + "learning_rate": 9.386093707671543e-05 + }, + { + "step": 413, + "epoch": 0.6618589743589743, + "cpu_mem": 1.805811712, + "gpu_mem": 4.456467456, + "loss": 1.3321, + "grad_norm": 2.378812551498413, + "learning_rate": 9.308287136370511e-05 + }, + { + "step": 414, + "epoch": 0.6634615384615384, + "cpu_mem": 1.805811712, + "gpu_mem": 4.456492032, + "loss": 1.1438, + "grad_norm": 0.979723334312439, + "learning_rate": 9.230659055883649e-05 + }, + { + "step": 415, + "epoch": 0.6650641025641025, + "cpu_mem": 1.805811712, + "gpu_mem": 4.456444416, + "loss": 1.3267, + "grad_norm": 2.0685203075408936, + "learning_rate": 9.15321190060981e-05 + }, + { + "step": 416, + "epoch": 0.6666666666666666, + "cpu_mem": 1.805811712, + "gpu_mem": 4.456479744, + "loss": 1.2382, + "grad_norm": 2.344755172729492, + "learning_rate": 9.075948099274078e-05 + }, + { + "step": 417, + "epoch": 0.6682692307692307, + "cpu_mem": 1.80600832, + "gpu_mem": 4.456441344, + "loss": 1.2237, + "grad_norm": 2.9527461528778076, + "learning_rate": 8.998870074851604e-05 + }, + { + "step": 418, + "epoch": 0.6698717948717948, + "cpu_mem": 1.80600832, + "gpu_mem": 4.456459776, + "loss": 1.4177, + "grad_norm": 2.0799171924591064, + "learning_rate": 8.9219802444916e-05 + }, + { + "step": 419, + "epoch": 0.6714743589743589, + "cpu_mem": 1.80600832, + "gpu_mem": 4.456452096, + "loss": 1.1696, + "grad_norm": 1.4602617025375366, + "learning_rate": 8.845281019441583e-05 + }, + { + "step": 420, + "epoch": 0.6730769230769231, + "cpu_mem": 1.80600832, + "gpu_mem": 4.45648896, + "loss": 1.203, + "grad_norm": 1.2469581365585327, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 421, + "epoch": 0.6746794871794872, + "cpu_mem": 1.806204928, + "gpu_mem": 4.456449024, + "loss": 1.2297, + "grad_norm": 2.0651254653930664, + "learning_rate": 8.692464000299362e-05 + }, + { + "step": 422, + "epoch": 0.6762820512820513, + "cpu_mem": 1.806204928, + "gpu_mem": 4.456462848, + "loss": 1.2969, + "grad_norm": 3.1099298000335693, + "learning_rate": 8.61635099851395e-05 + }, + { + "step": 423, + "epoch": 0.6778846153846154, + "cpu_mem": 1.806204928, + "gpu_mem": 4.456467456, + "loss": 1.1467, + "grad_norm": 1.5080673694610596, + "learning_rate": 8.540438186501792e-05 + }, + { + "step": 424, + "epoch": 0.6794871794871795, + "cpu_mem": 1.806401536, + "gpu_mem": 4.456429056, + "loss": 1.1209, + "grad_norm": 1.1852554082870483, + "learning_rate": 8.464727944871322e-05 + }, + { + "step": 425, + "epoch": 0.6810897435897436, + "cpu_mem": 1.806401536, + "gpu_mem": 4.456452096, + "loss": 1.289, + "grad_norm": 1.2570873498916626, + "learning_rate": 8.389222647878426e-05 + }, + { + "step": 426, + "epoch": 0.6826923076923077, + "cpu_mem": 1.806401536, + "gpu_mem": 4.45645056, + "loss": 1.2203, + "grad_norm": 1.1804797649383545, + "learning_rate": 8.313924663351926e-05 + }, + { + "step": 427, + "epoch": 0.6842948717948718, + "cpu_mem": 1.806401536, + "gpu_mem": 4.456468992, + "loss": 1.1833, + "grad_norm": 1.8091716766357422, + "learning_rate": 8.238836352619424e-05 + }, + { + "step": 428, + "epoch": 0.6858974358974359, + "cpu_mem": 1.806401536, + "gpu_mem": 4.45646592, + "loss": 1.3007, + "grad_norm": 1.0194973945617676, + "learning_rate": 8.163960070433164e-05 + }, + { + "step": 429, + "epoch": 0.6875, + "cpu_mem": 1.806401536, + "gpu_mem": 4.456464384, + "loss": 1.2434, + "grad_norm": 0.9688871502876282, + "learning_rate": 8.089298164896245e-05 + }, + { + "step": 430, + "epoch": 0.6891025641025641, + "cpu_mem": 1.806401536, + "gpu_mem": 4.456482816, + "loss": 1.0482, + "grad_norm": 1.453354001045227, + "learning_rate": 8.014852977388964e-05 + }, + { + "step": 431, + "epoch": 0.6907051282051282, + "cpu_mem": 1.806401536, + "gpu_mem": 4.456444416, + "loss": 1.1765, + "grad_norm": 0.993910551071167, + "learning_rate": 7.940626842495362e-05 + }, + { + "step": 432, + "epoch": 0.6923076923076923, + "cpu_mem": 1.806598144, + "gpu_mem": 4.45648896, + "loss": 1.1318, + "grad_norm": 1.1091222763061523, + "learning_rate": 7.866622087930074e-05 + }, + { + "step": 433, + "epoch": 0.6939102564102564, + "cpu_mem": 1.806598144, + "gpu_mem": 4.456453632, + "loss": 1.0359, + "grad_norm": 1.2227985858917236, + "learning_rate": 7.792841034465275e-05 + }, + { + "step": 434, + "epoch": 0.6955128205128205, + "cpu_mem": 1.806598144, + "gpu_mem": 4.45648128, + "loss": 1.108, + "grad_norm": 1.0463464260101318, + "learning_rate": 7.719285995857938e-05 + }, + { + "step": 435, + "epoch": 0.6971153846153846, + "cpu_mem": 1.806598144, + "gpu_mem": 4.456461312, + "loss": 1.1093, + "grad_norm": 1.7224947214126587, + "learning_rate": 7.64595927877727e-05 + }, + { + "step": 436, + "epoch": 0.6987179487179487, + "cpu_mem": 1.806598144, + "gpu_mem": 4.456507392, + "loss": 1.1154, + "grad_norm": 1.6234240531921387, + "learning_rate": 7.572863182732332e-05 + }, + { + "step": 437, + "epoch": 0.7003205128205128, + "cpu_mem": 1.806794752, + "gpu_mem": 4.456472064, + "loss": 1.1427, + "grad_norm": 1.7011550664901733, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 438, + "epoch": 0.7019230769230769, + "cpu_mem": 1.80699136, + "gpu_mem": 4.456462848, + "loss": 1.1562, + "grad_norm": 2.318251848220825, + "learning_rate": 7.42737201555302e-05 + }, + { + "step": 439, + "epoch": 0.7035256410256411, + "cpu_mem": 1.80699136, + "gpu_mem": 4.456456704, + "loss": 1.1697, + "grad_norm": 2.272237777709961, + "learning_rate": 7.354981506988387e-05 + }, + { + "step": 440, + "epoch": 0.7051282051282052, + "cpu_mem": 1.80699136, + "gpu_mem": 4.456441344, + "loss": 1.0975, + "grad_norm": 1.4558817148208618, + "learning_rate": 7.282830744455895e-05 + }, + { + "step": 441, + "epoch": 0.7067307692307693, + "cpu_mem": 1.80699136, + "gpu_mem": 4.456459776, + "loss": 1.2297, + "grad_norm": 3.49137544631958, + "learning_rate": 7.210921990586957e-05 + }, + { + "step": 442, + "epoch": 0.7083333333333334, + "cpu_mem": 1.80699136, + "gpu_mem": 4.456461312, + "loss": 0.9448, + "grad_norm": 1.5383589267730713, + "learning_rate": 7.139257500423665e-05 + }, + { + "step": 443, + "epoch": 0.7099358974358975, + "cpu_mem": 1.80699136, + "gpu_mem": 4.45646592, + "loss": 1.0278, + "grad_norm": 2.177945852279663, + "learning_rate": 7.067839521348035e-05 + }, + { + "step": 444, + "epoch": 0.7115384615384616, + "cpu_mem": 1.80699136, + "gpu_mem": 4.456468992, + "loss": 1.0927, + "grad_norm": 1.9978047609329224, + "learning_rate": 6.996670293011575e-05 + }, + { + "step": 445, + "epoch": 0.7131410256410257, + "cpu_mem": 1.80699136, + "gpu_mem": 4.456462848, + "loss": 1.2583, + "grad_norm": 2.4970059394836426, + "learning_rate": 6.92575204726501e-05 + }, + { + "step": 446, + "epoch": 0.7147435897435898, + "cpu_mem": 1.80699136, + "gpu_mem": 4.45648896, + "loss": 0.9416, + "grad_norm": 1.6889077425003052, + "learning_rate": 6.855087008088307e-05 + }, + { + "step": 447, + "epoch": 0.7163461538461539, + "cpu_mem": 1.807187968, + "gpu_mem": 4.456456704, + "loss": 1.0056, + "grad_norm": 1.6961884498596191, + "learning_rate": 6.784677391520952e-05 + }, + { + "step": 448, + "epoch": 0.717948717948718, + "cpu_mem": 1.807187968, + "gpu_mem": 4.456484352, + "loss": 0.9916, + "grad_norm": 2.023078680038452, + "learning_rate": 6.714525405592412e-05 + }, + { + "step": 449, + "epoch": 0.719551282051282, + "cpu_mem": 1.807187968, + "gpu_mem": 4.456492032, + "loss": 1.1074, + "grad_norm": 2.981701135635376, + "learning_rate": 6.644633250252937e-05 + }, + { + "step": 450, + "epoch": 0.7211538461538461, + "cpu_mem": 1.807187968, + "gpu_mem": 4.4564736, + "loss": 1.1673, + "grad_norm": 2.578402280807495, + "learning_rate": 6.575003117304535e-05 + }, + { + "step": 451, + "epoch": 0.7227564102564102, + "cpu_mem": 1.807187968, + "gpu_mem": 4.456459776, + "loss": 1.0737, + "grad_norm": 1.4027197360992432, + "learning_rate": 6.50563719033225e-05 + }, + { + "step": 452, + "epoch": 0.7243589743589743, + "cpu_mem": 1.807187968, + "gpu_mem": 4.456470528, + "loss": 1.0552, + "grad_norm": 1.856907606124878, + "learning_rate": 6.436537644635705e-05 + }, + { + "step": 453, + "epoch": 0.7259615384615384, + "cpu_mem": 1.807187968, + "gpu_mem": 4.456462848, + "loss": 1.1002, + "grad_norm": 1.976373553276062, + "learning_rate": 6.367706647160847e-05 + }, + { + "step": 454, + "epoch": 0.7275641025641025, + "cpu_mem": 1.807187968, + "gpu_mem": 4.456479744, + "loss": 0.9377, + "grad_norm": 1.5682294368743896, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 455, + "epoch": 0.7291666666666666, + "cpu_mem": 1.807187968, + "gpu_mem": 4.456452096, + "loss": 0.9555, + "grad_norm": 1.4511650800704956, + "learning_rate": 6.230858922484288e-05 + }, + { + "step": 456, + "epoch": 0.7307692307692307, + "cpu_mem": 1.807187968, + "gpu_mem": 4.456482816, + "loss": 1.0687, + "grad_norm": 2.9210867881774902, + "learning_rate": 6.162846486795938e-05 + }, + { + "step": 457, + "epoch": 0.7323717948717948, + "cpu_mem": 1.807384576, + "gpu_mem": 4.456464384, + "loss": 1.1522, + "grad_norm": 2.092254638671875, + "learning_rate": 6.095111182221422e-05 + }, + { + "step": 458, + "epoch": 0.7339743589743589, + "cpu_mem": 1.807384576, + "gpu_mem": 4.456452096, + "loss": 1.0511, + "grad_norm": 2.0090768337249756, + "learning_rate": 6.027655132924397e-05 + }, + { + "step": 459, + "epoch": 0.7355769230769231, + "cpu_mem": 1.807581184, + "gpu_mem": 4.456464384, + "loss": 0.9941, + "grad_norm": 2.6564903259277344, + "learning_rate": 5.960480454311155e-05 + }, + { + "step": 460, + "epoch": 0.7371794871794872, + "cpu_mem": 1.807581184, + "gpu_mem": 4.456470528, + "loss": 0.9844, + "grad_norm": 1.7499603033065796, + "learning_rate": 5.893589252964258e-05 + }, + { + "step": 461, + "epoch": 0.7387820512820513, + "cpu_mem": 1.807581184, + "gpu_mem": 4.45645824, + "loss": 0.9933, + "grad_norm": 2.0038347244262695, + "learning_rate": 5.826983626576479e-05 + }, + { + "step": 462, + "epoch": 0.7403846153846154, + "cpu_mem": 1.807581184, + "gpu_mem": 4.456447488, + "loss": 0.874, + "grad_norm": 2.2416679859161377, + "learning_rate": 5.760665663885046e-05 + }, + { + "step": 463, + "epoch": 0.7419871794871795, + "cpu_mem": 1.807581184, + "gpu_mem": 4.456449024, + "loss": 0.84, + "grad_norm": 1.5036391019821167, + "learning_rate": 5.6946374446060984e-05 + }, + { + "step": 464, + "epoch": 0.7435897435897436, + "cpu_mem": 1.807581184, + "gpu_mem": 4.456462848, + "loss": 1.1587, + "grad_norm": 3.653614044189453, + "learning_rate": 5.6289010393695056e-05 + }, + { + "step": 465, + "epoch": 0.7451923076923077, + "cpu_mem": 1.807777792, + "gpu_mem": 4.45646592, + "loss": 0.9171, + "grad_norm": 2.0239312648773193, + "learning_rate": 5.563458509653904e-05 + }, + { + "step": 466, + "epoch": 0.7467948717948718, + "cpu_mem": 1.807777792, + "gpu_mem": 4.456476672, + "loss": 1.072, + "grad_norm": 2.9899234771728516, + "learning_rate": 5.498311907722057e-05 + }, + { + "step": 467, + "epoch": 0.7483974358974359, + "cpu_mem": 1.807777792, + "gpu_mem": 4.45645056, + "loss": 0.9946, + "grad_norm": 1.9496617317199707, + "learning_rate": 5.43346327655652e-05 + }, + { + "step": 468, + "epoch": 0.75, + "cpu_mem": 1.807777792, + "gpu_mem": 4.45646592, + "loss": 1.0447, + "grad_norm": 2.0339791774749756, + "learning_rate": 5.3689146497955274e-05 + }, + { + "step": 469, + "epoch": 0.7516025641025641, + "cpu_mem": 1.807777792, + "gpu_mem": 4.456475136, + "loss": 0.9173, + "grad_norm": 2.3444836139678955, + "learning_rate": 5.30466805166927e-05 + }, + { + "step": 470, + "epoch": 0.7532051282051282, + "cpu_mem": 1.807777792, + "gpu_mem": 4.456449024, + "loss": 1.0369, + "grad_norm": 2.918546438217163, + "learning_rate": 5.240725496936372e-05 + }, + { + "step": 471, + "epoch": 0.7548076923076923, + "cpu_mem": 1.807777792, + "gpu_mem": 4.456455168, + "loss": 1.0722, + "grad_norm": 2.4702212810516357, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 472, + "epoch": 0.7564102564102564, + "cpu_mem": 1.807777792, + "gpu_mem": 4.456444416, + "loss": 1.0563, + "grad_norm": 3.706148624420166, + "learning_rate": 5.113760528948622e-05 + }, + { + "step": 473, + "epoch": 0.7580128205128205, + "cpu_mem": 1.807777792, + "gpu_mem": 4.45645056, + "loss": 1.0198, + "grad_norm": 2.736229181289673, + "learning_rate": 5.05074209728614e-05 + }, + { + "step": 474, + "epoch": 0.7596153846153846, + "cpu_mem": 1.807777792, + "gpu_mem": 4.456487424, + "loss": 0.9038, + "grad_norm": 1.8504544496536255, + "learning_rate": 4.988035672076899e-05 + }, + { + "step": 475, + "epoch": 0.7612179487179487, + "cpu_mem": 1.807777792, + "gpu_mem": 4.4564352, + "loss": 1.0437, + "grad_norm": 2.2408945560455322, + "learning_rate": 4.925643219780052e-05 + }, + { + "step": 476, + "epoch": 0.7628205128205128, + "cpu_mem": 1.807777792, + "gpu_mem": 4.456455168, + "loss": 1.0071, + "grad_norm": 3.839552402496338, + "learning_rate": 4.863566697008634e-05 + }, + { + "step": 477, + "epoch": 0.7644230769230769, + "cpu_mem": 1.807777792, + "gpu_mem": 4.456455168, + "loss": 0.9962, + "grad_norm": 3.4610788822174072, + "learning_rate": 4.801808050468219e-05 + }, + { + "step": 478, + "epoch": 0.7660256410256411, + "cpu_mem": 1.807777792, + "gpu_mem": 4.456453632, + "loss": 1.0891, + "grad_norm": 3.905059337615967, + "learning_rate": 4.7403692168958305e-05 + }, + { + "step": 479, + "epoch": 0.7676282051282052, + "cpu_mem": 1.8079744, + "gpu_mem": 4.456452096, + "loss": 0.8661, + "grad_norm": 2.5299017429351807, + "learning_rate": 4.679252122999255e-05 + }, + { + "step": 480, + "epoch": 0.7692307692307693, + "cpu_mem": 1.8079744, + "gpu_mem": 4.456444416, + "loss": 0.9952, + "grad_norm": 2.083989381790161, + "learning_rate": 4.618458685396579e-05 + }, + { + "step": 481, + "epoch": 0.7708333333333334, + "cpu_mem": 1.8079744, + "gpu_mem": 4.45650432, + "loss": 0.9843, + "grad_norm": 2.518477439880371, + "learning_rate": 4.5579908105561016e-05 + }, + { + "step": 482, + "epoch": 0.7724358974358975, + "cpu_mem": 1.8079744, + "gpu_mem": 4.456449024, + "loss": 0.9833, + "grad_norm": 2.5281898975372314, + "learning_rate": 4.497850394736563e-05 + }, + { + "step": 483, + "epoch": 0.7740384615384616, + "cpu_mem": 1.8079744, + "gpu_mem": 4.456432128, + "loss": 0.9136, + "grad_norm": 2.4145617485046387, + "learning_rate": 4.438039323927648e-05 + }, + { + "step": 484, + "epoch": 0.7756410256410257, + "cpu_mem": 1.8079744, + "gpu_mem": 4.456462848, + "loss": 1.0319, + "grad_norm": 2.959143877029419, + "learning_rate": 4.3785594737908676e-05 + }, + { + "step": 485, + "epoch": 0.7772435897435898, + "cpu_mem": 1.8079744, + "gpu_mem": 4.456507392, + "loss": 0.9233, + "grad_norm": 2.5835678577423096, + "learning_rate": 4.319412709600723e-05 + }, + { + "step": 486, + "epoch": 0.7788461538461539, + "cpu_mem": 1.8079744, + "gpu_mem": 4.456487424, + "loss": 0.9076, + "grad_norm": 2.5981979370117188, + "learning_rate": 4.2606008861862116e-05 + }, + { + "step": 487, + "epoch": 0.780448717948718, + "cpu_mem": 1.8079744, + "gpu_mem": 4.456487424, + "loss": 0.9284, + "grad_norm": 3.7493090629577637, + "learning_rate": 4.2021258478726774e-05 + }, + { + "step": 488, + "epoch": 0.782051282051282, + "cpu_mem": 1.8079744, + "gpu_mem": 4.456453632, + "loss": 0.9696, + "grad_norm": 2.9125239849090576, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 489, + "epoch": 0.7836538461538461, + "cpu_mem": 1.8079744, + "gpu_mem": 4.456478208, + "loss": 1.0269, + "grad_norm": 3.374387502670288, + "learning_rate": 4.0861934509848507e-05 + }, + { + "step": 490, + "epoch": 0.7852564102564102, + "cpu_mem": 1.808171008, + "gpu_mem": 4.45648128, + "loss": 0.9441, + "grad_norm": 2.616443157196045, + "learning_rate": 4.028739728024022e-05 + }, + { + "step": 491, + "epoch": 0.7868589743589743, + "cpu_mem": 1.808171008, + "gpu_mem": 4.456459776, + "loss": 0.9166, + "grad_norm": 1.8926653861999512, + "learning_rate": 3.971630061277077e-05 + }, + { + "step": 492, + "epoch": 0.7884615384615384, + "cpu_mem": 1.808171008, + "gpu_mem": 4.456482816, + "loss": 0.8423, + "grad_norm": 2.8136515617370605, + "learning_rate": 3.914866241690115e-05 + }, + { + "step": 493, + "epoch": 0.7900641025641025, + "cpu_mem": 1.808171008, + "gpu_mem": 4.456462848, + "loss": 1.0199, + "grad_norm": 2.407231330871582, + "learning_rate": 3.858450049363532e-05 + }, + { + "step": 494, + "epoch": 0.7916666666666666, + "cpu_mem": 1.808171008, + "gpu_mem": 4.456487424, + "loss": 0.8281, + "grad_norm": 1.5886608362197876, + "learning_rate": 3.8023832534962314e-05 + }, + { + "step": 495, + "epoch": 0.7932692307692307, + "cpu_mem": 1.808367616, + "gpu_mem": 4.456470528, + "loss": 0.8961, + "grad_norm": 2.0994338989257812, + "learning_rate": 3.746667612330109e-05 + }, + { + "step": 496, + "epoch": 0.7948717948717948, + "cpu_mem": 1.808367616, + "gpu_mem": 4.45646592, + "loss": 0.8631, + "grad_norm": 1.5018783807754517, + "learning_rate": 3.691304873094927e-05 + }, + { + "step": 497, + "epoch": 0.7964743589743589, + "cpu_mem": 1.808367616, + "gpu_mem": 4.456478208, + "loss": 1.0609, + "grad_norm": 1.968320608139038, + "learning_rate": 3.636296771953544e-05 + }, + { + "step": 498, + "epoch": 0.7980769230769231, + "cpu_mem": 1.808367616, + "gpu_mem": 4.456449024, + "loss": 0.9181, + "grad_norm": 2.7423806190490723, + "learning_rate": 3.581645033947425e-05 + }, + { + "step": 499, + "epoch": 0.7996794871794872, + "cpu_mem": 1.808367616, + "gpu_mem": 4.456462848, + "loss": 0.9654, + "grad_norm": 2.1253821849823, + "learning_rate": 3.527351372942588e-05 + }, + { + "step": 500, + "epoch": 0.8012820512820513, + "cpu_mem": 1.808367616, + "gpu_mem": 4.456449024, + "loss": 0.8907, + "grad_norm": 3.0386617183685303, + "learning_rate": 3.473417491575824e-05 + }, + { + "step": 501, + "epoch": 0.8028846153846154, + "cpu_mem": 1.808367616, + "gpu_mem": 4.45644288, + "loss": 0.9218, + "grad_norm": 2.3291690349578857, + "learning_rate": 3.41984508120132e-05 + }, + { + "step": 502, + "epoch": 0.8044871794871795, + "cpu_mem": 1.808367616, + "gpu_mem": 4.456449024, + "loss": 0.8181, + "grad_norm": 1.7332866191864014, + "learning_rate": 3.366635821837627e-05 + }, + { + "step": 503, + "epoch": 0.8060897435897436, + "cpu_mem": 1.808367616, + "gpu_mem": 4.456462848, + "loss": 0.9421, + "grad_norm": 2.003187417984009, + "learning_rate": 3.3137913821149425e-05 + }, + { + "step": 504, + "epoch": 0.8076923076923077, + "cpu_mem": 1.808367616, + "gpu_mem": 4.456445952, + "loss": 0.9916, + "grad_norm": 2.8462653160095215, + "learning_rate": 3.261313419222825e-05 + }, + { + "step": 505, + "epoch": 0.8092948717948718, + "cpu_mem": 1.808367616, + "gpu_mem": 4.456499712, + "loss": 0.8033, + "grad_norm": 1.66242253780365, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 506, + "epoch": 0.8108974358974359, + "cpu_mem": 1.808564224, + "gpu_mem": 4.45644288, + "loss": 0.8631, + "grad_norm": 3.3293495178222656, + "learning_rate": 3.157463495173713e-05 + }, + { + "step": 507, + "epoch": 0.8125, + "cpu_mem": 1.808564224, + "gpu_mem": 4.456521216, + "loss": 1.0182, + "grad_norm": 4.251587390899658, + "learning_rate": 3.1060947907265936e-05 + }, + { + "step": 508, + "epoch": 0.8141025641025641, + "cpu_mem": 1.808564224, + "gpu_mem": 4.456464384, + "loss": 0.8262, + "grad_norm": 3.472240447998047, + "learning_rate": 3.0550990764276634e-05 + }, + { + "step": 509, + "epoch": 0.8157051282051282, + "cpu_mem": 1.808564224, + "gpu_mem": 4.456482816, + "loss": 0.9516, + "grad_norm": 3.379431962966919, + "learning_rate": 3.00447795149086e-05 + }, + { + "step": 510, + "epoch": 0.8173076923076923, + "cpu_mem": 1.808564224, + "gpu_mem": 4.45645824, + "loss": 0.8469, + "grad_norm": 2.7214431762695312, + "learning_rate": 2.9542330033830884e-05 + }, + { + "step": 511, + "epoch": 0.8189102564102564, + "cpu_mem": 1.808564224, + "gpu_mem": 4.456490496, + "loss": 0.9145, + "grad_norm": 2.7970778942108154, + "learning_rate": 2.9043658077744316e-05 + }, + { + "step": 512, + "epoch": 0.8205128205128205, + "cpu_mem": 1.808564224, + "gpu_mem": 4.456510464, + "loss": 1.0685, + "grad_norm": 3.1822633743286133, + "learning_rate": 2.8548779284887442e-05 + }, + { + "step": 513, + "epoch": 0.8221153846153846, + "cpu_mem": 1.808564224, + "gpu_mem": 4.456439808, + "loss": 1.0837, + "grad_norm": 3.6782262325286865, + "learning_rate": 2.805770917454614e-05 + }, + { + "step": 514, + "epoch": 0.8237179487179487, + "cpu_mem": 1.808564224, + "gpu_mem": 4.456453632, + "loss": 0.8502, + "grad_norm": 2.5391392707824707, + "learning_rate": 2.7570463146566758e-05 + }, + { + "step": 515, + "epoch": 0.8253205128205128, + "cpu_mem": 1.808564224, + "gpu_mem": 4.456438272, + "loss": 0.9348, + "grad_norm": 2.046281337738037, + "learning_rate": 2.708705648087332e-05 + }, + { + "step": 516, + "epoch": 0.8269230769230769, + "cpu_mem": 1.808564224, + "gpu_mem": 4.456476672, + "loss": 0.8462, + "grad_norm": 2.882793664932251, + "learning_rate": 2.6607504336988317e-05 + }, + { + "step": 517, + "epoch": 0.8285256410256411, + "cpu_mem": 1.808564224, + "gpu_mem": 4.456476672, + "loss": 0.983, + "grad_norm": 2.7392687797546387, + "learning_rate": 2.613182175355739e-05 + }, + { + "step": 518, + "epoch": 0.8301282051282052, + "cpu_mem": 1.808564224, + "gpu_mem": 4.456462848, + "loss": 0.9918, + "grad_norm": 2.434717893600464, + "learning_rate": 2.5660023647877644e-05 + }, + { + "step": 519, + "epoch": 0.8317307692307693, + "cpu_mem": 1.808564224, + "gpu_mem": 4.456453632, + "loss": 0.7791, + "grad_norm": 1.9082975387573242, + "learning_rate": 2.5192124815429777e-05 + }, + { + "step": 520, + "epoch": 0.8333333333333334, + "cpu_mem": 1.808564224, + "gpu_mem": 4.45645824, + "loss": 0.9363, + "grad_norm": 2.2349612712860107, + "learning_rate": 2.472813992941418e-05 + }, + { + "step": 521, + "epoch": 0.8349358974358975, + "cpu_mem": 1.808564224, + "gpu_mem": 4.456461312, + "loss": 0.9536, + "grad_norm": 2.894627809524536, + "learning_rate": 2.426808354029078e-05 + }, + { + "step": 522, + "epoch": 0.8365384615384616, + "cpu_mem": 1.808564224, + "gpu_mem": 4.456467456, + "loss": 0.875, + "grad_norm": 1.7110627889633179, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 523, + "epoch": 0.8381410256410257, + "cpu_mem": 1.808564224, + "gpu_mem": 4.456485888, + "loss": 0.8728, + "grad_norm": 2.1646320819854736, + "learning_rate": 2.3359813838124277e-05 + }, + { + "step": 524, + "epoch": 0.8397435897435898, + "cpu_mem": 1.808760832, + "gpu_mem": 4.456479744, + "loss": 0.9226, + "grad_norm": 2.4913902282714844, + "learning_rate": 2.2911629008211363e-05 + }, + { + "step": 525, + "epoch": 0.8413461538461539, + "cpu_mem": 1.808760832, + "gpu_mem": 4.456456704, + "loss": 0.9259, + "grad_norm": 2.7608680725097656, + "learning_rate": 2.24674296405579e-05 + }, + { + "step": 526, + "epoch": 0.842948717948718, + "cpu_mem": 1.808760832, + "gpu_mem": 4.456444416, + "loss": 0.9845, + "grad_norm": 3.5295655727386475, + "learning_rate": 2.2027229665154446e-05 + }, + { + "step": 527, + "epoch": 0.844551282051282, + "cpu_mem": 1.808760832, + "gpu_mem": 4.456410624, + "loss": 1.0515, + "grad_norm": 5.383983612060547, + "learning_rate": 2.1591042886571634e-05 + }, + { + "step": 528, + "epoch": 0.8461538461538461, + "cpu_mem": 1.808760832, + "gpu_mem": 4.45645824, + "loss": 0.905, + "grad_norm": 2.4793217182159424, + "learning_rate": 2.1158882983527166e-05 + }, + { + "step": 529, + "epoch": 0.8477564102564102, + "cpu_mem": 1.808760832, + "gpu_mem": 4.456424448, + "loss": 0.9515, + "grad_norm": 1.763176441192627, + "learning_rate": 2.0730763508456738e-05 + }, + { + "step": 530, + "epoch": 0.8493589743589743, + "cpu_mem": 1.808760832, + "gpu_mem": 4.456472064, + "loss": 0.9184, + "grad_norm": 2.6453194618225098, + "learning_rate": 2.0306697887089235e-05 + }, + { + "step": 531, + "epoch": 0.8509615384615384, + "cpu_mem": 1.808760832, + "gpu_mem": 4.456470528, + "loss": 0.902, + "grad_norm": 2.20216965675354, + "learning_rate": 1.9886699418025543e-05 + }, + { + "step": 532, + "epoch": 0.8525641025641025, + "cpu_mem": 1.808760832, + "gpu_mem": 4.456472064, + "loss": 0.7802, + "grad_norm": 2.7239909172058105, + "learning_rate": 1.947078127232169e-05 + }, + { + "step": 533, + "epoch": 0.8541666666666666, + "cpu_mem": 1.808760832, + "gpu_mem": 4.45648128, + "loss": 1.0194, + "grad_norm": 2.501967191696167, + "learning_rate": 1.9058956493075644e-05 + }, + { + "step": 534, + "epoch": 0.8557692307692307, + "cpu_mem": 1.80895744, + "gpu_mem": 4.456456704, + "loss": 0.8517, + "grad_norm": 2.181183099746704, + "learning_rate": 1.8651237995018324e-05 + }, + { + "step": 535, + "epoch": 0.8573717948717948, + "cpu_mem": 1.80895744, + "gpu_mem": 4.456441344, + "loss": 0.8453, + "grad_norm": 1.6856112480163574, + "learning_rate": 1.8247638564108607e-05 + }, + { + "step": 536, + "epoch": 0.8589743589743589, + "cpu_mem": 1.80895744, + "gpu_mem": 4.456470528, + "loss": 0.9293, + "grad_norm": 3.1360461711883545, + "learning_rate": 1.7848170857132325e-05 + }, + { + "step": 537, + "epoch": 0.8605769230769231, + "cpu_mem": 1.80895744, + "gpu_mem": 4.456484352, + "loss": 0.9301, + "grad_norm": 1.814341425895691, + "learning_rate": 1.7452847401305496e-05 + }, + { + "step": 538, + "epoch": 0.8621794871794872, + "cpu_mem": 1.80895744, + "gpu_mem": 4.456439808, + "loss": 1.0481, + "grad_norm": 4.089025497436523, + "learning_rate": 1.7061680593881344e-05 + }, + { + "step": 539, + "epoch": 0.8637820512820513, + "cpu_mem": 1.80895744, + "gpu_mem": 4.456445952, + "loss": 1.1787, + "grad_norm": 3.011101245880127, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 540, + "epoch": 0.8653846153846154, + "cpu_mem": 1.80895744, + "gpu_mem": 4.456475136, + "loss": 1.0261, + "grad_norm": 2.265751361846924, + "learning_rate": 1.6291865861111353e-05 + }, + { + "step": 541, + "epoch": 0.8669871794871795, + "cpu_mem": 1.80895744, + "gpu_mem": 4.456470528, + "loss": 0.7825, + "grad_norm": 1.7601159811019897, + "learning_rate": 1.5913242076979493e-05 + }, + { + "step": 542, + "epoch": 0.8685897435897436, + "cpu_mem": 1.80895744, + "gpu_mem": 4.456456704, + "loss": 0.9745, + "grad_norm": 2.4858498573303223, + "learning_rate": 1.5538823222921288e-05 + }, + { + "step": 543, + "epoch": 0.8701923076923077, + "cpu_mem": 1.80895744, + "gpu_mem": 4.456470528, + "loss": 0.8472, + "grad_norm": 3.279144287109375, + "learning_rate": 1.5168621040626388e-05 + }, + { + "step": 544, + "epoch": 0.8717948717948718, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456459776, + "loss": 0.9227, + "grad_norm": 2.0984091758728027, + "learning_rate": 1.4802647139550577e-05 + }, + { + "step": 545, + "epoch": 0.8733974358974359, + "cpu_mem": 1.809154048, + "gpu_mem": 4.45646592, + "loss": 0.8516, + "grad_norm": 2.1800177097320557, + "learning_rate": 1.444091299655175e-05 + }, + { + "step": 546, + "epoch": 0.875, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456470528, + "loss": 0.9757, + "grad_norm": 2.5580296516418457, + "learning_rate": 1.408342995552988e-05 + }, + { + "step": 547, + "epoch": 0.8766025641025641, + "cpu_mem": 1.809154048, + "gpu_mem": 4.45646592, + "loss": 0.8616, + "grad_norm": 2.573448896408081, + "learning_rate": 1.3730209227071436e-05 + }, + { + "step": 548, + "epoch": 0.8782051282051282, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456439808, + "loss": 0.8333, + "grad_norm": 2.3756070137023926, + "learning_rate": 1.3381261888097755e-05 + }, + { + "step": 549, + "epoch": 0.8798076923076923, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456449024, + "loss": 0.9057, + "grad_norm": 2.7708489894866943, + "learning_rate": 1.303659888151753e-05 + }, + { + "step": 550, + "epoch": 0.8814102564102564, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456467456, + "loss": 1.0279, + "grad_norm": 2.7567365169525146, + "learning_rate": 1.2696231015883913e-05 + }, + { + "step": 551, + "epoch": 0.8830128205128205, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456438272, + "loss": 0.9196, + "grad_norm": 2.6583333015441895, + "learning_rate": 1.2360168965055301e-05 + }, + { + "step": 552, + "epoch": 0.8846153846153846, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456468992, + "loss": 0.8971, + "grad_norm": 2.0568013191223145, + "learning_rate": 1.2028423267860805e-05 + }, + { + "step": 553, + "epoch": 0.8862179487179487, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456478208, + "loss": 0.9267, + "grad_norm": 2.130746603012085, + "learning_rate": 1.1701004327769709e-05 + }, + { + "step": 554, + "epoch": 0.8878205128205128, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456439808, + "loss": 1.0663, + "grad_norm": 2.6815202236175537, + "learning_rate": 1.1377922412565005e-05 + }, + { + "step": 555, + "epoch": 0.8894230769230769, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456444416, + "loss": 0.7626, + "grad_norm": 1.9965585470199585, + "learning_rate": 1.1059187654021762e-05 + }, + { + "step": 556, + "epoch": 0.8910256410256411, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456468992, + "loss": 0.7735, + "grad_norm": 1.708986520767212, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 557, + "epoch": 0.8926282051282052, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456487424, + "loss": 0.9413, + "grad_norm": 1.8210986852645874, + "learning_rate": 1.0434799452076915e-05 + }, + { + "step": 558, + "epoch": 0.8942307692307693, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456468992, + "loss": 1.0459, + "grad_norm": 3.2495720386505127, + "learning_rate": 1.0129165589346643e-05 + }, + { + "step": 559, + "epoch": 0.8958333333333334, + "cpu_mem": 1.809154048, + "gpu_mem": 4.45651968, + "loss": 0.8365, + "grad_norm": 1.9135313034057617, + "learning_rate": 9.82791804400626e-06 + }, + { + "step": 560, + "epoch": 0.8974358974358975, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456452096, + "loss": 1.0789, + "grad_norm": 2.5548579692840576, + "learning_rate": 9.531066263109971e-06 + }, + { + "step": 561, + "epoch": 0.8990384615384616, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456453632, + "loss": 0.8677, + "grad_norm": 2.5515215396881104, + "learning_rate": 9.238619555861731e-06 + }, + { + "step": 562, + "epoch": 0.9006410256410257, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456453632, + "loss": 0.9902, + "grad_norm": 3.279202938079834, + "learning_rate": 8.950587093323435e-06 + }, + { + "step": 563, + "epoch": 0.9022435897435898, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456459776, + "loss": 0.8365, + "grad_norm": 1.9643250703811646, + "learning_rate": 8.66697790812731e-06 + }, + { + "step": 564, + "epoch": 0.9038461538461539, + "cpu_mem": 1.809154048, + "gpu_mem": 4.4564736, + "loss": 0.8288, + "grad_norm": 2.3503835201263428, + "learning_rate": 8.387800894192453e-06 + }, + { + "step": 565, + "epoch": 0.905448717948718, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456478208, + "loss": 0.9847, + "grad_norm": 2.905104875564575, + "learning_rate": 8.113064806446285e-06 + }, + { + "step": 566, + "epoch": 0.907051282051282, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456472064, + "loss": 0.8892, + "grad_norm": 2.907665729522705, + "learning_rate": 7.842778260549654e-06 + }, + { + "step": 567, + "epoch": 0.9086538461538461, + "cpu_mem": 1.809154048, + "gpu_mem": 4.45646592, + "loss": 0.8601, + "grad_norm": 2.1730449199676514, + "learning_rate": 7.5769497326268804e-06 + }, + { + "step": 568, + "epoch": 0.9102564102564102, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456479744, + "loss": 0.8791, + "grad_norm": 3.2807812690734863, + "learning_rate": 7.315587558999864e-06 + }, + { + "step": 569, + "epoch": 0.9118589743589743, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456472064, + "loss": 1.0603, + "grad_norm": 2.285287857055664, + "learning_rate": 7.058699935926526e-06 + }, + { + "step": 570, + "epoch": 0.9134615384615384, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456456704, + "loss": 0.8973, + "grad_norm": 2.1504557132720947, + "learning_rate": 6.8062949193440515e-06 + }, + { + "step": 571, + "epoch": 0.9150641025641025, + "cpu_mem": 1.809154048, + "gpu_mem": 4.45646592, + "loss": 0.9706, + "grad_norm": 2.0538368225097656, + "learning_rate": 6.5583804246160385e-06 + }, + { + "step": 572, + "epoch": 0.9166666666666666, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456475136, + "loss": 0.8493, + "grad_norm": 2.082390308380127, + "learning_rate": 6.3149642262843804e-06 + }, + { + "step": 573, + "epoch": 0.9182692307692307, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456478208, + "loss": 1.0256, + "grad_norm": 2.828127861022949, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 574, + "epoch": 0.9198717948717948, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456432128, + "loss": 0.8806, + "grad_norm": 2.6117889881134033, + "learning_rate": 5.84165711141048e-06 + }, + { + "step": 575, + "epoch": 0.9214743589743589, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456485888, + "loss": 0.8203, + "grad_norm": 2.674793243408203, + "learning_rate": 5.611781037671176e-06 + }, + { + "step": 576, + "epoch": 0.9230769230769231, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456487424, + "loss": 0.6633, + "grad_norm": 1.600711464881897, + "learning_rate": 5.386432945468555e-06 + }, + { + "step": 577, + "epoch": 0.9246794871794872, + "cpu_mem": 1.809154048, + "gpu_mem": 4.456432128, + "loss": 0.9213, + "grad_norm": 2.563718318939209, + "learning_rate": 5.165619901667311e-06 + }, + { + "step": 578, + "epoch": 0.9262820512820513, + "cpu_mem": 1.809154048, + "gpu_mem": 4.45646592, + "loss": 1.0261, + "grad_norm": 2.619046211242676, + "learning_rate": 4.949348830914002e-06 + }, + { + "step": 579, + "epoch": 0.9278846153846154, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456444416, + "loss": 0.9092, + "grad_norm": 1.8496012687683105, + "learning_rate": 4.737626515419951e-06 + }, + { + "step": 580, + "epoch": 0.9294871794871795, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456475136, + "loss": 0.8851, + "grad_norm": 2.0986618995666504, + "learning_rate": 4.530459594748592e-06 + }, + { + "step": 581, + "epoch": 0.9310897435897436, + "cpu_mem": 1.809350656, + "gpu_mem": 4.45645056, + "loss": 0.8494, + "grad_norm": 2.8303415775299072, + "learning_rate": 4.327854565607164e-06 + }, + { + "step": 582, + "epoch": 0.9326923076923077, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456484352, + "loss": 0.8422, + "grad_norm": 2.1133575439453125, + "learning_rate": 4.129817781643091e-06 + }, + { + "step": 583, + "epoch": 0.9342948717948718, + "cpu_mem": 1.809350656, + "gpu_mem": 4.45650432, + "loss": 1.0413, + "grad_norm": 3.0517375469207764, + "learning_rate": 3.9363554532446276e-06 + }, + { + "step": 584, + "epoch": 0.9358974358974359, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456468992, + "loss": 0.8291, + "grad_norm": 2.797323703765869, + "learning_rate": 3.7474736473461607e-06 + }, + { + "step": 585, + "epoch": 0.9375, + "cpu_mem": 1.809350656, + "gpu_mem": 4.45648896, + "loss": 0.9053, + "grad_norm": 2.0445802211761475, + "learning_rate": 3.56317828723795e-06 + }, + { + "step": 586, + "epoch": 0.9391025641025641, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456468992, + "loss": 0.7954, + "grad_norm": 2.0742852687835693, + "learning_rate": 3.383475152380355e-06 + }, + { + "step": 587, + "epoch": 0.9407051282051282, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456470528, + "loss": 0.8644, + "grad_norm": 3.4847335815429688, + "learning_rate": 3.2083698782225997e-06 + }, + { + "step": 588, + "epoch": 0.9423076923076923, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456462848, + "loss": 0.851, + "grad_norm": 2.7452690601348877, + "learning_rate": 3.0378679560260467e-06 + }, + { + "step": 589, + "epoch": 0.9439102564102564, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456461312, + "loss": 0.8892, + "grad_norm": 2.815835952758789, + "learning_rate": 2.871974732691984e-06 + }, + { + "step": 590, + "epoch": 0.9455128205128205, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456475136, + "loss": 1.0197, + "grad_norm": 2.3586599826812744, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 591, + "epoch": 0.9471153846153846, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456445952, + "loss": 0.8977, + "grad_norm": 2.4237234592437744, + "learning_rate": 2.554035047414732e-06 + }, + { + "step": 592, + "epoch": 0.9487179487179487, + "cpu_mem": 1.809350656, + "gpu_mem": 4.45649664, + "loss": 1.0575, + "grad_norm": 4.875778675079346, + "learning_rate": 2.401998555987389e-06 + }, + { + "step": 593, + "epoch": 0.9503205128205128, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456493568, + "loss": 0.9063, + "grad_norm": 2.788086414337158, + "learning_rate": 2.2545907041415457e-06 + }, + { + "step": 594, + "epoch": 0.9519230769230769, + "cpu_mem": 1.809350656, + "gpu_mem": 4.4564736, + "loss": 0.9126, + "grad_norm": 2.4821043014526367, + "learning_rate": 2.1118161145537436e-06 + }, + { + "step": 595, + "epoch": 0.9535256410256411, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456455168, + "loss": 0.8398, + "grad_norm": 1.955717921257019, + "learning_rate": 1.973679264602485e-06 + }, + { + "step": 596, + "epoch": 0.9551282051282052, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456464384, + "loss": 0.9081, + "grad_norm": 2.556626081466675, + "learning_rate": 1.840184486227808e-06 + }, + { + "step": 597, + "epoch": 0.9567307692307693, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456432128, + "loss": 0.9477, + "grad_norm": 4.19248628616333, + "learning_rate": 1.711335965795435e-06 + }, + { + "step": 598, + "epoch": 0.9583333333333334, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456493568, + "loss": 0.9258, + "grad_norm": 2.5303962230682373, + "learning_rate": 1.5871377439655054e-06 + }, + { + "step": 599, + "epoch": 0.9599358974358975, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456492032, + "loss": 0.9099, + "grad_norm": 2.2715554237365723, + "learning_rate": 1.4675937155658456e-06 + }, + { + "step": 600, + "epoch": 0.9615384615384616, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456447488, + "loss": 0.8814, + "grad_norm": 2.0561680793762207, + "learning_rate": 1.3527076294698846e-06 + }, + { + "step": 601, + "epoch": 0.9631410256410257, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456479744, + "loss": 0.8255, + "grad_norm": 2.545565128326416, + "learning_rate": 1.2424830884790126e-06 + }, + { + "step": 602, + "epoch": 0.9647435897435898, + "cpu_mem": 1.809350656, + "gpu_mem": 4.4564736, + "loss": 0.7438, + "grad_norm": 2.1717121601104736, + "learning_rate": 1.1369235492096397e-06 + }, + { + "step": 603, + "epoch": 0.9663461538461539, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456459776, + "loss": 0.7719, + "grad_norm": 3.2455124855041504, + "learning_rate": 1.0360323219847645e-06 + }, + { + "step": 604, + "epoch": 0.967948717948718, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456459776, + "loss": 1.0074, + "grad_norm": 2.6642205715179443, + "learning_rate": 9.398125707302084e-07 + }, + { + "step": 605, + "epoch": 0.969551282051282, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456485888, + "loss": 0.8265, + "grad_norm": 2.0840072631835938, + "learning_rate": 8.482673128753947e-07 + }, + { + "step": 606, + "epoch": 0.9711538461538461, + "cpu_mem": 1.809350656, + "gpu_mem": 4.4564736, + "loss": 0.7508, + "grad_norm": 2.643014669418335, + "learning_rate": 7.613994192586736e-07 + }, + { + "step": 607, + "epoch": 0.9727564102564102, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456464384, + "loss": 1.0115, + "grad_norm": 1.9522701501846313, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 608, + "epoch": 0.9743589743589743, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456456704, + "loss": 0.7706, + "grad_norm": 3.22189998626709, + "learning_rate": 6.017064746021094e-07 + }, + { + "step": 609, + "epoch": 0.9759615384615384, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456479744, + "loss": 0.7243, + "grad_norm": 2.309396982192993, + "learning_rate": 5.288864314965003e-07 + }, + { + "step": 610, + "epoch": 0.9775641025641025, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456468992, + "loss": 0.8263, + "grad_norm": 1.849007248878479, + "learning_rate": 4.607537683404106e-07 + }, + { + "step": 611, + "epoch": 0.9791666666666666, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456453632, + "loss": 0.7949, + "grad_norm": 2.6056885719299316, + "learning_rate": 3.973106217585842e-07 + }, + { + "step": 612, + "epoch": 0.9807692307692307, + "cpu_mem": 1.809350656, + "gpu_mem": 4.45651968, + "loss": 0.8039, + "grad_norm": 2.628697395324707, + "learning_rate": 3.3855898131356915e-07 + }, + { + "step": 613, + "epoch": 0.9823717948717948, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456462848, + "loss": 1.0014, + "grad_norm": 2.5199790000915527, + "learning_rate": 2.845006894433843e-07 + }, + { + "step": 614, + "epoch": 0.9839743589743589, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456449024, + "loss": 0.9339, + "grad_norm": 2.171428680419922, + "learning_rate": 2.351374414037155e-07 + }, + { + "step": 615, + "epoch": 0.9855769230769231, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456513536, + "loss": 0.9545, + "grad_norm": 3.3220839500427246, + "learning_rate": 1.9047078521474135e-07 + }, + { + "step": 616, + "epoch": 0.9871794871794872, + "cpu_mem": 1.809350656, + "gpu_mem": 4.45644288, + "loss": 0.8303, + "grad_norm": 2.676177978515625, + "learning_rate": 1.505021216125557e-07 + }, + { + "step": 617, + "epoch": 0.9887820512820513, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456470528, + "loss": 0.7201, + "grad_norm": 2.6833105087280273, + "learning_rate": 1.1523270400535245e-07 + }, + { + "step": 618, + "epoch": 0.9903846153846154, + "cpu_mem": 1.809350656, + "gpu_mem": 4.4564736, + "loss": 0.7927, + "grad_norm": 1.8235676288604736, + "learning_rate": 8.466363843397383e-08 + }, + { + "step": 619, + "epoch": 0.9919871794871795, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456449024, + "loss": 0.9017, + "grad_norm": 2.877737045288086, + "learning_rate": 5.8795883537338106e-08 + }, + { + "step": 620, + "epoch": 0.9935897435897436, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456476672, + "loss": 0.7923, + "grad_norm": 1.8737214803695679, + "learning_rate": 3.763025052231361e-08 + }, + { + "step": 621, + "epoch": 0.9951923076923077, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456485888, + "loss": 0.9792, + "grad_norm": 2.830772638320923, + "learning_rate": 2.1167403138339088e-08 + }, + { + "step": 622, + "epoch": 0.9967948717948718, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456482816, + "loss": 0.9321, + "grad_norm": 2.5512380599975586, + "learning_rate": 9.407857656540397e-09 + }, + { + "step": 623, + "epoch": 0.9983974358974359, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456455168, + "loss": 1.0029, + "grad_norm": 2.564606189727783, + "learning_rate": 2.3519828535434325e-09 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456201728, + "loss": 0.7466, + "grad_norm": 2.5306501388549805, + "learning_rate": 0.0 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.809350656, + "gpu_mem": 4.456201728, + "train_runtime": 8039.6235, + "train_samples_per_second": 4.964, + "train_steps_per_second": 0.078, + "total_flos": 0.0, + "train_loss": 1.2762103585096507 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r2-a2/adapter_config.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a6595677543f23b42f06770761e8d2aa18b1163d --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r2-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 4, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 2, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "D" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r2-a2/training_configuration.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..952ea9f97581d46d7e2eeb7fd1f634da30823ac8 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "WINOGRANDE", + "dataset_id": "allenai/winogrande", + "preprocess_id": "winogrande_train_deepeval" + }, + "peft_config": { + "method": "abl_D", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 789096 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_D-winogrande-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r2-a2", + "seed": 42, + "timestamp": "2025-09-02T08:23:57.212084" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r32-a2/adapter_config.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4d69c6bc9ef572e681044e096143c4cad32a3229 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r32-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 64, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 32, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "D" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r32-a2/eval_results.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..8f1b7cc12695f3796f1674ebf8085554f5d6baec --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "winogrande", + "results": 0.5224940805051302 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r32-a2/training_configuration.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..dd0b835822341e20240e5c8019fabb28b167709b --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "WINOGRANDE", + "dataset_id": "allenai/winogrande", + "preprocess_id": "winogrande_train_deepeval" + }, + "peft_config": { + "method": "abl_D", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 12773376 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_D-winogrande-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r32-a2", + "seed": 42, + "timestamp": "2025-09-02T22:14:13.485153" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r32-a2/training_logs.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..1be1f772e03df1a51858cfb7ab3d65a570a00735 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r32-a2/training_logs.json @@ -0,0 +1,5773 @@ +[ + { + "step": 1, + "epoch": 0.00625, + "cpu_mem": 1.712881664, + "gpu_mem": 4.469778944, + "loss": 3.3802, + "grad_norm": 32.1689338684082, + "learning_rate": 4.6875e-06 + }, + { + "step": 2, + "epoch": 0.0125, + "cpu_mem": 1.713471488, + "gpu_mem": 4.571963904, + "loss": 3.3361, + "grad_norm": 31.53998565673828, + "learning_rate": 9.375e-06 + }, + { + "step": 3, + "epoch": 0.01875, + "cpu_mem": 1.713668096, + "gpu_mem": 4.571968512, + "loss": 3.0855, + "grad_norm": 29.794095993041992, + "learning_rate": 1.40625e-05 + }, + { + "step": 4, + "epoch": 0.025, + "cpu_mem": 1.714061312, + "gpu_mem": 4.571966976, + "loss": 2.764, + "grad_norm": 29.946321487426758, + "learning_rate": 1.875e-05 + }, + { + "step": 5, + "epoch": 0.03125, + "cpu_mem": 1.71425792, + "gpu_mem": 4.571966976, + "loss": 2.382, + "grad_norm": 27.307903289794922, + "learning_rate": 2.3437499999999997e-05 + }, + { + "step": 6, + "epoch": 0.0375, + "cpu_mem": 1.714454528, + "gpu_mem": 4.57197312, + "loss": 1.9164, + "grad_norm": 22.70431137084961, + "learning_rate": 2.8125e-05 + }, + { + "step": 7, + "epoch": 0.04375, + "cpu_mem": 1.714847744, + "gpu_mem": 4.571979264, + "loss": 1.4825, + "grad_norm": 15.041815757751465, + "learning_rate": 3.28125e-05 + }, + { + "step": 8, + "epoch": 0.05, + "cpu_mem": 1.715044352, + "gpu_mem": 4.571962368, + "loss": 1.2038, + "grad_norm": 10.135503768920898, + "learning_rate": 3.75e-05 + }, + { + "step": 9, + "epoch": 0.05625, + "cpu_mem": 1.715044352, + "gpu_mem": 4.571968512, + "loss": 1.1983, + "grad_norm": 10.96366024017334, + "learning_rate": 4.2187499999999995e-05 + }, + { + "step": 10, + "epoch": 0.0625, + "cpu_mem": 1.71524096, + "gpu_mem": 4.571971584, + "loss": 0.9431, + "grad_norm": 6.626117706298828, + "learning_rate": 4.6874999999999994e-05 + }, + { + "step": 11, + "epoch": 0.06875, + "cpu_mem": 1.715437568, + "gpu_mem": 4.571960832, + "loss": 0.7981, + "grad_norm": 4.113291263580322, + "learning_rate": 5.156249999999999e-05 + }, + { + "step": 12, + "epoch": 0.075, + "cpu_mem": 1.715634176, + "gpu_mem": 4.57196544, + "loss": 0.7407, + "grad_norm": 3.954171657562256, + "learning_rate": 5.625e-05 + }, + { + "step": 13, + "epoch": 0.08125, + "cpu_mem": 1.715830784, + "gpu_mem": 4.57197312, + "loss": 0.8855, + "grad_norm": 11.876354217529297, + "learning_rate": 6.09375e-05 + }, + { + "step": 14, + "epoch": 0.0875, + "cpu_mem": 1.716027392, + "gpu_mem": 4.571968512, + "loss": 0.8488, + "grad_norm": 12.792831420898438, + "learning_rate": 6.5625e-05 + }, + { + "step": 15, + "epoch": 0.09375, + "cpu_mem": 1.716027392, + "gpu_mem": 4.571968512, + "loss": 0.878, + "grad_norm": 11.579307556152344, + "learning_rate": 7.03125e-05 + }, + { + "step": 16, + "epoch": 0.1, + "cpu_mem": 1.716027392, + "gpu_mem": 4.57196544, + "loss": 0.7041, + "grad_norm": 3.09954571723938, + "learning_rate": 7.5e-05 + }, + { + "step": 17, + "epoch": 0.10625, + "cpu_mem": 1.716224, + "gpu_mem": 4.57196544, + "loss": 0.6879, + "grad_norm": 1.0278576612472534, + "learning_rate": 7.968749999999999e-05 + }, + { + "step": 18, + "epoch": 0.1125, + "cpu_mem": 1.716224, + "gpu_mem": 4.571968512, + "loss": 0.76, + "grad_norm": 3.5721943378448486, + "learning_rate": 8.437499999999999e-05 + }, + { + "step": 19, + "epoch": 0.11875, + "cpu_mem": 1.716420608, + "gpu_mem": 4.57196544, + "loss": 0.7737, + "grad_norm": 3.449002981185913, + "learning_rate": 8.906249999999999e-05 + }, + { + "step": 20, + "epoch": 0.125, + "cpu_mem": 1.716420608, + "gpu_mem": 4.57197312, + "loss": 0.7914, + "grad_norm": 3.4342544078826904, + "learning_rate": 9.374999999999999e-05 + }, + { + "step": 21, + "epoch": 0.13125, + "cpu_mem": 1.716420608, + "gpu_mem": 4.57196544, + "loss": 0.6948, + "grad_norm": 0.9149996042251587, + "learning_rate": 9.843749999999999e-05 + }, + { + "step": 22, + "epoch": 0.1375, + "cpu_mem": 1.716420608, + "gpu_mem": 4.57196544, + "loss": 0.7296, + "grad_norm": 2.0590622425079346, + "learning_rate": 0.00010312499999999999 + }, + { + "step": 23, + "epoch": 0.14375, + "cpu_mem": 1.716617216, + "gpu_mem": 4.571960832, + "loss": 0.6745, + "grad_norm": 0.7544016242027283, + "learning_rate": 0.00010781249999999998 + }, + { + "step": 24, + "epoch": 0.15, + "cpu_mem": 1.716617216, + "gpu_mem": 4.571963904, + "loss": 0.7204, + "grad_norm": 1.3646470308303833, + "learning_rate": 0.0001125 + }, + { + "step": 25, + "epoch": 0.15625, + "cpu_mem": 1.716617216, + "gpu_mem": 4.571966976, + "loss": 0.7236, + "grad_norm": 0.868041455745697, + "learning_rate": 0.0001171875 + }, + { + "step": 26, + "epoch": 0.1625, + "cpu_mem": 1.716617216, + "gpu_mem": 4.571962368, + "loss": 0.7047, + "grad_norm": 1.171709418296814, + "learning_rate": 0.000121875 + }, + { + "step": 27, + "epoch": 0.16875, + "cpu_mem": 1.716617216, + "gpu_mem": 4.571960832, + "loss": 0.7338, + "grad_norm": 2.18450927734375, + "learning_rate": 0.0001265625 + }, + { + "step": 28, + "epoch": 0.175, + "cpu_mem": 1.716617216, + "gpu_mem": 4.571966976, + "loss": 0.702, + "grad_norm": 0.5332483053207397, + "learning_rate": 0.00013125 + }, + { + "step": 29, + "epoch": 0.18125, + "cpu_mem": 1.716617216, + "gpu_mem": 4.57196544, + "loss": 0.7234, + "grad_norm": 0.889076828956604, + "learning_rate": 0.0001359375 + }, + { + "step": 30, + "epoch": 0.1875, + "cpu_mem": 1.716813824, + "gpu_mem": 4.57196544, + "loss": 0.7256, + "grad_norm": 1.3256253004074097, + "learning_rate": 0.000140625 + }, + { + "step": 31, + "epoch": 0.19375, + "cpu_mem": 1.716813824, + "gpu_mem": 4.57196544, + "loss": 0.6893, + "grad_norm": 0.3865755796432495, + "learning_rate": 0.0001453125 + }, + { + "step": 32, + "epoch": 0.2, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6619, + "grad_norm": 0.6689721345901489, + "learning_rate": 0.00015 + }, + { + "step": 33, + "epoch": 0.20625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.7986, + "grad_norm": 3.04117751121521, + "learning_rate": 0.00015468749999999999 + }, + { + "step": 34, + "epoch": 0.2125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.7228, + "grad_norm": 1.7011828422546387, + "learning_rate": 0.00015937499999999998 + }, + { + "step": 35, + "epoch": 0.21875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.7496, + "grad_norm": 3.9139840602874756, + "learning_rate": 0.00016406249999999998 + }, + { + "step": 36, + "epoch": 0.225, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.8573, + "grad_norm": 5.903606414794922, + "learning_rate": 0.00016874999999999998 + }, + { + "step": 37, + "epoch": 0.23125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6923, + "grad_norm": 1.0770939588546753, + "learning_rate": 0.00017343749999999998 + }, + { + "step": 38, + "epoch": 0.2375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6935, + "grad_norm": 0.7174488306045532, + "learning_rate": 0.00017812499999999998 + }, + { + "step": 39, + "epoch": 0.24375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.6768, + "grad_norm": 0.3294523358345032, + "learning_rate": 0.00018281249999999998 + }, + { + "step": 40, + "epoch": 0.25, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.7557, + "grad_norm": 1.5978267192840576, + "learning_rate": 0.00018749999999999998 + }, + { + "step": 41, + "epoch": 0.25625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6985, + "grad_norm": 0.39825746417045593, + "learning_rate": 0.00019218749999999998 + }, + { + "step": 42, + "epoch": 0.2625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6962, + "grad_norm": 0.9870648384094238, + "learning_rate": 0.00019687499999999997 + }, + { + "step": 43, + "epoch": 0.26875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7196, + "grad_norm": 1.1788341999053955, + "learning_rate": 0.00020156249999999997 + }, + { + "step": 44, + "epoch": 0.275, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7045, + "grad_norm": 0.6213193535804749, + "learning_rate": 0.00020624999999999997 + }, + { + "step": 45, + "epoch": 0.28125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571974656, + "loss": 0.699, + "grad_norm": 0.4660557806491852, + "learning_rate": 0.00021093749999999997 + }, + { + "step": 46, + "epoch": 0.2875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7154, + "grad_norm": 0.5174314379692078, + "learning_rate": 0.00021562499999999997 + }, + { + "step": 47, + "epoch": 0.29375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.7168, + "grad_norm": 0.46802976727485657, + "learning_rate": 0.00022031249999999997 + }, + { + "step": 48, + "epoch": 0.3, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6882, + "grad_norm": 0.5682080984115601, + "learning_rate": 0.000225 + }, + { + "step": 49, + "epoch": 0.30625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.7535, + "grad_norm": 1.8219002485275269, + "learning_rate": 0.0002296875 + }, + { + "step": 50, + "epoch": 0.3125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.7144, + "grad_norm": 0.7219007015228271, + "learning_rate": 0.000234375 + }, + { + "step": 51, + "epoch": 0.31875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.7002, + "grad_norm": 1.0813559293746948, + "learning_rate": 0.0002390625 + }, + { + "step": 52, + "epoch": 0.325, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.7213, + "grad_norm": 1.2716503143310547, + "learning_rate": 0.00024375 + }, + { + "step": 53, + "epoch": 0.33125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.7521, + "grad_norm": 2.05289888381958, + "learning_rate": 0.00024843749999999996 + }, + { + "step": 54, + "epoch": 0.3375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571956224, + "loss": 0.6897, + "grad_norm": 0.2148728370666504, + "learning_rate": 0.000253125 + }, + { + "step": 55, + "epoch": 0.34375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6962, + "grad_norm": 0.21705928444862366, + "learning_rate": 0.00025781249999999996 + }, + { + "step": 56, + "epoch": 0.35, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6935, + "grad_norm": 0.15373973548412323, + "learning_rate": 0.0002625 + }, + { + "step": 57, + "epoch": 0.35625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.7151, + "grad_norm": 0.7540801763534546, + "learning_rate": 0.00026718749999999996 + }, + { + "step": 58, + "epoch": 0.3625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6975, + "grad_norm": 0.27551501989364624, + "learning_rate": 0.000271875 + }, + { + "step": 59, + "epoch": 0.36875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571974656, + "loss": 0.6953, + "grad_norm": 0.2733488380908966, + "learning_rate": 0.00027656249999999995 + }, + { + "step": 60, + "epoch": 0.375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6987, + "grad_norm": 0.21085555851459503, + "learning_rate": 0.00028125 + }, + { + "step": 61, + "epoch": 0.38125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6999, + "grad_norm": 0.3318427503108978, + "learning_rate": 0.00028593749999999995 + }, + { + "step": 62, + "epoch": 0.3875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.7014, + "grad_norm": 0.47032707929611206, + "learning_rate": 0.000290625 + }, + { + "step": 63, + "epoch": 0.39375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.7027, + "grad_norm": 0.7467537522315979, + "learning_rate": 0.00029531249999999995 + }, + { + "step": 64, + "epoch": 0.4, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.7115, + "grad_norm": 0.7446199655532837, + "learning_rate": 0.0003 + }, + { + "step": 65, + "epoch": 0.40625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6853, + "grad_norm": 0.22937144339084625, + "learning_rate": 0.00029999776892091325 + }, + { + "step": 66, + "epoch": 0.4125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.7305, + "grad_norm": 0.9117576479911804, + "learning_rate": 0.00029999107575002246 + }, + { + "step": 67, + "epoch": 0.41875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.683, + "grad_norm": 0.2598615288734436, + "learning_rate": 0.0002999799206864343 + }, + { + "step": 68, + "epoch": 0.425, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.7049, + "grad_norm": 0.2957565188407898, + "learning_rate": 0.0002999643040619863 + }, + { + "step": 69, + "epoch": 0.43125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6893, + "grad_norm": 0.15510152280330658, + "learning_rate": 0.0002999442263412377 + }, + { + "step": 70, + "epoch": 0.4375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.6983, + "grad_norm": 0.26608291268348694, + "learning_rate": 0.00029991968812145484 + }, + { + "step": 71, + "epoch": 0.44375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.7038, + "grad_norm": 0.33182021975517273, + "learning_rate": 0.00029989069013259374 + }, + { + "step": 72, + "epoch": 0.45, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6962, + "grad_norm": 0.2412266880273819, + "learning_rate": 0.00029985723323727866 + }, + { + "step": 73, + "epoch": 0.45625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.72, + "grad_norm": 0.9211668968200684, + "learning_rate": 0.00029981931843077583 + }, + { + "step": 74, + "epoch": 0.4625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6968, + "grad_norm": 0.15469565987586975, + "learning_rate": 0.00029977694684096444 + }, + { + "step": 75, + "epoch": 0.46875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6832, + "grad_norm": 0.455708771944046, + "learning_rate": 0.0002997301197283027 + }, + { + "step": 76, + "epoch": 0.475, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.7669, + "grad_norm": 1.5266261100769043, + "learning_rate": 0.0002996788384857905 + }, + { + "step": 77, + "epoch": 0.48125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.7208, + "grad_norm": 0.8196693658828735, + "learning_rate": 0.00029962310463892795 + }, + { + "step": 78, + "epoch": 0.4875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.7049, + "grad_norm": 0.6286107301712036, + "learning_rate": 0.00029956291984566997 + }, + { + "step": 79, + "epoch": 0.49375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6982, + "grad_norm": 0.21492691338062286, + "learning_rate": 0.00029949828589637703 + }, + { + "step": 80, + "epoch": 0.5, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.7024, + "grad_norm": 0.28664469718933105, + "learning_rate": 0.0002994292047137618 + }, + { + "step": 81, + "epoch": 0.50625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.7046, + "grad_norm": 0.5319966673851013, + "learning_rate": 0.00029935567835283203 + }, + { + "step": 82, + "epoch": 0.5125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6963, + "grad_norm": 0.35548657178878784, + "learning_rate": 0.00029927770900082954 + }, + { + "step": 83, + "epoch": 0.51875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.7149, + "grad_norm": 0.8282843828201294, + "learning_rate": 0.0002991952989771647 + }, + { + "step": 84, + "epoch": 0.525, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.692, + "grad_norm": 0.3110659122467041, + "learning_rate": 0.0002991084507333479 + }, + { + "step": 85, + "epoch": 0.53125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.7097, + "grad_norm": 0.4593585431575775, + "learning_rate": 0.00029901716685291663 + }, + { + "step": 86, + "epoch": 0.5375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6955, + "grad_norm": 0.24550582468509674, + "learning_rate": 0.0002989214500513582 + }, + { + "step": 87, + "epoch": 0.54375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6879, + "grad_norm": 0.123134545981884, + "learning_rate": 0.0002988213031760294 + }, + { + "step": 88, + "epoch": 0.55, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.705, + "grad_norm": 0.47104912996292114, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 89, + "epoch": 0.55625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6965, + "grad_norm": 0.2713971734046936, + "learning_rate": 0.0002986077312523219 + }, + { + "step": 90, + "epoch": 0.5625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6887, + "grad_norm": 0.2528550326824188, + "learning_rate": 0.00029849431255722116 + }, + { + "step": 91, + "epoch": 0.56875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6785, + "grad_norm": 0.509993314743042, + "learning_rate": 0.00029837647649471715 + }, + { + "step": 92, + "epoch": 0.575, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.742, + "grad_norm": 0.897745668888092, + "learning_rate": 0.0002982542265701641 + }, + { + "step": 93, + "epoch": 0.58125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.7149, + "grad_norm": 0.5334535241127014, + "learning_rate": 0.0002981275664202187 + }, + { + "step": 94, + "epoch": 0.5875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6967, + "grad_norm": 0.20767106115818024, + "learning_rate": 0.00029799649981273186 + }, + { + "step": 95, + "epoch": 0.59375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6885, + "grad_norm": 0.2857329547405243, + "learning_rate": 0.00029786103064663634 + }, + { + "step": 96, + "epoch": 0.6, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6945, + "grad_norm": 0.4487205445766449, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 97, + "epoch": 0.60625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.703, + "grad_norm": 0.3160085380077362, + "learning_rate": 0.00029757690088906156 + }, + { + "step": 98, + "epoch": 0.6125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.7004, + "grad_norm": 0.30160436034202576, + "learning_rate": 0.00029742824874979515 + }, + { + "step": 99, + "epoch": 0.61875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.68, + "grad_norm": 0.4484723210334778, + "learning_rate": 0.0002972752109560943 + }, + { + "step": 100, + "epoch": 0.625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7009, + "grad_norm": 0.36186373233795166, + "learning_rate": 0.00029711779206048454 + }, + { + "step": 101, + "epoch": 0.63125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.7002, + "grad_norm": 0.21791011095046997, + "learning_rate": 0.0002969559967458194 + }, + { + "step": 102, + "epoch": 0.6375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.7116, + "grad_norm": 0.531248927116394, + "learning_rate": 0.0002967898298251407 + }, + { + "step": 103, + "epoch": 0.64375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6889, + "grad_norm": 0.19894546270370483, + "learning_rate": 0.0002966192962415358 + }, + { + "step": 104, + "epoch": 0.65, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6966, + "grad_norm": 0.23612190783023834, + "learning_rate": 0.00029644440106799 + }, + { + "step": 105, + "epoch": 0.65625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6979, + "grad_norm": 0.20928075909614563, + "learning_rate": 0.00029626514950723627 + }, + { + "step": 106, + "epoch": 0.6625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.7034, + "grad_norm": 0.4276311695575714, + "learning_rate": 0.0002960815468916 + }, + { + "step": 107, + "epoch": 0.66875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6986, + "grad_norm": 0.2789122760295868, + "learning_rate": 0.0002958935986828407 + }, + { + "step": 108, + "epoch": 0.675, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6915, + "grad_norm": 0.10478498041629791, + "learning_rate": 0.00029570131047198915 + }, + { + "step": 109, + "epoch": 0.68125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.7026, + "grad_norm": 0.17489172518253326, + "learning_rate": 0.0002955046879791816 + }, + { + "step": 110, + "epoch": 0.6875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6935, + "grad_norm": 0.39998719096183777, + "learning_rate": 0.00029530373705348895 + }, + { + "step": 111, + "epoch": 0.69375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.7005, + "grad_norm": 0.246766597032547, + "learning_rate": 0.00029509846367274336 + }, + { + "step": 112, + "epoch": 0.7, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6991, + "grad_norm": 0.22258344292640686, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 113, + "epoch": 0.70625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57195776, + "loss": 0.6795, + "grad_norm": 0.2079460769891739, + "learning_rate": 0.00029467497410015625 + }, + { + "step": 114, + "epoch": 0.7125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6924, + "grad_norm": 0.14717693626880646, + "learning_rate": 0.00029445677050616437 + }, + { + "step": 115, + "epoch": 0.71875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.7033, + "grad_norm": 0.36552441120147705, + "learning_rate": 0.0002942342696524443 + }, + { + "step": 116, + "epoch": 0.725, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.7393, + "grad_norm": 1.2145140171051025, + "learning_rate": 0.0002940074781578893 + }, + { + "step": 117, + "epoch": 0.73125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.693, + "grad_norm": 0.08560096472501755, + "learning_rate": 0.00029377640276902954 + }, + { + "step": 118, + "epoch": 0.7375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6941, + "grad_norm": 0.07255161553621292, + "learning_rate": 0.0002935410503598313 + }, + { + "step": 119, + "epoch": 0.74375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.7277, + "grad_norm": 0.9523004293441772, + "learning_rate": 0.00029330142793149237 + }, + { + "step": 120, + "epoch": 0.75, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7007, + "grad_norm": 0.2972448468208313, + "learning_rate": 0.000293057542612234 + }, + { + "step": 121, + "epoch": 0.75625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6949, + "grad_norm": 0.06620430946350098, + "learning_rate": 0.0002928094016570886 + }, + { + "step": 122, + "epoch": 0.7625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6876, + "grad_norm": 0.5297858119010925, + "learning_rate": 0.00029255701244768414 + }, + { + "step": 123, + "epoch": 0.76875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6949, + "grad_norm": 0.056497056037187576, + "learning_rate": 0.0002923003824920244 + }, + { + "step": 124, + "epoch": 0.775, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6994, + "grad_norm": 0.23698468506336212, + "learning_rate": 0.0002920395194242658 + }, + { + "step": 125, + "epoch": 0.78125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57195776, + "loss": 0.695, + "grad_norm": 0.11117374897003174, + "learning_rate": 0.00029177443100449014 + }, + { + "step": 126, + "epoch": 0.7875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6961, + "grad_norm": 0.24096669256687164, + "learning_rate": 0.00029150512511847375 + }, + { + "step": 127, + "epoch": 0.79375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6944, + "grad_norm": 0.07378742098808289, + "learning_rate": 0.00029123160977745306 + }, + { + "step": 128, + "epoch": 0.8, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.7012, + "grad_norm": 0.4092985987663269, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 129, + "epoch": 0.80625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6956, + "grad_norm": 0.09056918323040009, + "learning_rate": 0.00029067198340121094 + }, + { + "step": 130, + "epoch": 0.8125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6967, + "grad_norm": 0.14956776797771454, + "learning_rate": 0.00029038588901359884 + }, + { + "step": 131, + "epoch": 0.81875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6945, + "grad_norm": 0.19803333282470703, + "learning_rate": 0.00029009561846570604 + }, + { + "step": 132, + "epoch": 0.825, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6977, + "grad_norm": 0.48452550172805786, + "learning_rate": 0.00028980118039241976 + }, + { + "step": 133, + "epoch": 0.83125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6929, + "grad_norm": 0.8557135462760925, + "learning_rate": 0.00028950258355260177 + }, + { + "step": 134, + "epoch": 0.8375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6988, + "grad_norm": 0.3551441431045532, + "learning_rate": 0.00028919983682882766 + }, + { + "step": 135, + "epoch": 0.84375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6719, + "grad_norm": 0.14573824405670166, + "learning_rate": 0.0002888929492271224 + }, + { + "step": 136, + "epoch": 0.85, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6776, + "grad_norm": 0.22165468335151672, + "learning_rate": 0.000288581929876693 + }, + { + "step": 137, + "epoch": 0.85625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.7014, + "grad_norm": 0.5670106410980225, + "learning_rate": 0.00028826678802965614 + }, + { + "step": 138, + "epoch": 0.8625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.7281, + "grad_norm": 0.8851659893989563, + "learning_rate": 0.0002879475330607638 + }, + { + "step": 139, + "epoch": 0.86875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.7175, + "grad_norm": 0.70131915807724, + "learning_rate": 0.00028762417446712363 + }, + { + "step": 140, + "epoch": 0.875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6883, + "grad_norm": 0.13828440010547638, + "learning_rate": 0.00028729672186791704 + }, + { + "step": 141, + "epoch": 0.88125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6928, + "grad_norm": 0.42647144198417664, + "learning_rate": 0.00028696518500411254 + }, + { + "step": 142, + "epoch": 0.8875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6957, + "grad_norm": 0.38046514987945557, + "learning_rate": 0.0002866295737381763 + }, + { + "step": 143, + "epoch": 0.89375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6949, + "grad_norm": 0.08273141086101532, + "learning_rate": 0.0002862898980537788 + }, + { + "step": 144, + "epoch": 0.9, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.693, + "grad_norm": 0.13509684801101685, + "learning_rate": 0.0002859461680554975 + }, + { + "step": 145, + "epoch": 0.90625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.687, + "grad_norm": 0.23182788491249084, + "learning_rate": 0.0002855983939685165 + }, + { + "step": 146, + "epoch": 0.9125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.683, + "grad_norm": 0.09309686720371246, + "learning_rate": 0.0002852465861383224 + }, + { + "step": 147, + "epoch": 0.91875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.7409, + "grad_norm": 0.8272542953491211, + "learning_rate": 0.00028489075503039643 + }, + { + "step": 148, + "epoch": 0.925, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6953, + "grad_norm": 0.20700886845588684, + "learning_rate": 0.00028453091122990323 + }, + { + "step": 149, + "epoch": 0.93125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6901, + "grad_norm": 0.3300243318080902, + "learning_rate": 0.0002841670654413757 + }, + { + "step": 150, + "epoch": 0.9375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6994, + "grad_norm": 0.42934930324554443, + "learning_rate": 0.0002837992284883971 + }, + { + "step": 151, + "epoch": 0.94375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6961, + "grad_norm": 0.199885293841362, + "learning_rate": 0.0002834274113132784 + }, + { + "step": 152, + "epoch": 0.95, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.696, + "grad_norm": 0.09874448925256729, + "learning_rate": 0.0002830516249767332 + }, + { + "step": 153, + "epoch": 0.95625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6957, + "grad_norm": 0.2374245524406433, + "learning_rate": 0.0002826718806575488 + }, + { + "step": 154, + "epoch": 0.9625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.692, + "grad_norm": 0.15978726744651794, + "learning_rate": 0.0002822881896522532 + }, + { + "step": 155, + "epoch": 0.96875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.7024, + "grad_norm": 0.483416348695755, + "learning_rate": 0.0002819005633747795 + }, + { + "step": 156, + "epoch": 0.975, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.697, + "grad_norm": 0.2337210327386856, + "learning_rate": 0.00028150901335612615 + }, + { + "step": 157, + "epoch": 0.98125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6738, + "grad_norm": 1.0724352598190308, + "learning_rate": 0.0002811135512440138 + }, + { + "step": 158, + "epoch": 0.9875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571976192, + "loss": 0.7084, + "grad_norm": 0.4240679144859314, + "learning_rate": 0.0002807141888025392 + }, + { + "step": 159, + "epoch": 0.99375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6932, + "grad_norm": 0.25563615560531616, + "learning_rate": 0.00028031093791182484 + }, + { + "step": 160, + "epoch": 1.0, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.7251, + "grad_norm": 0.6342345476150513, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 161, + "epoch": 1.00625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.7242, + "grad_norm": 0.5745607018470764, + "learning_rate": 0.0002794928188811727 + }, + { + "step": 162, + "epoch": 1.0125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.714, + "grad_norm": 0.48474857211112976, + "learning_rate": 0.0002790779750784118 + }, + { + "step": 163, + "epoch": 1.01875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7036, + "grad_norm": 0.6228493452072144, + "learning_rate": 0.0002786592915000408 + }, + { + "step": 164, + "epoch": 1.025, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6816, + "grad_norm": 0.2759997844696045, + "learning_rate": 0.00027823678060094197 + }, + { + "step": 165, + "epoch": 1.03125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6797, + "grad_norm": 0.10200056433677673, + "learning_rate": 0.0002778104549498518 + }, + { + "step": 166, + "epoch": 1.0375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.7459, + "grad_norm": 0.7267525792121887, + "learning_rate": 0.00027738032722898683 + }, + { + "step": 167, + "epoch": 1.04375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.7605, + "grad_norm": 0.8250167965888977, + "learning_rate": 0.00027694641023366656 + }, + { + "step": 168, + "epoch": 1.05, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.7362, + "grad_norm": 0.6142211556434631, + "learning_rate": 0.0002765087168719328 + }, + { + "step": 169, + "epoch": 1.05625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.7194, + "grad_norm": 0.5058600902557373, + "learning_rate": 0.00027606726016416567 + }, + { + "step": 170, + "epoch": 1.0625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.7166, + "grad_norm": 0.5433225631713867, + "learning_rate": 0.00027562205324269617 + }, + { + "step": 171, + "epoch": 1.06875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6973, + "grad_norm": 0.4207172691822052, + "learning_rate": 0.00027517310935141565 + }, + { + "step": 172, + "epoch": 1.075, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6952, + "grad_norm": 0.0757172480225563, + "learning_rate": 0.0002747204418453818 + }, + { + "step": 173, + "epoch": 1.08125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57195776, + "loss": 0.6969, + "grad_norm": 0.148137167096138, + "learning_rate": 0.00027426406419042135 + }, + { + "step": 174, + "epoch": 1.0875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.7025, + "grad_norm": 0.3433571457862854, + "learning_rate": 0.00027380398996272956 + }, + { + "step": 175, + "epoch": 1.09375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7203, + "grad_norm": 0.592641294002533, + "learning_rate": 0.0002733402328484662 + }, + { + "step": 176, + "epoch": 1.1, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6931, + "grad_norm": 0.08390302956104279, + "learning_rate": 0.00027287280664334875 + }, + { + "step": 177, + "epoch": 1.10625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.703, + "grad_norm": 0.3761526346206665, + "learning_rate": 0.0002724017252522415 + }, + { + "step": 178, + "epoch": 1.1125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6978, + "grad_norm": 0.20027367770671844, + "learning_rate": 0.0002719270026887423 + }, + { + "step": 179, + "epoch": 1.11875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.7014, + "grad_norm": 0.23353342711925507, + "learning_rate": 0.0002714486530747656 + }, + { + "step": 180, + "epoch": 1.125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6949, + "grad_norm": 0.17496509850025177, + "learning_rate": 0.0002709666906401224 + }, + { + "step": 181, + "epoch": 1.13125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.5719808, + "loss": 0.6918, + "grad_norm": 0.5516202449798584, + "learning_rate": 0.0002704811297220967 + }, + { + "step": 182, + "epoch": 1.1375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.7089, + "grad_norm": 0.7007341980934143, + "learning_rate": 0.00026999198476501945 + }, + { + "step": 183, + "epoch": 1.14375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6949, + "grad_norm": 0.42201465368270874, + "learning_rate": 0.0002694992703198383 + }, + { + "step": 184, + "epoch": 1.15, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6831, + "grad_norm": 0.20444747805595398, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 185, + "epoch": 1.15625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6871, + "grad_norm": 0.14490249752998352, + "learning_rate": 0.0002685031916994403 + }, + { + "step": 186, + "epoch": 1.1625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.7226, + "grad_norm": 0.6153913736343384, + "learning_rate": 0.0002679998571552925 + }, + { + "step": 187, + "epoch": 1.16875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.676, + "grad_norm": 0.10400459915399551, + "learning_rate": 0.0002674930123842975 + }, + { + "step": 188, + "epoch": 1.175, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6934, + "grad_norm": 0.20555929839611053, + "learning_rate": 0.0002669826724639322 + }, + { + "step": 189, + "epoch": 1.18125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6863, + "grad_norm": 0.35662564635276794, + "learning_rate": 0.0002664688525756463 + }, + { + "step": 190, + "epoch": 1.1875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.7032, + "grad_norm": 0.611815333366394, + "learning_rate": 0.0002659515680044105 + }, + { + "step": 191, + "epoch": 1.19375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.7085, + "grad_norm": 0.5617057681083679, + "learning_rate": 0.00026543083413826203 + }, + { + "step": 192, + "epoch": 1.2, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.7025, + "grad_norm": 0.6491231918334961, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 193, + "epoch": 1.20625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6924, + "grad_norm": 0.21333308517932892, + "learning_rate": 0.0002643790805859582 + }, + { + "step": 194, + "epoch": 1.2125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.6903, + "grad_norm": 0.1674475073814392, + "learning_rate": 0.00026384809218707423 + }, + { + "step": 195, + "epoch": 1.21875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.7298, + "grad_norm": 0.9665305614471436, + "learning_rate": 0.0002633137170668897 + }, + { + "step": 196, + "epoch": 1.225, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6823, + "grad_norm": 0.19133318960666656, + "learning_rate": 0.0002627759711218466 + }, + { + "step": 197, + "epoch": 1.23125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.6999, + "grad_norm": 0.1539922058582306, + "learning_rate": 0.00026223487034866133 + }, + { + "step": 198, + "epoch": 1.2375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7001, + "grad_norm": 0.21028481423854828, + "learning_rate": 0.00026169043084384896 + }, + { + "step": 199, + "epoch": 1.24375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6844, + "grad_norm": 0.08904171735048294, + "learning_rate": 0.00026114266880324387 + }, + { + "step": 200, + "epoch": 1.25, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7232, + "grad_norm": 0.6141135096549988, + "learning_rate": 0.0002605916005215186 + }, + { + "step": 201, + "epoch": 1.25625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.7058, + "grad_norm": 0.4128480553627014, + "learning_rate": 0.00026003724239169874 + }, + { + "step": 202, + "epoch": 1.2625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6863, + "grad_norm": 0.31763043999671936, + "learning_rate": 0.00025947961090467533 + }, + { + "step": 203, + "epoch": 1.26875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.7018, + "grad_norm": 0.7675949931144714, + "learning_rate": 0.0002589187226487144 + }, + { + "step": 204, + "epoch": 1.275, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6938, + "grad_norm": 0.06750268489122391, + "learning_rate": 0.0002583545943089633 + }, + { + "step": 205, + "epoch": 1.28125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6907, + "grad_norm": 0.23038960993289948, + "learning_rate": 0.00025778724266695466 + }, + { + "step": 206, + "epoch": 1.2875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7137, + "grad_norm": 0.46466726064682007, + "learning_rate": 0.00025721668460010696 + }, + { + "step": 207, + "epoch": 1.29375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.702, + "grad_norm": 0.29555705189704895, + "learning_rate": 0.0002566429370812223 + }, + { + "step": 208, + "epoch": 1.3, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.681, + "grad_norm": 0.08953425288200378, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 209, + "epoch": 1.30625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7269, + "grad_norm": 0.6399303078651428, + "learning_rate": 0.0002554859420524386 + }, + { + "step": 210, + "epoch": 1.3125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.7085, + "grad_norm": 0.4344145357608795, + "learning_rate": 0.00025490272896050507 + }, + { + "step": 211, + "epoch": 1.31875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6967, + "grad_norm": 0.4960492253303528, + "learning_rate": 0.00025431639525144175 + }, + { + "step": 212, + "epoch": 1.325, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6964, + "grad_norm": 0.13717401027679443, + "learning_rate": 0.0002537269583673404 + }, + { + "step": 213, + "epoch": 1.33125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6927, + "grad_norm": 0.0784611776471138, + "learning_rate": 0.0002531344358426051 + }, + { + "step": 214, + "epoch": 1.3375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571974656, + "loss": 0.7013, + "grad_norm": 0.3192507028579712, + "learning_rate": 0.0002525388453034307 + }, + { + "step": 215, + "epoch": 1.34375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6887, + "grad_norm": 0.15926799178123474, + "learning_rate": 0.0002519402044672784 + }, + { + "step": 216, + "epoch": 1.35, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6828, + "grad_norm": 0.2850461006164551, + "learning_rate": 0.00025133853114234905 + }, + { + "step": 217, + "epoch": 1.35625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.7057, + "grad_norm": 0.39781439304351807, + "learning_rate": 0.00025073384322705274 + }, + { + "step": 218, + "epoch": 1.3625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7161, + "grad_norm": 0.6818528175354004, + "learning_rate": 0.0002501261587094771 + }, + { + "step": 219, + "epoch": 1.36875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6934, + "grad_norm": 0.04837153106927872, + "learning_rate": 0.00024951549566685165 + }, + { + "step": 220, + "epoch": 1.375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6933, + "grad_norm": 0.06746860593557358, + "learning_rate": 0.0002489018722650103 + }, + { + "step": 221, + "epoch": 1.38125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6967, + "grad_norm": 0.5540512204170227, + "learning_rate": 0.00024828530675785094 + }, + { + "step": 222, + "epoch": 1.3875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6942, + "grad_norm": 0.06623371690511703, + "learning_rate": 0.00024766581748679234 + }, + { + "step": 223, + "epoch": 1.39375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.6973, + "grad_norm": 0.34845584630966187, + "learning_rate": 0.0002470434228802286 + }, + { + "step": 224, + "epoch": 1.4, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6923, + "grad_norm": 0.08514504879713058, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 225, + "epoch": 1.40625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6953, + "grad_norm": 0.2627629339694977, + "learning_rate": 0.0002457899918057468 + }, + { + "step": 226, + "epoch": 1.4125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6959, + "grad_norm": 0.19740509986877441, + "learning_rate": 0.0002451589926245468 + }, + { + "step": 227, + "epoch": 1.41875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.695, + "grad_norm": 0.05116485804319382, + "learning_rate": 0.00024452516268016865 + }, + { + "step": 228, + "epoch": 1.425, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.692, + "grad_norm": 0.08216831088066101, + "learning_rate": 0.00024388852082760884 + }, + { + "step": 229, + "epoch": 1.43125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.693, + "grad_norm": 0.09249627590179443, + "learning_rate": 0.00024324908600551162 + }, + { + "step": 230, + "epoch": 1.4375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6933, + "grad_norm": 0.1620689034461975, + "learning_rate": 0.00024260687723560574 + }, + { + "step": 231, + "epoch": 1.44375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6934, + "grad_norm": 0.3793896734714508, + "learning_rate": 0.00024196191362213862 + }, + { + "step": 232, + "epoch": 1.45, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6932, + "grad_norm": 0.1461774706840515, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 233, + "epoch": 1.45625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6958, + "grad_norm": 0.07761687785387039, + "learning_rate": 0.0002406637986906913 + }, + { + "step": 234, + "epoch": 1.4625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571976192, + "loss": 0.6954, + "grad_norm": 0.09012317657470703, + "learning_rate": 0.00024001068598867212 + }, + { + "step": 235, + "epoch": 1.46875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6989, + "grad_norm": 0.3257127106189728, + "learning_rate": 0.000239354895673865 + }, + { + "step": 236, + "epoch": 1.475, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6903, + "grad_norm": 0.41912370920181274, + "learning_rate": 0.00023869644725453735 + }, + { + "step": 237, + "epoch": 1.48125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571974656, + "loss": 0.7074, + "grad_norm": 0.4793280363082886, + "learning_rate": 0.00023803536031802918 + }, + { + "step": 238, + "epoch": 1.4875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.7065, + "grad_norm": 0.8029257655143738, + "learning_rate": 0.00023737165453017033 + }, + { + "step": 239, + "epoch": 1.49375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6957, + "grad_norm": 0.25974783301353455, + "learning_rate": 0.0002367053496346955 + }, + { + "step": 240, + "epoch": 1.5, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6951, + "grad_norm": 0.1260383427143097, + "learning_rate": 0.00023603646545265687 + }, + { + "step": 241, + "epoch": 1.50625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6781, + "grad_norm": 0.3276844322681427, + "learning_rate": 0.00023536502188183472 + }, + { + "step": 242, + "epoch": 1.5125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6826, + "grad_norm": 0.09126769006252289, + "learning_rate": 0.00023469103889614505 + }, + { + "step": 243, + "epoch": 1.51875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571974656, + "loss": 0.7442, + "grad_norm": 0.8104130625724792, + "learning_rate": 0.0002340145365450458 + }, + { + "step": 244, + "epoch": 1.525, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7213, + "grad_norm": 0.5069760680198669, + "learning_rate": 0.0002333355349529403 + }, + { + "step": 245, + "epoch": 1.53125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.7108, + "grad_norm": 0.4709412753582001, + "learning_rate": 0.0002326540543185786 + }, + { + "step": 246, + "epoch": 1.5375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6939, + "grad_norm": 0.05098516121506691, + "learning_rate": 0.0002319701149144565 + }, + { + "step": 247, + "epoch": 1.54375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6927, + "grad_norm": 0.1472744196653366, + "learning_rate": 0.00023128373708621275 + }, + { + "step": 248, + "epoch": 1.55, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6961, + "grad_norm": 0.37528201937675476, + "learning_rate": 0.00023059494125202357 + }, + { + "step": 249, + "epoch": 1.55625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6884, + "grad_norm": 0.1264956146478653, + "learning_rate": 0.00022990374790199532 + }, + { + "step": 250, + "epoch": 1.5625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.7154, + "grad_norm": 0.5412298440933228, + "learning_rate": 0.0002292101775975552 + }, + { + "step": 251, + "epoch": 1.56875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6901, + "grad_norm": 0.06004335358738899, + "learning_rate": 0.00022851425097083906 + }, + { + "step": 252, + "epoch": 1.575, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6856, + "grad_norm": 0.1410798579454422, + "learning_rate": 0.00022781598872407822 + }, + { + "step": 253, + "epoch": 1.58125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6978, + "grad_norm": 0.14781467616558075, + "learning_rate": 0.00022711541162898321 + }, + { + "step": 254, + "epoch": 1.5875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.701, + "grad_norm": 0.1900554746389389, + "learning_rate": 0.00022641254052612627 + }, + { + "step": 255, + "epoch": 1.59375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.6886, + "grad_norm": 0.06143057718873024, + "learning_rate": 0.00022570739632432079 + }, + { + "step": 256, + "epoch": 1.6, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6994, + "grad_norm": 0.2810983955860138, + "learning_rate": 0.000225 + }, + { + "step": 257, + "epoch": 1.60625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6945, + "grad_norm": 0.04753813147544861, + "learning_rate": 0.0002242903725965924 + }, + { + "step": 258, + "epoch": 1.6125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6955, + "grad_norm": 0.31154751777648926, + "learning_rate": 0.00022357853522389615 + }, + { + "step": 259, + "epoch": 1.61875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6993, + "grad_norm": 0.47017306089401245, + "learning_rate": 0.000222864509057451 + }, + { + "step": 260, + "epoch": 1.625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.697, + "grad_norm": 0.2748226821422577, + "learning_rate": 0.00022214831533790813 + }, + { + "step": 261, + "epoch": 1.63125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.689, + "grad_norm": 0.5120517611503601, + "learning_rate": 0.0002214299753703987 + }, + { + "step": 262, + "epoch": 1.6375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6939, + "grad_norm": 0.1018412709236145, + "learning_rate": 0.00022070951052389966 + }, + { + "step": 263, + "epoch": 1.64375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571976192, + "loss": 0.694, + "grad_norm": 0.13063260912895203, + "learning_rate": 0.00021998694223059837 + }, + { + "step": 264, + "epoch": 1.65, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.694, + "grad_norm": 0.08045435696840286, + "learning_rate": 0.0002192622919852551 + }, + { + "step": 265, + "epoch": 1.65625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.7007, + "grad_norm": 0.29500266909599304, + "learning_rate": 0.00021853558134456307 + }, + { + "step": 266, + "epoch": 1.6625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6901, + "grad_norm": 0.12323495745658875, + "learning_rate": 0.00021780683192650796 + }, + { + "step": 267, + "epoch": 1.66875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6902, + "grad_norm": 0.1330847293138504, + "learning_rate": 0.00021707606540972413 + }, + { + "step": 268, + "epoch": 1.675, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7011, + "grad_norm": 0.4407116770744324, + "learning_rate": 0.00021634330353285017 + }, + { + "step": 269, + "epoch": 1.68125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6911, + "grad_norm": 0.24736863374710083, + "learning_rate": 0.00021560856809388213 + }, + { + "step": 270, + "epoch": 1.6875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6959, + "grad_norm": 0.30900779366493225, + "learning_rate": 0.00021487188094952489 + }, + { + "step": 271, + "epoch": 1.69375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6942, + "grad_norm": 0.04139747843146324, + "learning_rate": 0.0002141332640145423 + }, + { + "step": 272, + "epoch": 1.7, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6995, + "grad_norm": 0.47512736916542053, + "learning_rate": 0.0002133927392611049 + }, + { + "step": 273, + "epoch": 1.70625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.693, + "grad_norm": 0.15486083924770355, + "learning_rate": 0.00021265032871813658 + }, + { + "step": 274, + "epoch": 1.7125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6958, + "grad_norm": 0.39790481328964233, + "learning_rate": 0.00021190605447065917 + }, + { + "step": 275, + "epoch": 1.71875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6933, + "grad_norm": 0.24841873347759247, + "learning_rate": 0.0002111599386591355 + }, + { + "step": 276, + "epoch": 1.725, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6953, + "grad_norm": 0.19071198999881744, + "learning_rate": 0.00021041200347881057 + }, + { + "step": 277, + "epoch": 1.73125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.6966, + "grad_norm": 0.36508864164352417, + "learning_rate": 0.00020966227117905163 + }, + { + "step": 278, + "epoch": 1.7375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6916, + "grad_norm": 0.07497318834066391, + "learning_rate": 0.00020891076406268612 + }, + { + "step": 279, + "epoch": 1.74375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6948, + "grad_norm": 0.35155701637268066, + "learning_rate": 0.00020815750448533805 + }, + { + "step": 280, + "epoch": 1.75, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.6954, + "grad_norm": 0.13070276379585266, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 281, + "epoch": 1.75625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.6928, + "grad_norm": 0.29972949624061584, + "learning_rate": 0.00020664581763018324 + }, + { + "step": 282, + "epoch": 1.7625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6961, + "grad_norm": 0.1651931256055832, + "learning_rate": 0.00020588743532161543 + }, + { + "step": 283, + "epoch": 1.76875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.709, + "grad_norm": 0.6671253442764282, + "learning_rate": 0.00020512739048920552 + }, + { + "step": 284, + "epoch": 1.775, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6985, + "grad_norm": 0.24082604050636292, + "learning_rate": 0.00020436570574255522 + }, + { + "step": 285, + "epoch": 1.78125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6947, + "grad_norm": 0.11839146167039871, + "learning_rate": 0.00020360240374005 + }, + { + "step": 286, + "epoch": 1.7875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6971, + "grad_norm": 0.6211200952529907, + "learning_rate": 0.00020283750718818501 + }, + { + "step": 287, + "epoch": 1.79375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.6952, + "grad_norm": 0.15069188177585602, + "learning_rate": 0.00020207103884088955 + }, + { + "step": 288, + "epoch": 1.8, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6938, + "grad_norm": 0.06709975749254227, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 289, + "epoch": 1.80625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6949, + "grad_norm": 0.23215904831886292, + "learning_rate": 0.00020053347800883298 + }, + { + "step": 290, + "epoch": 1.8125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6919, + "grad_norm": 0.13334542512893677, + "learning_rate": 0.00019976243126300282 + }, + { + "step": 291, + "epoch": 1.81875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6926, + "grad_norm": 0.07445777952671051, + "learning_rate": 0.00019898990419824333 + }, + { + "step": 292, + "epoch": 1.825, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571956224, + "loss": 0.6948, + "grad_norm": 0.10290640592575073, + "learning_rate": 0.00019821591979547423 + }, + { + "step": 293, + "epoch": 1.83125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.7117, + "grad_norm": 0.5153995752334595, + "learning_rate": 0.00019744050107896774 + }, + { + "step": 294, + "epoch": 1.8375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57195776, + "loss": 0.7143, + "grad_norm": 0.6969769597053528, + "learning_rate": 0.0001966636711156636 + }, + { + "step": 295, + "epoch": 1.84375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6924, + "grad_norm": 0.07216745615005493, + "learning_rate": 0.00019588545301448302 + }, + { + "step": 296, + "epoch": 1.85, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6948, + "grad_norm": 0.34937772154808044, + "learning_rate": 0.00019510586992564093 + }, + { + "step": 297, + "epoch": 1.85625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6957, + "grad_norm": 0.07198280096054077, + "learning_rate": 0.0001943249450399578 + }, + { + "step": 298, + "epoch": 1.8625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6978, + "grad_norm": 0.17043612897396088, + "learning_rate": 0.0001935427015881693 + }, + { + "step": 299, + "epoch": 1.86875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6912, + "grad_norm": 0.10138287395238876, + "learning_rate": 0.00019275916284023563 + }, + { + "step": 300, + "epoch": 1.875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.7105, + "grad_norm": 0.614494800567627, + "learning_rate": 0.00019197435210464882 + }, + { + "step": 301, + "epoch": 1.88125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6927, + "grad_norm": 0.05384859815239906, + "learning_rate": 0.00019118829272773985 + }, + { + "step": 302, + "epoch": 1.8875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6918, + "grad_norm": 0.07557376474142075, + "learning_rate": 0.00019040100809298392 + }, + { + "step": 303, + "epoch": 1.89375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571976192, + "loss": 0.6971, + "grad_norm": 0.1991368979215622, + "learning_rate": 0.00018961252162030476 + }, + { + "step": 304, + "epoch": 1.9, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6932, + "grad_norm": 0.2207423597574234, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 305, + "epoch": 1.90625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.692, + "grad_norm": 0.2398618459701538, + "learning_rate": 0.00018803203701893393 + }, + { + "step": 306, + "epoch": 1.9125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6929, + "grad_norm": 0.05550159886479378, + "learning_rate": 0.00018724008590605742 + }, + { + "step": 307, + "epoch": 1.91875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.7073, + "grad_norm": 0.4328235983848572, + "learning_rate": 0.0001864470269854896 + }, + { + "step": 308, + "epoch": 1.925, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6904, + "grad_norm": 0.05488348752260208, + "learning_rate": 0.00018565288384892595 + }, + { + "step": 309, + "epoch": 1.93125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.7142, + "grad_norm": 0.6022237539291382, + "learning_rate": 0.00018485768012031518 + }, + { + "step": 310, + "epoch": 1.9375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6836, + "grad_norm": 0.198994979262352, + "learning_rate": 0.00018406143945515598 + }, + { + "step": 311, + "epoch": 1.94375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.6904, + "grad_norm": 0.07985939830541611, + "learning_rate": 0.00018326418553979367 + }, + { + "step": 312, + "epoch": 1.95, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.7, + "grad_norm": 0.27188464999198914, + "learning_rate": 0.0001824659420907154 + }, + { + "step": 313, + "epoch": 1.95625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6892, + "grad_norm": 0.10269410163164139, + "learning_rate": 0.00018166673285384475 + }, + { + "step": 314, + "epoch": 1.9625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6926, + "grad_norm": 0.09785927832126617, + "learning_rate": 0.00018086658160383523 + }, + { + "step": 315, + "epoch": 1.96875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6912, + "grad_norm": 0.146536186337471, + "learning_rate": 0.00018006551214336304 + }, + { + "step": 316, + "epoch": 1.975, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6991, + "grad_norm": 0.38985446095466614, + "learning_rate": 0.00017926354830241924 + }, + { + "step": 317, + "epoch": 1.98125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6927, + "grad_norm": 0.2856346070766449, + "learning_rate": 0.00017846071393760044 + }, + { + "step": 318, + "epoch": 1.9875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6925, + "grad_norm": 0.15812620520591736, + "learning_rate": 0.00017765703293139948 + }, + { + "step": 319, + "epoch": 1.99375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6857, + "grad_norm": 0.22834262251853943, + "learning_rate": 0.00017685252919149493 + }, + { + "step": 320, + "epoch": 2.0, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6816, + "grad_norm": 0.2623056471347809, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 321, + "epoch": 2.00625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6576, + "grad_norm": 0.47624626755714417, + "learning_rate": 0.00017524114926294887 + }, + { + "step": 322, + "epoch": 2.0125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6978, + "grad_norm": 0.26160383224487305, + "learning_rate": 0.0001744343210091883 + }, + { + "step": 323, + "epoch": 2.01875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.7112, + "grad_norm": 0.41236573457717896, + "learning_rate": 0.00017362676589005967 + }, + { + "step": 324, + "epoch": 2.025, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571976192, + "loss": 0.7452, + "grad_norm": 0.7381967902183533, + "learning_rate": 0.0001728185079284875 + }, + { + "step": 325, + "epoch": 2.03125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.719, + "grad_norm": 0.5006628036499023, + "learning_rate": 0.00017200957116830423 + }, + { + "step": 326, + "epoch": 2.0375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571974656, + "loss": 0.7152, + "grad_norm": 0.4342546761035919, + "learning_rate": 0.00017119997967353514 + }, + { + "step": 327, + "epoch": 2.04375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6872, + "grad_norm": 0.09059812873601913, + "learning_rate": 0.00017038975752768211 + }, + { + "step": 328, + "epoch": 2.05, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.7037, + "grad_norm": 0.3153970539569855, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 329, + "epoch": 2.05625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.7045, + "grad_norm": 0.4084380269050598, + "learning_rate": 0.0001687675177098179 + }, + { + "step": 330, + "epoch": 2.0625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6928, + "grad_norm": 0.05508684739470482, + "learning_rate": 0.00016795554829574435 + }, + { + "step": 331, + "epoch": 2.06875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6945, + "grad_norm": 0.11397580802440643, + "learning_rate": 0.00016714304474502696 + }, + { + "step": 332, + "epoch": 2.075, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6815, + "grad_norm": 0.2261170893907547, + "learning_rate": 0.00016633003122779467 + }, + { + "step": 333, + "epoch": 2.08125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6832, + "grad_norm": 0.08420999348163605, + "learning_rate": 0.00016551653192934694 + }, + { + "step": 334, + "epoch": 2.0875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.7011, + "grad_norm": 0.24950134754180908, + "learning_rate": 0.0001647025710494341 + }, + { + "step": 335, + "epoch": 2.09375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.7046, + "grad_norm": 0.2992820143699646, + "learning_rate": 0.00016388817280153735 + }, + { + "step": 336, + "epoch": 2.1, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.675, + "grad_norm": 0.04479832947254181, + "learning_rate": 0.00016307336141214873 + }, + { + "step": 337, + "epoch": 2.10625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.7357, + "grad_norm": 0.5715765953063965, + "learning_rate": 0.00016225816112005022 + }, + { + "step": 338, + "epoch": 2.1125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.7355, + "grad_norm": 0.5929423570632935, + "learning_rate": 0.00016144259617559286 + }, + { + "step": 339, + "epoch": 2.11875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.7087, + "grad_norm": 0.3627922236919403, + "learning_rate": 0.00016062669083997513 + }, + { + "step": 340, + "epoch": 2.125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.7014, + "grad_norm": 0.21161341667175293, + "learning_rate": 0.00015981046938452146 + }, + { + "step": 341, + "epoch": 2.13125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6885, + "grad_norm": 0.11589790135622025, + "learning_rate": 0.00015899395608996015 + }, + { + "step": 342, + "epoch": 2.1375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6886, + "grad_norm": 0.17802953720092773, + "learning_rate": 0.00015817717524570094 + }, + { + "step": 343, + "epoch": 2.14375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6913, + "grad_norm": 0.044442273676395416, + "learning_rate": 0.0001573601511491127 + }, + { + "step": 344, + "epoch": 2.15, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6973, + "grad_norm": 0.3317153751850128, + "learning_rate": 0.00015654290810480042 + }, + { + "step": 345, + "epoch": 2.15625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6971, + "grad_norm": 0.29405876994132996, + "learning_rate": 0.00015572547042388223 + }, + { + "step": 346, + "epoch": 2.1625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6935, + "grad_norm": 0.062107376754283905, + "learning_rate": 0.00015490786242326643 + }, + { + "step": 347, + "epoch": 2.16875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6919, + "grad_norm": 0.16305236518383026, + "learning_rate": 0.00015409010842492777 + }, + { + "step": 348, + "epoch": 2.175, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6928, + "grad_norm": 0.1601140797138214, + "learning_rate": 0.00015327223275518416 + }, + { + "step": 349, + "epoch": 2.18125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571954688, + "loss": 0.695, + "grad_norm": 0.1680706888437271, + "learning_rate": 0.000152454259743973 + }, + { + "step": 350, + "epoch": 2.1875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6929, + "grad_norm": 0.0900038629770279, + "learning_rate": 0.00015163621372412734 + }, + { + "step": 351, + "epoch": 2.19375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6905, + "grad_norm": 0.1903282254934311, + "learning_rate": 0.00015081811903065205 + }, + { + "step": 352, + "epoch": 2.2, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6929, + "grad_norm": 0.09070234000682831, + "learning_rate": 0.00015 + }, + { + "step": 353, + "epoch": 2.20625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6946, + "grad_norm": 0.11729394644498825, + "learning_rate": 0.0001491818809693479 + }, + { + "step": 354, + "epoch": 2.2125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6876, + "grad_norm": 1.0032715797424316, + "learning_rate": 0.00014836378627587266 + }, + { + "step": 355, + "epoch": 2.21875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6994, + "grad_norm": 0.30010315775871277, + "learning_rate": 0.00014754574025602698 + }, + { + "step": 356, + "epoch": 2.225, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.7006, + "grad_norm": 0.30009880661964417, + "learning_rate": 0.00014672776724481584 + }, + { + "step": 357, + "epoch": 2.23125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.7068, + "grad_norm": 0.3801293969154358, + "learning_rate": 0.00014590989157507224 + }, + { + "step": 358, + "epoch": 2.2375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6982, + "grad_norm": 0.29767465591430664, + "learning_rate": 0.00014509213757673357 + }, + { + "step": 359, + "epoch": 2.24375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6984, + "grad_norm": 0.1592196673154831, + "learning_rate": 0.00014427452957611775 + }, + { + "step": 360, + "epoch": 2.25, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6975, + "grad_norm": 0.2913283109664917, + "learning_rate": 0.0001434570918951996 + }, + { + "step": 361, + "epoch": 2.25625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6962, + "grad_norm": 0.04952171444892883, + "learning_rate": 0.0001426398488508873 + }, + { + "step": 362, + "epoch": 2.2625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6949, + "grad_norm": 0.44019341468811035, + "learning_rate": 0.00014182282475429903 + }, + { + "step": 363, + "epoch": 2.26875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6885, + "grad_norm": 0.07596150040626526, + "learning_rate": 0.00014100604391003985 + }, + { + "step": 364, + "epoch": 2.275, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.7032, + "grad_norm": 0.3380616009235382, + "learning_rate": 0.0001401895306154785 + }, + { + "step": 365, + "epoch": 2.28125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.7101, + "grad_norm": 0.49890026450157166, + "learning_rate": 0.00013937330916002487 + }, + { + "step": 366, + "epoch": 2.2875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6998, + "grad_norm": 0.24058803915977478, + "learning_rate": 0.00013855740382440714 + }, + { + "step": 367, + "epoch": 2.29375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6806, + "grad_norm": 0.22572366893291473, + "learning_rate": 0.0001377418388799498 + }, + { + "step": 368, + "epoch": 2.3, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6838, + "grad_norm": 0.16291771829128265, + "learning_rate": 0.00013692663858785124 + }, + { + "step": 369, + "epoch": 2.30625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6831, + "grad_norm": 0.15589989721775055, + "learning_rate": 0.00013611182719846268 + }, + { + "step": 370, + "epoch": 2.3125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.697, + "grad_norm": 0.18399417400360107, + "learning_rate": 0.0001352974289505659 + }, + { + "step": 371, + "epoch": 2.31875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.702, + "grad_norm": 0.2719920873641968, + "learning_rate": 0.000134483468070653 + }, + { + "step": 372, + "epoch": 2.325, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.7028, + "grad_norm": 0.3059929311275482, + "learning_rate": 0.00013366996877220533 + }, + { + "step": 373, + "epoch": 2.33125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6954, + "grad_norm": 0.1504541039466858, + "learning_rate": 0.000132856955254973 + }, + { + "step": 374, + "epoch": 2.3375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571977728, + "loss": 0.6855, + "grad_norm": 0.252089262008667, + "learning_rate": 0.00013204445170425565 + }, + { + "step": 375, + "epoch": 2.34375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6982, + "grad_norm": 0.26580294966697693, + "learning_rate": 0.00013123248229018214 + }, + { + "step": 376, + "epoch": 2.35, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6979, + "grad_norm": 0.3089165687561035, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 377, + "epoch": 2.35625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571977728, + "loss": 0.6945, + "grad_norm": 0.1964135617017746, + "learning_rate": 0.0001296102424723179 + }, + { + "step": 378, + "epoch": 2.3625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6925, + "grad_norm": 0.220304936170578, + "learning_rate": 0.0001288000203264649 + }, + { + "step": 379, + "epoch": 2.36875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7009, + "grad_norm": 0.2519533038139343, + "learning_rate": 0.00012799042883169574 + }, + { + "step": 380, + "epoch": 2.375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6871, + "grad_norm": 0.26304638385772705, + "learning_rate": 0.00012718149207151247 + }, + { + "step": 381, + "epoch": 2.38125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.7001, + "grad_norm": 0.20696626603603363, + "learning_rate": 0.00012637323410994033 + }, + { + "step": 382, + "epoch": 2.3875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6841, + "grad_norm": 0.12289115786552429, + "learning_rate": 0.0001255656789908117 + }, + { + "step": 383, + "epoch": 2.39375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6891, + "grad_norm": 0.06022673845291138, + "learning_rate": 0.0001247588507370511 + }, + { + "step": 384, + "epoch": 2.4, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.7109, + "grad_norm": 0.33266085386276245, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 385, + "epoch": 2.40625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.7222, + "grad_norm": 0.5158587694168091, + "learning_rate": 0.0001231474708085051 + }, + { + "step": 386, + "epoch": 2.4125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.7047, + "grad_norm": 0.2701108455657959, + "learning_rate": 0.0001223429670686005 + }, + { + "step": 387, + "epoch": 2.41875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6863, + "grad_norm": 0.0768279954791069, + "learning_rate": 0.00012153928606239957 + }, + { + "step": 388, + "epoch": 2.425, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7012, + "grad_norm": 0.2522667348384857, + "learning_rate": 0.00012073645169758076 + }, + { + "step": 389, + "epoch": 2.43125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6918, + "grad_norm": 0.06267531961202621, + "learning_rate": 0.00011993448785663692 + }, + { + "step": 390, + "epoch": 2.4375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6998, + "grad_norm": 0.2994101941585541, + "learning_rate": 0.00011913341839616476 + }, + { + "step": 391, + "epoch": 2.44375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6934, + "grad_norm": 0.08431486785411835, + "learning_rate": 0.00011833326714615522 + }, + { + "step": 392, + "epoch": 2.45, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571974656, + "loss": 0.6905, + "grad_norm": 0.05766095593571663, + "learning_rate": 0.00011753405790928456 + }, + { + "step": 393, + "epoch": 2.45625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6917, + "grad_norm": 0.1012611985206604, + "learning_rate": 0.0001167358144602063 + }, + { + "step": 394, + "epoch": 2.4625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6955, + "grad_norm": 0.17278528213500977, + "learning_rate": 0.00011593856054484402 + }, + { + "step": 395, + "epoch": 2.46875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6682, + "grad_norm": 0.4198247492313385, + "learning_rate": 0.00011514231987968482 + }, + { + "step": 396, + "epoch": 2.475, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.694, + "grad_norm": 0.09453270584344864, + "learning_rate": 0.00011434711615107404 + }, + { + "step": 397, + "epoch": 2.48125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.6948, + "grad_norm": 0.1261870563030243, + "learning_rate": 0.00011355297301451042 + }, + { + "step": 398, + "epoch": 2.4875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571974656, + "loss": 0.7, + "grad_norm": 0.16208261251449585, + "learning_rate": 0.00011275991409394253 + }, + { + "step": 399, + "epoch": 2.49375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7213, + "grad_norm": 0.43665266036987305, + "learning_rate": 0.00011196796298106608 + }, + { + "step": 400, + "epoch": 2.5, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.7183, + "grad_norm": 0.42699700593948364, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 401, + "epoch": 2.50625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6759, + "grad_norm": 0.17939704656600952, + "learning_rate": 0.00011038747837969526 + }, + { + "step": 402, + "epoch": 2.5125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6967, + "grad_norm": 0.15480579435825348, + "learning_rate": 0.00010959899190701608 + }, + { + "step": 403, + "epoch": 2.51875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6933, + "grad_norm": 0.07884307950735092, + "learning_rate": 0.00010881170727226018 + }, + { + "step": 404, + "epoch": 2.525, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.7039, + "grad_norm": 0.3113122284412384, + "learning_rate": 0.00010802564789535119 + }, + { + "step": 405, + "epoch": 2.53125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6865, + "grad_norm": 0.07262302190065384, + "learning_rate": 0.00010724083715976441 + }, + { + "step": 406, + "epoch": 2.5375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6897, + "grad_norm": 0.09824483096599579, + "learning_rate": 0.00010645729841183066 + }, + { + "step": 407, + "epoch": 2.54375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6922, + "grad_norm": 0.06731496006250381, + "learning_rate": 0.00010567505496004213 + }, + { + "step": 408, + "epoch": 2.55, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.6911, + "grad_norm": 0.08393482118844986, + "learning_rate": 0.00010489413007435904 + }, + { + "step": 409, + "epoch": 2.55625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6925, + "grad_norm": 0.09099280834197998, + "learning_rate": 0.00010411454698551695 + }, + { + "step": 410, + "epoch": 2.5625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6935, + "grad_norm": 0.25519803166389465, + "learning_rate": 0.00010333632888433638 + }, + { + "step": 411, + "epoch": 2.56875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6963, + "grad_norm": 0.2510821223258972, + "learning_rate": 0.00010255949892103225 + }, + { + "step": 412, + "epoch": 2.575, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6925, + "grad_norm": 0.159589946269989, + "learning_rate": 0.00010178408020452579 + }, + { + "step": 413, + "epoch": 2.58125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571974656, + "loss": 0.6919, + "grad_norm": 0.1601668894290924, + "learning_rate": 0.00010101009580175669 + }, + { + "step": 414, + "epoch": 2.5875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6946, + "grad_norm": 0.038014672696590424, + "learning_rate": 0.00010023756873699722 + }, + { + "step": 415, + "epoch": 2.59375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6921, + "grad_norm": 0.09832153469324112, + "learning_rate": 9.946652199116699e-05 + }, + { + "step": 416, + "epoch": 2.6, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6967, + "grad_norm": 0.2600041627883911, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 417, + "epoch": 2.60625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.696, + "grad_norm": 0.25567150115966797, + "learning_rate": 9.792896115911045e-05 + }, + { + "step": 418, + "epoch": 2.6125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6927, + "grad_norm": 0.16339796781539917, + "learning_rate": 9.716249281181497e-05 + }, + { + "step": 419, + "epoch": 2.61875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6938, + "grad_norm": 0.33493927121162415, + "learning_rate": 9.639759625994998e-05 + }, + { + "step": 420, + "epoch": 2.625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6921, + "grad_norm": 0.04447793960571289, + "learning_rate": 9.563429425744476e-05 + }, + { + "step": 421, + "epoch": 2.63125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6962, + "grad_norm": 0.22203843295574188, + "learning_rate": 9.487260951079448e-05 + }, + { + "step": 422, + "epoch": 2.6375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6977, + "grad_norm": 0.33312079310417175, + "learning_rate": 9.411256467838455e-05 + }, + { + "step": 423, + "epoch": 2.64375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6877, + "grad_norm": 0.4097679555416107, + "learning_rate": 9.335418236981677e-05 + }, + { + "step": 424, + "epoch": 2.65, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6958, + "grad_norm": 0.12110136449337006, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 425, + "epoch": 2.65625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6947, + "grad_norm": 0.13972948491573334, + "learning_rate": 9.184249551466189e-05 + }, + { + "step": 426, + "epoch": 2.6625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6891, + "grad_norm": 0.22157472372055054, + "learning_rate": 9.10892359373139e-05 + }, + { + "step": 427, + "epoch": 2.66875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571974656, + "loss": 0.6935, + "grad_norm": 0.04082195833325386, + "learning_rate": 9.033772882094833e-05 + }, + { + "step": 428, + "epoch": 2.675, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.7031, + "grad_norm": 0.4175085723400116, + "learning_rate": 8.958799652118943e-05 + }, + { + "step": 429, + "epoch": 2.68125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6923, + "grad_norm": 0.0752616599202156, + "learning_rate": 8.884006134086449e-05 + }, + { + "step": 430, + "epoch": 2.6875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6911, + "grad_norm": 0.0635843425989151, + "learning_rate": 8.809394552934079e-05 + }, + { + "step": 431, + "epoch": 2.69375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6863, + "grad_norm": 0.29497310519218445, + "learning_rate": 8.734967128186338e-05 + }, + { + "step": 432, + "epoch": 2.7, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.695, + "grad_norm": 0.06148180365562439, + "learning_rate": 8.660726073889511e-05 + }, + { + "step": 433, + "epoch": 2.70625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6945, + "grad_norm": 0.12305723875761032, + "learning_rate": 8.586673598545771e-05 + }, + { + "step": 434, + "epoch": 2.7125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6946, + "grad_norm": 0.11568289250135422, + "learning_rate": 8.512811905047505e-05 + }, + { + "step": 435, + "epoch": 2.71875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6913, + "grad_norm": 0.12653611600399017, + "learning_rate": 8.439143190611787e-05 + }, + { + "step": 436, + "epoch": 2.725, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6936, + "grad_norm": 0.04978393763303757, + "learning_rate": 8.365669646714983e-05 + }, + { + "step": 437, + "epoch": 2.73125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.6979, + "grad_norm": 0.11181218177080154, + "learning_rate": 8.29239345902759e-05 + }, + { + "step": 438, + "epoch": 2.7375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6945, + "grad_norm": 0.054673317819833755, + "learning_rate": 8.219316807349204e-05 + }, + { + "step": 439, + "epoch": 2.74375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6936, + "grad_norm": 0.053271424025297165, + "learning_rate": 8.146441865543689e-05 + }, + { + "step": 440, + "epoch": 2.75, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6892, + "grad_norm": 0.5475156903266907, + "learning_rate": 8.073770801474495e-05 + }, + { + "step": 441, + "epoch": 2.75625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6923, + "grad_norm": 0.043399348855018616, + "learning_rate": 8.001305776940163e-05 + }, + { + "step": 442, + "epoch": 2.7625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6906, + "grad_norm": 0.12547709047794342, + "learning_rate": 7.929048947610034e-05 + }, + { + "step": 443, + "epoch": 2.76875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6947, + "grad_norm": 0.13459821045398712, + "learning_rate": 7.857002462960132e-05 + }, + { + "step": 444, + "epoch": 2.775, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6888, + "grad_norm": 0.16310815513134003, + "learning_rate": 7.785168466209187e-05 + }, + { + "step": 445, + "epoch": 2.78125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6888, + "grad_norm": 0.09876418113708496, + "learning_rate": 7.713549094254897e-05 + }, + { + "step": 446, + "epoch": 2.7875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6858, + "grad_norm": 0.19554999470710754, + "learning_rate": 7.64214647761038e-05 + }, + { + "step": 447, + "epoch": 2.79375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6871, + "grad_norm": 0.10782324522733688, + "learning_rate": 7.570962740340759e-05 + }, + { + "step": 448, + "epoch": 2.8, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.687, + "grad_norm": 0.0909029096364975, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 449, + "epoch": 2.80625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6866, + "grad_norm": 0.07580197602510452, + "learning_rate": 7.429260367567916e-05 + }, + { + "step": 450, + "epoch": 2.8125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.7046, + "grad_norm": 0.3055714964866638, + "learning_rate": 7.358745947387373e-05 + }, + { + "step": 451, + "epoch": 2.81875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6944, + "grad_norm": 0.13993531465530396, + "learning_rate": 7.288458837101675e-05 + }, + { + "step": 452, + "epoch": 2.825, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6676, + "grad_norm": 0.32887107133865356, + "learning_rate": 7.218401127592175e-05 + }, + { + "step": 453, + "epoch": 2.83125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6746, + "grad_norm": 0.19721698760986328, + "learning_rate": 7.14857490291609e-05 + }, + { + "step": 454, + "epoch": 2.8375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6966, + "grad_norm": 0.13519670069217682, + "learning_rate": 7.07898224024448e-05 + }, + { + "step": 455, + "epoch": 2.84375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.7138, + "grad_norm": 0.42395660281181335, + "learning_rate": 7.009625209800465e-05 + }, + { + "step": 456, + "epoch": 2.85, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.7087, + "grad_norm": 0.35730594396591187, + "learning_rate": 6.940505874797639e-05 + }, + { + "step": 457, + "epoch": 2.85625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.7046, + "grad_norm": 0.2927296757698059, + "learning_rate": 6.871626291378728e-05 + }, + { + "step": 458, + "epoch": 2.8625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.7186, + "grad_norm": 0.5007656812667847, + "learning_rate": 6.80298850855435e-05 + }, + { + "step": 459, + "epoch": 2.86875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6992, + "grad_norm": 0.17160183191299438, + "learning_rate": 6.734594568142142e-05 + }, + { + "step": 460, + "epoch": 2.875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.7024, + "grad_norm": 0.31330177187919617, + "learning_rate": 6.66644650470597e-05 + }, + { + "step": 461, + "epoch": 2.88125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6804, + "grad_norm": 0.4806714951992035, + "learning_rate": 6.598546345495417e-05 + }, + { + "step": 462, + "epoch": 2.8875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.702, + "grad_norm": 0.4844644069671631, + "learning_rate": 6.530896110385494e-05 + }, + { + "step": 463, + "epoch": 2.89375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6971, + "grad_norm": 0.11558495461940765, + "learning_rate": 6.463497811816523e-05 + }, + { + "step": 464, + "epoch": 2.9, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6919, + "grad_norm": 0.11563748121261597, + "learning_rate": 6.396353454734311e-05 + }, + { + "step": 465, + "epoch": 2.90625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6906, + "grad_norm": 0.32812657952308655, + "learning_rate": 6.32946503653045e-05 + }, + { + "step": 466, + "epoch": 2.9125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6942, + "grad_norm": 0.06311943382024765, + "learning_rate": 6.262834546982969e-05 + }, + { + "step": 467, + "epoch": 2.91875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571974656, + "loss": 0.6899, + "grad_norm": 0.05211424082517624, + "learning_rate": 6.196463968197084e-05 + }, + { + "step": 468, + "epoch": 2.925, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6949, + "grad_norm": 0.09392490983009338, + "learning_rate": 6.130355274546267e-05 + }, + { + "step": 469, + "epoch": 2.93125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.704, + "grad_norm": 0.40072453022003174, + "learning_rate": 6.064510432613499e-05 + }, + { + "step": 470, + "epoch": 2.9375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.698, + "grad_norm": 0.2403048425912857, + "learning_rate": 5.998931401132786e-05 + }, + { + "step": 471, + "epoch": 2.94375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6812, + "grad_norm": 0.27562740445137024, + "learning_rate": 5.933620130930867e-05 + }, + { + "step": 472, + "epoch": 2.95, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7013, + "grad_norm": 0.3003266453742981, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 473, + "epoch": 2.95625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6935, + "grad_norm": 0.13988588750362396, + "learning_rate": 5.803808637786135e-05 + }, + { + "step": 474, + "epoch": 2.9625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.6985, + "grad_norm": 0.18832312524318695, + "learning_rate": 5.739312276439427e-05 + }, + { + "step": 475, + "epoch": 2.96875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6964, + "grad_norm": 0.11303382366895676, + "learning_rate": 5.6750913994488415e-05 + }, + { + "step": 476, + "epoch": 2.975, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6963, + "grad_norm": 0.10384392738342285, + "learning_rate": 5.6111479172391136e-05 + }, + { + "step": 477, + "epoch": 2.98125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6911, + "grad_norm": 0.08915739506483078, + "learning_rate": 5.5474837319831314e-05 + }, + { + "step": 478, + "epoch": 2.9875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6904, + "grad_norm": 0.09668831527233124, + "learning_rate": 5.4841007375453186e-05 + }, + { + "step": 479, + "epoch": 2.99375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6898, + "grad_norm": 0.10113918036222458, + "learning_rate": 5.4210008194253196e-05 + }, + { + "step": 480, + "epoch": 3.0, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6894, + "grad_norm": 0.12514659762382507, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 481, + "epoch": 3.00625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.687, + "grad_norm": 0.11501681059598923, + "learning_rate": 5.2956577119771405e-05 + }, + { + "step": 482, + "epoch": 3.0125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6964, + "grad_norm": 0.25200155377388, + "learning_rate": 5.233418251320765e-05 + }, + { + "step": 483, + "epoch": 3.01875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.7018, + "grad_norm": 0.3061540424823761, + "learning_rate": 5.171469324214901e-05 + }, + { + "step": 484, + "epoch": 3.025, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6944, + "grad_norm": 0.04331190511584282, + "learning_rate": 5.109812773498967e-05 + }, + { + "step": 485, + "epoch": 3.03125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6987, + "grad_norm": 0.28217047452926636, + "learning_rate": 5.048450433314835e-05 + }, + { + "step": 486, + "epoch": 3.0375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6887, + "grad_norm": 0.1892416626214981, + "learning_rate": 4.987384129052291e-05 + }, + { + "step": 487, + "epoch": 3.04375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6974, + "grad_norm": 0.16187840700149536, + "learning_rate": 4.926615677294723e-05 + }, + { + "step": 488, + "epoch": 3.05, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6941, + "grad_norm": 0.36968836188316345, + "learning_rate": 4.866146885765096e-05 + }, + { + "step": 489, + "epoch": 3.05625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6936, + "grad_norm": 0.3441619873046875, + "learning_rate": 4.8059795532721575e-05 + }, + { + "step": 490, + "epoch": 3.0625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6924, + "grad_norm": 0.17151501774787903, + "learning_rate": 4.7461154696569294e-05 + }, + { + "step": 491, + "epoch": 3.06875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6933, + "grad_norm": 0.30534347891807556, + "learning_rate": 4.686556415739488e-05 + }, + { + "step": 492, + "epoch": 3.075, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6912, + "grad_norm": 0.14173726737499237, + "learning_rate": 4.62730416326596e-05 + }, + { + "step": 493, + "epoch": 3.08125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6955, + "grad_norm": 0.4236004054546356, + "learning_rate": 4.568360474855826e-05 + }, + { + "step": 494, + "epoch": 3.0875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6896, + "grad_norm": 0.1186327263712883, + "learning_rate": 4.509727103949492e-05 + }, + { + "step": 495, + "epoch": 3.09375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6932, + "grad_norm": 0.4656846821308136, + "learning_rate": 4.451405794756138e-05 + }, + { + "step": 496, + "epoch": 3.1, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571974656, + "loss": 0.6949, + "grad_norm": 0.1734677255153656, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 497, + "epoch": 3.10625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571974656, + "loss": 0.6936, + "grad_norm": 0.3158358037471771, + "learning_rate": 4.33570629187776e-05 + }, + { + "step": 498, + "epoch": 3.1125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6949, + "grad_norm": 0.42844414710998535, + "learning_rate": 4.278331539989307e-05 + }, + { + "step": 499, + "epoch": 3.11875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.6948, + "grad_norm": 0.08941220492124557, + "learning_rate": 4.2212757333045283e-05 + }, + { + "step": 500, + "epoch": 3.125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6957, + "grad_norm": 0.14115017652511597, + "learning_rate": 4.164540569103667e-05 + }, + { + "step": 501, + "epoch": 3.13125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6942, + "grad_norm": 0.05152183771133423, + "learning_rate": 4.108127735128561e-05 + }, + { + "step": 502, + "epoch": 3.1375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6936, + "grad_norm": 0.2164037972688675, + "learning_rate": 4.052038909532469e-05 + }, + { + "step": 503, + "epoch": 3.14375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6922, + "grad_norm": 0.25941720604896545, + "learning_rate": 3.996275760830125e-05 + }, + { + "step": 504, + "epoch": 3.15, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6947, + "grad_norm": 0.04531259834766388, + "learning_rate": 3.94083994784814e-05 + }, + { + "step": 505, + "epoch": 3.15625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6944, + "grad_norm": 0.3211238384246826, + "learning_rate": 3.885733119675616e-05 + }, + { + "step": 506, + "epoch": 3.1625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6912, + "grad_norm": 0.05825451388955116, + "learning_rate": 3.830956915615106e-05 + }, + { + "step": 507, + "epoch": 3.16875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.6922, + "grad_norm": 0.0583849661052227, + "learning_rate": 3.776512965133863e-05 + }, + { + "step": 508, + "epoch": 3.175, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.6888, + "grad_norm": 0.2603738307952881, + "learning_rate": 3.72240288781534e-05 + }, + { + "step": 509, + "epoch": 3.18125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6945, + "grad_norm": 0.266971617937088, + "learning_rate": 3.66862829331103e-05 + }, + { + "step": 510, + "epoch": 3.1875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6977, + "grad_norm": 0.11055431514978409, + "learning_rate": 3.6151907812925717e-05 + }, + { + "step": 511, + "epoch": 3.19375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6936, + "grad_norm": 0.054498959332704544, + "learning_rate": 3.562091941404179e-05 + }, + { + "step": 512, + "epoch": 3.2, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6921, + "grad_norm": 0.050303343683481216, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 513, + "epoch": 3.20625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6913, + "grad_norm": 0.15094709396362305, + "learning_rate": 3.456916586173797e-05 + }, + { + "step": 514, + "epoch": 3.2125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6916, + "grad_norm": 0.2683545649051666, + "learning_rate": 3.404843199558945e-05 + }, + { + "step": 515, + "epoch": 3.21875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.696, + "grad_norm": 0.25272372364997864, + "learning_rate": 3.3531147424353664e-05 + }, + { + "step": 516, + "epoch": 3.225, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6936, + "grad_norm": 0.2502441108226776, + "learning_rate": 3.301732753606776e-05 + }, + { + "step": 517, + "epoch": 3.23125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6896, + "grad_norm": 0.5063999891281128, + "learning_rate": 3.250698761570244e-05 + }, + { + "step": 518, + "epoch": 3.2375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6871, + "grad_norm": 0.6133648753166199, + "learning_rate": 3.200014284470745e-05 + }, + { + "step": 519, + "epoch": 3.24375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6935, + "grad_norm": 0.04791819304227829, + "learning_rate": 3.149680830055967e-05 + }, + { + "step": 520, + "epoch": 3.25, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6902, + "grad_norm": 0.24886314570903778, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 521, + "epoch": 3.25625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6923, + "grad_norm": 0.05471263825893402, + "learning_rate": 3.0500729680161663e-05 + }, + { + "step": 522, + "epoch": 3.2625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6873, + "grad_norm": 0.18334780633449554, + "learning_rate": 3.0008015234980552e-05 + }, + { + "step": 523, + "epoch": 3.26875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6936, + "grad_norm": 0.07117155939340591, + "learning_rate": 2.9518870277903274e-05 + }, + { + "step": 524, + "epoch": 3.275, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6898, + "grad_norm": 0.08556237071752548, + "learning_rate": 2.9033309359877597e-05 + }, + { + "step": 525, + "epoch": 3.28125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.7012, + "grad_norm": 0.37769514322280884, + "learning_rate": 2.855134692523438e-05 + }, + { + "step": 526, + "epoch": 3.2875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.692, + "grad_norm": 0.09495555609464645, + "learning_rate": 2.807299731125773e-05 + }, + { + "step": 527, + "epoch": 3.29375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6916, + "grad_norm": 0.10709106177091599, + "learning_rate": 2.759827474775852e-05 + }, + { + "step": 528, + "epoch": 3.3, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6883, + "grad_norm": 0.10131390392780304, + "learning_rate": 2.7127193356651213e-05 + }, + { + "step": 529, + "epoch": 3.30625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6979, + "grad_norm": 0.36316922307014465, + "learning_rate": 2.665976715153377e-05 + }, + { + "step": 530, + "epoch": 3.3125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6836, + "grad_norm": 0.2532973885536194, + "learning_rate": 2.619601003727043e-05 + }, + { + "step": 531, + "epoch": 3.31875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571979264, + "loss": 0.7029, + "grad_norm": 0.43310075998306274, + "learning_rate": 2.5735935809578656e-05 + }, + { + "step": 532, + "epoch": 3.325, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.7021, + "grad_norm": 0.4239107668399811, + "learning_rate": 2.5279558154618197e-05 + }, + { + "step": 533, + "epoch": 3.33125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6947, + "grad_norm": 0.1901184320449829, + "learning_rate": 2.4826890648584353e-05 + }, + { + "step": 534, + "epoch": 3.3375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6952, + "grad_norm": 0.12347812950611115, + "learning_rate": 2.4377946757303828e-05 + }, + { + "step": 535, + "epoch": 3.34375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6969, + "grad_norm": 0.10874398797750473, + "learning_rate": 2.393273983583427e-05 + }, + { + "step": 536, + "epoch": 3.35, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6949, + "grad_norm": 0.1623145490884781, + "learning_rate": 2.3491283128067174e-05 + }, + { + "step": 537, + "epoch": 3.35625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6896, + "grad_norm": 0.15935073792934418, + "learning_rate": 2.3053589766333414e-05 + }, + { + "step": 538, + "epoch": 3.3625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6916, + "grad_norm": 0.19807302951812744, + "learning_rate": 2.261967277101318e-05 + }, + { + "step": 539, + "epoch": 3.36875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6923, + "grad_norm": 0.08826253563165665, + "learning_rate": 2.218954505014821e-05 + }, + { + "step": 540, + "epoch": 3.375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6902, + "grad_norm": 0.3971004784107208, + "learning_rate": 2.1763219399058042e-05 + }, + { + "step": 541, + "epoch": 3.38125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6915, + "grad_norm": 0.059857264161109924, + "learning_rate": 2.1340708499959197e-05 + }, + { + "step": 542, + "epoch": 3.3875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6947, + "grad_norm": 0.16927236318588257, + "learning_rate": 2.0922024921588167e-05 + }, + { + "step": 543, + "epoch": 3.39375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6928, + "grad_norm": 0.16673800349235535, + "learning_rate": 2.0507181118827254e-05 + }, + { + "step": 544, + "epoch": 3.4, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6874, + "grad_norm": 0.058378253132104874, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 545, + "epoch": 3.40625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.6955, + "grad_norm": 0.08364900201559067, + "learning_rate": 1.9689062088175154e-05 + }, + { + "step": 546, + "epoch": 3.4125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6923, + "grad_norm": 0.04720776155591011, + "learning_rate": 1.928581119746081e-05 + }, + { + "step": 547, + "epoch": 3.41875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6898, + "grad_norm": 0.20455242693424225, + "learning_rate": 1.8886448755986193e-05 + }, + { + "step": 548, + "epoch": 3.425, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6913, + "grad_norm": 0.09009282290935516, + "learning_rate": 1.8490986643873845e-05 + }, + { + "step": 549, + "epoch": 3.43125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6925, + "grad_norm": 0.09538199752569199, + "learning_rate": 1.8099436625220443e-05 + }, + { + "step": 550, + "epoch": 3.4375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571976192, + "loss": 0.6889, + "grad_norm": 0.14223554730415344, + "learning_rate": 1.7711810347746757e-05 + }, + { + "step": 551, + "epoch": 3.44375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6958, + "grad_norm": 0.21968699991703033, + "learning_rate": 1.7328119342451165e-05 + }, + { + "step": 552, + "epoch": 3.45, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6918, + "grad_norm": 0.07953153550624847, + "learning_rate": 1.694837502326674e-05 + }, + { + "step": 553, + "epoch": 3.45625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6954, + "grad_norm": 0.17260029911994934, + "learning_rate": 1.6572588686721606e-05 + }, + { + "step": 554, + "epoch": 3.4625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6866, + "grad_norm": 0.2895820438861847, + "learning_rate": 1.6200771511602882e-05 + }, + { + "step": 555, + "epoch": 3.46875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.6863, + "grad_norm": 0.6304632425308228, + "learning_rate": 1.583293455862422e-05 + }, + { + "step": 556, + "epoch": 3.475, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6904, + "grad_norm": 0.06288617104291916, + "learning_rate": 1.546908877009676e-05 + }, + { + "step": 557, + "epoch": 3.48125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6983, + "grad_norm": 0.3000217378139496, + "learning_rate": 1.5109244969603546e-05 + }, + { + "step": 558, + "epoch": 3.4875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6984, + "grad_norm": 0.24716979265213013, + "learning_rate": 1.4753413861677604e-05 + }, + { + "step": 559, + "epoch": 3.49375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6908, + "grad_norm": 0.16789042949676514, + "learning_rate": 1.4401606031483497e-05 + }, + { + "step": 560, + "epoch": 3.5, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6929, + "grad_norm": 0.20585864782333374, + "learning_rate": 1.4053831944502508e-05 + }, + { + "step": 561, + "epoch": 3.50625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.692, + "grad_norm": 0.055791694670915604, + "learning_rate": 1.371010194622117e-05 + }, + { + "step": 562, + "epoch": 3.5125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.7056, + "grad_norm": 0.4438885748386383, + "learning_rate": 1.3370426261823613e-05 + }, + { + "step": 563, + "epoch": 3.51875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6964, + "grad_norm": 0.14871558547019958, + "learning_rate": 1.3034814995887433e-05 + }, + { + "step": 564, + "epoch": 3.525, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6911, + "grad_norm": 0.1117328330874443, + "learning_rate": 1.2703278132082934e-05 + }, + { + "step": 565, + "epoch": 3.53125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6916, + "grad_norm": 0.05138324573636055, + "learning_rate": 1.237582553287631e-05 + }, + { + "step": 566, + "epoch": 3.5375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.702, + "grad_norm": 0.48615556955337524, + "learning_rate": 1.205246693923616e-05 + }, + { + "step": 567, + "epoch": 3.54375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.7004, + "grad_norm": 0.31352537870407104, + "learning_rate": 1.173321197034382e-05 + }, + { + "step": 568, + "epoch": 3.55, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6908, + "grad_norm": 0.17119790613651276, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 569, + "epoch": 3.55625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6997, + "grad_norm": 0.34925609827041626, + "learning_rate": 1.1107050772877507e-05 + }, + { + "step": 570, + "epoch": 3.5625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6898, + "grad_norm": 0.0697748139500618, + "learning_rate": 1.0800163171172332e-05 + }, + { + "step": 571, + "epoch": 3.56875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6946, + "grad_norm": 0.11803988367319107, + "learning_rate": 1.0497416447398187e-05 + }, + { + "step": 572, + "epoch": 3.575, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6874, + "grad_norm": 0.3019244074821472, + "learning_rate": 1.0198819607580233e-05 + }, + { + "step": 573, + "epoch": 3.58125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6928, + "grad_norm": 0.17132940888404846, + "learning_rate": 9.904381534293993e-06 + }, + { + "step": 574, + "epoch": 3.5875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6894, + "grad_norm": 0.18543866276741028, + "learning_rate": 9.614110986401169e-06 + }, + { + "step": 575, + "epoch": 3.59375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6918, + "grad_norm": 0.04336598515510559, + "learning_rate": 9.32801659878905e-06 + }, + { + "step": 576, + "epoch": 3.6, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6989, + "grad_norm": 0.22866995632648468, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 577, + "epoch": 3.60625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6909, + "grad_norm": 0.05693443492054939, + "learning_rate": 8.768390222546895e-06 + }, + { + "step": 578, + "epoch": 3.6125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571974656, + "loss": 0.694, + "grad_norm": 0.08117138594388962, + "learning_rate": 8.494874881526215e-06 + }, + { + "step": 579, + "epoch": 3.61875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6937, + "grad_norm": 0.04590700566768646, + "learning_rate": 8.225568995509834e-06 + }, + { + "step": 580, + "epoch": 3.625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6881, + "grad_norm": 0.37014177441596985, + "learning_rate": 7.960480575734162e-06 + }, + { + "step": 581, + "epoch": 3.63125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6895, + "grad_norm": 0.13581690192222595, + "learning_rate": 7.699617507975563e-06 + }, + { + "step": 582, + "epoch": 3.6375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.6911, + "grad_norm": 0.05031242594122887, + "learning_rate": 7.442987552315833e-06 + }, + { + "step": 583, + "epoch": 3.64375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.693, + "grad_norm": 0.05726393312215805, + "learning_rate": 7.190598342911358e-06 + }, + { + "step": 584, + "epoch": 3.65, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6899, + "grad_norm": 0.05312518775463104, + "learning_rate": 6.942457387765976e-06 + }, + { + "step": 585, + "epoch": 3.65625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571974656, + "loss": 0.6898, + "grad_norm": 0.14171838760375977, + "learning_rate": 6.698572068507596e-06 + }, + { + "step": 586, + "epoch": 3.6625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6982, + "grad_norm": 0.39349818229675293, + "learning_rate": 6.458949640168675e-06 + }, + { + "step": 587, + "epoch": 3.66875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571974656, + "loss": 0.6875, + "grad_norm": 0.09803467988967896, + "learning_rate": 6.223597230970428e-06 + }, + { + "step": 588, + "epoch": 3.675, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6886, + "grad_norm": 0.31059136986732483, + "learning_rate": 5.992521842110709e-06 + }, + { + "step": 589, + "epoch": 3.68125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6955, + "grad_norm": 0.2113228738307953, + "learning_rate": 5.7657303475556974e-06 + }, + { + "step": 590, + "epoch": 3.6875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6978, + "grad_norm": 0.3317917585372925, + "learning_rate": 5.543229493835594e-06 + }, + { + "step": 591, + "epoch": 3.69375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6944, + "grad_norm": 0.5013271570205688, + "learning_rate": 5.325025899843732e-06 + }, + { + "step": 592, + "epoch": 3.7, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.6968, + "grad_norm": 0.2707918882369995, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 593, + "epoch": 3.70625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6928, + "grad_norm": 0.20954295992851257, + "learning_rate": 4.901536327256589e-06 + }, + { + "step": 594, + "epoch": 3.7125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6981, + "grad_norm": 0.49577978253364563, + "learning_rate": 4.6962629465110365e-06 + }, + { + "step": 595, + "epoch": 3.71875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57197312, + "loss": 0.691, + "grad_norm": 0.05905291438102722, + "learning_rate": 4.495312020818403e-06 + }, + { + "step": 596, + "epoch": 3.725, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6902, + "grad_norm": 0.258797824382782, + "learning_rate": 4.298689528010785e-06 + }, + { + "step": 597, + "epoch": 3.73125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6938, + "grad_norm": 0.10444536805152893, + "learning_rate": 4.106401317159275e-06 + }, + { + "step": 598, + "epoch": 3.7375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6922, + "grad_norm": 0.2676865756511688, + "learning_rate": 3.918453108399955e-06 + }, + { + "step": 599, + "epoch": 3.74375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6934, + "grad_norm": 0.14509393274784088, + "learning_rate": 3.7348504927637302e-06 + }, + { + "step": 600, + "epoch": 3.75, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6929, + "grad_norm": 0.1651117503643036, + "learning_rate": 3.5555989320099952e-06 + }, + { + "step": 601, + "epoch": 3.75625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6906, + "grad_norm": 0.08684806525707245, + "learning_rate": 3.3807037584642316e-06 + }, + { + "step": 602, + "epoch": 3.7625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6921, + "grad_norm": 0.1388402134180069, + "learning_rate": 3.21017017485925e-06 + }, + { + "step": 603, + "epoch": 3.76875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571971584, + "loss": 0.6911, + "grad_norm": 0.2360139787197113, + "learning_rate": 3.0440032541805825e-06 + }, + { + "step": 604, + "epoch": 3.775, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6938, + "grad_norm": 0.10008968412876129, + "learning_rate": 2.882207939515435e-06 + }, + { + "step": 605, + "epoch": 3.78125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571960832, + "loss": 0.6946, + "grad_norm": 0.42079946398735046, + "learning_rate": 2.7247890439057064e-06 + }, + { + "step": 606, + "epoch": 3.7875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6911, + "grad_norm": 0.19489826261997223, + "learning_rate": 2.5717512502048342e-06 + }, + { + "step": 607, + "epoch": 3.79375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6898, + "grad_norm": 0.1686435043811798, + "learning_rate": 2.423099110938376e-06 + }, + { + "step": 608, + "epoch": 3.8, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6926, + "grad_norm": 0.22877652943134308, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 609, + "epoch": 3.80625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6928, + "grad_norm": 0.3493768572807312, + "learning_rate": 2.1389693533636455e-06 + }, + { + "step": 610, + "epoch": 3.8125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.697, + "grad_norm": 0.08688914775848389, + "learning_rate": 2.003500187268153e-06 + }, + { + "step": 611, + "epoch": 3.81875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6922, + "grad_norm": 0.21941912174224854, + "learning_rate": 1.8724335797812685e-06 + }, + { + "step": 612, + "epoch": 3.825, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.6916, + "grad_norm": 0.1675485223531723, + "learning_rate": 1.7457734298359005e-06 + }, + { + "step": 613, + "epoch": 3.83125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6925, + "grad_norm": 0.06934154033660889, + "learning_rate": 1.6235235052828476e-06 + }, + { + "step": 614, + "epoch": 3.8375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6895, + "grad_norm": 0.21687670052051544, + "learning_rate": 1.505687442778819e-06 + }, + { + "step": 615, + "epoch": 3.84375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6935, + "grad_norm": 0.22348476946353912, + "learning_rate": 1.3922687476781047e-06 + }, + { + "step": 616, + "epoch": 3.85, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6906, + "grad_norm": 0.043734513223171234, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 617, + "epoch": 3.85625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6917, + "grad_norm": 0.1843772828578949, + "learning_rate": 1.1786968239705486e-06 + }, + { + "step": 618, + "epoch": 3.8625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6917, + "grad_norm": 0.044150326400995255, + "learning_rate": 1.0785499486417438e-06 + }, + { + "step": 619, + "epoch": 3.86875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6928, + "grad_norm": 0.06122425198554993, + "learning_rate": 9.82833147083345e-07 + }, + { + "step": 620, + "epoch": 3.875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6905, + "grad_norm": 0.2708818018436432, + "learning_rate": 8.91549266652053e-07 + }, + { + "step": 621, + "epoch": 3.88125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6941, + "grad_norm": 0.3593442142009735, + "learning_rate": 8.04701022835319e-07 + }, + { + "step": 622, + "epoch": 3.8875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6983, + "grad_norm": 0.1404038816690445, + "learning_rate": 7.222909991704773e-07 + }, + { + "step": 623, + "epoch": 3.89375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6894, + "grad_norm": 0.28675106167793274, + "learning_rate": 6.443216471679058e-07 + }, + { + "step": 624, + "epoch": 3.9, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6948, + "grad_norm": 0.13724032044410706, + "learning_rate": 5.707952862381681e-07 + }, + { + "step": 625, + "epoch": 3.90625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6908, + "grad_norm": 0.10844231396913528, + "learning_rate": 5.017141036229522e-07 + }, + { + "step": 626, + "epoch": 3.9125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6921, + "grad_norm": 0.043847937136888504, + "learning_rate": 4.370801543300051e-07 + }, + { + "step": 627, + "epoch": 3.91875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6938, + "grad_norm": 0.13400162756443024, + "learning_rate": 3.768953610720327e-07 + }, + { + "step": 628, + "epoch": 3.925, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.691, + "grad_norm": 0.12805011868476868, + "learning_rate": 3.211615142094781e-07 + }, + { + "step": 629, + "epoch": 3.93125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571963904, + "loss": 0.6912, + "grad_norm": 0.22486035525798798, + "learning_rate": 2.6988027169728145e-07 + }, + { + "step": 630, + "epoch": 3.9375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571962368, + "loss": 0.6933, + "grad_norm": 0.25307610630989075, + "learning_rate": 2.2305315903553555e-07 + }, + { + "step": 631, + "epoch": 3.94375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571976192, + "loss": 0.6904, + "grad_norm": 0.1897202581167221, + "learning_rate": 1.8068156922413924e-07 + }, + { + "step": 632, + "epoch": 3.95, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6909, + "grad_norm": 0.1058412715792656, + "learning_rate": 1.4276676272133025e-07 + }, + { + "step": 633, + "epoch": 3.95625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6901, + "grad_norm": 0.04906204342842102, + "learning_rate": 1.0930986740621539e-07 + }, + { + "step": 634, + "epoch": 3.9625, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571966976, + "loss": 0.6918, + "grad_norm": 0.2431795597076416, + "learning_rate": 8.031187854514731e-08 + }, + { + "step": 635, + "epoch": 3.96875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6914, + "grad_norm": 0.06553512066602707, + "learning_rate": 5.577365876224815e-08 + }, + { + "step": 636, + "epoch": 3.975, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571959296, + "loss": 0.6922, + "grad_norm": 0.06544186174869537, + "learning_rate": 3.5695938013630134e-08 + }, + { + "step": 637, + "epoch": 3.98125, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571970048, + "loss": 0.6944, + "grad_norm": 0.4074690043926239, + "learning_rate": 2.007931356572956e-08 + }, + { + "step": 638, + "epoch": 3.9875, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6922, + "grad_norm": 0.2239709496498108, + "learning_rate": 8.924249977537712e-09 + }, + { + "step": 639, + "epoch": 3.99375, + "cpu_mem": 1.717010432, + "gpu_mem": 4.571968512, + "loss": 0.6912, + "grad_norm": 0.2206483632326126, + "learning_rate": 2.2310790867619e-09 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "loss": 0.6926, + "grad_norm": 0.19011861085891724, + "learning_rate": 0.0 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.717010432, + "gpu_mem": 4.57196544, + "train_runtime": 1389.6218, + "train_samples_per_second": 29.458, + "train_steps_per_second": 0.461, + "total_flos": 0.0, + "train_loss": 0.721914070378989 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r8-a2/adapter_config.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c9ded8039b496858a8aa3d756f427279337f8964 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r8-a2/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha": 16, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weight": "kaiming", + "layers_pattern": null, + "layers_to_transform": null, + "metric_tracking": false, + "modules_to_save": null, + "peft_type": "ABLATION", + "r": 8, + "revision": null, + "seed": 42, + "share_weights": false, + "target_modules": [ + "up_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "o_proj", + "v_proj" + ], + "task_type": null, + "track_n": 100, + "variant": "D" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r8-a2/eval_results.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..d9b127e0dc1d92f886c507640ca6ee7256441c0a --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "winogrande", + "results": 0.5130228887134964 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r8-a2/training_configuration.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..cf827197bec2c2451f7d0651031af7a85a019690 --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "WINOGRANDE", + "dataset_id": "allenai/winogrande", + "preprocess_id": "winogrande_train_deepeval" + }, + "peft_config": { + "method": "abl_D", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 3163776 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-abl_D-winogrande-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r8-a2", + "seed": 42, + "timestamp": "2025-09-02T15:18:43.762369" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r8-a2/training_logs.json b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..c112cbb63d08203f07769e50dadc56b80231e5bd --- /dev/null +++ b/TinyLlama_v1.1-abl_D/TinyLlama_v1.1-abl_D-winogrande-r8-a2/training_logs.json @@ -0,0 +1,5773 @@ +[ + { + "step": 1, + "epoch": 0.00625, + "cpu_mem": 1.699598336, + "gpu_mem": 4.430642688, + "loss": 3.3802, + "grad_norm": 17.330472946166992, + "learning_rate": 4.6875e-06 + }, + { + "step": 2, + "epoch": 0.0125, + "cpu_mem": 1.70018816, + "gpu_mem": 4.456029696, + "loss": 3.3361, + "grad_norm": 17.051837921142578, + "learning_rate": 9.375e-06 + }, + { + "step": 3, + "epoch": 0.01875, + "cpu_mem": 1.700384768, + "gpu_mem": 4.456034304, + "loss": 3.2022, + "grad_norm": 16.11080551147461, + "learning_rate": 1.40625e-05 + }, + { + "step": 4, + "epoch": 0.025, + "cpu_mem": 1.700777984, + "gpu_mem": 4.456032768, + "loss": 3.1108, + "grad_norm": 16.4958553314209, + "learning_rate": 1.875e-05 + }, + { + "step": 5, + "epoch": 0.03125, + "cpu_mem": 1.7011712, + "gpu_mem": 4.456032768, + "loss": 3.0432, + "grad_norm": 16.20774269104004, + "learning_rate": 2.3437499999999997e-05 + }, + { + "step": 6, + "epoch": 0.0375, + "cpu_mem": 1.701367808, + "gpu_mem": 4.456038912, + "loss": 2.9196, + "grad_norm": 16.535968780517578, + "learning_rate": 2.8125e-05 + }, + { + "step": 7, + "epoch": 0.04375, + "cpu_mem": 1.701564416, + "gpu_mem": 4.456045056, + "loss": 2.6913, + "grad_norm": 16.048547744750977, + "learning_rate": 3.28125e-05 + }, + { + "step": 8, + "epoch": 0.05, + "cpu_mem": 1.701761024, + "gpu_mem": 4.45602816, + "loss": 2.5014, + "grad_norm": 14.602679252624512, + "learning_rate": 3.75e-05 + }, + { + "step": 9, + "epoch": 0.05625, + "cpu_mem": 1.701957632, + "gpu_mem": 4.456034304, + "loss": 2.4781, + "grad_norm": 14.933575630187988, + "learning_rate": 4.2187499999999995e-05 + }, + { + "step": 10, + "epoch": 0.0625, + "cpu_mem": 1.70215424, + "gpu_mem": 4.456037376, + "loss": 2.0968, + "grad_norm": 13.542939186096191, + "learning_rate": 4.6874999999999994e-05 + }, + { + "step": 11, + "epoch": 0.06875, + "cpu_mem": 1.702350848, + "gpu_mem": 4.456026624, + "loss": 1.8626, + "grad_norm": 11.93506145477295, + "learning_rate": 5.156249999999999e-05 + }, + { + "step": 12, + "epoch": 0.075, + "cpu_mem": 1.702547456, + "gpu_mem": 4.456031232, + "loss": 1.8008, + "grad_norm": 10.587722778320312, + "learning_rate": 5.625e-05 + }, + { + "step": 13, + "epoch": 0.08125, + "cpu_mem": 1.702547456, + "gpu_mem": 4.456038912, + "loss": 1.4827, + "grad_norm": 7.889034748077393, + "learning_rate": 6.09375e-05 + }, + { + "step": 14, + "epoch": 0.0875, + "cpu_mem": 1.702744064, + "gpu_mem": 4.456034304, + "loss": 1.3361, + "grad_norm": 6.314718246459961, + "learning_rate": 6.5625e-05 + }, + { + "step": 15, + "epoch": 0.09375, + "cpu_mem": 1.702940672, + "gpu_mem": 4.456034304, + "loss": 1.1456, + "grad_norm": 4.480144023895264, + "learning_rate": 7.03125e-05 + }, + { + "step": 16, + "epoch": 0.1, + "cpu_mem": 1.702940672, + "gpu_mem": 4.456031232, + "loss": 1.036, + "grad_norm": 4.192731857299805, + "learning_rate": 7.5e-05 + }, + { + "step": 17, + "epoch": 0.10625, + "cpu_mem": 1.702940672, + "gpu_mem": 4.456031232, + "loss": 0.8619, + "grad_norm": 2.553569793701172, + "learning_rate": 7.968749999999999e-05 + }, + { + "step": 18, + "epoch": 0.1125, + "cpu_mem": 1.70313728, + "gpu_mem": 4.456034304, + "loss": 0.8535, + "grad_norm": 2.1433348655700684, + "learning_rate": 8.437499999999999e-05 + }, + { + "step": 19, + "epoch": 0.11875, + "cpu_mem": 1.703333888, + "gpu_mem": 4.456031232, + "loss": 0.8081, + "grad_norm": 1.9438246488571167, + "learning_rate": 8.906249999999999e-05 + }, + { + "step": 20, + "epoch": 0.125, + "cpu_mem": 1.703333888, + "gpu_mem": 4.456038912, + "loss": 0.8132, + "grad_norm": 2.955030679702759, + "learning_rate": 9.374999999999999e-05 + }, + { + "step": 21, + "epoch": 0.13125, + "cpu_mem": 1.703333888, + "gpu_mem": 4.456031232, + "loss": 0.7452, + "grad_norm": 1.3201414346694946, + "learning_rate": 9.843749999999999e-05 + }, + { + "step": 22, + "epoch": 0.1375, + "cpu_mem": 1.703333888, + "gpu_mem": 4.456031232, + "loss": 0.7268, + "grad_norm": 1.7582844495773315, + "learning_rate": 0.00010312499999999999 + }, + { + "step": 23, + "epoch": 0.14375, + "cpu_mem": 1.703333888, + "gpu_mem": 4.456026624, + "loss": 0.6818, + "grad_norm": 1.6355748176574707, + "learning_rate": 0.00010781249999999998 + }, + { + "step": 24, + "epoch": 0.15, + "cpu_mem": 1.703333888, + "gpu_mem": 4.456029696, + "loss": 0.7419, + "grad_norm": 2.1497550010681152, + "learning_rate": 0.0001125 + }, + { + "step": 25, + "epoch": 0.15625, + "cpu_mem": 1.703333888, + "gpu_mem": 4.456032768, + "loss": 0.6973, + "grad_norm": 1.732041597366333, + "learning_rate": 0.0001171875 + }, + { + "step": 26, + "epoch": 0.1625, + "cpu_mem": 1.703530496, + "gpu_mem": 4.45602816, + "loss": 0.6983, + "grad_norm": 3.0203518867492676, + "learning_rate": 0.000121875 + }, + { + "step": 27, + "epoch": 0.16875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.7326, + "grad_norm": 2.906920909881592, + "learning_rate": 0.0001265625 + }, + { + "step": 28, + "epoch": 0.175, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7273, + "grad_norm": 0.9345660209655762, + "learning_rate": 0.00013125 + }, + { + "step": 29, + "epoch": 0.18125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7332, + "grad_norm": 2.354891538619995, + "learning_rate": 0.0001359375 + }, + { + "step": 30, + "epoch": 0.1875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.763, + "grad_norm": 2.734105348587036, + "learning_rate": 0.000140625 + }, + { + "step": 31, + "epoch": 0.19375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6976, + "grad_norm": 1.4779430627822876, + "learning_rate": 0.0001453125 + }, + { + "step": 32, + "epoch": 0.2, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6571, + "grad_norm": 1.2661937475204468, + "learning_rate": 0.00015 + }, + { + "step": 33, + "epoch": 0.20625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.8514, + "grad_norm": 5.971726417541504, + "learning_rate": 0.00015468749999999999 + }, + { + "step": 34, + "epoch": 0.2125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.8238, + "grad_norm": 5.169806957244873, + "learning_rate": 0.00015937499999999998 + }, + { + "step": 35, + "epoch": 0.21875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7352, + "grad_norm": 2.8325142860412598, + "learning_rate": 0.00016406249999999998 + }, + { + "step": 36, + "epoch": 0.225, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7224, + "grad_norm": 0.9273037314414978, + "learning_rate": 0.00016874999999999998 + }, + { + "step": 37, + "epoch": 0.23125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.7062, + "grad_norm": 1.2120202779769897, + "learning_rate": 0.00017343749999999998 + }, + { + "step": 38, + "epoch": 0.2375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6945, + "grad_norm": 0.7735399007797241, + "learning_rate": 0.00017812499999999998 + }, + { + "step": 39, + "epoch": 0.24375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.7688, + "grad_norm": 1.6971741914749146, + "learning_rate": 0.00018281249999999998 + }, + { + "step": 40, + "epoch": 0.25, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.7165, + "grad_norm": 0.6731290221214294, + "learning_rate": 0.00018749999999999998 + }, + { + "step": 41, + "epoch": 0.25625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.7491, + "grad_norm": 1.1540348529815674, + "learning_rate": 0.00019218749999999998 + }, + { + "step": 42, + "epoch": 0.2625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.68, + "grad_norm": 0.42739537358283997, + "learning_rate": 0.00019687499999999997 + }, + { + "step": 43, + "epoch": 0.26875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6892, + "grad_norm": 0.6731612682342529, + "learning_rate": 0.00020156249999999997 + }, + { + "step": 44, + "epoch": 0.275, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7244, + "grad_norm": 0.7660074830055237, + "learning_rate": 0.00020624999999999997 + }, + { + "step": 45, + "epoch": 0.28125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456040448, + "loss": 0.7196, + "grad_norm": 0.8986096382141113, + "learning_rate": 0.00021093749999999997 + }, + { + "step": 46, + "epoch": 0.2875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7193, + "grad_norm": 0.5424149632453918, + "learning_rate": 0.00021562499999999997 + }, + { + "step": 47, + "epoch": 0.29375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.724, + "grad_norm": 0.35452237725257874, + "learning_rate": 0.00022031249999999997 + }, + { + "step": 48, + "epoch": 0.3, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7034, + "grad_norm": 0.6182721257209778, + "learning_rate": 0.000225 + }, + { + "step": 49, + "epoch": 0.30625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.68, + "grad_norm": 0.7303750514984131, + "learning_rate": 0.0002296875 + }, + { + "step": 50, + "epoch": 0.3125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.7192, + "grad_norm": 0.41640040278434753, + "learning_rate": 0.000234375 + }, + { + "step": 51, + "epoch": 0.31875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6937, + "grad_norm": 0.24383771419525146, + "learning_rate": 0.0002390625 + }, + { + "step": 52, + "epoch": 0.325, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7092, + "grad_norm": 0.6195520758628845, + "learning_rate": 0.00024375 + }, + { + "step": 53, + "epoch": 0.33125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7264, + "grad_norm": 0.8763359189033508, + "learning_rate": 0.00024843749999999996 + }, + { + "step": 54, + "epoch": 0.3375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456022016, + "loss": 0.7047, + "grad_norm": 0.4118117392063141, + "learning_rate": 0.000253125 + }, + { + "step": 55, + "epoch": 0.34375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6887, + "grad_norm": 0.233088880777359, + "learning_rate": 0.00025781249999999996 + }, + { + "step": 56, + "epoch": 0.35, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6935, + "grad_norm": 0.20324158668518066, + "learning_rate": 0.0002625 + }, + { + "step": 57, + "epoch": 0.35625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.7051, + "grad_norm": 0.5539292693138123, + "learning_rate": 0.00026718749999999996 + }, + { + "step": 58, + "epoch": 0.3625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.7065, + "grad_norm": 0.3275444805622101, + "learning_rate": 0.000271875 + }, + { + "step": 59, + "epoch": 0.36875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456040448, + "loss": 0.6867, + "grad_norm": 0.3392412066459656, + "learning_rate": 0.00027656249999999995 + }, + { + "step": 60, + "epoch": 0.375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7088, + "grad_norm": 0.46076229214668274, + "learning_rate": 0.00028125 + }, + { + "step": 61, + "epoch": 0.38125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6998, + "grad_norm": 0.19276709854602814, + "learning_rate": 0.00028593749999999995 + }, + { + "step": 62, + "epoch": 0.3875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6993, + "grad_norm": 0.2592833936214447, + "learning_rate": 0.000290625 + }, + { + "step": 63, + "epoch": 0.39375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7091, + "grad_norm": 0.6501164436340332, + "learning_rate": 0.00029531249999999995 + }, + { + "step": 64, + "epoch": 0.4, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.6925, + "grad_norm": 0.2763468325138092, + "learning_rate": 0.0003 + }, + { + "step": 65, + "epoch": 0.40625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.7085, + "grad_norm": 0.5380356311798096, + "learning_rate": 0.00029999776892091325 + }, + { + "step": 66, + "epoch": 0.4125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7091, + "grad_norm": 0.3734208345413208, + "learning_rate": 0.00029999107575002246 + }, + { + "step": 67, + "epoch": 0.41875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6827, + "grad_norm": 0.38056251406669617, + "learning_rate": 0.0002999799206864343 + }, + { + "step": 68, + "epoch": 0.425, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7103, + "grad_norm": 0.3159506618976593, + "learning_rate": 0.0002999643040619863 + }, + { + "step": 69, + "epoch": 0.43125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6907, + "grad_norm": 0.24412059783935547, + "learning_rate": 0.0002999442263412377 + }, + { + "step": 70, + "epoch": 0.4375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.7101, + "grad_norm": 0.5731344223022461, + "learning_rate": 0.00029991968812145484 + }, + { + "step": 71, + "epoch": 0.44375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6958, + "grad_norm": 0.1965249925851822, + "learning_rate": 0.00029989069013259374 + }, + { + "step": 72, + "epoch": 0.45, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6928, + "grad_norm": 0.20343318581581116, + "learning_rate": 0.00029985723323727866 + }, + { + "step": 73, + "epoch": 0.45625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7106, + "grad_norm": 0.7042127847671509, + "learning_rate": 0.00029981931843077583 + }, + { + "step": 74, + "epoch": 0.4625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.7064, + "grad_norm": 0.3276800811290741, + "learning_rate": 0.00029977694684096444 + }, + { + "step": 75, + "epoch": 0.46875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7151, + "grad_norm": 0.8003770709037781, + "learning_rate": 0.0002997301197283027 + }, + { + "step": 76, + "epoch": 0.475, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.7027, + "grad_norm": 0.557656466960907, + "learning_rate": 0.0002996788384857905 + }, + { + "step": 77, + "epoch": 0.48125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.6974, + "grad_norm": 0.1641213446855545, + "learning_rate": 0.00029962310463892795 + }, + { + "step": 78, + "epoch": 0.4875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6938, + "grad_norm": 0.26423755288124084, + "learning_rate": 0.00029956291984566997 + }, + { + "step": 79, + "epoch": 0.49375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7047, + "grad_norm": 0.2656375467777252, + "learning_rate": 0.00029949828589637703 + }, + { + "step": 80, + "epoch": 0.5, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6985, + "grad_norm": 0.19438022375106812, + "learning_rate": 0.0002994292047137618 + }, + { + "step": 81, + "epoch": 0.50625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.6888, + "grad_norm": 0.19698256254196167, + "learning_rate": 0.00029935567835283203 + }, + { + "step": 82, + "epoch": 0.5125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6945, + "grad_norm": 0.2216099053621292, + "learning_rate": 0.00029927770900082954 + }, + { + "step": 83, + "epoch": 0.51875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6973, + "grad_norm": 0.46508583426475525, + "learning_rate": 0.0002991952989771647 + }, + { + "step": 84, + "epoch": 0.525, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6882, + "grad_norm": 0.2503087818622589, + "learning_rate": 0.0002991084507333479 + }, + { + "step": 85, + "epoch": 0.53125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6958, + "grad_norm": 0.2057289332151413, + "learning_rate": 0.00029901716685291663 + }, + { + "step": 86, + "epoch": 0.5375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.7007, + "grad_norm": 0.3335287868976593, + "learning_rate": 0.0002989214500513582 + }, + { + "step": 87, + "epoch": 0.54375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.698, + "grad_norm": 0.28353646397590637, + "learning_rate": 0.0002988213031760294 + }, + { + "step": 88, + "epoch": 0.55, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6944, + "grad_norm": 0.27222684025764465, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 89, + "epoch": 0.55625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6969, + "grad_norm": 0.2575468420982361, + "learning_rate": 0.0002986077312523219 + }, + { + "step": 90, + "epoch": 0.5625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6952, + "grad_norm": 0.39506685733795166, + "learning_rate": 0.00029849431255722116 + }, + { + "step": 91, + "epoch": 0.56875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6998, + "grad_norm": 0.7814059853553772, + "learning_rate": 0.00029837647649471715 + }, + { + "step": 92, + "epoch": 0.575, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7028, + "grad_norm": 0.46568307280540466, + "learning_rate": 0.0002982542265701641 + }, + { + "step": 93, + "epoch": 0.58125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6937, + "grad_norm": 0.2545846402645111, + "learning_rate": 0.0002981275664202187 + }, + { + "step": 94, + "epoch": 0.5875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7107, + "grad_norm": 0.2428673952817917, + "learning_rate": 0.00029799649981273186 + }, + { + "step": 95, + "epoch": 0.59375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6834, + "grad_norm": 0.25050491094589233, + "learning_rate": 0.00029786103064663634 + }, + { + "step": 96, + "epoch": 0.6, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.7057, + "grad_norm": 0.6383910775184631, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 97, + "epoch": 0.60625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.698, + "grad_norm": 0.16576214134693146, + "learning_rate": 0.00029757690088906156 + }, + { + "step": 98, + "epoch": 0.6125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6949, + "grad_norm": 0.1572464555501938, + "learning_rate": 0.00029742824874979515 + }, + { + "step": 99, + "epoch": 0.61875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.7004, + "grad_norm": 0.6269007325172424, + "learning_rate": 0.0002972752109560943 + }, + { + "step": 100, + "epoch": 0.625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6963, + "grad_norm": 0.2632835805416107, + "learning_rate": 0.00029711779206048454 + }, + { + "step": 101, + "epoch": 0.63125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7037, + "grad_norm": 0.3220721483230591, + "learning_rate": 0.0002969559967458194 + }, + { + "step": 102, + "epoch": 0.6375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.718, + "grad_norm": 0.5996019244194031, + "learning_rate": 0.0002967898298251407 + }, + { + "step": 103, + "epoch": 0.64375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6842, + "grad_norm": 0.19360452890396118, + "learning_rate": 0.0002966192962415358 + }, + { + "step": 104, + "epoch": 0.65, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7113, + "grad_norm": 0.4986571669578552, + "learning_rate": 0.00029644440106799 + }, + { + "step": 105, + "epoch": 0.65625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6983, + "grad_norm": 0.25528281927108765, + "learning_rate": 0.00029626514950723627 + }, + { + "step": 106, + "epoch": 0.6625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6965, + "grad_norm": 0.20703844726085663, + "learning_rate": 0.0002960815468916 + }, + { + "step": 107, + "epoch": 0.66875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6982, + "grad_norm": 0.21419017016887665, + "learning_rate": 0.0002958935986828407 + }, + { + "step": 108, + "epoch": 0.675, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6911, + "grad_norm": 0.16305585205554962, + "learning_rate": 0.00029570131047198915 + }, + { + "step": 109, + "epoch": 0.68125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7126, + "grad_norm": 0.360077828168869, + "learning_rate": 0.0002955046879791816 + }, + { + "step": 110, + "epoch": 0.6875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.7123, + "grad_norm": 0.5474017262458801, + "learning_rate": 0.00029530373705348895 + }, + { + "step": 111, + "epoch": 0.69375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.7003, + "grad_norm": 0.1723344922065735, + "learning_rate": 0.00029509846367274336 + }, + { + "step": 112, + "epoch": 0.7, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.699, + "grad_norm": 0.13713021576404572, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 113, + "epoch": 0.70625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456023552, + "loss": 0.6895, + "grad_norm": 0.3542085886001587, + "learning_rate": 0.00029467497410015625 + }, + { + "step": 114, + "epoch": 0.7125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6927, + "grad_norm": 0.18892717361450195, + "learning_rate": 0.00029445677050616437 + }, + { + "step": 115, + "epoch": 0.71875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.7055, + "grad_norm": 0.38129350543022156, + "learning_rate": 0.0002942342696524443 + }, + { + "step": 116, + "epoch": 0.725, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.7591, + "grad_norm": 1.0986248254776, + "learning_rate": 0.0002940074781578893 + }, + { + "step": 117, + "epoch": 0.73125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6934, + "grad_norm": 0.20243695378303528, + "learning_rate": 0.00029377640276902954 + }, + { + "step": 118, + "epoch": 0.7375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6981, + "grad_norm": 0.26343855261802673, + "learning_rate": 0.0002935410503598313 + }, + { + "step": 119, + "epoch": 0.74375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7088, + "grad_norm": 0.7019100785255432, + "learning_rate": 0.00029330142793149237 + }, + { + "step": 120, + "epoch": 0.75, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6982, + "grad_norm": 0.30614131689071655, + "learning_rate": 0.000293057542612234 + }, + { + "step": 121, + "epoch": 0.75625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6916, + "grad_norm": 0.18096567690372467, + "learning_rate": 0.0002928094016570886 + }, + { + "step": 122, + "epoch": 0.7625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6847, + "grad_norm": 0.4552153944969177, + "learning_rate": 0.00029255701244768414 + }, + { + "step": 123, + "epoch": 0.76875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.7003, + "grad_norm": 0.20824210345745087, + "learning_rate": 0.0002923003824920244 + }, + { + "step": 124, + "epoch": 0.775, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7069, + "grad_norm": 0.3840045928955078, + "learning_rate": 0.0002920395194242658 + }, + { + "step": 125, + "epoch": 0.78125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456023552, + "loss": 0.6972, + "grad_norm": 0.23416200280189514, + "learning_rate": 0.00029177443100449014 + }, + { + "step": 126, + "epoch": 0.7875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.699, + "grad_norm": 0.31037354469299316, + "learning_rate": 0.00029150512511847375 + }, + { + "step": 127, + "epoch": 0.79375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6967, + "grad_norm": 0.22400780022144318, + "learning_rate": 0.00029123160977745306 + }, + { + "step": 128, + "epoch": 0.8, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.7005, + "grad_norm": 0.3288860321044922, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 129, + "epoch": 0.80625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.704, + "grad_norm": 0.17490486800670624, + "learning_rate": 0.00029067198340121094 + }, + { + "step": 130, + "epoch": 0.8125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.7062, + "grad_norm": 0.25320348143577576, + "learning_rate": 0.00029038588901359884 + }, + { + "step": 131, + "epoch": 0.81875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6961, + "grad_norm": 0.1575758457183838, + "learning_rate": 0.00029009561846570604 + }, + { + "step": 132, + "epoch": 0.825, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.7093, + "grad_norm": 0.4376318156719208, + "learning_rate": 0.00028980118039241976 + }, + { + "step": 133, + "epoch": 0.83125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7147, + "grad_norm": 0.7222729325294495, + "learning_rate": 0.00028950258355260177 + }, + { + "step": 134, + "epoch": 0.8375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6962, + "grad_norm": 0.07905472069978714, + "learning_rate": 0.00028919983682882766 + }, + { + "step": 135, + "epoch": 0.84375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6843, + "grad_norm": 0.3347111642360687, + "learning_rate": 0.0002888929492271224 + }, + { + "step": 136, + "epoch": 0.85, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6801, + "grad_norm": 0.16417890787124634, + "learning_rate": 0.000288581929876693 + }, + { + "step": 137, + "epoch": 0.85625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.697, + "grad_norm": 0.19577834010124207, + "learning_rate": 0.00028826678802965614 + }, + { + "step": 138, + "epoch": 0.8625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7176, + "grad_norm": 0.45092537999153137, + "learning_rate": 0.0002879475330607638 + }, + { + "step": 139, + "epoch": 0.86875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.739, + "grad_norm": 0.6200214624404907, + "learning_rate": 0.00028762417446712363 + }, + { + "step": 140, + "epoch": 0.875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6973, + "grad_norm": 0.2689876854419708, + "learning_rate": 0.00028729672186791704 + }, + { + "step": 141, + "epoch": 0.88125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6859, + "grad_norm": 0.18331760168075562, + "learning_rate": 0.00028696518500411254 + }, + { + "step": 142, + "epoch": 0.8875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6935, + "grad_norm": 0.19338366389274597, + "learning_rate": 0.0002866295737381763 + }, + { + "step": 143, + "epoch": 0.89375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7048, + "grad_norm": 0.25354376435279846, + "learning_rate": 0.0002862898980537788 + }, + { + "step": 144, + "epoch": 0.9, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.698, + "grad_norm": 0.22968712449073792, + "learning_rate": 0.0002859461680554975 + }, + { + "step": 145, + "epoch": 0.90625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.6902, + "grad_norm": 0.21939389407634735, + "learning_rate": 0.0002855983939685165 + }, + { + "step": 146, + "epoch": 0.9125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6922, + "grad_norm": 0.27763479948043823, + "learning_rate": 0.0002852465861383224 + }, + { + "step": 147, + "epoch": 0.91875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6955, + "grad_norm": 0.21394118666648865, + "learning_rate": 0.00028489075503039643 + }, + { + "step": 148, + "epoch": 0.925, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6935, + "grad_norm": 0.18833930790424347, + "learning_rate": 0.00028453091122990323 + }, + { + "step": 149, + "epoch": 0.93125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.7011, + "grad_norm": 0.3847750127315521, + "learning_rate": 0.0002841670654413757 + }, + { + "step": 150, + "epoch": 0.9375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6978, + "grad_norm": 0.36875686049461365, + "learning_rate": 0.0002837992284883971 + }, + { + "step": 151, + "epoch": 0.94375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6973, + "grad_norm": 0.18281979858875275, + "learning_rate": 0.0002834274113132784 + }, + { + "step": 152, + "epoch": 0.95, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6938, + "grad_norm": 0.10258230566978455, + "learning_rate": 0.0002830516249767332 + }, + { + "step": 153, + "epoch": 0.95625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6889, + "grad_norm": 0.17077164351940155, + "learning_rate": 0.0002826718806575488 + }, + { + "step": 154, + "epoch": 0.9625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7004, + "grad_norm": 0.24183516204357147, + "learning_rate": 0.0002822881896522532 + }, + { + "step": 155, + "epoch": 0.96875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7093, + "grad_norm": 0.4858053922653198, + "learning_rate": 0.0002819005633747795 + }, + { + "step": 156, + "epoch": 0.975, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.6956, + "grad_norm": 0.120506152510643, + "learning_rate": 0.00028150901335612615 + }, + { + "step": 157, + "epoch": 0.98125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7049, + "grad_norm": 1.0144362449645996, + "learning_rate": 0.0002811135512440138 + }, + { + "step": 158, + "epoch": 0.9875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456041984, + "loss": 0.693, + "grad_norm": 0.15784592926502228, + "learning_rate": 0.0002807141888025392 + }, + { + "step": 159, + "epoch": 0.99375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6893, + "grad_norm": 0.1060345470905304, + "learning_rate": 0.00028031093791182484 + }, + { + "step": 160, + "epoch": 1.0, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.7061, + "grad_norm": 0.32761263847351074, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 161, + "epoch": 1.00625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7148, + "grad_norm": 0.3958893120288849, + "learning_rate": 0.0002794928188811727 + }, + { + "step": 162, + "epoch": 1.0125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.7125, + "grad_norm": 0.47067391872406006, + "learning_rate": 0.0002790779750784118 + }, + { + "step": 163, + "epoch": 1.01875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7292, + "grad_norm": 0.7118585705757141, + "learning_rate": 0.0002786592915000408 + }, + { + "step": 164, + "epoch": 1.025, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7031, + "grad_norm": 0.5785593390464783, + "learning_rate": 0.00027823678060094197 + }, + { + "step": 165, + "epoch": 1.03125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6858, + "grad_norm": 0.4318457543849945, + "learning_rate": 0.0002778104549498518 + }, + { + "step": 166, + "epoch": 1.0375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7207, + "grad_norm": 0.8019049763679504, + "learning_rate": 0.00027738032722898683 + }, + { + "step": 167, + "epoch": 1.04375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7524, + "grad_norm": 1.2010325193405151, + "learning_rate": 0.00027694641023366656 + }, + { + "step": 168, + "epoch": 1.05, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.7292, + "grad_norm": 0.8006308078765869, + "learning_rate": 0.0002765087168719328 + }, + { + "step": 169, + "epoch": 1.05625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7115, + "grad_norm": 0.5251290202140808, + "learning_rate": 0.00027606726016416567 + }, + { + "step": 170, + "epoch": 1.0625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.7027, + "grad_norm": 0.4066130518913269, + "learning_rate": 0.00027562205324269617 + }, + { + "step": 171, + "epoch": 1.06875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.7017, + "grad_norm": 0.43078696727752686, + "learning_rate": 0.00027517310935141565 + }, + { + "step": 172, + "epoch": 1.075, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6959, + "grad_norm": 0.13528597354888916, + "learning_rate": 0.0002747204418453818 + }, + { + "step": 173, + "epoch": 1.08125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456023552, + "loss": 0.7035, + "grad_norm": 0.13586246967315674, + "learning_rate": 0.00027426406419042135 + }, + { + "step": 174, + "epoch": 1.0875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6949, + "grad_norm": 0.15380734205245972, + "learning_rate": 0.00027380398996272956 + }, + { + "step": 175, + "epoch": 1.09375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6973, + "grad_norm": 0.3080759048461914, + "learning_rate": 0.0002733402328484662 + }, + { + "step": 176, + "epoch": 1.1, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6903, + "grad_norm": 0.2749730348587036, + "learning_rate": 0.00027287280664334875 + }, + { + "step": 177, + "epoch": 1.10625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7211, + "grad_norm": 0.5025511384010315, + "learning_rate": 0.0002724017252522415 + }, + { + "step": 178, + "epoch": 1.1125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6902, + "grad_norm": 0.27959924936294556, + "learning_rate": 0.0002719270026887423 + }, + { + "step": 179, + "epoch": 1.11875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6986, + "grad_norm": 0.20522993803024292, + "learning_rate": 0.0002714486530747656 + }, + { + "step": 180, + "epoch": 1.125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6891, + "grad_norm": 0.21163153648376465, + "learning_rate": 0.0002709666906401224 + }, + { + "step": 181, + "epoch": 1.13125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456046592, + "loss": 0.6899, + "grad_norm": 0.691405713558197, + "learning_rate": 0.0002704811297220967 + }, + { + "step": 182, + "epoch": 1.1375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.698, + "grad_norm": 0.665687620639801, + "learning_rate": 0.00026999198476501945 + }, + { + "step": 183, + "epoch": 1.14375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6911, + "grad_norm": 0.5429388880729675, + "learning_rate": 0.0002694992703198383 + }, + { + "step": 184, + "epoch": 1.15, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6851, + "grad_norm": 0.3378201127052307, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 185, + "epoch": 1.15625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7025, + "grad_norm": 0.2237425446510315, + "learning_rate": 0.0002685031916994403 + }, + { + "step": 186, + "epoch": 1.1625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.7243, + "grad_norm": 0.7370006442070007, + "learning_rate": 0.0002679998571552925 + }, + { + "step": 187, + "epoch": 1.16875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6672, + "grad_norm": 0.22516986727714539, + "learning_rate": 0.0002674930123842975 + }, + { + "step": 188, + "epoch": 1.175, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6974, + "grad_norm": 0.4589906632900238, + "learning_rate": 0.0002669826724639322 + }, + { + "step": 189, + "epoch": 1.18125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7115, + "grad_norm": 0.5878050327301025, + "learning_rate": 0.0002664688525756463 + }, + { + "step": 190, + "epoch": 1.1875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6859, + "grad_norm": 0.3077067732810974, + "learning_rate": 0.0002659515680044105 + }, + { + "step": 191, + "epoch": 1.19375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6972, + "grad_norm": 0.4824809432029724, + "learning_rate": 0.00026543083413826203 + }, + { + "step": 192, + "epoch": 1.2, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6819, + "grad_norm": 0.5308559536933899, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 193, + "epoch": 1.20625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.695, + "grad_norm": 0.1901646852493286, + "learning_rate": 0.0002643790805859582 + }, + { + "step": 194, + "epoch": 1.2125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.6926, + "grad_norm": 0.27227503061294556, + "learning_rate": 0.00026384809218707423 + }, + { + "step": 195, + "epoch": 1.21875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.7594, + "grad_norm": 1.1576486825942993, + "learning_rate": 0.0002633137170668897 + }, + { + "step": 196, + "epoch": 1.225, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6897, + "grad_norm": 0.4159877896308899, + "learning_rate": 0.0002627759711218466 + }, + { + "step": 197, + "epoch": 1.23125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.7169, + "grad_norm": 0.30037254095077515, + "learning_rate": 0.00026223487034866133 + }, + { + "step": 198, + "epoch": 1.2375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6958, + "grad_norm": 0.15459652245044708, + "learning_rate": 0.00026169043084384896 + }, + { + "step": 199, + "epoch": 1.24375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6852, + "grad_norm": 0.1492416262626648, + "learning_rate": 0.00026114266880324387 + }, + { + "step": 200, + "epoch": 1.25, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7255, + "grad_norm": 0.6600925326347351, + "learning_rate": 0.0002605916005215186 + }, + { + "step": 201, + "epoch": 1.25625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.7285, + "grad_norm": 0.5197784304618835, + "learning_rate": 0.00026003724239169874 + }, + { + "step": 202, + "epoch": 1.2625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6905, + "grad_norm": 0.18552690744400024, + "learning_rate": 0.00025947961090467533 + }, + { + "step": 203, + "epoch": 1.26875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.748, + "grad_norm": 0.9276514053344727, + "learning_rate": 0.0002589187226487144 + }, + { + "step": 204, + "epoch": 1.275, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6996, + "grad_norm": 0.2500166594982147, + "learning_rate": 0.0002583545943089633 + }, + { + "step": 205, + "epoch": 1.28125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6834, + "grad_norm": 0.16842703521251678, + "learning_rate": 0.00025778724266695466 + }, + { + "step": 206, + "epoch": 1.2875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.708, + "grad_norm": 0.21827392280101776, + "learning_rate": 0.00025721668460010696 + }, + { + "step": 207, + "epoch": 1.29375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.7021, + "grad_norm": 0.17127957940101624, + "learning_rate": 0.0002566429370812223 + }, + { + "step": 208, + "epoch": 1.3, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6804, + "grad_norm": 0.13762445747852325, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 209, + "epoch": 1.30625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7407, + "grad_norm": 0.6228614449501038, + "learning_rate": 0.0002554859420524386 + }, + { + "step": 210, + "epoch": 1.3125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7363, + "grad_norm": 0.5276408791542053, + "learning_rate": 0.00025490272896050507 + }, + { + "step": 211, + "epoch": 1.31875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6877, + "grad_norm": 0.16867157816886902, + "learning_rate": 0.00025431639525144175 + }, + { + "step": 212, + "epoch": 1.325, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6991, + "grad_norm": 0.19430650770664215, + "learning_rate": 0.0002537269583673404 + }, + { + "step": 213, + "epoch": 1.33125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7072, + "grad_norm": 0.29926982522010803, + "learning_rate": 0.0002531344358426051 + }, + { + "step": 214, + "epoch": 1.3375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456040448, + "loss": 0.6897, + "grad_norm": 0.10244950652122498, + "learning_rate": 0.0002525388453034307 + }, + { + "step": 215, + "epoch": 1.34375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6918, + "grad_norm": 0.26906269788742065, + "learning_rate": 0.0002519402044672784 + }, + { + "step": 216, + "epoch": 1.35, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6937, + "grad_norm": 0.2990459203720093, + "learning_rate": 0.00025133853114234905 + }, + { + "step": 217, + "epoch": 1.35625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.7034, + "grad_norm": 0.2346249222755432, + "learning_rate": 0.00025073384322705274 + }, + { + "step": 218, + "epoch": 1.3625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7154, + "grad_norm": 0.48543885350227356, + "learning_rate": 0.0002501261587094771 + }, + { + "step": 219, + "epoch": 1.36875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6956, + "grad_norm": 0.11397181451320648, + "learning_rate": 0.00024951549566685165 + }, + { + "step": 220, + "epoch": 1.375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7007, + "grad_norm": 0.20102454721927643, + "learning_rate": 0.0002489018722650103 + }, + { + "step": 221, + "epoch": 1.38125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6757, + "grad_norm": 0.19539330899715424, + "learning_rate": 0.00024828530675785094 + }, + { + "step": 222, + "epoch": 1.3875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7053, + "grad_norm": 0.23400302231311798, + "learning_rate": 0.00024766581748679234 + }, + { + "step": 223, + "epoch": 1.39375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.6886, + "grad_norm": 0.07748206704854965, + "learning_rate": 0.0002470434228802286 + }, + { + "step": 224, + "epoch": 1.4, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6972, + "grad_norm": 0.1449657380580902, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 225, + "epoch": 1.40625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7119, + "grad_norm": 0.32834935188293457, + "learning_rate": 0.0002457899918057468 + }, + { + "step": 226, + "epoch": 1.4125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.703, + "grad_norm": 0.22646580636501312, + "learning_rate": 0.0002451589926245468 + }, + { + "step": 227, + "epoch": 1.41875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6979, + "grad_norm": 0.09850948303937912, + "learning_rate": 0.00024452516268016865 + }, + { + "step": 228, + "epoch": 1.425, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6941, + "grad_norm": 0.08055689185857773, + "learning_rate": 0.00024388852082760884 + }, + { + "step": 229, + "epoch": 1.43125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6967, + "grad_norm": 0.09169227629899979, + "learning_rate": 0.00024324908600551162 + }, + { + "step": 230, + "epoch": 1.4375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6934, + "grad_norm": 0.1622571349143982, + "learning_rate": 0.00024260687723560574 + }, + { + "step": 231, + "epoch": 1.44375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6901, + "grad_norm": 0.21527031064033508, + "learning_rate": 0.00024196191362213862 + }, + { + "step": 232, + "epoch": 1.45, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6968, + "grad_norm": 0.09339339286088943, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 233, + "epoch": 1.45625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6999, + "grad_norm": 0.14290744066238403, + "learning_rate": 0.0002406637986906913 + }, + { + "step": 234, + "epoch": 1.4625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456041984, + "loss": 0.6914, + "grad_norm": 0.16555726528167725, + "learning_rate": 0.00024001068598867212 + }, + { + "step": 235, + "epoch": 1.46875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7052, + "grad_norm": 0.2937389016151428, + "learning_rate": 0.000239354895673865 + }, + { + "step": 236, + "epoch": 1.475, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6871, + "grad_norm": 0.2382066249847412, + "learning_rate": 0.00023869644725453735 + }, + { + "step": 237, + "epoch": 1.48125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456040448, + "loss": 0.7162, + "grad_norm": 0.4082764983177185, + "learning_rate": 0.00023803536031802918 + }, + { + "step": 238, + "epoch": 1.4875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.7228, + "grad_norm": 0.6265595555305481, + "learning_rate": 0.00023737165453017033 + }, + { + "step": 239, + "epoch": 1.49375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6926, + "grad_norm": 0.12580931186676025, + "learning_rate": 0.0002367053496346955 + }, + { + "step": 240, + "epoch": 1.5, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6953, + "grad_norm": 0.0716833770275116, + "learning_rate": 0.00023603646545265687 + }, + { + "step": 241, + "epoch": 1.50625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6871, + "grad_norm": 0.3441932201385498, + "learning_rate": 0.00023536502188183472 + }, + { + "step": 242, + "epoch": 1.5125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6854, + "grad_norm": 0.17742300033569336, + "learning_rate": 0.00023469103889614505 + }, + { + "step": 243, + "epoch": 1.51875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456040448, + "loss": 0.7155, + "grad_norm": 0.47247934341430664, + "learning_rate": 0.0002340145365450458 + }, + { + "step": 244, + "epoch": 1.525, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7103, + "grad_norm": 0.31267666816711426, + "learning_rate": 0.0002333355349529403 + }, + { + "step": 245, + "epoch": 1.53125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.7119, + "grad_norm": 0.35581332445144653, + "learning_rate": 0.0002326540543185786 + }, + { + "step": 246, + "epoch": 1.5375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6942, + "grad_norm": 0.10159732401371002, + "learning_rate": 0.0002319701149144565 + }, + { + "step": 247, + "epoch": 1.54375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.7081, + "grad_norm": 0.25603386759757996, + "learning_rate": 0.00023128373708621275 + }, + { + "step": 248, + "epoch": 1.55, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6863, + "grad_norm": 0.12361364811658859, + "learning_rate": 0.00023059494125202357 + }, + { + "step": 249, + "epoch": 1.55625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6978, + "grad_norm": 0.28085654973983765, + "learning_rate": 0.00022990374790199532 + }, + { + "step": 250, + "epoch": 1.5625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6933, + "grad_norm": 0.20397578179836273, + "learning_rate": 0.0002292101775975552 + }, + { + "step": 251, + "epoch": 1.56875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.699, + "grad_norm": 0.19108420610427856, + "learning_rate": 0.00022851425097083906 + }, + { + "step": 252, + "epoch": 1.575, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.694, + "grad_norm": 0.2400539070367813, + "learning_rate": 0.00022781598872407822 + }, + { + "step": 253, + "epoch": 1.58125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6965, + "grad_norm": 0.08949398994445801, + "learning_rate": 0.00022711541162898321 + }, + { + "step": 254, + "epoch": 1.5875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6968, + "grad_norm": 0.11189975589513779, + "learning_rate": 0.00022641254052612627 + }, + { + "step": 255, + "epoch": 1.59375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.6872, + "grad_norm": 0.08785166591405869, + "learning_rate": 0.00022570739632432079 + }, + { + "step": 256, + "epoch": 1.6, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7036, + "grad_norm": 0.24093002080917358, + "learning_rate": 0.000225 + }, + { + "step": 257, + "epoch": 1.60625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6961, + "grad_norm": 0.12372710555791855, + "learning_rate": 0.0002242903725965924 + }, + { + "step": 258, + "epoch": 1.6125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7087, + "grad_norm": 0.3642513155937195, + "learning_rate": 0.00022357853522389615 + }, + { + "step": 259, + "epoch": 1.61875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6853, + "grad_norm": 0.19357043504714966, + "learning_rate": 0.000222864509057451 + }, + { + "step": 260, + "epoch": 1.625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6879, + "grad_norm": 0.06451836228370667, + "learning_rate": 0.00022214831533790813 + }, + { + "step": 261, + "epoch": 1.63125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.7174, + "grad_norm": 0.5284934639930725, + "learning_rate": 0.0002214299753703987 + }, + { + "step": 262, + "epoch": 1.6375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6926, + "grad_norm": 0.11590387672185898, + "learning_rate": 0.00022070951052389966 + }, + { + "step": 263, + "epoch": 1.64375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456041984, + "loss": 0.693, + "grad_norm": 0.09077291190624237, + "learning_rate": 0.00021998694223059837 + }, + { + "step": 264, + "epoch": 1.65, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6921, + "grad_norm": 0.0868513360619545, + "learning_rate": 0.0002192622919852551 + }, + { + "step": 265, + "epoch": 1.65625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6967, + "grad_norm": 0.1370062530040741, + "learning_rate": 0.00021853558134456307 + }, + { + "step": 266, + "epoch": 1.6625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.69, + "grad_norm": 0.13203848898410797, + "learning_rate": 0.00021780683192650796 + }, + { + "step": 267, + "epoch": 1.66875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6902, + "grad_norm": 0.1182737648487091, + "learning_rate": 0.00021707606540972413 + }, + { + "step": 268, + "epoch": 1.675, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7058, + "grad_norm": 0.3541680574417114, + "learning_rate": 0.00021634330353285017 + }, + { + "step": 269, + "epoch": 1.68125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6892, + "grad_norm": 0.1335131824016571, + "learning_rate": 0.00021560856809388213 + }, + { + "step": 270, + "epoch": 1.6875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7064, + "grad_norm": 0.3369695544242859, + "learning_rate": 0.00021487188094952489 + }, + { + "step": 271, + "epoch": 1.69375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.694, + "grad_norm": 0.13077911734580994, + "learning_rate": 0.0002141332640145423 + }, + { + "step": 272, + "epoch": 1.7, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.685, + "grad_norm": 0.21736779808998108, + "learning_rate": 0.0002133927392611049 + }, + { + "step": 273, + "epoch": 1.70625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.7031, + "grad_norm": 0.26307982206344604, + "learning_rate": 0.00021265032871813658 + }, + { + "step": 274, + "epoch": 1.7125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6828, + "grad_norm": 0.18123528361320496, + "learning_rate": 0.00021190605447065917 + }, + { + "step": 275, + "epoch": 1.71875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7034, + "grad_norm": 0.29030510783195496, + "learning_rate": 0.0002111599386591355 + }, + { + "step": 276, + "epoch": 1.725, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.7017, + "grad_norm": 0.22276875376701355, + "learning_rate": 0.00021041200347881057 + }, + { + "step": 277, + "epoch": 1.73125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.6907, + "grad_norm": 0.22180457413196564, + "learning_rate": 0.00020966227117905163 + }, + { + "step": 278, + "epoch": 1.7375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6906, + "grad_norm": 0.09787695854902267, + "learning_rate": 0.00020891076406268612 + }, + { + "step": 279, + "epoch": 1.74375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6975, + "grad_norm": 0.2720576524734497, + "learning_rate": 0.00020815750448533805 + }, + { + "step": 280, + "epoch": 1.75, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.6984, + "grad_norm": 0.12415912747383118, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 281, + "epoch": 1.75625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.6925, + "grad_norm": 0.2172631472349167, + "learning_rate": 0.00020664581763018324 + }, + { + "step": 282, + "epoch": 1.7625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6977, + "grad_norm": 0.14696194231510162, + "learning_rate": 0.00020588743532161543 + }, + { + "step": 283, + "epoch": 1.76875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7141, + "grad_norm": 0.5196764469146729, + "learning_rate": 0.00020512739048920552 + }, + { + "step": 284, + "epoch": 1.775, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7046, + "grad_norm": 0.2294853925704956, + "learning_rate": 0.00020436570574255522 + }, + { + "step": 285, + "epoch": 1.78125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6976, + "grad_norm": 0.16527678072452545, + "learning_rate": 0.00020360240374005 + }, + { + "step": 286, + "epoch": 1.7875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6855, + "grad_norm": 0.37758520245552063, + "learning_rate": 0.00020283750718818501 + }, + { + "step": 287, + "epoch": 1.79375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.6951, + "grad_norm": 0.07231743633747101, + "learning_rate": 0.00020207103884088955 + }, + { + "step": 288, + "epoch": 1.8, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.695, + "grad_norm": 0.07208273559808731, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 289, + "epoch": 1.80625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6947, + "grad_norm": 0.13992980122566223, + "learning_rate": 0.00020053347800883298 + }, + { + "step": 290, + "epoch": 1.8125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6901, + "grad_norm": 0.10304129868745804, + "learning_rate": 0.00019976243126300282 + }, + { + "step": 291, + "epoch": 1.81875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6945, + "grad_norm": 0.0780540183186531, + "learning_rate": 0.00019898990419824333 + }, + { + "step": 292, + "epoch": 1.825, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456022016, + "loss": 0.696, + "grad_norm": 0.06909530609846115, + "learning_rate": 0.00019821591979547423 + }, + { + "step": 293, + "epoch": 1.83125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.7063, + "grad_norm": 0.349342405796051, + "learning_rate": 0.00019744050107896774 + }, + { + "step": 294, + "epoch": 1.8375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456023552, + "loss": 0.7086, + "grad_norm": 0.49702563881874084, + "learning_rate": 0.0001966636711156636 + }, + { + "step": 295, + "epoch": 1.84375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6928, + "grad_norm": 0.08309005945920944, + "learning_rate": 0.00019588545301448302 + }, + { + "step": 296, + "epoch": 1.85, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6958, + "grad_norm": 0.26925718784332275, + "learning_rate": 0.00019510586992564093 + }, + { + "step": 297, + "epoch": 1.85625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6959, + "grad_norm": 0.07232359796762466, + "learning_rate": 0.0001943249450399578 + }, + { + "step": 298, + "epoch": 1.8625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6972, + "grad_norm": 0.1169787272810936, + "learning_rate": 0.0001935427015881693 + }, + { + "step": 299, + "epoch": 1.86875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6925, + "grad_norm": 0.11477385461330414, + "learning_rate": 0.00019275916284023563 + }, + { + "step": 300, + "epoch": 1.875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7035, + "grad_norm": 0.4322449862957001, + "learning_rate": 0.00019197435210464882 + }, + { + "step": 301, + "epoch": 1.88125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6926, + "grad_norm": 0.07309428602457047, + "learning_rate": 0.00019118829272773985 + }, + { + "step": 302, + "epoch": 1.8875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6919, + "grad_norm": 0.08718182146549225, + "learning_rate": 0.00019040100809298392 + }, + { + "step": 303, + "epoch": 1.89375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456041984, + "loss": 0.6992, + "grad_norm": 0.15625004470348358, + "learning_rate": 0.00018961252162030476 + }, + { + "step": 304, + "epoch": 1.9, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6941, + "grad_norm": 0.20362500846385956, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 305, + "epoch": 1.90625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6951, + "grad_norm": 0.22174617648124695, + "learning_rate": 0.00018803203701893393 + }, + { + "step": 306, + "epoch": 1.9125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6951, + "grad_norm": 0.10130838304758072, + "learning_rate": 0.00018724008590605742 + }, + { + "step": 307, + "epoch": 1.91875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6961, + "grad_norm": 0.2783345580101013, + "learning_rate": 0.0001864470269854896 + }, + { + "step": 308, + "epoch": 1.925, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6923, + "grad_norm": 0.13675889372825623, + "learning_rate": 0.00018565288384892595 + }, + { + "step": 309, + "epoch": 1.93125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7037, + "grad_norm": 0.41442885994911194, + "learning_rate": 0.00018485768012031518 + }, + { + "step": 310, + "epoch": 1.9375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6877, + "grad_norm": 0.2158603072166443, + "learning_rate": 0.00018406143945515598 + }, + { + "step": 311, + "epoch": 1.94375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.6919, + "grad_norm": 0.10658439993858337, + "learning_rate": 0.00018326418553979367 + }, + { + "step": 312, + "epoch": 1.95, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.6984, + "grad_norm": 0.1933278739452362, + "learning_rate": 0.0001824659420907154 + }, + { + "step": 313, + "epoch": 1.95625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6887, + "grad_norm": 0.09343045204877853, + "learning_rate": 0.00018166673285384475 + }, + { + "step": 314, + "epoch": 1.9625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6912, + "grad_norm": 0.11637547612190247, + "learning_rate": 0.00018086658160383523 + }, + { + "step": 315, + "epoch": 1.96875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6909, + "grad_norm": 0.0888054370880127, + "learning_rate": 0.00018006551214336304 + }, + { + "step": 316, + "epoch": 1.975, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7036, + "grad_norm": 0.35265836119651794, + "learning_rate": 0.00017926354830241924 + }, + { + "step": 317, + "epoch": 1.98125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6981, + "grad_norm": 0.2728828191757202, + "learning_rate": 0.00017846071393760044 + }, + { + "step": 318, + "epoch": 1.9875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6958, + "grad_norm": 0.22352907061576843, + "learning_rate": 0.00017765703293139948 + }, + { + "step": 319, + "epoch": 1.99375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6911, + "grad_norm": 0.26762253046035767, + "learning_rate": 0.00017685252919149493 + }, + { + "step": 320, + "epoch": 2.0, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6945, + "grad_norm": 0.31644365191459656, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 321, + "epoch": 2.00625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6749, + "grad_norm": 0.49289509654045105, + "learning_rate": 0.00017524114926294887 + }, + { + "step": 322, + "epoch": 2.0125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6919, + "grad_norm": 0.09819575399160385, + "learning_rate": 0.0001744343210091883 + }, + { + "step": 323, + "epoch": 2.01875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.698, + "grad_norm": 0.1934129297733307, + "learning_rate": 0.00017362676589005967 + }, + { + "step": 324, + "epoch": 2.025, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456041984, + "loss": 0.7215, + "grad_norm": 0.45884793996810913, + "learning_rate": 0.0001728185079284875 + }, + { + "step": 325, + "epoch": 2.03125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.7083, + "grad_norm": 0.3133298456668854, + "learning_rate": 0.00017200957116830423 + }, + { + "step": 326, + "epoch": 2.0375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456040448, + "loss": 0.7108, + "grad_norm": 0.3089672923088074, + "learning_rate": 0.00017119997967353514 + }, + { + "step": 327, + "epoch": 2.04375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6873, + "grad_norm": 0.09201841056346893, + "learning_rate": 0.00017038975752768211 + }, + { + "step": 328, + "epoch": 2.05, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.7139, + "grad_norm": 0.32476553320884705, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 329, + "epoch": 2.05625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.7193, + "grad_norm": 0.4434891641139984, + "learning_rate": 0.0001687675177098179 + }, + { + "step": 330, + "epoch": 2.0625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.698, + "grad_norm": 0.18232925236225128, + "learning_rate": 0.00016795554829574435 + }, + { + "step": 331, + "epoch": 2.06875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6924, + "grad_norm": 0.09980880469083786, + "learning_rate": 0.00016714304474502696 + }, + { + "step": 332, + "epoch": 2.075, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.702, + "grad_norm": 0.3917293846607208, + "learning_rate": 0.00016633003122779467 + }, + { + "step": 333, + "epoch": 2.08125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6954, + "grad_norm": 0.2708187699317932, + "learning_rate": 0.00016551653192934694 + }, + { + "step": 334, + "epoch": 2.0875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.693, + "grad_norm": 0.07552526891231537, + "learning_rate": 0.0001647025710494341 + }, + { + "step": 335, + "epoch": 2.09375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6931, + "grad_norm": 0.10803566128015518, + "learning_rate": 0.00016388817280153735 + }, + { + "step": 336, + "epoch": 2.1, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6773, + "grad_norm": 0.1511460691690445, + "learning_rate": 0.00016307336141214873 + }, + { + "step": 337, + "epoch": 2.10625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.7159, + "grad_norm": 0.4196055829524994, + "learning_rate": 0.00016225816112005022 + }, + { + "step": 338, + "epoch": 2.1125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7249, + "grad_norm": 0.5000525712966919, + "learning_rate": 0.00016144259617559286 + }, + { + "step": 339, + "epoch": 2.11875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.7112, + "grad_norm": 0.3563736379146576, + "learning_rate": 0.00016062669083997513 + }, + { + "step": 340, + "epoch": 2.125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.7065, + "grad_norm": 0.2568931579589844, + "learning_rate": 0.00015981046938452146 + }, + { + "step": 341, + "epoch": 2.13125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6884, + "grad_norm": 0.07310066372156143, + "learning_rate": 0.00015899395608996015 + }, + { + "step": 342, + "epoch": 2.1375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6847, + "grad_norm": 0.07713677734136581, + "learning_rate": 0.00015817717524570094 + }, + { + "step": 343, + "epoch": 2.14375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6914, + "grad_norm": 0.16349926590919495, + "learning_rate": 0.0001573601511491127 + }, + { + "step": 344, + "epoch": 2.15, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6881, + "grad_norm": 0.14680688083171844, + "learning_rate": 0.00015654290810480042 + }, + { + "step": 345, + "epoch": 2.15625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6872, + "grad_norm": 0.12180665135383606, + "learning_rate": 0.00015572547042388223 + }, + { + "step": 346, + "epoch": 2.1625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.694, + "grad_norm": 0.11654186248779297, + "learning_rate": 0.00015490786242326643 + }, + { + "step": 347, + "epoch": 2.16875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6955, + "grad_norm": 0.24101006984710693, + "learning_rate": 0.00015409010842492777 + }, + { + "step": 348, + "epoch": 2.175, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6939, + "grad_norm": 0.21055243909358978, + "learning_rate": 0.00015327223275518416 + }, + { + "step": 349, + "epoch": 2.18125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602048, + "loss": 0.6964, + "grad_norm": 0.11897505819797516, + "learning_rate": 0.000152454259743973 + }, + { + "step": 350, + "epoch": 2.1875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6915, + "grad_norm": 0.08952657878398895, + "learning_rate": 0.00015163621372412734 + }, + { + "step": 351, + "epoch": 2.19375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6899, + "grad_norm": 0.1511620432138443, + "learning_rate": 0.00015081811903065205 + }, + { + "step": 352, + "epoch": 2.2, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6961, + "grad_norm": 0.125346839427948, + "learning_rate": 0.00015 + }, + { + "step": 353, + "epoch": 2.20625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6919, + "grad_norm": 0.07630880922079086, + "learning_rate": 0.0001491818809693479 + }, + { + "step": 354, + "epoch": 2.2125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6702, + "grad_norm": 0.7390841245651245, + "learning_rate": 0.00014836378627587266 + }, + { + "step": 355, + "epoch": 2.21875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6987, + "grad_norm": 0.2715112566947937, + "learning_rate": 0.00014754574025602698 + }, + { + "step": 356, + "epoch": 2.225, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.7031, + "grad_norm": 0.27261266112327576, + "learning_rate": 0.00014672776724481584 + }, + { + "step": 357, + "epoch": 2.23125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.7053, + "grad_norm": 0.34147873520851135, + "learning_rate": 0.00014590989157507224 + }, + { + "step": 358, + "epoch": 2.2375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6969, + "grad_norm": 0.2782283425331116, + "learning_rate": 0.00014509213757673357 + }, + { + "step": 359, + "epoch": 2.24375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.696, + "grad_norm": 0.17512324452400208, + "learning_rate": 0.00014427452957611775 + }, + { + "step": 360, + "epoch": 2.25, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7021, + "grad_norm": 0.28349336981773376, + "learning_rate": 0.0001434570918951996 + }, + { + "step": 361, + "epoch": 2.25625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6947, + "grad_norm": 0.0893184170126915, + "learning_rate": 0.0001426398488508873 + }, + { + "step": 362, + "epoch": 2.2625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.7039, + "grad_norm": 0.44234317541122437, + "learning_rate": 0.00014182282475429903 + }, + { + "step": 363, + "epoch": 2.26875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6938, + "grad_norm": 0.17111903429031372, + "learning_rate": 0.00014100604391003985 + }, + { + "step": 364, + "epoch": 2.275, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6937, + "grad_norm": 0.19343946874141693, + "learning_rate": 0.0001401895306154785 + }, + { + "step": 365, + "epoch": 2.28125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6926, + "grad_norm": 0.29134827852249146, + "learning_rate": 0.00013937330916002487 + }, + { + "step": 366, + "epoch": 2.2875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6967, + "grad_norm": 0.1319189816713333, + "learning_rate": 0.00013855740382440714 + }, + { + "step": 367, + "epoch": 2.29375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6805, + "grad_norm": 0.25175121426582336, + "learning_rate": 0.0001377418388799498 + }, + { + "step": 368, + "epoch": 2.3, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6888, + "grad_norm": 0.17317324876785278, + "learning_rate": 0.00013692663858785124 + }, + { + "step": 369, + "epoch": 2.30625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.683, + "grad_norm": 0.14933162927627563, + "learning_rate": 0.00013611182719846268 + }, + { + "step": 370, + "epoch": 2.3125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6945, + "grad_norm": 0.13434858620166779, + "learning_rate": 0.0001352974289505659 + }, + { + "step": 371, + "epoch": 2.31875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6996, + "grad_norm": 0.2197239249944687, + "learning_rate": 0.000134483468070653 + }, + { + "step": 372, + "epoch": 2.325, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.7041, + "grad_norm": 0.24462877213954926, + "learning_rate": 0.00013366996877220533 + }, + { + "step": 373, + "epoch": 2.33125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6966, + "grad_norm": 0.1500042974948883, + "learning_rate": 0.000132856955254973 + }, + { + "step": 374, + "epoch": 2.3375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45604352, + "loss": 0.681, + "grad_norm": 0.17176175117492676, + "learning_rate": 0.00013204445170425565 + }, + { + "step": 375, + "epoch": 2.34375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7046, + "grad_norm": 0.3065783977508545, + "learning_rate": 0.00013123248229018214 + }, + { + "step": 376, + "epoch": 2.35, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7075, + "grad_norm": 0.3401780128479004, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 377, + "epoch": 2.35625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45604352, + "loss": 0.7022, + "grad_norm": 0.25689321756362915, + "learning_rate": 0.0001296102424723179 + }, + { + "step": 378, + "epoch": 2.3625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6989, + "grad_norm": 0.29288023710250854, + "learning_rate": 0.0001288000203264649 + }, + { + "step": 379, + "epoch": 2.36875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6944, + "grad_norm": 0.1302637755870819, + "learning_rate": 0.00012799042883169574 + }, + { + "step": 380, + "epoch": 2.375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6997, + "grad_norm": 0.32580849528312683, + "learning_rate": 0.00012718149207151247 + }, + { + "step": 381, + "epoch": 2.38125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6932, + "grad_norm": 0.11105800420045853, + "learning_rate": 0.00012637323410994033 + }, + { + "step": 382, + "epoch": 2.3875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6904, + "grad_norm": 0.20618505775928497, + "learning_rate": 0.0001255656789908117 + }, + { + "step": 383, + "epoch": 2.39375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6923, + "grad_norm": 0.13942557573318481, + "learning_rate": 0.0001247588507370511 + }, + { + "step": 384, + "epoch": 2.4, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7082, + "grad_norm": 0.24411270022392273, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 385, + "epoch": 2.40625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7127, + "grad_norm": 0.41210344433784485, + "learning_rate": 0.0001231474708085051 + }, + { + "step": 386, + "epoch": 2.4125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.7016, + "grad_norm": 0.2096068412065506, + "learning_rate": 0.0001223429670686005 + }, + { + "step": 387, + "epoch": 2.41875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6854, + "grad_norm": 0.08161913603544235, + "learning_rate": 0.00012153928606239957 + }, + { + "step": 388, + "epoch": 2.425, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.703, + "grad_norm": 0.24621932208538055, + "learning_rate": 0.00012073645169758076 + }, + { + "step": 389, + "epoch": 2.43125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6937, + "grad_norm": 0.1236347034573555, + "learning_rate": 0.00011993448785663692 + }, + { + "step": 390, + "epoch": 2.4375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7074, + "grad_norm": 0.3155075013637543, + "learning_rate": 0.00011913341839616476 + }, + { + "step": 391, + "epoch": 2.44375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.696, + "grad_norm": 0.16603218019008636, + "learning_rate": 0.00011833326714615522 + }, + { + "step": 392, + "epoch": 2.45, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456040448, + "loss": 0.6925, + "grad_norm": 0.1014234647154808, + "learning_rate": 0.00011753405790928456 + }, + { + "step": 393, + "epoch": 2.45625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7007, + "grad_norm": 0.21574853360652924, + "learning_rate": 0.0001167358144602063 + }, + { + "step": 394, + "epoch": 2.4625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.691, + "grad_norm": 0.06068209558725357, + "learning_rate": 0.00011593856054484402 + }, + { + "step": 395, + "epoch": 2.46875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6933, + "grad_norm": 0.48437970876693726, + "learning_rate": 0.00011514231987968482 + }, + { + "step": 396, + "epoch": 2.475, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6963, + "grad_norm": 0.08715514838695526, + "learning_rate": 0.00011434711615107404 + }, + { + "step": 397, + "epoch": 2.48125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.6921, + "grad_norm": 0.054307691752910614, + "learning_rate": 0.00011355297301451042 + }, + { + "step": 398, + "epoch": 2.4875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456040448, + "loss": 0.6966, + "grad_norm": 0.06114857271313667, + "learning_rate": 0.00011275991409394253 + }, + { + "step": 399, + "epoch": 2.49375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7044, + "grad_norm": 0.27321651577949524, + "learning_rate": 0.00011196796298106608 + }, + { + "step": 400, + "epoch": 2.5, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7065, + "grad_norm": 0.27761924266815186, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 401, + "epoch": 2.50625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6794, + "grad_norm": 0.2004023641347885, + "learning_rate": 0.00011038747837969526 + }, + { + "step": 402, + "epoch": 2.5125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6978, + "grad_norm": 0.11827881634235382, + "learning_rate": 0.00010959899190701608 + }, + { + "step": 403, + "epoch": 2.51875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6963, + "grad_norm": 0.09610182046890259, + "learning_rate": 0.00010881170727226018 + }, + { + "step": 404, + "epoch": 2.525, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7095, + "grad_norm": 0.29769662022590637, + "learning_rate": 0.00010802564789535119 + }, + { + "step": 405, + "epoch": 2.53125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6882, + "grad_norm": 0.07882243394851685, + "learning_rate": 0.00010724083715976441 + }, + { + "step": 406, + "epoch": 2.5375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6883, + "grad_norm": 0.06755933910608292, + "learning_rate": 0.00010645729841183066 + }, + { + "step": 407, + "epoch": 2.54375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6901, + "grad_norm": 0.09040128439664841, + "learning_rate": 0.00010567505496004213 + }, + { + "step": 408, + "epoch": 2.55, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.6897, + "grad_norm": 0.07503551244735718, + "learning_rate": 0.00010489413007435904 + }, + { + "step": 409, + "epoch": 2.55625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6924, + "grad_norm": 0.0734320729970932, + "learning_rate": 0.00010411454698551695 + }, + { + "step": 410, + "epoch": 2.5625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6867, + "grad_norm": 0.12100972980260849, + "learning_rate": 0.00010333632888433638 + }, + { + "step": 411, + "epoch": 2.56875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.7079, + "grad_norm": 0.32620811462402344, + "learning_rate": 0.00010255949892103225 + }, + { + "step": 412, + "epoch": 2.575, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6905, + "grad_norm": 0.06201591342687607, + "learning_rate": 0.00010178408020452579 + }, + { + "step": 413, + "epoch": 2.58125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456040448, + "loss": 0.6878, + "grad_norm": 0.07189074903726578, + "learning_rate": 0.00010101009580175669 + }, + { + "step": 414, + "epoch": 2.5875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6967, + "grad_norm": 0.11751672625541687, + "learning_rate": 0.00010023756873699722 + }, + { + "step": 415, + "epoch": 2.59375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6957, + "grad_norm": 0.16729162633419037, + "learning_rate": 9.946652199116699e-05 + }, + { + "step": 416, + "epoch": 2.6, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6991, + "grad_norm": 0.2767026722431183, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 417, + "epoch": 2.60625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7028, + "grad_norm": 0.25911518931388855, + "learning_rate": 9.792896115911045e-05 + }, + { + "step": 418, + "epoch": 2.6125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6902, + "grad_norm": 0.11466385424137115, + "learning_rate": 9.716249281181497e-05 + }, + { + "step": 419, + "epoch": 2.61875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6972, + "grad_norm": 0.3180299997329712, + "learning_rate": 9.639759625994998e-05 + }, + { + "step": 420, + "epoch": 2.625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6917, + "grad_norm": 0.05426226183772087, + "learning_rate": 9.563429425744476e-05 + }, + { + "step": 421, + "epoch": 2.63125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6976, + "grad_norm": 0.1753247082233429, + "learning_rate": 9.487260951079448e-05 + }, + { + "step": 422, + "epoch": 2.6375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6961, + "grad_norm": 0.2734011709690094, + "learning_rate": 9.411256467838455e-05 + }, + { + "step": 423, + "epoch": 2.64375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6881, + "grad_norm": 0.33922427892684937, + "learning_rate": 9.335418236981677e-05 + }, + { + "step": 424, + "epoch": 2.65, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6979, + "grad_norm": 0.12582628428936005, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 425, + "epoch": 2.65625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6931, + "grad_norm": 0.14488649368286133, + "learning_rate": 9.184249551466189e-05 + }, + { + "step": 426, + "epoch": 2.6625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.689, + "grad_norm": 0.17252856492996216, + "learning_rate": 9.10892359373139e-05 + }, + { + "step": 427, + "epoch": 2.66875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456040448, + "loss": 0.6972, + "grad_norm": 0.08293557912111282, + "learning_rate": 9.033772882094833e-05 + }, + { + "step": 428, + "epoch": 2.675, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.7092, + "grad_norm": 0.3822398781776428, + "learning_rate": 8.958799652118943e-05 + }, + { + "step": 429, + "epoch": 2.68125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6942, + "grad_norm": 0.10763350129127502, + "learning_rate": 8.884006134086449e-05 + }, + { + "step": 430, + "epoch": 2.6875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6907, + "grad_norm": 0.06001412123441696, + "learning_rate": 8.809394552934079e-05 + }, + { + "step": 431, + "epoch": 2.69375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.683, + "grad_norm": 0.22379964590072632, + "learning_rate": 8.734967128186338e-05 + }, + { + "step": 432, + "epoch": 2.7, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6986, + "grad_norm": 0.09788617491722107, + "learning_rate": 8.660726073889511e-05 + }, + { + "step": 433, + "epoch": 2.70625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6953, + "grad_norm": 0.14352932572364807, + "learning_rate": 8.586673598545771e-05 + }, + { + "step": 434, + "epoch": 2.7125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6974, + "grad_norm": 0.14477844536304474, + "learning_rate": 8.512811905047505e-05 + }, + { + "step": 435, + "epoch": 2.71875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6895, + "grad_norm": 0.10970351845026016, + "learning_rate": 8.439143190611787e-05 + }, + { + "step": 436, + "epoch": 2.725, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6916, + "grad_norm": 0.07371252030134201, + "learning_rate": 8.365669646714983e-05 + }, + { + "step": 437, + "epoch": 2.73125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.7008, + "grad_norm": 0.12519006431102753, + "learning_rate": 8.29239345902759e-05 + }, + { + "step": 438, + "epoch": 2.7375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6967, + "grad_norm": 0.09527359157800674, + "learning_rate": 8.219316807349204e-05 + }, + { + "step": 439, + "epoch": 2.74375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6943, + "grad_norm": 0.07872035354375839, + "learning_rate": 8.146441865543689e-05 + }, + { + "step": 440, + "epoch": 2.75, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6855, + "grad_norm": 0.4546462297439575, + "learning_rate": 8.073770801474495e-05 + }, + { + "step": 441, + "epoch": 2.75625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6931, + "grad_norm": 0.058027856051921844, + "learning_rate": 8.001305776940163e-05 + }, + { + "step": 442, + "epoch": 2.7625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6913, + "grad_norm": 0.1112491562962532, + "learning_rate": 7.929048947610034e-05 + }, + { + "step": 443, + "epoch": 2.76875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.695, + "grad_norm": 0.12150882929563522, + "learning_rate": 7.857002462960132e-05 + }, + { + "step": 444, + "epoch": 2.775, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6922, + "grad_norm": 0.160121887922287, + "learning_rate": 7.785168466209187e-05 + }, + { + "step": 445, + "epoch": 2.78125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6934, + "grad_norm": 0.10320843011140823, + "learning_rate": 7.713549094254897e-05 + }, + { + "step": 446, + "epoch": 2.7875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6871, + "grad_norm": 0.1988602876663208, + "learning_rate": 7.64214647761038e-05 + }, + { + "step": 447, + "epoch": 2.79375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6884, + "grad_norm": 0.12908147275447845, + "learning_rate": 7.570962740340759e-05 + }, + { + "step": 448, + "epoch": 2.8, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6884, + "grad_norm": 0.12128958851099014, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 449, + "epoch": 2.80625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6886, + "grad_norm": 0.11044973134994507, + "learning_rate": 7.429260367567916e-05 + }, + { + "step": 450, + "epoch": 2.8125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.7032, + "grad_norm": 0.22270995378494263, + "learning_rate": 7.358745947387373e-05 + }, + { + "step": 451, + "epoch": 2.81875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6947, + "grad_norm": 0.10446536540985107, + "learning_rate": 7.288458837101675e-05 + }, + { + "step": 452, + "epoch": 2.825, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6749, + "grad_norm": 0.31897568702697754, + "learning_rate": 7.218401127592175e-05 + }, + { + "step": 453, + "epoch": 2.83125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6773, + "grad_norm": 0.20788855850696564, + "learning_rate": 7.14857490291609e-05 + }, + { + "step": 454, + "epoch": 2.8375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6981, + "grad_norm": 0.08838234096765518, + "learning_rate": 7.07898224024448e-05 + }, + { + "step": 455, + "epoch": 2.84375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7051, + "grad_norm": 0.32461225986480713, + "learning_rate": 7.009625209800465e-05 + }, + { + "step": 456, + "epoch": 2.85, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7027, + "grad_norm": 0.26527076959609985, + "learning_rate": 6.940505874797639e-05 + }, + { + "step": 457, + "epoch": 2.85625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7018, + "grad_norm": 0.22826281189918518, + "learning_rate": 6.871626291378728e-05 + }, + { + "step": 458, + "epoch": 2.8625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7128, + "grad_norm": 0.4096078872680664, + "learning_rate": 6.80298850855435e-05 + }, + { + "step": 459, + "epoch": 2.86875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7016, + "grad_norm": 0.15578070282936096, + "learning_rate": 6.734594568142142e-05 + }, + { + "step": 460, + "epoch": 2.875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.7064, + "grad_norm": 0.28081074357032776, + "learning_rate": 6.66644650470597e-05 + }, + { + "step": 461, + "epoch": 2.88125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6762, + "grad_norm": 0.38274309039115906, + "learning_rate": 6.598546345495417e-05 + }, + { + "step": 462, + "epoch": 2.8875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7075, + "grad_norm": 0.4382462203502655, + "learning_rate": 6.530896110385494e-05 + }, + { + "step": 463, + "epoch": 2.89375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6993, + "grad_norm": 0.14974206686019897, + "learning_rate": 6.463497811816523e-05 + }, + { + "step": 464, + "epoch": 2.9, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6903, + "grad_norm": 0.0787801742553711, + "learning_rate": 6.396353454734311e-05 + }, + { + "step": 465, + "epoch": 2.90625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.697, + "grad_norm": 0.3469065725803375, + "learning_rate": 6.32946503653045e-05 + }, + { + "step": 466, + "epoch": 2.9125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6956, + "grad_norm": 0.07199260592460632, + "learning_rate": 6.262834546982969e-05 + }, + { + "step": 467, + "epoch": 2.91875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456040448, + "loss": 0.6922, + "grad_norm": 0.10803790390491486, + "learning_rate": 6.196463968197084e-05 + }, + { + "step": 468, + "epoch": 2.925, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6919, + "grad_norm": 0.06186731904745102, + "learning_rate": 6.130355274546267e-05 + }, + { + "step": 469, + "epoch": 2.93125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6943, + "grad_norm": 0.26043465733528137, + "learning_rate": 6.064510432613499e-05 + }, + { + "step": 470, + "epoch": 2.9375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6959, + "grad_norm": 0.1433803290128708, + "learning_rate": 5.998931401132786e-05 + }, + { + "step": 471, + "epoch": 2.94375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.691, + "grad_norm": 0.29147329926490784, + "learning_rate": 5.933620130930867e-05 + }, + { + "step": 472, + "epoch": 2.95, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6961, + "grad_norm": 0.2083701491355896, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 473, + "epoch": 2.95625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6907, + "grad_norm": 0.09508762508630753, + "learning_rate": 5.803808637786135e-05 + }, + { + "step": 474, + "epoch": 2.9625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.6972, + "grad_norm": 0.12146692723035812, + "learning_rate": 5.739312276439427e-05 + }, + { + "step": 475, + "epoch": 2.96875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.694, + "grad_norm": 0.0790642723441124, + "learning_rate": 5.6750913994488415e-05 + }, + { + "step": 476, + "epoch": 2.975, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6953, + "grad_norm": 0.08034928143024445, + "learning_rate": 5.6111479172391136e-05 + }, + { + "step": 477, + "epoch": 2.98125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6926, + "grad_norm": 0.1092766672372818, + "learning_rate": 5.5474837319831314e-05 + }, + { + "step": 478, + "epoch": 2.9875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6915, + "grad_norm": 0.10284140706062317, + "learning_rate": 5.4841007375453186e-05 + }, + { + "step": 479, + "epoch": 2.99375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6911, + "grad_norm": 0.10155194997787476, + "learning_rate": 5.4210008194253196e-05 + }, + { + "step": 480, + "epoch": 3.0, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6861, + "grad_norm": 0.10975462943315506, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 481, + "epoch": 3.00625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.6854, + "grad_norm": 0.11360020935535431, + "learning_rate": 5.2956577119771405e-05 + }, + { + "step": 482, + "epoch": 3.0125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6991, + "grad_norm": 0.22840693593025208, + "learning_rate": 5.233418251320765e-05 + }, + { + "step": 483, + "epoch": 3.01875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.707, + "grad_norm": 0.2785073518753052, + "learning_rate": 5.171469324214901e-05 + }, + { + "step": 484, + "epoch": 3.025, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6952, + "grad_norm": 0.07849278301000595, + "learning_rate": 5.109812773498967e-05 + }, + { + "step": 485, + "epoch": 3.03125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7008, + "grad_norm": 0.2592863440513611, + "learning_rate": 5.048450433314835e-05 + }, + { + "step": 486, + "epoch": 3.0375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6871, + "grad_norm": 0.13755610585212708, + "learning_rate": 4.987384129052291e-05 + }, + { + "step": 487, + "epoch": 3.04375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.7005, + "grad_norm": 0.17429444193840027, + "learning_rate": 4.926615677294723e-05 + }, + { + "step": 488, + "epoch": 3.05, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.7015, + "grad_norm": 0.3459007441997528, + "learning_rate": 4.866146885765096e-05 + }, + { + "step": 489, + "epoch": 3.05625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6893, + "grad_norm": 0.2515803873538971, + "learning_rate": 4.8059795532721575e-05 + }, + { + "step": 490, + "epoch": 3.0625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.695, + "grad_norm": 0.1882389932870865, + "learning_rate": 4.7461154696569294e-05 + }, + { + "step": 491, + "epoch": 3.06875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6868, + "grad_norm": 0.21153484284877777, + "learning_rate": 4.686556415739488e-05 + }, + { + "step": 492, + "epoch": 3.075, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6916, + "grad_norm": 0.09220373630523682, + "learning_rate": 4.62730416326596e-05 + }, + { + "step": 493, + "epoch": 3.08125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6892, + "grad_norm": 0.30619680881500244, + "learning_rate": 4.568360474855826e-05 + }, + { + "step": 494, + "epoch": 3.0875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6911, + "grad_norm": 0.15139108896255493, + "learning_rate": 4.509727103949492e-05 + }, + { + "step": 495, + "epoch": 3.09375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6875, + "grad_norm": 0.3610985279083252, + "learning_rate": 4.451405794756138e-05 + }, + { + "step": 496, + "epoch": 3.1, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456040448, + "loss": 0.6939, + "grad_norm": 0.12308734655380249, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 497, + "epoch": 3.10625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456040448, + "loss": 0.6963, + "grad_norm": 0.29118868708610535, + "learning_rate": 4.33570629187776e-05 + }, + { + "step": 498, + "epoch": 3.1125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6957, + "grad_norm": 0.37380826473236084, + "learning_rate": 4.278331539989307e-05 + }, + { + "step": 499, + "epoch": 3.11875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.6941, + "grad_norm": 0.10158409178256989, + "learning_rate": 4.2212757333045283e-05 + }, + { + "step": 500, + "epoch": 3.125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6964, + "grad_norm": 0.14179424941539764, + "learning_rate": 4.164540569103667e-05 + }, + { + "step": 501, + "epoch": 3.13125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6944, + "grad_norm": 0.05988779291510582, + "learning_rate": 4.108127735128561e-05 + }, + { + "step": 502, + "epoch": 3.1375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6923, + "grad_norm": 0.17060285806655884, + "learning_rate": 4.052038909532469e-05 + }, + { + "step": 503, + "epoch": 3.14375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6911, + "grad_norm": 0.22303560376167297, + "learning_rate": 3.996275760830125e-05 + }, + { + "step": 504, + "epoch": 3.15, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6951, + "grad_norm": 0.06213363632559776, + "learning_rate": 3.94083994784814e-05 + }, + { + "step": 505, + "epoch": 3.15625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6958, + "grad_norm": 0.2764429450035095, + "learning_rate": 3.885733119675616e-05 + }, + { + "step": 506, + "epoch": 3.1625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6925, + "grad_norm": 0.06017112359404564, + "learning_rate": 3.830956915615106e-05 + }, + { + "step": 507, + "epoch": 3.16875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.6945, + "grad_norm": 0.07642989605665207, + "learning_rate": 3.776512965133863e-05 + }, + { + "step": 508, + "epoch": 3.175, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.6901, + "grad_norm": 0.21582236886024475, + "learning_rate": 3.72240288781534e-05 + }, + { + "step": 509, + "epoch": 3.18125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6958, + "grad_norm": 0.22523757815361023, + "learning_rate": 3.66862829331103e-05 + }, + { + "step": 510, + "epoch": 3.1875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6975, + "grad_norm": 0.10873033106327057, + "learning_rate": 3.6151907812925717e-05 + }, + { + "step": 511, + "epoch": 3.19375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6935, + "grad_norm": 0.07670394331216812, + "learning_rate": 3.562091941404179e-05 + }, + { + "step": 512, + "epoch": 3.2, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6934, + "grad_norm": 0.06926532089710236, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 513, + "epoch": 3.20625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.691, + "grad_norm": 0.12880925834178925, + "learning_rate": 3.456916586173797e-05 + }, + { + "step": 514, + "epoch": 3.2125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.694, + "grad_norm": 0.23028628528118134, + "learning_rate": 3.404843199558945e-05 + }, + { + "step": 515, + "epoch": 3.21875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6928, + "grad_norm": 0.210487499833107, + "learning_rate": 3.3531147424353664e-05 + }, + { + "step": 516, + "epoch": 3.225, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6928, + "grad_norm": 0.21001724898815155, + "learning_rate": 3.301732753606776e-05 + }, + { + "step": 517, + "epoch": 3.23125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6924, + "grad_norm": 0.4239060878753662, + "learning_rate": 3.250698761570244e-05 + }, + { + "step": 518, + "epoch": 3.2375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6887, + "grad_norm": 0.5119627714157104, + "learning_rate": 3.200014284470745e-05 + }, + { + "step": 519, + "epoch": 3.24375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.696, + "grad_norm": 0.055483654141426086, + "learning_rate": 3.149680830055967e-05 + }, + { + "step": 520, + "epoch": 3.25, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6909, + "grad_norm": 0.22218897938728333, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 521, + "epoch": 3.25625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6938, + "grad_norm": 0.071445994079113, + "learning_rate": 3.0500729680161663e-05 + }, + { + "step": 522, + "epoch": 3.2625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6895, + "grad_norm": 0.18263757228851318, + "learning_rate": 3.0008015234980552e-05 + }, + { + "step": 523, + "epoch": 3.26875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6929, + "grad_norm": 0.06803449243307114, + "learning_rate": 2.9518870277903274e-05 + }, + { + "step": 524, + "epoch": 3.275, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6887, + "grad_norm": 0.07582808285951614, + "learning_rate": 2.9033309359877597e-05 + }, + { + "step": 525, + "epoch": 3.28125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6976, + "grad_norm": 0.2955162525177002, + "learning_rate": 2.855134692523438e-05 + }, + { + "step": 526, + "epoch": 3.2875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6905, + "grad_norm": 0.09258957952260971, + "learning_rate": 2.807299731125773e-05 + }, + { + "step": 527, + "epoch": 3.29375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6939, + "grad_norm": 0.1213991567492485, + "learning_rate": 2.759827474775852e-05 + }, + { + "step": 528, + "epoch": 3.3, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6908, + "grad_norm": 0.11444179713726044, + "learning_rate": 2.7127193356651213e-05 + }, + { + "step": 529, + "epoch": 3.30625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6949, + "grad_norm": 0.29306772351264954, + "learning_rate": 2.665976715153377e-05 + }, + { + "step": 530, + "epoch": 3.3125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6882, + "grad_norm": 0.23926085233688354, + "learning_rate": 2.619601003727043e-05 + }, + { + "step": 531, + "epoch": 3.31875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456045056, + "loss": 0.6995, + "grad_norm": 0.33981063961982727, + "learning_rate": 2.5735935809578656e-05 + }, + { + "step": 532, + "epoch": 3.325, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7008, + "grad_norm": 0.34058380126953125, + "learning_rate": 2.5279558154618197e-05 + }, + { + "step": 533, + "epoch": 3.33125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6932, + "grad_norm": 0.14790907502174377, + "learning_rate": 2.4826890648584353e-05 + }, + { + "step": 534, + "epoch": 3.3375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6964, + "grad_norm": 0.11046469211578369, + "learning_rate": 2.4377946757303828e-05 + }, + { + "step": 535, + "epoch": 3.34375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6986, + "grad_norm": 0.10569898039102554, + "learning_rate": 2.393273983583427e-05 + }, + { + "step": 536, + "epoch": 3.35, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.694, + "grad_norm": 0.14173872768878937, + "learning_rate": 2.3491283128067174e-05 + }, + { + "step": 537, + "epoch": 3.35625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6907, + "grad_norm": 0.13812153041362762, + "learning_rate": 2.3053589766333414e-05 + }, + { + "step": 538, + "epoch": 3.3625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6922, + "grad_norm": 0.17387312650680542, + "learning_rate": 2.261967277101318e-05 + }, + { + "step": 539, + "epoch": 3.36875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6926, + "grad_norm": 0.1000937968492508, + "learning_rate": 2.218954505014821e-05 + }, + { + "step": 540, + "epoch": 3.375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6914, + "grad_norm": 0.3135361671447754, + "learning_rate": 2.1763219399058042e-05 + }, + { + "step": 541, + "epoch": 3.38125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6923, + "grad_norm": 0.06628899276256561, + "learning_rate": 2.1340708499959197e-05 + }, + { + "step": 542, + "epoch": 3.3875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.694, + "grad_norm": 0.16795030236244202, + "learning_rate": 2.0922024921588167e-05 + }, + { + "step": 543, + "epoch": 3.39375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6929, + "grad_norm": 0.16226153075695038, + "learning_rate": 2.0507181118827254e-05 + }, + { + "step": 544, + "epoch": 3.4, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6884, + "grad_norm": 0.06549161672592163, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 545, + "epoch": 3.40625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.6993, + "grad_norm": 0.07886295765638351, + "learning_rate": 1.9689062088175154e-05 + }, + { + "step": 546, + "epoch": 3.4125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6923, + "grad_norm": 0.05662911385297775, + "learning_rate": 1.928581119746081e-05 + }, + { + "step": 547, + "epoch": 3.41875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6929, + "grad_norm": 0.19984129071235657, + "learning_rate": 1.8886448755986193e-05 + }, + { + "step": 548, + "epoch": 3.425, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6918, + "grad_norm": 0.11157559603452682, + "learning_rate": 1.8490986643873845e-05 + }, + { + "step": 549, + "epoch": 3.43125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6937, + "grad_norm": 0.1134122982621193, + "learning_rate": 1.8099436625220443e-05 + }, + { + "step": 550, + "epoch": 3.4375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456041984, + "loss": 0.6915, + "grad_norm": 0.15191589295864105, + "learning_rate": 1.7711810347746757e-05 + }, + { + "step": 551, + "epoch": 3.44375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6926, + "grad_norm": 0.1569242626428604, + "learning_rate": 1.7328119342451165e-05 + }, + { + "step": 552, + "epoch": 3.45, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6934, + "grad_norm": 0.1049153208732605, + "learning_rate": 1.694837502326674e-05 + }, + { + "step": 553, + "epoch": 3.45625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6934, + "grad_norm": 0.13054990768432617, + "learning_rate": 1.6572588686721606e-05 + }, + { + "step": 554, + "epoch": 3.4625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6896, + "grad_norm": 0.28669801354408264, + "learning_rate": 1.6200771511602882e-05 + }, + { + "step": 555, + "epoch": 3.46875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.692, + "grad_norm": 0.5552548766136169, + "learning_rate": 1.583293455862422e-05 + }, + { + "step": 556, + "epoch": 3.475, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6935, + "grad_norm": 0.09054745733737946, + "learning_rate": 1.546908877009676e-05 + }, + { + "step": 557, + "epoch": 3.48125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6932, + "grad_norm": 0.2277979999780655, + "learning_rate": 1.5109244969603546e-05 + }, + { + "step": 558, + "epoch": 3.4875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6954, + "grad_norm": 0.1846141368150711, + "learning_rate": 1.4753413861677604e-05 + }, + { + "step": 559, + "epoch": 3.49375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.693, + "grad_norm": 0.17892596125602722, + "learning_rate": 1.4401606031483497e-05 + }, + { + "step": 560, + "epoch": 3.5, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6939, + "grad_norm": 0.15679946541786194, + "learning_rate": 1.4053831944502508e-05 + }, + { + "step": 561, + "epoch": 3.50625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6926, + "grad_norm": 0.09792538732290268, + "learning_rate": 1.371010194622117e-05 + }, + { + "step": 562, + "epoch": 3.5125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.7024, + "grad_norm": 0.34966909885406494, + "learning_rate": 1.3370426261823613e-05 + }, + { + "step": 563, + "epoch": 3.51875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6944, + "grad_norm": 0.09896015375852585, + "learning_rate": 1.3034814995887433e-05 + }, + { + "step": 564, + "epoch": 3.525, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6922, + "grad_norm": 0.13487832248210907, + "learning_rate": 1.2703278132082934e-05 + }, + { + "step": 565, + "epoch": 3.53125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6929, + "grad_norm": 0.0782841220498085, + "learning_rate": 1.237582553287631e-05 + }, + { + "step": 566, + "epoch": 3.5375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6969, + "grad_norm": 0.3944440484046936, + "learning_rate": 1.205246693923616e-05 + }, + { + "step": 567, + "epoch": 3.54375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.697, + "grad_norm": 0.23645463585853577, + "learning_rate": 1.173321197034382e-05 + }, + { + "step": 568, + "epoch": 3.55, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6927, + "grad_norm": 0.16764096915721893, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 569, + "epoch": 3.55625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6967, + "grad_norm": 0.2795197665691376, + "learning_rate": 1.1107050772877507e-05 + }, + { + "step": 570, + "epoch": 3.5625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6915, + "grad_norm": 0.08546560257673264, + "learning_rate": 1.0800163171172332e-05 + }, + { + "step": 571, + "epoch": 3.56875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.694, + "grad_norm": 0.10810653120279312, + "learning_rate": 1.0497416447398187e-05 + }, + { + "step": 572, + "epoch": 3.575, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6896, + "grad_norm": 0.2774757444858551, + "learning_rate": 1.0198819607580233e-05 + }, + { + "step": 573, + "epoch": 3.58125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6928, + "grad_norm": 0.1478627324104309, + "learning_rate": 9.904381534293993e-06 + }, + { + "step": 574, + "epoch": 3.5875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6904, + "grad_norm": 0.17939648032188416, + "learning_rate": 9.614110986401169e-06 + }, + { + "step": 575, + "epoch": 3.59375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6908, + "grad_norm": 0.06489428877830505, + "learning_rate": 9.32801659878905e-06 + }, + { + "step": 576, + "epoch": 3.6, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6981, + "grad_norm": 0.18909065425395966, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 577, + "epoch": 3.60625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6912, + "grad_norm": 0.07397881150245667, + "learning_rate": 8.768390222546895e-06 + }, + { + "step": 578, + "epoch": 3.6125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456040448, + "loss": 0.6938, + "grad_norm": 0.09233646094799042, + "learning_rate": 8.494874881526215e-06 + }, + { + "step": 579, + "epoch": 3.61875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.693, + "grad_norm": 0.0686846524477005, + "learning_rate": 8.225568995509834e-06 + }, + { + "step": 580, + "epoch": 3.625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6874, + "grad_norm": 0.30937594175338745, + "learning_rate": 7.960480575734162e-06 + }, + { + "step": 581, + "epoch": 3.63125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6904, + "grad_norm": 0.12163514643907547, + "learning_rate": 7.699617507975563e-06 + }, + { + "step": 582, + "epoch": 3.6375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.6919, + "grad_norm": 0.08242300152778625, + "learning_rate": 7.442987552315833e-06 + }, + { + "step": 583, + "epoch": 3.64375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6931, + "grad_norm": 0.07798877358436584, + "learning_rate": 7.190598342911358e-06 + }, + { + "step": 584, + "epoch": 3.65, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6889, + "grad_norm": 0.06930743902921677, + "learning_rate": 6.942457387765976e-06 + }, + { + "step": 585, + "epoch": 3.65625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456040448, + "loss": 0.6922, + "grad_norm": 0.12818168103694916, + "learning_rate": 6.698572068507596e-06 + }, + { + "step": 586, + "epoch": 3.6625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6975, + "grad_norm": 0.3352430760860443, + "learning_rate": 6.458949640168675e-06 + }, + { + "step": 587, + "epoch": 3.66875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456040448, + "loss": 0.6877, + "grad_norm": 0.10930031538009644, + "learning_rate": 6.223597230970428e-06 + }, + { + "step": 588, + "epoch": 3.675, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6858, + "grad_norm": 0.27856409549713135, + "learning_rate": 5.992521842110709e-06 + }, + { + "step": 589, + "epoch": 3.68125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6945, + "grad_norm": 0.19082336127758026, + "learning_rate": 5.7657303475556974e-06 + }, + { + "step": 590, + "epoch": 3.6875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6992, + "grad_norm": 0.2985897362232208, + "learning_rate": 5.543229493835594e-06 + }, + { + "step": 591, + "epoch": 3.69375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6948, + "grad_norm": 0.42367178201675415, + "learning_rate": 5.325025899843732e-06 + }, + { + "step": 592, + "epoch": 3.7, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.6959, + "grad_norm": 0.2314881980419159, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 593, + "epoch": 3.70625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6928, + "grad_norm": 0.18180237710475922, + "learning_rate": 4.901536327256589e-06 + }, + { + "step": 594, + "epoch": 3.7125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.7011, + "grad_norm": 0.43466371297836304, + "learning_rate": 4.6962629465110365e-06 + }, + { + "step": 595, + "epoch": 3.71875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456038912, + "loss": 0.6915, + "grad_norm": 0.06398872286081314, + "learning_rate": 4.495312020818403e-06 + }, + { + "step": 596, + "epoch": 3.725, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6898, + "grad_norm": 0.22946597635746002, + "learning_rate": 4.298689528010785e-06 + }, + { + "step": 597, + "epoch": 3.73125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6943, + "grad_norm": 0.09485232084989548, + "learning_rate": 4.106401317159275e-06 + }, + { + "step": 598, + "epoch": 3.7375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6939, + "grad_norm": 0.23554526269435883, + "learning_rate": 3.918453108399955e-06 + }, + { + "step": 599, + "epoch": 3.74375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6916, + "grad_norm": 0.13757729530334473, + "learning_rate": 3.7348504927637302e-06 + }, + { + "step": 600, + "epoch": 3.75, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6949, + "grad_norm": 0.13301430642604828, + "learning_rate": 3.5555989320099952e-06 + }, + { + "step": 601, + "epoch": 3.75625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6925, + "grad_norm": 0.08851190656423569, + "learning_rate": 3.3807037584642316e-06 + }, + { + "step": 602, + "epoch": 3.7625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6903, + "grad_norm": 0.12842616438865662, + "learning_rate": 3.21017017485925e-06 + }, + { + "step": 603, + "epoch": 3.76875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456037376, + "loss": 0.6949, + "grad_norm": 0.21782749891281128, + "learning_rate": 3.0440032541805825e-06 + }, + { + "step": 604, + "epoch": 3.775, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6948, + "grad_norm": 0.1000092476606369, + "learning_rate": 2.882207939515435e-06 + }, + { + "step": 605, + "epoch": 3.78125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456026624, + "loss": 0.6947, + "grad_norm": 0.3556864261627197, + "learning_rate": 2.7247890439057064e-06 + }, + { + "step": 606, + "epoch": 3.7875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6914, + "grad_norm": 0.1738535612821579, + "learning_rate": 2.5717512502048342e-06 + }, + { + "step": 607, + "epoch": 3.79375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6916, + "grad_norm": 0.13462737202644348, + "learning_rate": 2.423099110938376e-06 + }, + { + "step": 608, + "epoch": 3.8, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6904, + "grad_norm": 0.19031384587287903, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 609, + "epoch": 3.80625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6943, + "grad_norm": 0.30036935210227966, + "learning_rate": 2.1389693533636455e-06 + }, + { + "step": 610, + "epoch": 3.8125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6946, + "grad_norm": 0.10329035669565201, + "learning_rate": 2.003500187268153e-06 + }, + { + "step": 611, + "epoch": 3.81875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6913, + "grad_norm": 0.18808455765247345, + "learning_rate": 1.8724335797812685e-06 + }, + { + "step": 612, + "epoch": 3.825, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.6929, + "grad_norm": 0.147823765873909, + "learning_rate": 1.7457734298359005e-06 + }, + { + "step": 613, + "epoch": 3.83125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6962, + "grad_norm": 0.09249557554721832, + "learning_rate": 1.6235235052828476e-06 + }, + { + "step": 614, + "epoch": 3.8375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6904, + "grad_norm": 0.17523115873336792, + "learning_rate": 1.505687442778819e-06 + }, + { + "step": 615, + "epoch": 3.84375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.693, + "grad_norm": 0.18554629385471344, + "learning_rate": 1.3922687476781047e-06 + }, + { + "step": 616, + "epoch": 3.85, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.692, + "grad_norm": 0.06263169646263123, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 617, + "epoch": 3.85625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.692, + "grad_norm": 0.17601223289966583, + "learning_rate": 1.1786968239705486e-06 + }, + { + "step": 618, + "epoch": 3.8625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6893, + "grad_norm": 0.06357831507921219, + "learning_rate": 1.0785499486417438e-06 + }, + { + "step": 619, + "epoch": 3.86875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6923, + "grad_norm": 0.06812720745801926, + "learning_rate": 9.82833147083345e-07 + }, + { + "step": 620, + "epoch": 3.875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6907, + "grad_norm": 0.2193717658519745, + "learning_rate": 8.91549266652053e-07 + }, + { + "step": 621, + "epoch": 3.88125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6964, + "grad_norm": 0.31179434061050415, + "learning_rate": 8.04701022835319e-07 + }, + { + "step": 622, + "epoch": 3.8875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6982, + "grad_norm": 0.13962934911251068, + "learning_rate": 7.222909991704773e-07 + }, + { + "step": 623, + "epoch": 3.89375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6893, + "grad_norm": 0.24792057275772095, + "learning_rate": 6.443216471679058e-07 + }, + { + "step": 624, + "epoch": 3.9, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.695, + "grad_norm": 0.13216660916805267, + "learning_rate": 5.707952862381681e-07 + }, + { + "step": 625, + "epoch": 3.90625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6915, + "grad_norm": 0.09431707859039307, + "learning_rate": 5.017141036229522e-07 + }, + { + "step": 626, + "epoch": 3.9125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6936, + "grad_norm": 0.06636515259742737, + "learning_rate": 4.370801543300051e-07 + }, + { + "step": 627, + "epoch": 3.91875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6906, + "grad_norm": 0.13196074962615967, + "learning_rate": 3.768953610720327e-07 + }, + { + "step": 628, + "epoch": 3.925, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6926, + "grad_norm": 0.1271630823612213, + "learning_rate": 3.211615142094781e-07 + }, + { + "step": 629, + "epoch": 3.93125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456029696, + "loss": 0.6899, + "grad_norm": 0.19110821187496185, + "learning_rate": 2.6988027169728145e-07 + }, + { + "step": 630, + "epoch": 3.9375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45602816, + "loss": 0.6914, + "grad_norm": 0.21459637582302094, + "learning_rate": 2.2305315903553555e-07 + }, + { + "step": 631, + "epoch": 3.94375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456041984, + "loss": 0.6947, + "grad_norm": 0.18273432552814484, + "learning_rate": 1.8068156922413924e-07 + }, + { + "step": 632, + "epoch": 3.95, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6891, + "grad_norm": 0.09414903074502945, + "learning_rate": 1.4276676272133025e-07 + }, + { + "step": 633, + "epoch": 3.95625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6915, + "grad_norm": 0.0658152848482132, + "learning_rate": 1.0930986740621539e-07 + }, + { + "step": 634, + "epoch": 3.9625, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456032768, + "loss": 0.6942, + "grad_norm": 0.2177802473306656, + "learning_rate": 8.031187854514731e-08 + }, + { + "step": 635, + "epoch": 3.96875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6937, + "grad_norm": 0.06488247215747833, + "learning_rate": 5.577365876224815e-08 + }, + { + "step": 636, + "epoch": 3.975, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456025088, + "loss": 0.6921, + "grad_norm": 0.09320878982543945, + "learning_rate": 3.5695938013630134e-08 + }, + { + "step": 637, + "epoch": 3.98125, + "cpu_mem": 1.703727104, + "gpu_mem": 4.45603584, + "loss": 0.6965, + "grad_norm": 0.35684335231781006, + "learning_rate": 2.007931356572956e-08 + }, + { + "step": 638, + "epoch": 3.9875, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6948, + "grad_norm": 0.19065923988819122, + "learning_rate": 8.924249977537712e-09 + }, + { + "step": 639, + "epoch": 3.99375, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456034304, + "loss": 0.6924, + "grad_norm": 0.1915898323059082, + "learning_rate": 2.2310790867619e-09 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "loss": 0.6929, + "grad_norm": 0.1613631695508957, + "learning_rate": 0.0 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.703727104, + "gpu_mem": 4.456031232, + "train_runtime": 1375.3898, + "train_samples_per_second": 29.763, + "train_steps_per_second": 0.465, + "total_flos": 0.0, + "train_loss": 0.740024858340621 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r2-a2/README.md b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r2-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r2-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r2-a2/adapter_config.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4c4758d99093e963e7b960b3e04b3ff68f0cc5fe --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r2-a2/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha": 4, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "inference_mode": true, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 2, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r2-a2/eval_results.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..384e1397287d4d15eb6360ac8a4a4171526b3b6b --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_c", + "results": 0.2440273037542662 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r2-a2/training_configuration.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..0468a7e85f4fb56b238e8e15da8f5508008e8ced --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_C", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "loha", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 3153920 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loha-arc_c-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r2-a2", + "seed": 42, + "timestamp": "2025-09-13T01:52:21.412022" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r2-a2/training_logs.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..baf635ffbdf923b5a48518c8f0980f91bc686419 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r2-a2/training_logs.json @@ -0,0 +1,625 @@ +[ + { + "step": 1, + "epoch": 0.05714285714285714, + "cpu_mem": 1.843392512, + "gpu_mem": 4.430079488, + "loss": 4.4614, + "grad_norm": 3.8946611881256104, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 2, + "epoch": 0.11428571428571428, + "cpu_mem": 1.848897536, + "gpu_mem": 4.45530112, + "loss": 4.6994, + "grad_norm": 4.02003812789917, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 3, + "epoch": 0.17142857142857143, + "cpu_mem": 1.848897536, + "gpu_mem": 4.45533184, + "loss": 4.3594, + "grad_norm": 4.214444637298584, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 4, + "epoch": 0.22857142857142856, + "cpu_mem": 1.849094144, + "gpu_mem": 4.455298048, + "loss": 4.126, + "grad_norm": 4.275318145751953, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 5, + "epoch": 0.2857142857142857, + "cpu_mem": 1.849094144, + "gpu_mem": 4.45528576, + "loss": 4.0209, + "grad_norm": 4.282114028930664, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 6, + "epoch": 0.34285714285714286, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455348736, + "loss": 4.0825, + "grad_norm": 3.9877567291259766, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 7, + "epoch": 0.4, + "cpu_mem": 1.849290752, + "gpu_mem": 4.45535488, + "loss": 3.8303, + "grad_norm": 4.352617263793945, + "learning_rate": 0.0003 + }, + { + "step": 8, + "epoch": 0.45714285714285713, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455313408, + "loss": 3.6244, + "grad_norm": 3.6530308723449707, + "learning_rate": 0.00029980111348272456 + }, + { + "step": 9, + "epoch": 0.5142857142857142, + "cpu_mem": 1.849290752, + "gpu_mem": 4.4553088, + "loss": 3.3334, + "grad_norm": 3.468229055404663, + "learning_rate": 0.00029920498134218835 + }, + { + "step": 10, + "epoch": 0.5714285714285714, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455298048, + "loss": 3.0456, + "grad_norm": 4.338120937347412, + "learning_rate": 0.0002982131844136615 + }, + { + "step": 11, + "epoch": 0.6285714285714286, + "cpu_mem": 1.849290752, + "gpu_mem": 4.4553088, + "loss": 2.7112, + "grad_norm": 3.928178310394287, + "learning_rate": 0.0002968283527643036 + }, + { + "step": 12, + "epoch": 0.6857142857142857, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455333376, + "loss": 2.4481, + "grad_norm": 2.8286490440368652, + "learning_rate": 0.000295054158718698 + }, + { + "step": 13, + "epoch": 0.7428571428571429, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455333376, + "loss": 2.6106, + "grad_norm": 2.432260036468506, + "learning_rate": 0.00029289530712050735 + }, + { + "step": 14, + "epoch": 0.8, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455281152, + "loss": 2.1997, + "grad_norm": 1.7232836484909058, + "learning_rate": 0.000290357522856074 + }, + { + "step": 15, + "epoch": 0.8571428571428571, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455356416, + "loss": 1.8828, + "grad_norm": 1.2205018997192383, + "learning_rate": 0.0002874475356730507 + }, + { + "step": 16, + "epoch": 0.9142857142857143, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455350272, + "loss": 2.0196, + "grad_norm": 1.1594927310943604, + "learning_rate": 0.0002841730623343193 + }, + { + "step": 17, + "epoch": 0.9714285714285714, + "cpu_mem": 1.849290752, + "gpu_mem": 4.45535488, + "loss": 1.7567, + "grad_norm": 0.8488183617591858, + "learning_rate": 0.00028054278615452326 + }, + { + "step": 18, + "epoch": 1.0285714285714285, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467927552, + "loss": 2.8824, + "grad_norm": 1.782801628112793, + "learning_rate": 0.0002765663339734778 + }, + { + "step": 19, + "epoch": 1.0857142857142856, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467926016, + "loss": 1.857, + "grad_norm": 0.9957961440086365, + "learning_rate": 0.00027225425062752165 + }, + { + "step": 20, + "epoch": 1.1428571428571428, + "cpu_mem": 1.849290752, + "gpu_mem": 4.46790144, + "loss": 1.6556, + "grad_norm": 0.715120255947113, + "learning_rate": 0.0002676179709865066 + }, + { + "step": 21, + "epoch": 1.2, + "cpu_mem": 1.849290752, + "gpu_mem": 4.46790912, + "loss": 1.5634, + "grad_norm": 0.40482035279273987, + "learning_rate": 0.0002626697896305779 + }, + { + "step": 22, + "epoch": 1.2571428571428571, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467938304, + "loss": 1.871, + "grad_norm": 1.25314462184906, + "learning_rate": 0.000257422828247159 + }, + { + "step": 23, + "epoch": 1.3142857142857143, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467967488, + "loss": 1.5028, + "grad_norm": 0.34803447127342224, + "learning_rate": 0.00025189100083459397 + }, + { + "step": 24, + "epoch": 1.3714285714285714, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467910656, + "loss": 1.5608, + "grad_norm": 0.3179437816143036, + "learning_rate": 0.0002460889768047263 + }, + { + "step": 25, + "epoch": 1.4285714285714286, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467979776, + "loss": 1.4631, + "grad_norm": 0.4684447646141052, + "learning_rate": 0.00024003214208225522 + }, + { + "step": 26, + "epoch": 1.4857142857142858, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467936768, + "loss": 1.5067, + "grad_norm": 0.5099743008613586, + "learning_rate": 0.00023373655830402968 + }, + { + "step": 27, + "epoch": 1.5428571428571427, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467895296, + "loss": 1.4576, + "grad_norm": 0.25448524951934814, + "learning_rate": 0.00022721892022647462 + }, + { + "step": 28, + "epoch": 1.6, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467941376, + "loss": 1.6888, + "grad_norm": 0.5487000346183777, + "learning_rate": 0.000220496511454098 + }, + { + "step": 29, + "epoch": 1.657142857142857, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467936768, + "loss": 1.4526, + "grad_norm": 0.22202962636947632, + "learning_rate": 0.0002135871586064791 + }, + { + "step": 30, + "epoch": 1.7142857142857144, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467926016, + "loss": 1.5256, + "grad_norm": 0.42495542764663696, + "learning_rate": 0.00020650918404527775 + }, + { + "step": 31, + "epoch": 1.7714285714285714, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467956736, + "loss": 1.4789, + "grad_norm": 0.39444655179977417, + "learning_rate": 0.00019928135728662522 + }, + { + "step": 32, + "epoch": 1.8285714285714287, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467965952, + "loss": 1.4731, + "grad_norm": 0.2375553995370865, + "learning_rate": 0.00019192284522774142 + }, + { + "step": 33, + "epoch": 1.8857142857142857, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467945984, + "loss": 1.4928, + "grad_norm": 0.242265522480011, + "learning_rate": 0.00018445316131976934 + }, + { + "step": 34, + "epoch": 1.9428571428571428, + "cpu_mem": 1.849290752, + "gpu_mem": 4.46792448, + "loss": 1.4607, + "grad_norm": 0.25905004143714905, + "learning_rate": 0.00017689211382161034 + }, + { + "step": 35, + "epoch": 2.0, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467812352, + "loss": 2.1667, + "grad_norm": 0.2108139991760254, + "learning_rate": 0.00016925975327198266 + }, + { + "step": 36, + "epoch": 2.057142857142857, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455327232, + "loss": 1.4074, + "grad_norm": 0.2236945927143097, + "learning_rate": 0.00016157631931899697 + }, + { + "step": 37, + "epoch": 2.1142857142857143, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455336448, + "loss": 1.4506, + "grad_norm": 0.2395268827676773, + "learning_rate": 0.0001538621870482483 + }, + { + "step": 38, + "epoch": 2.1714285714285713, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455307264, + "loss": 1.427, + "grad_norm": 0.27511242032051086, + "learning_rate": 0.00014613781295175172 + }, + { + "step": 39, + "epoch": 2.2285714285714286, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455325696, + "loss": 1.4053, + "grad_norm": 0.20745790004730225, + "learning_rate": 0.00013842368068100303 + }, + { + "step": 40, + "epoch": 2.2857142857142856, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455302656, + "loss": 1.448, + "grad_norm": 0.22691254317760468, + "learning_rate": 0.00013074024672801731 + }, + { + "step": 41, + "epoch": 2.342857142857143, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455304192, + "loss": 1.4359, + "grad_norm": 0.22095000743865967, + "learning_rate": 0.00012310788617838966 + }, + { + "step": 42, + "epoch": 2.4, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455333376, + "loss": 1.3688, + "grad_norm": 0.3039399981498718, + "learning_rate": 0.00011554683868023067 + }, + { + "step": 43, + "epoch": 2.4571428571428573, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455348736, + "loss": 1.4336, + "grad_norm": 0.31906071305274963, + "learning_rate": 0.00010807715477225858 + }, + { + "step": 44, + "epoch": 2.5142857142857142, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455367168, + "loss": 1.4135, + "grad_norm": 0.1812860369682312, + "learning_rate": 0.00010071864271337478 + }, + { + "step": 45, + "epoch": 2.571428571428571, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455321088, + "loss": 1.3819, + "grad_norm": 0.1355254203081131, + "learning_rate": 9.34908159547222e-05 + }, + { + "step": 46, + "epoch": 2.6285714285714286, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455314944, + "loss": 1.3781, + "grad_norm": 0.2931221127510071, + "learning_rate": 8.641284139352091e-05 + }, + { + "step": 47, + "epoch": 2.685714285714286, + "cpu_mem": 1.849290752, + "gpu_mem": 4.4553088, + "loss": 1.3965, + "grad_norm": 0.26195481419563293, + "learning_rate": 7.950348854590204e-05 + }, + { + "step": 48, + "epoch": 2.742857142857143, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455313408, + "loss": 1.3711, + "grad_norm": 0.22411486506462097, + "learning_rate": 7.278107977352543e-05 + }, + { + "step": 49, + "epoch": 2.8, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455304192, + "loss": 1.3992, + "grad_norm": 0.25941604375839233, + "learning_rate": 6.626344169597031e-05 + }, + { + "step": 50, + "epoch": 2.857142857142857, + "cpu_mem": 1.849290752, + "gpu_mem": 4.45528576, + "loss": 1.3874, + "grad_norm": 0.18215373158454895, + "learning_rate": 5.996785791774478e-05 + }, + { + "step": 51, + "epoch": 2.914285714285714, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455310336, + "loss": 1.3886, + "grad_norm": 0.21252787113189697, + "learning_rate": 5.391102319527373e-05 + }, + { + "step": 52, + "epoch": 2.9714285714285715, + "cpu_mem": 1.849290752, + "gpu_mem": 4.455337984, + "loss": 1.4109, + "grad_norm": 0.1708744317293167, + "learning_rate": 4.8108999165406026e-05 + }, + { + "step": 53, + "epoch": 3.0285714285714285, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467922944, + "loss": 2.0759, + "grad_norm": 0.36979636549949646, + "learning_rate": 4.257717175284103e-05 + }, + { + "step": 54, + "epoch": 3.085714285714286, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467892224, + "loss": 1.4061, + "grad_norm": 0.1473303586244583, + "learning_rate": 3.733021036942205e-05 + }, + { + "step": 55, + "epoch": 3.142857142857143, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467926016, + "loss": 1.4266, + "grad_norm": 0.22449180483818054, + "learning_rate": 3.238202901349345e-05 + }, + { + "step": 56, + "epoch": 3.2, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467999744, + "loss": 1.3953, + "grad_norm": 0.20237641036510468, + "learning_rate": 2.774574937247831e-05 + }, + { + "step": 57, + "epoch": 3.257142857142857, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467942912, + "loss": 1.3861, + "grad_norm": 0.13346266746520996, + "learning_rate": 2.3433666026522153e-05 + }, + { + "step": 58, + "epoch": 3.314285714285714, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467936768, + "loss": 1.3533, + "grad_norm": 0.27854371070861816, + "learning_rate": 1.945721384547671e-05 + }, + { + "step": 59, + "epoch": 3.3714285714285714, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467987456, + "loss": 1.3928, + "grad_norm": 0.20833967626094818, + "learning_rate": 1.5826937665680693e-05 + }, + { + "step": 60, + "epoch": 3.4285714285714284, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467913728, + "loss": 1.4186, + "grad_norm": 0.21329668164253235, + "learning_rate": 1.2552464326949302e-05 + }, + { + "step": 61, + "epoch": 3.4857142857142858, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467927552, + "loss": 1.4168, + "grad_norm": 0.2251892238855362, + "learning_rate": 9.64247714392597e-06 + }, + { + "step": 62, + "epoch": 3.5428571428571427, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467929088, + "loss": 1.3743, + "grad_norm": 0.20018987357616425, + "learning_rate": 7.104692879492624e-06 + }, + { + "step": 63, + "epoch": 3.6, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467918336, + "loss": 1.3656, + "grad_norm": 0.15898799896240234, + "learning_rate": 4.945841281301943e-06 + }, + { + "step": 64, + "epoch": 3.657142857142857, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467935232, + "loss": 1.3608, + "grad_norm": 0.21927297115325928, + "learning_rate": 3.1716472356963286e-06 + }, + { + "step": 65, + "epoch": 3.7142857142857144, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467956736, + "loss": 1.3956, + "grad_norm": 0.23750653862953186, + "learning_rate": 1.7868155863384415e-06 + }, + { + "step": 66, + "epoch": 3.7714285714285714, + "cpu_mem": 1.849290752, + "gpu_mem": 4.46794752, + "loss": 1.3818, + "grad_norm": 0.2604621350765228, + "learning_rate": 7.950186578116413e-07 + }, + { + "step": 67, + "epoch": 3.8285714285714287, + "cpu_mem": 1.849290752, + "gpu_mem": 4.467973632, + "loss": 1.3726, + "grad_norm": 0.19689707458019257, + "learning_rate": 1.988865172754206e-07 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.849290752, + "gpu_mem": 4.46792448, + "loss": 1.3674, + "grad_norm": 0.3441721796989441, + "learning_rate": 0.0 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.849290752, + "gpu_mem": 4.46792448, + "train_runtime": 764.217, + "train_samples_per_second": 5.857, + "train_steps_per_second": 0.089, + "total_flos": 4007637443321856.0, + "train_loss": 1.9421252590768479 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r32-a2/README.md b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r32-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r32-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r32-a2/adapter_config.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0ef1d724eca7640a4f365c193cda2fc4efdb2073 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r32-a2/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha": 64, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "inference_mode": true, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 32, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r32-a2/eval_results.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..dd66dd3e24454623529b403c2bddb468d634989d --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_c", + "results": 0.2773037542662116 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r32-a2/training_configuration.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..6b84bfe39aa4ca092dec34cbeee1b5ff1610a133 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_C", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "loha", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 50462720 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loha-arc_c-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r32-a2", + "seed": 42, + "timestamp": "2025-09-14T05:43:02.345333" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r32-a2/training_logs.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..cfcc72789f1b41b94a6ee168fb5b1e9d43bb0bec --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r32-a2/training_logs.json @@ -0,0 +1,625 @@ +[ + { + "step": 1, + "epoch": 0.05714285714285714, + "cpu_mem": 1.862864896, + "gpu_mem": 4.619314688, + "loss": 4.4614, + "grad_norm": 3.8523473739624023, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 2, + "epoch": 0.11428571428571428, + "cpu_mem": 1.86836992, + "gpu_mem": 5.02300672, + "loss": 4.6994, + "grad_norm": 3.918381452560425, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 3, + "epoch": 0.17142857142857143, + "cpu_mem": 1.86836992, + "gpu_mem": 5.02303744, + "loss": 4.2307, + "grad_norm": 4.096261978149414, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 4, + "epoch": 0.22857142857142856, + "cpu_mem": 1.86836992, + "gpu_mem": 5.023003648, + "loss": 3.7407, + "grad_norm": 4.00218391418457, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 5, + "epoch": 0.2857142857142857, + "cpu_mem": 1.86836992, + "gpu_mem": 5.02299136, + "loss": 3.2783, + "grad_norm": 3.4006645679473877, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 6, + "epoch": 0.34285714285714286, + "cpu_mem": 1.86836992, + "gpu_mem": 5.023054336, + "loss": 3.129, + "grad_norm": 4.129428863525391, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 7, + "epoch": 0.4, + "cpu_mem": 1.86836992, + "gpu_mem": 5.02306048, + "loss": 2.4964, + "grad_norm": 2.6763901710510254, + "learning_rate": 0.0003 + }, + { + "step": 8, + "epoch": 0.45714285714285713, + "cpu_mem": 1.86836992, + "gpu_mem": 5.023019008, + "loss": 2.2114, + "grad_norm": 1.290360450744629, + "learning_rate": 0.00029980111348272456 + }, + { + "step": 9, + "epoch": 0.5142857142857142, + "cpu_mem": 1.86836992, + "gpu_mem": 5.0230144, + "loss": 2.0118, + "grad_norm": 0.8746750950813293, + "learning_rate": 0.00029920498134218835 + }, + { + "step": 10, + "epoch": 0.5714285714285714, + "cpu_mem": 1.86836992, + "gpu_mem": 5.023003648, + "loss": 1.7145, + "grad_norm": 0.46772903203964233, + "learning_rate": 0.0002982131844136615 + }, + { + "step": 11, + "epoch": 0.6285714285714286, + "cpu_mem": 1.86836992, + "gpu_mem": 5.0230144, + "loss": 1.5075, + "grad_norm": 0.2590790092945099, + "learning_rate": 0.0002968283527643036 + }, + { + "step": 12, + "epoch": 0.6857142857142857, + "cpu_mem": 1.86836992, + "gpu_mem": 5.023038976, + "loss": 1.4578, + "grad_norm": 0.24011576175689697, + "learning_rate": 0.000295054158718698 + }, + { + "step": 13, + "epoch": 0.7428571428571429, + "cpu_mem": 1.86836992, + "gpu_mem": 5.023038976, + "loss": 1.5113, + "grad_norm": 0.3649512827396393, + "learning_rate": 0.00029289530712050735 + }, + { + "step": 14, + "epoch": 0.8, + "cpu_mem": 1.86836992, + "gpu_mem": 5.022986752, + "loss": 1.5277, + "grad_norm": 0.215444415807724, + "learning_rate": 0.000290357522856074 + }, + { + "step": 15, + "epoch": 0.8571428571428571, + "cpu_mem": 1.86836992, + "gpu_mem": 5.023062016, + "loss": 1.6125, + "grad_norm": 0.3915955424308777, + "learning_rate": 0.0002874475356730507 + }, + { + "step": 16, + "epoch": 0.9142857142857143, + "cpu_mem": 1.86836992, + "gpu_mem": 5.023055872, + "loss": 1.4828, + "grad_norm": 0.20246171951293945, + "learning_rate": 0.0002841730623343193 + }, + { + "step": 17, + "epoch": 0.9714285714285714, + "cpu_mem": 1.86836992, + "gpu_mem": 5.02306048, + "loss": 1.3977, + "grad_norm": 0.14407864212989807, + "learning_rate": 0.00028054278615452326 + }, + { + "step": 18, + "epoch": 1.0285714285714285, + "cpu_mem": 1.86836992, + "gpu_mem": 5.224868352, + "loss": 2.0465, + "grad_norm": 0.25867244601249695, + "learning_rate": 0.0002765663339734778 + }, + { + "step": 19, + "epoch": 1.0857142857142856, + "cpu_mem": 1.86836992, + "gpu_mem": 5.224866816, + "loss": 1.4179, + "grad_norm": 0.17259110510349274, + "learning_rate": 0.00027225425062752165 + }, + { + "step": 20, + "epoch": 1.1428571428571428, + "cpu_mem": 1.86836992, + "gpu_mem": 5.22484224, + "loss": 1.3521, + "grad_norm": 0.19877421855926514, + "learning_rate": 0.0002676179709865066 + }, + { + "step": 21, + "epoch": 1.2, + "cpu_mem": 1.86836992, + "gpu_mem": 5.22484992, + "loss": 1.3588, + "grad_norm": 0.13644251227378845, + "learning_rate": 0.0002626697896305779 + }, + { + "step": 22, + "epoch": 1.2571428571428571, + "cpu_mem": 1.86836992, + "gpu_mem": 5.224879104, + "loss": 1.3667, + "grad_norm": 0.14385129511356354, + "learning_rate": 0.000257422828247159 + }, + { + "step": 23, + "epoch": 1.3142857142857143, + "cpu_mem": 1.86836992, + "gpu_mem": 5.224908288, + "loss": 1.3291, + "grad_norm": 0.1350773274898529, + "learning_rate": 0.00025189100083459397 + }, + { + "step": 24, + "epoch": 1.3714285714285714, + "cpu_mem": 1.86836992, + "gpu_mem": 5.224851456, + "loss": 1.3715, + "grad_norm": 0.13441303372383118, + "learning_rate": 0.0002460889768047263 + }, + { + "step": 25, + "epoch": 1.4285714285714286, + "cpu_mem": 1.86836992, + "gpu_mem": 5.224920576, + "loss": 1.3148, + "grad_norm": 0.17031680047512054, + "learning_rate": 0.00024003214208225522 + }, + { + "step": 26, + "epoch": 1.4857142857142858, + "cpu_mem": 1.86836992, + "gpu_mem": 5.224877568, + "loss": 1.337, + "grad_norm": 0.119869664311409, + "learning_rate": 0.00023373655830402968 + }, + { + "step": 27, + "epoch": 1.5428571428571427, + "cpu_mem": 1.86836992, + "gpu_mem": 5.224836096, + "loss": 1.35, + "grad_norm": 0.1316402554512024, + "learning_rate": 0.00022721892022647462 + }, + { + "step": 28, + "epoch": 1.6, + "cpu_mem": 1.86836992, + "gpu_mem": 5.224882176, + "loss": 1.495, + "grad_norm": 0.399570494890213, + "learning_rate": 0.000220496511454098 + }, + { + "step": 29, + "epoch": 1.657142857142857, + "cpu_mem": 1.86836992, + "gpu_mem": 5.224877568, + "loss": 1.396, + "grad_norm": 0.17705513536930084, + "learning_rate": 0.0002135871586064791 + }, + { + "step": 30, + "epoch": 1.7142857142857144, + "cpu_mem": 1.86836992, + "gpu_mem": 5.224866816, + "loss": 1.4093, + "grad_norm": 0.21322089433670044, + "learning_rate": 0.00020650918404527775 + }, + { + "step": 31, + "epoch": 1.7714285714285714, + "cpu_mem": 1.86836992, + "gpu_mem": 5.224897536, + "loss": 1.3744, + "grad_norm": 0.17508447170257568, + "learning_rate": 0.00019928135728662522 + }, + { + "step": 32, + "epoch": 1.8285714285714287, + "cpu_mem": 1.86836992, + "gpu_mem": 5.224906752, + "loss": 1.3874, + "grad_norm": 0.12007424235343933, + "learning_rate": 0.00019192284522774142 + }, + { + "step": 33, + "epoch": 1.8857142857142857, + "cpu_mem": 1.86836992, + "gpu_mem": 5.224886784, + "loss": 1.3909, + "grad_norm": 0.10286141186952591, + "learning_rate": 0.00018445316131976934 + }, + { + "step": 34, + "epoch": 1.9428571428571428, + "cpu_mem": 1.86836992, + "gpu_mem": 5.22486528, + "loss": 1.3752, + "grad_norm": 0.11452487111091614, + "learning_rate": 0.00017689211382161034 + }, + { + "step": 35, + "epoch": 2.0, + "cpu_mem": 1.86836992, + "gpu_mem": 5.224753152, + "loss": 2.1375, + "grad_norm": 0.2822839319705963, + "learning_rate": 0.00016925975327198266 + }, + { + "step": 36, + "epoch": 2.057142857142857, + "cpu_mem": 1.868566528, + "gpu_mem": 5.023032832, + "loss": 1.4265, + "grad_norm": 0.2875940501689911, + "learning_rate": 0.00016157631931899697 + }, + { + "step": 37, + "epoch": 2.1142857142857143, + "cpu_mem": 1.868566528, + "gpu_mem": 5.023042048, + "loss": 1.3726, + "grad_norm": 0.10795528441667557, + "learning_rate": 0.0001538621870482483 + }, + { + "step": 38, + "epoch": 2.1714285714285713, + "cpu_mem": 1.868566528, + "gpu_mem": 5.023012864, + "loss": 1.3486, + "grad_norm": 0.1297284960746765, + "learning_rate": 0.00014613781295175172 + }, + { + "step": 39, + "epoch": 2.2285714285714286, + "cpu_mem": 1.868566528, + "gpu_mem": 5.023031296, + "loss": 1.3483, + "grad_norm": 0.08380130678415298, + "learning_rate": 0.00013842368068100303 + }, + { + "step": 40, + "epoch": 2.2857142857142856, + "cpu_mem": 1.868566528, + "gpu_mem": 5.023008256, + "loss": 1.404, + "grad_norm": 0.1665874570608139, + "learning_rate": 0.00013074024672801731 + }, + { + "step": 41, + "epoch": 2.342857142857143, + "cpu_mem": 1.868566528, + "gpu_mem": 5.023009792, + "loss": 1.4074, + "grad_norm": 0.15654538571834564, + "learning_rate": 0.00012310788617838966 + }, + { + "step": 42, + "epoch": 2.4, + "cpu_mem": 1.868566528, + "gpu_mem": 5.023038976, + "loss": 1.316, + "grad_norm": 0.13470788300037384, + "learning_rate": 0.00011554683868023067 + }, + { + "step": 43, + "epoch": 2.4571428571428573, + "cpu_mem": 1.868566528, + "gpu_mem": 5.023054336, + "loss": 1.3817, + "grad_norm": 0.2074926495552063, + "learning_rate": 0.00010807715477225858 + }, + { + "step": 44, + "epoch": 2.5142857142857142, + "cpu_mem": 1.868566528, + "gpu_mem": 5.023072768, + "loss": 1.3621, + "grad_norm": 0.09369979798793793, + "learning_rate": 0.00010071864271337478 + }, + { + "step": 45, + "epoch": 2.571428571428571, + "cpu_mem": 1.868566528, + "gpu_mem": 5.023026688, + "loss": 1.3338, + "grad_norm": 0.08683168143033981, + "learning_rate": 9.34908159547222e-05 + }, + { + "step": 46, + "epoch": 2.6285714285714286, + "cpu_mem": 1.868566528, + "gpu_mem": 5.023020544, + "loss": 1.3136, + "grad_norm": 0.13517531752586365, + "learning_rate": 8.641284139352091e-05 + }, + { + "step": 47, + "epoch": 2.685714285714286, + "cpu_mem": 1.868566528, + "gpu_mem": 5.0230144, + "loss": 1.3176, + "grad_norm": 0.13062775135040283, + "learning_rate": 7.950348854590204e-05 + }, + { + "step": 48, + "epoch": 2.742857142857143, + "cpu_mem": 1.868566528, + "gpu_mem": 5.023019008, + "loss": 1.3, + "grad_norm": 0.10755203664302826, + "learning_rate": 7.278107977352543e-05 + }, + { + "step": 49, + "epoch": 2.8, + "cpu_mem": 1.868566528, + "gpu_mem": 5.023009792, + "loss": 1.3443, + "grad_norm": 0.12639285624027252, + "learning_rate": 6.626344169597031e-05 + }, + { + "step": 50, + "epoch": 2.857142857142857, + "cpu_mem": 1.868566528, + "gpu_mem": 5.02299136, + "loss": 1.3375, + "grad_norm": 0.12154214084148407, + "learning_rate": 5.996785791774478e-05 + }, + { + "step": 51, + "epoch": 2.914285714285714, + "cpu_mem": 1.868566528, + "gpu_mem": 5.023015936, + "loss": 1.376, + "grad_norm": 0.19331255555152893, + "learning_rate": 5.391102319527373e-05 + }, + { + "step": 52, + "epoch": 2.9714285714285715, + "cpu_mem": 1.868566528, + "gpu_mem": 5.023043584, + "loss": 1.3656, + "grad_norm": 0.11791692674160004, + "learning_rate": 4.8108999165406026e-05 + }, + { + "step": 53, + "epoch": 3.0285714285714285, + "cpu_mem": 1.868566528, + "gpu_mem": 5.224863744, + "loss": 2.0, + "grad_norm": 0.22311677038669586, + "learning_rate": 4.257717175284103e-05 + }, + { + "step": 54, + "epoch": 3.085714285714286, + "cpu_mem": 1.868566528, + "gpu_mem": 5.224833024, + "loss": 1.3845, + "grad_norm": 0.17593929171562195, + "learning_rate": 3.733021036942205e-05 + }, + { + "step": 55, + "epoch": 3.142857142857143, + "cpu_mem": 1.868566528, + "gpu_mem": 5.224866816, + "loss": 1.3976, + "grad_norm": 0.21280762553215027, + "learning_rate": 3.238202901349345e-05 + }, + { + "step": 56, + "epoch": 3.2, + "cpu_mem": 1.868566528, + "gpu_mem": 5.224940544, + "loss": 1.3612, + "grad_norm": 0.1263953298330307, + "learning_rate": 2.774574937247831e-05 + }, + { + "step": 57, + "epoch": 3.257142857142857, + "cpu_mem": 1.868566528, + "gpu_mem": 5.224883712, + "loss": 1.3569, + "grad_norm": 0.10715311765670776, + "learning_rate": 2.3433666026522153e-05 + }, + { + "step": 58, + "epoch": 3.314285714285714, + "cpu_mem": 1.868566528, + "gpu_mem": 5.224877568, + "loss": 1.2637, + "grad_norm": 0.116644948720932, + "learning_rate": 1.945721384547671e-05 + }, + { + "step": 59, + "epoch": 3.3714285714285714, + "cpu_mem": 1.868566528, + "gpu_mem": 5.224928256, + "loss": 1.343, + "grad_norm": 0.15131889283657074, + "learning_rate": 1.5826937665680693e-05 + }, + { + "step": 60, + "epoch": 3.4285714285714284, + "cpu_mem": 1.868566528, + "gpu_mem": 5.224854528, + "loss": 1.4025, + "grad_norm": 0.19640026986598969, + "learning_rate": 1.2552464326949302e-05 + }, + { + "step": 61, + "epoch": 3.4857142857142858, + "cpu_mem": 1.868566528, + "gpu_mem": 5.224868352, + "loss": 1.4148, + "grad_norm": 0.1944180577993393, + "learning_rate": 9.64247714392597e-06 + }, + { + "step": 62, + "epoch": 3.5428571428571427, + "cpu_mem": 1.868566528, + "gpu_mem": 5.224869888, + "loss": 1.3417, + "grad_norm": 0.1324373483657837, + "learning_rate": 7.104692879492624e-06 + }, + { + "step": 63, + "epoch": 3.6, + "cpu_mem": 1.868566528, + "gpu_mem": 5.224859136, + "loss": 1.3408, + "grad_norm": 0.13134542107582092, + "learning_rate": 4.945841281301943e-06 + }, + { + "step": 64, + "epoch": 3.657142857142857, + "cpu_mem": 1.868566528, + "gpu_mem": 5.224876032, + "loss": 1.3258, + "grad_norm": 0.13349179923534393, + "learning_rate": 3.1716472356963286e-06 + }, + { + "step": 65, + "epoch": 3.7142857142857144, + "cpu_mem": 1.868566528, + "gpu_mem": 5.224897536, + "loss": 1.3424, + "grad_norm": 0.1222631111741066, + "learning_rate": 1.7868155863384415e-06 + }, + { + "step": 66, + "epoch": 3.7714285714285714, + "cpu_mem": 1.868566528, + "gpu_mem": 5.22488832, + "loss": 1.3032, + "grad_norm": 0.11887747049331665, + "learning_rate": 7.950186578116413e-07 + }, + { + "step": 67, + "epoch": 3.8285714285714287, + "cpu_mem": 1.868566528, + "gpu_mem": 5.224914432, + "loss": 1.3324, + "grad_norm": 0.11385256797075272, + "learning_rate": 1.988865172754206e-07 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.868566528, + "gpu_mem": 5.22486528, + "loss": 1.3143, + "grad_norm": 0.1465492844581604, + "learning_rate": 0.0 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.868566528, + "gpu_mem": 5.22486528, + "train_runtime": 765.9614, + "train_samples_per_second": 5.844, + "train_steps_per_second": 0.089, + "total_flos": 4190351787565056.0, + "train_loss": 1.6758992479127996 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r8-a2/README.md b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r8-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r8-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r8-a2/adapter_config.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5723daa9f5f7b854bf548bbee9a6d37e12198a3a --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r8-a2/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha": 16, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "inference_mode": true, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 8, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r8-a2/eval_results.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..f6c4753fe0e5b92da858d0472a8ff7f864c4ef07 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_c", + "results": 0.26023890784982934 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r8-a2/training_configuration.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..16aee8ac37815dd55814912c12647142bb65fd98 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_C", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "loha", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 12615680 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loha-arc_c-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r8-a2", + "seed": 42, + "timestamp": "2025-09-13T15:40:46.212367" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r8-a2/training_logs.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..9fcd60052b4342c9cc93c395fbf2fe9aaeb33167 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_c-r8-a2/training_logs.json @@ -0,0 +1,625 @@ +[ + { + "step": 1, + "epoch": 0.05714285714285714, + "cpu_mem": 1.843372032, + "gpu_mem": 4.467926528, + "loss": 4.4614, + "grad_norm": 3.7767608165740967, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 2, + "epoch": 0.11428571428571428, + "cpu_mem": 1.848877056, + "gpu_mem": 4.56884224, + "loss": 4.6994, + "grad_norm": 3.902996301651001, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 3, + "epoch": 0.17142857142857143, + "cpu_mem": 1.849073664, + "gpu_mem": 4.56887296, + "loss": 4.3151, + "grad_norm": 4.104316234588623, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 4, + "epoch": 0.22857142857142856, + "cpu_mem": 1.849073664, + "gpu_mem": 4.568839168, + "loss": 3.9919, + "grad_norm": 4.12067985534668, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 5, + "epoch": 0.2857142857142857, + "cpu_mem": 1.849073664, + "gpu_mem": 4.56882688, + "loss": 3.751, + "grad_norm": 3.9066431522369385, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 6, + "epoch": 0.34285714285714286, + "cpu_mem": 1.849073664, + "gpu_mem": 4.568889856, + "loss": 3.6737, + "grad_norm": 3.354490280151367, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 7, + "epoch": 0.4, + "cpu_mem": 1.849073664, + "gpu_mem": 4.568896, + "loss": 3.2016, + "grad_norm": 3.078850507736206, + "learning_rate": 0.0003 + }, + { + "step": 8, + "epoch": 0.45714285714285713, + "cpu_mem": 1.849073664, + "gpu_mem": 4.568854528, + "loss": 2.9939, + "grad_norm": 4.0653815269470215, + "learning_rate": 0.00029980111348272456 + }, + { + "step": 9, + "epoch": 0.5142857142857142, + "cpu_mem": 1.849073664, + "gpu_mem": 4.56884992, + "loss": 2.6281, + "grad_norm": 2.3823447227478027, + "learning_rate": 0.00029920498134218835 + }, + { + "step": 10, + "epoch": 0.5714285714285714, + "cpu_mem": 1.849073664, + "gpu_mem": 4.568839168, + "loss": 2.245, + "grad_norm": 1.4851523637771606, + "learning_rate": 0.0002982131844136615 + }, + { + "step": 11, + "epoch": 0.6285714285714286, + "cpu_mem": 1.849073664, + "gpu_mem": 4.56884992, + "loss": 1.8774, + "grad_norm": 0.9828141927719116, + "learning_rate": 0.0002968283527643036 + }, + { + "step": 12, + "epoch": 0.6857142857142857, + "cpu_mem": 1.849073664, + "gpu_mem": 4.568874496, + "loss": 1.7402, + "grad_norm": 0.7155304551124573, + "learning_rate": 0.000295054158718698 + }, + { + "step": 13, + "epoch": 0.7428571428571429, + "cpu_mem": 1.849073664, + "gpu_mem": 4.568874496, + "loss": 1.8955, + "grad_norm": 0.8261786103248596, + "learning_rate": 0.00029289530712050735 + }, + { + "step": 14, + "epoch": 0.8, + "cpu_mem": 1.849073664, + "gpu_mem": 4.568822272, + "loss": 1.6686, + "grad_norm": 0.3713507354259491, + "learning_rate": 0.000290357522856074 + }, + { + "step": 15, + "epoch": 0.8571428571428571, + "cpu_mem": 1.849073664, + "gpu_mem": 4.568897536, + "loss": 1.5893, + "grad_norm": 0.3085663616657257, + "learning_rate": 0.0002874475356730507 + }, + { + "step": 16, + "epoch": 0.9142857142857143, + "cpu_mem": 1.849073664, + "gpu_mem": 4.568891392, + "loss": 1.6288, + "grad_norm": 0.2903585731983185, + "learning_rate": 0.0002841730623343193 + }, + { + "step": 17, + "epoch": 0.9714285714285714, + "cpu_mem": 1.849073664, + "gpu_mem": 4.568896, + "loss": 1.4667, + "grad_norm": 0.20423276722431183, + "learning_rate": 0.00028054278615452326 + }, + { + "step": 18, + "epoch": 1.0285714285714285, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619315712, + "loss": 2.3181, + "grad_norm": 0.7196502089500427, + "learning_rate": 0.0002765663339734778 + }, + { + "step": 19, + "epoch": 1.0857142857142856, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619314176, + "loss": 1.5486, + "grad_norm": 0.36073899269104004, + "learning_rate": 0.00027225425062752165 + }, + { + "step": 20, + "epoch": 1.1428571428571428, + "cpu_mem": 1.849270272, + "gpu_mem": 4.6192896, + "loss": 1.3929, + "grad_norm": 0.26487046480178833, + "learning_rate": 0.0002676179709865066 + }, + { + "step": 21, + "epoch": 1.2, + "cpu_mem": 1.849270272, + "gpu_mem": 4.61929728, + "loss": 1.4255, + "grad_norm": 0.18397259712219238, + "learning_rate": 0.0002626697896305779 + }, + { + "step": 22, + "epoch": 1.2571428571428571, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619326464, + "loss": 1.5301, + "grad_norm": 0.43461138010025024, + "learning_rate": 0.000257422828247159 + }, + { + "step": 23, + "epoch": 1.3142857142857143, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619355648, + "loss": 1.364, + "grad_norm": 0.16019728779792786, + "learning_rate": 0.00025189100083459397 + }, + { + "step": 24, + "epoch": 1.3714285714285714, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619298816, + "loss": 1.4224, + "grad_norm": 0.18088769912719727, + "learning_rate": 0.0002460889768047263 + }, + { + "step": 25, + "epoch": 1.4285714285714286, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619367936, + "loss": 1.3128, + "grad_norm": 0.1378791481256485, + "learning_rate": 0.00024003214208225522 + }, + { + "step": 26, + "epoch": 1.4857142857142858, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619324928, + "loss": 1.3744, + "grad_norm": 0.1631397157907486, + "learning_rate": 0.00023373655830402968 + }, + { + "step": 27, + "epoch": 1.5428571428571427, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619283456, + "loss": 1.4121, + "grad_norm": 0.2546769678592682, + "learning_rate": 0.00022721892022647462 + }, + { + "step": 28, + "epoch": 1.6, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619329536, + "loss": 1.541, + "grad_norm": 0.4572147727012634, + "learning_rate": 0.000220496511454098 + }, + { + "step": 29, + "epoch": 1.657142857142857, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619324928, + "loss": 1.3923, + "grad_norm": 0.14714832603931427, + "learning_rate": 0.0002135871586064791 + }, + { + "step": 30, + "epoch": 1.7142857142857144, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619314176, + "loss": 1.4077, + "grad_norm": 0.23051311075687408, + "learning_rate": 0.00020650918404527775 + }, + { + "step": 31, + "epoch": 1.7714285714285714, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619344896, + "loss": 1.3878, + "grad_norm": 0.2572135925292969, + "learning_rate": 0.00019928135728662522 + }, + { + "step": 32, + "epoch": 1.8285714285714287, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619354112, + "loss": 1.4099, + "grad_norm": 0.15407662093639374, + "learning_rate": 0.00019192284522774142 + }, + { + "step": 33, + "epoch": 1.8857142857142857, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619334144, + "loss": 1.4096, + "grad_norm": 0.13206005096435547, + "learning_rate": 0.00018445316131976934 + }, + { + "step": 34, + "epoch": 1.9428571428571428, + "cpu_mem": 1.849270272, + "gpu_mem": 4.61931264, + "loss": 1.3775, + "grad_norm": 0.15021972358226776, + "learning_rate": 0.00017689211382161034 + }, + { + "step": 35, + "epoch": 2.0, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619200512, + "loss": 2.1294, + "grad_norm": 0.30331751704216003, + "learning_rate": 0.00016925975327198266 + }, + { + "step": 36, + "epoch": 2.057142857142857, + "cpu_mem": 1.849270272, + "gpu_mem": 4.568868352, + "loss": 1.4055, + "grad_norm": 0.2694096267223358, + "learning_rate": 0.00016157631931899697 + }, + { + "step": 37, + "epoch": 2.1142857142857143, + "cpu_mem": 1.849270272, + "gpu_mem": 4.568877568, + "loss": 1.4, + "grad_norm": 0.1496746987104416, + "learning_rate": 0.0001538621870482483 + }, + { + "step": 38, + "epoch": 2.1714285714285713, + "cpu_mem": 1.849270272, + "gpu_mem": 4.568848384, + "loss": 1.343, + "grad_norm": 0.12383216619491577, + "learning_rate": 0.00014613781295175172 + }, + { + "step": 39, + "epoch": 2.2285714285714286, + "cpu_mem": 1.849270272, + "gpu_mem": 4.568866816, + "loss": 1.3566, + "grad_norm": 0.12478478252887726, + "learning_rate": 0.00013842368068100303 + }, + { + "step": 40, + "epoch": 2.2857142857142856, + "cpu_mem": 1.849270272, + "gpu_mem": 4.568843776, + "loss": 1.417, + "grad_norm": 0.20593976974487305, + "learning_rate": 0.00013074024672801731 + }, + { + "step": 41, + "epoch": 2.342857142857143, + "cpu_mem": 1.849270272, + "gpu_mem": 4.568845312, + "loss": 1.4032, + "grad_norm": 0.21525323390960693, + "learning_rate": 0.00012310788617838966 + }, + { + "step": 42, + "epoch": 2.4, + "cpu_mem": 1.849270272, + "gpu_mem": 4.568874496, + "loss": 1.3515, + "grad_norm": 0.17963601648807526, + "learning_rate": 0.00011554683868023067 + }, + { + "step": 43, + "epoch": 2.4571428571428573, + "cpu_mem": 1.849270272, + "gpu_mem": 4.568889856, + "loss": 1.4054, + "grad_norm": 0.28198957443237305, + "learning_rate": 0.00010807715477225858 + }, + { + "step": 44, + "epoch": 2.5142857142857142, + "cpu_mem": 1.849270272, + "gpu_mem": 4.568908288, + "loss": 1.3873, + "grad_norm": 0.191033735871315, + "learning_rate": 0.00010071864271337478 + }, + { + "step": 45, + "epoch": 2.571428571428571, + "cpu_mem": 1.849270272, + "gpu_mem": 4.568862208, + "loss": 1.3503, + "grad_norm": 0.12333618104457855, + "learning_rate": 9.34908159547222e-05 + }, + { + "step": 46, + "epoch": 2.6285714285714286, + "cpu_mem": 1.849270272, + "gpu_mem": 4.568856064, + "loss": 1.3251, + "grad_norm": 0.3150593638420105, + "learning_rate": 8.641284139352091e-05 + }, + { + "step": 47, + "epoch": 2.685714285714286, + "cpu_mem": 1.849270272, + "gpu_mem": 4.56884992, + "loss": 1.3654, + "grad_norm": 0.18101467192173004, + "learning_rate": 7.950348854590204e-05 + }, + { + "step": 48, + "epoch": 2.742857142857143, + "cpu_mem": 1.849270272, + "gpu_mem": 4.568854528, + "loss": 1.3232, + "grad_norm": 0.15281569957733154, + "learning_rate": 7.278107977352543e-05 + }, + { + "step": 49, + "epoch": 2.8, + "cpu_mem": 1.849270272, + "gpu_mem": 4.568845312, + "loss": 1.3664, + "grad_norm": 0.19651691615581512, + "learning_rate": 6.626344169597031e-05 + }, + { + "step": 50, + "epoch": 2.857142857142857, + "cpu_mem": 1.849270272, + "gpu_mem": 4.56882688, + "loss": 1.3379, + "grad_norm": 0.10636218637228012, + "learning_rate": 5.996785791774478e-05 + }, + { + "step": 51, + "epoch": 2.914285714285714, + "cpu_mem": 1.849270272, + "gpu_mem": 4.568851456, + "loss": 1.3877, + "grad_norm": 0.22441740334033966, + "learning_rate": 5.391102319527373e-05 + }, + { + "step": 52, + "epoch": 2.9714285714285715, + "cpu_mem": 1.849270272, + "gpu_mem": 4.568879104, + "loss": 1.3925, + "grad_norm": 0.15090321004390717, + "learning_rate": 4.8108999165406026e-05 + }, + { + "step": 53, + "epoch": 3.0285714285714285, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619311104, + "loss": 2.0274, + "grad_norm": 0.24913212656974792, + "learning_rate": 4.257717175284103e-05 + }, + { + "step": 54, + "epoch": 3.085714285714286, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619280384, + "loss": 1.4008, + "grad_norm": 0.29139190912246704, + "learning_rate": 3.733021036942205e-05 + }, + { + "step": 55, + "epoch": 3.142857142857143, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619314176, + "loss": 1.4013, + "grad_norm": 0.23508620262145996, + "learning_rate": 3.238202901349345e-05 + }, + { + "step": 56, + "epoch": 3.2, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619387904, + "loss": 1.3567, + "grad_norm": 0.17042416334152222, + "learning_rate": 2.774574937247831e-05 + }, + { + "step": 57, + "epoch": 3.257142857142857, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619331072, + "loss": 1.3658, + "grad_norm": 0.20380978286266327, + "learning_rate": 2.3433666026522153e-05 + }, + { + "step": 58, + "epoch": 3.314285714285714, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619324928, + "loss": 1.2986, + "grad_norm": 0.1716468334197998, + "learning_rate": 1.945721384547671e-05 + }, + { + "step": 59, + "epoch": 3.3714285714285714, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619375616, + "loss": 1.369, + "grad_norm": 0.21552585065364838, + "learning_rate": 1.5826937665680693e-05 + }, + { + "step": 60, + "epoch": 3.4285714285714284, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619301888, + "loss": 1.4203, + "grad_norm": 0.323853999376297, + "learning_rate": 1.2552464326949302e-05 + }, + { + "step": 61, + "epoch": 3.4857142857142858, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619315712, + "loss": 1.411, + "grad_norm": 0.30850380659103394, + "learning_rate": 9.64247714392597e-06 + }, + { + "step": 62, + "epoch": 3.5428571428571427, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619317248, + "loss": 1.3628, + "grad_norm": 0.1667039841413498, + "learning_rate": 7.104692879492624e-06 + }, + { + "step": 63, + "epoch": 3.6, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619306496, + "loss": 1.3517, + "grad_norm": 0.18624359369277954, + "learning_rate": 4.945841281301943e-06 + }, + { + "step": 64, + "epoch": 3.657142857142857, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619323392, + "loss": 1.3325, + "grad_norm": 0.18097564578056335, + "learning_rate": 3.1716472356963286e-06 + }, + { + "step": 65, + "epoch": 3.7142857142857144, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619344896, + "loss": 1.3673, + "grad_norm": 0.156330868601799, + "learning_rate": 1.7868155863384415e-06 + }, + { + "step": 66, + "epoch": 3.7714285714285714, + "cpu_mem": 1.849270272, + "gpu_mem": 4.61933568, + "loss": 1.317, + "grad_norm": 0.2276175171136856, + "learning_rate": 7.950186578116413e-07 + }, + { + "step": 67, + "epoch": 3.8285714285714287, + "cpu_mem": 1.849270272, + "gpu_mem": 4.619361792, + "loss": 1.3513, + "grad_norm": 0.13923251628875732, + "learning_rate": 1.988865172754206e-07 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.849270272, + "gpu_mem": 4.61931264, + "loss": 1.3345, + "grad_norm": 0.24741221964359283, + "learning_rate": 0.0 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.849270272, + "gpu_mem": 4.61931264, + "train_runtime": 763.0811, + "train_samples_per_second": 5.866, + "train_steps_per_second": 0.089, + "total_flos": 4044180312170496.0, + "train_loss": 1.7760318692992716 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r2-a2/README.md b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r2-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r2-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r2-a2/adapter_config.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4c4758d99093e963e7b960b3e04b3ff68f0cc5fe --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r2-a2/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha": 4, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "inference_mode": true, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 2, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r2-a2/eval_results.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..038a0c81cd15fb7be03405184feeec6b65142b25 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_e", + "results": 0.24957912457912457 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r2-a2/training_configuration.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..f68d8a63e3c6cf339c20bda94e68ef185a11849b --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_E", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "loha", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 3153920 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loha-arc_e-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r2-a2", + "seed": 42, + "timestamp": "2025-09-13T00:30:37.302867" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r2-a2/training_logs.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..96b32457b2d8663c4697b88cdf5e9a49415551c6 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r2-a2/training_logs.json @@ -0,0 +1,1273 @@ +[ + { + "step": 1, + "epoch": 0.028169014084507043, + "cpu_mem": 1.8680832, + "gpu_mem": 4.430024192, + "loss": 4.6319, + "grad_norm": 4.065451622009277, + "learning_rate": 2.1428571428571425e-05 + }, + { + "step": 2, + "epoch": 0.056338028169014086, + "cpu_mem": 1.873391616, + "gpu_mem": 4.45531648, + "loss": 4.4578, + "grad_norm": 4.036701202392578, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 3, + "epoch": 0.08450704225352113, + "cpu_mem": 1.873588224, + "gpu_mem": 4.455294976, + "loss": 4.6328, + "grad_norm": 4.180227756500244, + "learning_rate": 6.428571428571427e-05 + }, + { + "step": 4, + "epoch": 0.11267605633802817, + "cpu_mem": 1.873784832, + "gpu_mem": 4.455273472, + "loss": 4.7706, + "grad_norm": 3.8782899379730225, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 5, + "epoch": 0.14084507042253522, + "cpu_mem": 1.87398144, + "gpu_mem": 4.455314944, + "loss": 4.529, + "grad_norm": 4.0265045166015625, + "learning_rate": 0.00010714285714285714 + }, + { + "step": 6, + "epoch": 0.16901408450704225, + "cpu_mem": 1.87398144, + "gpu_mem": 4.455290368, + "loss": 4.2479, + "grad_norm": 4.177544116973877, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 7, + "epoch": 0.19718309859154928, + "cpu_mem": 1.874178048, + "gpu_mem": 4.455313408, + "loss": 4.2672, + "grad_norm": 4.227900505065918, + "learning_rate": 0.00015 + }, + { + "step": 8, + "epoch": 0.22535211267605634, + "cpu_mem": 1.874178048, + "gpu_mem": 4.455271936, + "loss": 4.1248, + "grad_norm": 4.28433895111084, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 9, + "epoch": 0.2535211267605634, + "cpu_mem": 1.874178048, + "gpu_mem": 4.455273472, + "loss": 4.2011, + "grad_norm": 4.030056953430176, + "learning_rate": 0.00019285714285714286 + }, + { + "step": 10, + "epoch": 0.28169014084507044, + "cpu_mem": 1.874178048, + "gpu_mem": 4.455268864, + "loss": 3.7578, + "grad_norm": 3.9933383464813232, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 11, + "epoch": 0.30985915492957744, + "cpu_mem": 1.874178048, + "gpu_mem": 4.4553472, + "loss": 3.6776, + "grad_norm": 3.85056471824646, + "learning_rate": 0.00023571428571428569 + }, + { + "step": 12, + "epoch": 0.3380281690140845, + "cpu_mem": 1.874178048, + "gpu_mem": 4.455321088, + "loss": 3.4678, + "grad_norm": 3.6991894245147705, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 13, + "epoch": 0.36619718309859156, + "cpu_mem": 1.874178048, + "gpu_mem": 4.455271936, + "loss": 3.4373, + "grad_norm": 4.062644004821777, + "learning_rate": 0.00027857142857142854 + }, + { + "step": 14, + "epoch": 0.39436619718309857, + "cpu_mem": 1.874178048, + "gpu_mem": 4.45529344, + "loss": 3.1714, + "grad_norm": 3.8472795486450195, + "learning_rate": 0.0003 + }, + { + "step": 15, + "epoch": 0.4225352112676056, + "cpu_mem": 1.874178048, + "gpu_mem": 4.4552704, + "loss": 2.9463, + "grad_norm": 2.9479715824127197, + "learning_rate": 0.0002999533773001224 + }, + { + "step": 16, + "epoch": 0.4507042253521127, + "cpu_mem": 1.874178048, + "gpu_mem": 4.455275008, + "loss": 2.644, + "grad_norm": 2.442200183868408, + "learning_rate": 0.0002998135381828383 + }, + { + "step": 17, + "epoch": 0.4788732394366197, + "cpu_mem": 1.874178048, + "gpu_mem": 4.455311872, + "loss": 2.5557, + "grad_norm": 2.285137891769409, + "learning_rate": 0.00029958056957717696 + }, + { + "step": 18, + "epoch": 0.5070422535211268, + "cpu_mem": 1.874178048, + "gpu_mem": 4.455322624, + "loss": 2.4043, + "grad_norm": 1.9421344995498657, + "learning_rate": 0.0002992546163048102 + }, + { + "step": 19, + "epoch": 0.5352112676056338, + "cpu_mem": 1.874178048, + "gpu_mem": 4.455265792, + "loss": 2.502, + "grad_norm": 1.947041630744934, + "learning_rate": 0.0002988358809900258 + }, + { + "step": 20, + "epoch": 0.5633802816901409, + "cpu_mem": 1.874178048, + "gpu_mem": 4.455336448, + "loss": 1.7626, + "grad_norm": 0.9063177108764648, + "learning_rate": 0.0002983246239337692 + }, + { + "step": 21, + "epoch": 0.5915492957746479, + "cpu_mem": 1.874178048, + "gpu_mem": 4.455334912, + "loss": 2.0804, + "grad_norm": 1.5791268348693848, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 22, + "epoch": 0.6197183098591549, + "cpu_mem": 1.874178048, + "gpu_mem": 4.455291904, + "loss": 2.043, + "grad_norm": 1.404981017112732, + "learning_rate": 0.00029702587317728153 + }, + { + "step": 23, + "epoch": 0.647887323943662, + "cpu_mem": 1.874374656, + "gpu_mem": 4.4553088, + "loss": 1.8419, + "grad_norm": 1.1069085597991943, + "learning_rate": 0.0002962391868272735 + }, + { + "step": 24, + "epoch": 0.676056338028169, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455265792, + "loss": 1.7489, + "grad_norm": 0.7893968224525452, + "learning_rate": 0.00029536159293436166 + }, + { + "step": 25, + "epoch": 0.704225352112676, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455294976, + "loss": 1.6522, + "grad_norm": 0.5530843138694763, + "learning_rate": 0.00029439363704250176 + }, + { + "step": 26, + "epoch": 0.7323943661971831, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455275008, + "loss": 1.553, + "grad_norm": 0.2656780183315277, + "learning_rate": 0.00029333592086792107 + }, + { + "step": 27, + "epoch": 0.7605633802816901, + "cpu_mem": 1.874374656, + "gpu_mem": 4.45530112, + "loss": 1.5266, + "grad_norm": 0.42043960094451904, + "learning_rate": 0.0002921891019250697 + }, + { + "step": 28, + "epoch": 0.7887323943661971, + "cpu_mem": 1.874374656, + "gpu_mem": 4.45530112, + "loss": 1.5525, + "grad_norm": 0.35208380222320557, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 29, + "epoch": 0.8169014084507042, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455279616, + "loss": 1.5117, + "grad_norm": 0.6222015619277954, + "learning_rate": 0.00028963106229663063 + }, + { + "step": 30, + "epoch": 0.8450704225352113, + "cpu_mem": 1.874374656, + "gpu_mem": 4.4552704, + "loss": 1.4743, + "grad_norm": 0.3267349898815155, + "learning_rate": 0.00028822143178056114 + }, + { + "step": 31, + "epoch": 0.8732394366197183, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455288832, + "loss": 1.4276, + "grad_norm": 0.25030508637428284, + "learning_rate": 0.00028672587784675096 + }, + { + "step": 32, + "epoch": 0.9014084507042254, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455311872, + "loss": 1.4375, + "grad_norm": 0.3375011682510376, + "learning_rate": 0.0002851453301853628 + }, + { + "step": 33, + "epoch": 0.9295774647887324, + "cpu_mem": 1.874374656, + "gpu_mem": 4.4553088, + "loss": 1.4379, + "grad_norm": 0.23925703763961792, + "learning_rate": 0.00028348077132172027 + }, + { + "step": 34, + "epoch": 0.9577464788732394, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455311872, + "loss": 1.4815, + "grad_norm": 0.3022330701351166, + "learning_rate": 0.0002817332360055343 + }, + { + "step": 35, + "epoch": 0.9859154929577465, + "cpu_mem": 1.874374656, + "gpu_mem": 4.45529344, + "loss": 1.4094, + "grad_norm": 0.4639003276824951, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 36, + "epoch": 1.0140845070422535, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467899904, + "loss": 2.172, + "grad_norm": 1.018775463104248, + "learning_rate": 0.0002779936322448233 + }, + { + "step": 37, + "epoch": 1.0422535211267605, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467904512, + "loss": 1.4021, + "grad_norm": 0.24485285580158234, + "learning_rate": 0.0002760038884726157 + }, + { + "step": 38, + "epoch": 1.0704225352112675, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467883008, + "loss": 1.3512, + "grad_norm": 0.3681104779243469, + "learning_rate": 0.00027393581614739923 + }, + { + "step": 39, + "epoch": 1.0985915492957747, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467872256, + "loss": 1.3377, + "grad_norm": 0.28259164094924927, + "learning_rate": 0.0002717907008573785 + }, + { + "step": 40, + "epoch": 1.1267605633802817, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467935232, + "loss": 1.387, + "grad_norm": 0.21912617981433868, + "learning_rate": 0.0002695698760834384 + }, + { + "step": 41, + "epoch": 1.1549295774647887, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467895296, + "loss": 1.3273, + "grad_norm": 0.1466824859380722, + "learning_rate": 0.00026727472237020447 + }, + { + "step": 42, + "epoch": 1.1830985915492958, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467938304, + "loss": 1.3807, + "grad_norm": 0.2532905042171478, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 43, + "epoch": 1.2112676056338028, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467887616, + "loss": 1.4233, + "grad_norm": 0.2067308872938156, + "learning_rate": 0.0002624671804451601 + }, + { + "step": 44, + "epoch": 1.2394366197183098, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467952128, + "loss": 1.3554, + "grad_norm": 0.13997229933738708, + "learning_rate": 0.0002599577807744739 + }, + { + "step": 45, + "epoch": 1.267605633802817, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467919872, + "loss": 1.3601, + "grad_norm": 0.1438092440366745, + "learning_rate": 0.0002573800273889577 + }, + { + "step": 46, + "epoch": 1.295774647887324, + "cpu_mem": 1.874374656, + "gpu_mem": 4.46792448, + "loss": 1.3792, + "grad_norm": 0.1838187277317047, + "learning_rate": 0.0002547355227129109 + }, + { + "step": 47, + "epoch": 1.323943661971831, + "cpu_mem": 1.874374656, + "gpu_mem": 4.46787072, + "loss": 1.356, + "grad_norm": 0.3598126173019409, + "learning_rate": 0.00025202591066563786 + }, + { + "step": 48, + "epoch": 1.352112676056338, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467884544, + "loss": 1.3507, + "grad_norm": 0.19435901939868927, + "learning_rate": 0.0002492528756395289 + }, + { + "step": 49, + "epoch": 1.380281690140845, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467873792, + "loss": 1.3441, + "grad_norm": 0.18648944795131683, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 50, + "epoch": 1.408450704225352, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467887616, + "loss": 1.3496, + "grad_norm": 0.223169207572937, + "learning_rate": 0.00024352347027881003 + }, + { + "step": 51, + "epoch": 1.436619718309859, + "cpu_mem": 1.874374656, + "gpu_mem": 4.46793984, + "loss": 1.3736, + "grad_norm": 0.2813203036785126, + "learning_rate": 0.0002405706615488216 + }, + { + "step": 52, + "epoch": 1.4647887323943662, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467887616, + "loss": 1.3585, + "grad_norm": 0.19064456224441528, + "learning_rate": 0.00023756155083521846 + }, + { + "step": 53, + "epoch": 1.4929577464788732, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467956736, + "loss": 1.3085, + "grad_norm": 0.19727540016174316, + "learning_rate": 0.00023449800870954326 + }, + { + "step": 54, + "epoch": 1.5211267605633803, + "cpu_mem": 1.874374656, + "gpu_mem": 4.46792448, + "loss": 1.3603, + "grad_norm": 0.35384392738342285, + "learning_rate": 0.0002313819395798639 + }, + { + "step": 55, + "epoch": 1.5492957746478875, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467933696, + "loss": 1.3636, + "grad_norm": 0.1487715095281601, + "learning_rate": 0.0002282152805069247 + }, + { + "step": 56, + "epoch": 1.5774647887323945, + "cpu_mem": 1.874374656, + "gpu_mem": 4.46790912, + "loss": 1.2954, + "grad_norm": 0.20130091905593872, + "learning_rate": 0.000225 + }, + { + "step": 57, + "epoch": 1.6056338028169015, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467942912, + "loss": 1.3362, + "grad_norm": 0.22836185991764069, + "learning_rate": 0.00022173809679319772 + }, + { + "step": 58, + "epoch": 1.6338028169014085, + "cpu_mem": 1.874374656, + "gpu_mem": 4.46792448, + "loss": 1.3297, + "grad_norm": 0.24739979207515717, + "learning_rate": 0.00021843159860297442 + }, + { + "step": 59, + "epoch": 1.6619718309859155, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467910656, + "loss": 1.3361, + "grad_norm": 0.16636288166046143, + "learning_rate": 0.00021508256086763368 + }, + { + "step": 60, + "epoch": 1.6901408450704225, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467949056, + "loss": 1.34, + "grad_norm": 0.2644355595111847, + "learning_rate": 0.00021169306546959174 + }, + { + "step": 61, + "epoch": 1.7183098591549295, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467881472, + "loss": 1.3237, + "grad_norm": 0.3212207555770874, + "learning_rate": 0.0002082652194412042 + }, + { + "step": 62, + "epoch": 1.7464788732394365, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467929088, + "loss": 1.3672, + "grad_norm": 0.18150964379310608, + "learning_rate": 0.00020480115365495926 + }, + { + "step": 63, + "epoch": 1.7746478873239435, + "cpu_mem": 1.874374656, + "gpu_mem": 4.4678784, + "loss": 1.3397, + "grad_norm": 0.2501877248287201, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 64, + "epoch": 1.8028169014084507, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467927552, + "loss": 1.3303, + "grad_norm": 0.2916346490383148, + "learning_rate": 0.00019777299753775265 + }, + { + "step": 65, + "epoch": 1.8309859154929577, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467926016, + "loss": 1.3667, + "grad_norm": 0.17438240349292755, + "learning_rate": 0.00019421327616163563 + }, + { + "step": 66, + "epoch": 1.8591549295774648, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467944448, + "loss": 1.3228, + "grad_norm": 0.24722999334335327, + "learning_rate": 0.00019062607022145078 + }, + { + "step": 67, + "epoch": 1.887323943661972, + "cpu_mem": 1.874374656, + "gpu_mem": 4.46788608, + "loss": 1.333, + "grad_norm": 0.23443934321403503, + "learning_rate": 0.00018701360965354402 + }, + { + "step": 68, + "epoch": 1.915492957746479, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467898368, + "loss": 1.3757, + "grad_norm": 0.15439674258232117, + "learning_rate": 0.00018337814009344714 + }, + { + "step": 69, + "epoch": 1.943661971830986, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467922944, + "loss": 1.3154, + "grad_norm": 0.24316363036632538, + "learning_rate": 0.0001797219214799096 + }, + { + "step": 70, + "epoch": 1.971830985915493, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467899904, + "loss": 1.2988, + "grad_norm": 0.19323624670505524, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 71, + "epoch": 2.0, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467735552, + "loss": 2.0051, + "grad_norm": 0.8141029477119446, + "learning_rate": 0.00017235633992642615 + }, + { + "step": 72, + "epoch": 2.028169014084507, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455304192, + "loss": 1.3503, + "grad_norm": 0.2051018625497818, + "learning_rate": 0.00016865155569712278 + }, + { + "step": 73, + "epoch": 2.056338028169014, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455267328, + "loss": 1.3195, + "grad_norm": 0.17024345695972443, + "learning_rate": 0.0001649351769893725 + }, + { + "step": 74, + "epoch": 2.084507042253521, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455327232, + "loss": 1.3423, + "grad_norm": 0.20512598752975464, + "learning_rate": 0.00016120951403796364 + }, + { + "step": 75, + "epoch": 2.112676056338028, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455294976, + "loss": 1.3429, + "grad_norm": 0.3033762574195862, + "learning_rate": 0.00015747688284910457 + }, + { + "step": 76, + "epoch": 2.140845070422535, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455305728, + "loss": 1.306, + "grad_norm": 0.1772235631942749, + "learning_rate": 0.00015373960376071093 + }, + { + "step": 77, + "epoch": 2.169014084507042, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455342592, + "loss": 1.3688, + "grad_norm": 0.18338434398174286, + "learning_rate": 0.00015 + }, + { + "step": 78, + "epoch": 2.1971830985915495, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455327232, + "loss": 1.367, + "grad_norm": 0.2746819853782654, + "learning_rate": 0.00014626039623928907 + }, + { + "step": 79, + "epoch": 2.2253521126760565, + "cpu_mem": 1.874374656, + "gpu_mem": 4.45527808, + "loss": 1.2867, + "grad_norm": 0.20020714402198792, + "learning_rate": 0.0001425231171508954 + }, + { + "step": 80, + "epoch": 2.2535211267605635, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455322624, + "loss": 1.3703, + "grad_norm": 0.20599445700645447, + "learning_rate": 0.00013879048596203636 + }, + { + "step": 81, + "epoch": 2.2816901408450705, + "cpu_mem": 1.874374656, + "gpu_mem": 4.4553088, + "loss": 1.3679, + "grad_norm": 0.3158780634403229, + "learning_rate": 0.0001350648230106275 + }, + { + "step": 82, + "epoch": 2.3098591549295775, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455276544, + "loss": 1.3743, + "grad_norm": 0.33274611830711365, + "learning_rate": 0.00013134844430287725 + }, + { + "step": 83, + "epoch": 2.3380281690140845, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455327232, + "loss": 1.3847, + "grad_norm": 0.4091032147407532, + "learning_rate": 0.0001276436600735738 + }, + { + "step": 84, + "epoch": 2.3661971830985915, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455265792, + "loss": 1.3523, + "grad_norm": 0.1747487187385559, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 85, + "epoch": 2.3943661971830985, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455311872, + "loss": 1.3773, + "grad_norm": 0.2982265055179596, + "learning_rate": 0.00012027807852009038 + }, + { + "step": 86, + "epoch": 2.4225352112676055, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455265792, + "loss": 1.3526, + "grad_norm": 0.2403230369091034, + "learning_rate": 0.00011662185990655284 + }, + { + "step": 87, + "epoch": 2.4507042253521125, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455296512, + "loss": 1.3839, + "grad_norm": 0.15933208167552948, + "learning_rate": 0.00011298639034645593 + }, + { + "step": 88, + "epoch": 2.4788732394366195, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455271936, + "loss": 1.324, + "grad_norm": 0.24802301824092865, + "learning_rate": 0.00010937392977854923 + }, + { + "step": 89, + "epoch": 2.507042253521127, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455325696, + "loss": 1.3021, + "grad_norm": 0.19023920595645905, + "learning_rate": 0.00010578672383836435 + }, + { + "step": 90, + "epoch": 2.535211267605634, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455307264, + "loss": 1.342, + "grad_norm": 0.20103256404399872, + "learning_rate": 0.00010222700246224735 + }, + { + "step": 91, + "epoch": 2.563380281690141, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455256576, + "loss": 1.3852, + "grad_norm": 0.3424113392829895, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 92, + "epoch": 2.591549295774648, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455281152, + "loss": 1.3511, + "grad_norm": 0.1586439609527588, + "learning_rate": 9.519884634504074e-05 + }, + { + "step": 93, + "epoch": 2.619718309859155, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455284224, + "loss": 1.2809, + "grad_norm": 0.28515490889549255, + "learning_rate": 9.17347805587958e-05 + }, + { + "step": 94, + "epoch": 2.647887323943662, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455276544, + "loss": 1.3112, + "grad_norm": 0.20693497359752655, + "learning_rate": 8.830693453040829e-05 + }, + { + "step": 95, + "epoch": 2.676056338028169, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455314944, + "loss": 1.3254, + "grad_norm": 0.1791871041059494, + "learning_rate": 8.491743913236628e-05 + }, + { + "step": 96, + "epoch": 2.704225352112676, + "cpu_mem": 1.874374656, + "gpu_mem": 4.45532416, + "loss": 1.2997, + "grad_norm": 0.35642847418785095, + "learning_rate": 8.156840139702554e-05 + }, + { + "step": 97, + "epoch": 2.732394366197183, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455267328, + "loss": 1.3717, + "grad_norm": 0.2160898596048355, + "learning_rate": 7.82619032068023e-05 + }, + { + "step": 98, + "epoch": 2.76056338028169, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455267328, + "loss": 1.3428, + "grad_norm": 0.12680499255657196, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 99, + "epoch": 2.788732394366197, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455264256, + "loss": 1.3135, + "grad_norm": 0.2591705620288849, + "learning_rate": 7.17847194930753e-05 + }, + { + "step": 100, + "epoch": 2.816901408450704, + "cpu_mem": 1.874374656, + "gpu_mem": 4.45526272, + "loss": 1.2762, + "grad_norm": 0.1725183129310608, + "learning_rate": 6.86180604201361e-05 + }, + { + "step": 101, + "epoch": 2.845070422535211, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455305728, + "loss": 1.2709, + "grad_norm": 0.16160669922828674, + "learning_rate": 6.550199129045668e-05 + }, + { + "step": 102, + "epoch": 2.873239436619718, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455244288, + "loss": 1.2992, + "grad_norm": 0.1716378629207611, + "learning_rate": 6.243844916478155e-05 + }, + { + "step": 103, + "epoch": 2.9014084507042255, + "cpu_mem": 1.874374656, + "gpu_mem": 4.45529344, + "loss": 1.3175, + "grad_norm": 0.13474643230438232, + "learning_rate": 5.9429338451178355e-05 + }, + { + "step": 104, + "epoch": 2.9295774647887325, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455356416, + "loss": 1.3214, + "grad_norm": 0.20464323461055756, + "learning_rate": 5.6476529721189974e-05 + }, + { + "step": 105, + "epoch": 2.9577464788732395, + "cpu_mem": 1.874374656, + "gpu_mem": 4.4553088, + "loss": 1.2857, + "grad_norm": 0.15217600762844086, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 106, + "epoch": 2.9859154929577465, + "cpu_mem": 1.874374656, + "gpu_mem": 4.455290368, + "loss": 1.348, + "grad_norm": 0.13626451790332794, + "learning_rate": 5.074712436047112e-05 + }, + { + "step": 107, + "epoch": 3.0140845070422535, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467922944, + "loss": 1.8961, + "grad_norm": 0.46198323369026184, + "learning_rate": 4.7974089334362057e-05 + }, + { + "step": 108, + "epoch": 3.0422535211267605, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467904512, + "loss": 1.3236, + "grad_norm": 0.16889333724975586, + "learning_rate": 4.526447728708908e-05 + }, + { + "step": 109, + "epoch": 3.0704225352112675, + "cpu_mem": 1.874374656, + "gpu_mem": 4.46789376, + "loss": 1.3425, + "grad_norm": 0.17433060705661774, + "learning_rate": 4.261997261104223e-05 + }, + { + "step": 110, + "epoch": 3.0985915492957745, + "cpu_mem": 1.874374656, + "gpu_mem": 4.46794752, + "loss": 1.348, + "grad_norm": 0.4165462851524353, + "learning_rate": 4.004221922552608e-05 + }, + { + "step": 111, + "epoch": 3.1267605633802815, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467907584, + "loss": 1.3454, + "grad_norm": 0.24124093353748322, + "learning_rate": 3.753281955483985e-05 + }, + { + "step": 112, + "epoch": 3.1549295774647885, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467926016, + "loss": 1.3342, + "grad_norm": 0.1414530724287033, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 113, + "epoch": 3.183098591549296, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467988992, + "loss": 1.3212, + "grad_norm": 0.15928998589515686, + "learning_rate": 3.2725277629795526e-05 + }, + { + "step": 114, + "epoch": 3.211267605633803, + "cpu_mem": 1.874374656, + "gpu_mem": 4.4679168, + "loss": 1.3403, + "grad_norm": 0.17498110234737396, + "learning_rate": 3.0430123916561672e-05 + }, + { + "step": 115, + "epoch": 3.23943661971831, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467910656, + "loss": 1.3511, + "grad_norm": 0.23636463284492493, + "learning_rate": 2.8209299142621522e-05 + }, + { + "step": 116, + "epoch": 3.267605633802817, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467926016, + "loss": 1.3359, + "grad_norm": 0.16740088164806366, + "learning_rate": 2.6064183852600797e-05 + }, + { + "step": 117, + "epoch": 3.295774647887324, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467941376, + "loss": 1.2967, + "grad_norm": 0.18808303773403168, + "learning_rate": 2.3996111527384288e-05 + }, + { + "step": 118, + "epoch": 3.323943661971831, + "cpu_mem": 1.874374656, + "gpu_mem": 4.46793216, + "loss": 1.3097, + "grad_norm": 0.1812855303287506, + "learning_rate": 2.2006367755176655e-05 + }, + { + "step": 119, + "epoch": 3.352112676056338, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467922944, + "loss": 1.3512, + "grad_norm": 0.19479770958423615, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 120, + "epoch": 3.380281690140845, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467941376, + "loss": 1.3183, + "grad_norm": 0.16387511789798737, + "learning_rate": 1.82667639944657e-05 + }, + { + "step": 121, + "epoch": 3.408450704225352, + "cpu_mem": 1.874374656, + "gpu_mem": 4.46793984, + "loss": 1.3159, + "grad_norm": 0.22514477372169495, + "learning_rate": 1.6519228678279718e-05 + }, + { + "step": 122, + "epoch": 3.436619718309859, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467896832, + "loss": 1.3252, + "grad_norm": 0.29075855016708374, + "learning_rate": 1.4854669814637143e-05 + }, + { + "step": 123, + "epoch": 3.464788732394366, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467929088, + "loss": 1.3051, + "grad_norm": 0.15960793197155, + "learning_rate": 1.3274122153249028e-05 + }, + { + "step": 124, + "epoch": 3.492957746478873, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467883008, + "loss": 1.3202, + "grad_norm": 0.15475459396839142, + "learning_rate": 1.1778568219438839e-05 + }, + { + "step": 125, + "epoch": 3.52112676056338, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467927552, + "loss": 1.3297, + "grad_norm": 0.3692609965801239, + "learning_rate": 1.036893770336938e-05 + }, + { + "step": 126, + "epoch": 3.5492957746478875, + "cpu_mem": 1.874374656, + "gpu_mem": 4.4678784, + "loss": 1.319, + "grad_norm": 0.31852075457572937, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 127, + "epoch": 3.5774647887323945, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467890688, + "loss": 1.3256, + "grad_norm": 0.16433864831924438, + "learning_rate": 7.810898074930243e-06 + }, + { + "step": 128, + "epoch": 3.6056338028169015, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467915264, + "loss": 1.3001, + "grad_norm": 0.2812267541885376, + "learning_rate": 6.664079132078881e-06 + }, + { + "step": 129, + "epoch": 3.6338028169014085, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467876864, + "loss": 1.3201, + "grad_norm": 0.15767137706279755, + "learning_rate": 5.606362957498195e-06 + }, + { + "step": 130, + "epoch": 3.6619718309859155, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467879936, + "loss": 1.353, + "grad_norm": 0.16969774663448334, + "learning_rate": 4.638407065638322e-06 + }, + { + "step": 131, + "epoch": 3.6901408450704225, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467892224, + "loss": 1.3332, + "grad_norm": 0.20540140569210052, + "learning_rate": 3.760813172726457e-06 + }, + { + "step": 132, + "epoch": 3.7183098591549295, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467856896, + "loss": 1.3227, + "grad_norm": 0.16817808151245117, + "learning_rate": 2.9741268227184255e-06 + }, + { + "step": 133, + "epoch": 3.7464788732394365, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467898368, + "loss": 1.2852, + "grad_norm": 0.23800396919250488, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 134, + "epoch": 3.7746478873239435, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467913728, + "loss": 1.2772, + "grad_norm": 0.22161710262298584, + "learning_rate": 1.6753760662307215e-06 + }, + { + "step": 135, + "epoch": 3.802816901408451, + "cpu_mem": 1.874374656, + "gpu_mem": 4.4678784, + "loss": 1.3486, + "grad_norm": 0.33473095297813416, + "learning_rate": 1.1641190099741904e-06 + }, + { + "step": 136, + "epoch": 3.830985915492958, + "cpu_mem": 1.874374656, + "gpu_mem": 4.46788608, + "loss": 1.3573, + "grad_norm": 0.15975971519947052, + "learning_rate": 7.453836951897885e-07 + }, + { + "step": 137, + "epoch": 3.859154929577465, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467907584, + "loss": 1.3204, + "grad_norm": 0.2749437093734741, + "learning_rate": 4.194304228229806e-07 + }, + { + "step": 138, + "epoch": 3.887323943661972, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467918336, + "loss": 1.3404, + "grad_norm": 0.23362818360328674, + "learning_rate": 1.8646181716164831e-07 + }, + { + "step": 139, + "epoch": 3.915492957746479, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467910656, + "loss": 1.3518, + "grad_norm": 0.16985180974006653, + "learning_rate": 4.662269987756317e-08 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467944448, + "loss": 1.3343, + "grad_norm": 0.16269202530384064, + "learning_rate": 0.0 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.874374656, + "gpu_mem": 4.467944448, + "train_runtime": 1378.7469, + "train_samples_per_second": 6.531, + "train_steps_per_second": 0.102, + "total_flos": 7241065831231488.0, + "train_loss": 1.7059970940862383 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r32-a2/README.md b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r32-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r32-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r32-a2/adapter_config.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0ef1d724eca7640a4f365c193cda2fc4efdb2073 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r32-a2/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha": 64, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "inference_mode": true, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 32, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r32-a2/eval_results.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..3d7bdf863d1db246819c5334c39d548dc5128f53 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_e", + "results": 0.2676767676767677 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r32-a2/training_configuration.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..68e2cc4d1422eb7f4a5260601d1b3b7760c69d0b --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_E", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "loha", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 50462720 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loha-arc_e-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r32-a2", + "seed": 42, + "timestamp": "2025-09-14T04:17:50.657052" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r32-a2/training_logs.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..50380c217287b0b0c044adfdecabbe06c8d80a78 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r32-a2/training_logs.json @@ -0,0 +1,1273 @@ +[ + { + "step": 1, + "epoch": 0.028169014084507043, + "cpu_mem": 1.880563712, + "gpu_mem": 4.619259392, + "loss": 4.6319, + "grad_norm": 3.9199652671813965, + "learning_rate": 2.1428571428571425e-05 + }, + { + "step": 2, + "epoch": 0.056338028169014086, + "cpu_mem": 1.886068736, + "gpu_mem": 5.02302208, + "loss": 4.4578, + "grad_norm": 3.97998046875, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 3, + "epoch": 0.08450704225352113, + "cpu_mem": 1.886265344, + "gpu_mem": 5.023000576, + "loss": 4.5687, + "grad_norm": 4.045681476593018, + "learning_rate": 6.428571428571427e-05 + }, + { + "step": 4, + "epoch": 0.11267605633802817, + "cpu_mem": 1.886461952, + "gpu_mem": 5.022979072, + "loss": 4.5909, + "grad_norm": 3.8668904304504395, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 5, + "epoch": 0.14084507042253522, + "cpu_mem": 1.886461952, + "gpu_mem": 5.023020544, + "loss": 4.1647, + "grad_norm": 3.776479959487915, + "learning_rate": 0.00010714285714285714 + }, + { + "step": 6, + "epoch": 0.16901408450704225, + "cpu_mem": 1.886461952, + "gpu_mem": 5.022995968, + "loss": 3.6312, + "grad_norm": 3.6861133575439453, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 7, + "epoch": 0.19718309859154928, + "cpu_mem": 1.886461952, + "gpu_mem": 5.023019008, + "loss": 3.4079, + "grad_norm": 3.1410071849823, + "learning_rate": 0.00015 + }, + { + "step": 8, + "epoch": 0.22535211267605634, + "cpu_mem": 1.88665856, + "gpu_mem": 5.022977536, + "loss": 3.0393, + "grad_norm": 3.7539222240448, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 9, + "epoch": 0.2535211267605634, + "cpu_mem": 1.88665856, + "gpu_mem": 5.022979072, + "loss": 3.0146, + "grad_norm": 3.9807534217834473, + "learning_rate": 0.00019285714285714286 + }, + { + "step": 10, + "epoch": 0.28169014084507044, + "cpu_mem": 1.886855168, + "gpu_mem": 5.022974464, + "loss": 2.4237, + "grad_norm": 1.7236407995224, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 11, + "epoch": 0.30985915492957744, + "cpu_mem": 1.886855168, + "gpu_mem": 5.0230528, + "loss": 2.2201, + "grad_norm": 1.2199119329452515, + "learning_rate": 0.00023571428571428569 + }, + { + "step": 12, + "epoch": 0.3380281690140845, + "cpu_mem": 1.886855168, + "gpu_mem": 5.023026688, + "loss": 2.0871, + "grad_norm": 0.8235573172569275, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 13, + "epoch": 0.36619718309859156, + "cpu_mem": 1.886855168, + "gpu_mem": 5.022977536, + "loss": 1.867, + "grad_norm": 0.7165443897247314, + "learning_rate": 0.00027857142857142854 + }, + { + "step": 14, + "epoch": 0.39436619718309857, + "cpu_mem": 1.886855168, + "gpu_mem": 5.02299904, + "loss": 1.714, + "grad_norm": 0.6233973503112793, + "learning_rate": 0.0003 + }, + { + "step": 15, + "epoch": 0.4225352112676056, + "cpu_mem": 1.886855168, + "gpu_mem": 5.022976, + "loss": 1.7003, + "grad_norm": 0.5033014416694641, + "learning_rate": 0.0002999533773001224 + }, + { + "step": 16, + "epoch": 0.4507042253521127, + "cpu_mem": 1.886855168, + "gpu_mem": 5.022980608, + "loss": 1.6011, + "grad_norm": 0.33908239006996155, + "learning_rate": 0.0002998135381828383 + }, + { + "step": 17, + "epoch": 0.4788732394366197, + "cpu_mem": 1.886855168, + "gpu_mem": 5.023017472, + "loss": 1.4614, + "grad_norm": 0.26101478934288025, + "learning_rate": 0.00029958056957717696 + }, + { + "step": 18, + "epoch": 0.5070422535211268, + "cpu_mem": 1.886855168, + "gpu_mem": 5.023028224, + "loss": 1.399, + "grad_norm": 0.21480445563793182, + "learning_rate": 0.0002992546163048102 + }, + { + "step": 19, + "epoch": 0.5352112676056338, + "cpu_mem": 1.886855168, + "gpu_mem": 5.022971392, + "loss": 1.4858, + "grad_norm": 0.4659430980682373, + "learning_rate": 0.0002988358809900258 + }, + { + "step": 20, + "epoch": 0.5633802816901409, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023042048, + "loss": 1.444, + "grad_norm": 0.33030256628990173, + "learning_rate": 0.0002983246239337692 + }, + { + "step": 21, + "epoch": 0.5915492957746479, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023040512, + "loss": 1.3503, + "grad_norm": 0.3957853615283966, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 22, + "epoch": 0.6197183098591549, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022997504, + "loss": 1.3906, + "grad_norm": 0.37114882469177246, + "learning_rate": 0.00029702587317728153 + }, + { + "step": 23, + "epoch": 0.647887323943662, + "cpu_mem": 1.887051776, + "gpu_mem": 5.0230144, + "loss": 1.3335, + "grad_norm": 0.18101029098033905, + "learning_rate": 0.0002962391868272735 + }, + { + "step": 24, + "epoch": 0.676056338028169, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022971392, + "loss": 1.3631, + "grad_norm": 0.14284168183803558, + "learning_rate": 0.00029536159293436166 + }, + { + "step": 25, + "epoch": 0.704225352112676, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023000576, + "loss": 1.3707, + "grad_norm": 0.1273227334022522, + "learning_rate": 0.00029439363704250176 + }, + { + "step": 26, + "epoch": 0.7323943661971831, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022980608, + "loss": 1.417, + "grad_norm": 0.09615727514028549, + "learning_rate": 0.00029333592086792107 + }, + { + "step": 27, + "epoch": 0.7605633802816901, + "cpu_mem": 1.887051776, + "gpu_mem": 5.02300672, + "loss": 1.3676, + "grad_norm": 0.15267758071422577, + "learning_rate": 0.0002921891019250697 + }, + { + "step": 28, + "epoch": 0.7887323943661971, + "cpu_mem": 1.887051776, + "gpu_mem": 5.02300672, + "loss": 1.3464, + "grad_norm": 0.12064103037118912, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 29, + "epoch": 0.8169014084507042, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022985216, + "loss": 1.3009, + "grad_norm": 0.20544445514678955, + "learning_rate": 0.00028963106229663063 + }, + { + "step": 30, + "epoch": 0.8450704225352113, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022976, + "loss": 1.3861, + "grad_norm": 0.19890104234218597, + "learning_rate": 0.00028822143178056114 + }, + { + "step": 31, + "epoch": 0.8732394366197183, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022994432, + "loss": 1.3414, + "grad_norm": 0.113870769739151, + "learning_rate": 0.00028672587784675096 + }, + { + "step": 32, + "epoch": 0.9014084507042254, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023017472, + "loss": 1.3276, + "grad_norm": 0.13833604753017426, + "learning_rate": 0.0002851453301853628 + }, + { + "step": 33, + "epoch": 0.9295774647887324, + "cpu_mem": 1.887051776, + "gpu_mem": 5.0230144, + "loss": 1.3505, + "grad_norm": 0.18103745579719543, + "learning_rate": 0.00028348077132172027 + }, + { + "step": 34, + "epoch": 0.9577464788732394, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023017472, + "loss": 1.4277, + "grad_norm": 0.3061281442642212, + "learning_rate": 0.0002817332360055343 + }, + { + "step": 35, + "epoch": 0.9859154929577465, + "cpu_mem": 1.887051776, + "gpu_mem": 5.02299904, + "loss": 1.3074, + "grad_norm": 0.19278664886951447, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 36, + "epoch": 1.0140845070422535, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224840704, + "loss": 1.916, + "grad_norm": 0.2998383939266205, + "learning_rate": 0.0002779936322448233 + }, + { + "step": 37, + "epoch": 1.0422535211267605, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224845312, + "loss": 1.361, + "grad_norm": 0.1525452882051468, + "learning_rate": 0.0002760038884726157 + }, + { + "step": 38, + "epoch": 1.0704225352112675, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224823808, + "loss": 1.2464, + "grad_norm": 0.13389617204666138, + "learning_rate": 0.00027393581614739923 + }, + { + "step": 39, + "epoch": 1.0985915492957747, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224813056, + "loss": 1.3455, + "grad_norm": 0.3108973205089569, + "learning_rate": 0.0002717907008573785 + }, + { + "step": 40, + "epoch": 1.1267605633802817, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224876032, + "loss": 1.3951, + "grad_norm": 0.2972719967365265, + "learning_rate": 0.0002695698760834384 + }, + { + "step": 41, + "epoch": 1.1549295774647887, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224836096, + "loss": 1.3211, + "grad_norm": 0.12421967834234238, + "learning_rate": 0.00026727472237020447 + }, + { + "step": 42, + "epoch": 1.1830985915492958, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224879104, + "loss": 1.3412, + "grad_norm": 0.1402796506881714, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 43, + "epoch": 1.2112676056338028, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224828416, + "loss": 1.3805, + "grad_norm": 0.10279949009418488, + "learning_rate": 0.0002624671804451601 + }, + { + "step": 44, + "epoch": 1.2394366197183098, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224892928, + "loss": 1.3894, + "grad_norm": 0.20946818590164185, + "learning_rate": 0.0002599577807744739 + }, + { + "step": 45, + "epoch": 1.267605633802817, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224860672, + "loss": 1.3752, + "grad_norm": 0.19150123000144958, + "learning_rate": 0.0002573800273889577 + }, + { + "step": 46, + "epoch": 1.295774647887324, + "cpu_mem": 1.887051776, + "gpu_mem": 5.22486528, + "loss": 1.3672, + "grad_norm": 0.12701866030693054, + "learning_rate": 0.0002547355227129109 + }, + { + "step": 47, + "epoch": 1.323943661971831, + "cpu_mem": 1.887051776, + "gpu_mem": 5.22481152, + "loss": 1.3128, + "grad_norm": 0.18270061910152435, + "learning_rate": 0.00025202591066563786 + }, + { + "step": 48, + "epoch": 1.352112676056338, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224825344, + "loss": 1.3241, + "grad_norm": 0.11208175122737885, + "learning_rate": 0.0002492528756395289 + }, + { + "step": 49, + "epoch": 1.380281690140845, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224814592, + "loss": 1.3344, + "grad_norm": 0.1389838606119156, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 50, + "epoch": 1.408450704225352, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224828416, + "loss": 1.3383, + "grad_norm": 0.15130719542503357, + "learning_rate": 0.00024352347027881003 + }, + { + "step": 51, + "epoch": 1.436619718309859, + "cpu_mem": 1.887051776, + "gpu_mem": 5.22488064, + "loss": 1.3625, + "grad_norm": 0.15674443542957306, + "learning_rate": 0.0002405706615488216 + }, + { + "step": 52, + "epoch": 1.4647887323943662, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224828416, + "loss": 1.3688, + "grad_norm": 0.16031569242477417, + "learning_rate": 0.00023756155083521846 + }, + { + "step": 53, + "epoch": 1.4929577464788732, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224897536, + "loss": 1.3111, + "grad_norm": 0.15271522104740143, + "learning_rate": 0.00023449800870954326 + }, + { + "step": 54, + "epoch": 1.5211267605633803, + "cpu_mem": 1.887051776, + "gpu_mem": 5.22486528, + "loss": 1.2957, + "grad_norm": 0.11683334410190582, + "learning_rate": 0.0002313819395798639 + }, + { + "step": 55, + "epoch": 1.5492957746478875, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224874496, + "loss": 1.3694, + "grad_norm": 0.14974774420261383, + "learning_rate": 0.0002282152805069247 + }, + { + "step": 56, + "epoch": 1.5774647887323945, + "cpu_mem": 1.887051776, + "gpu_mem": 5.22484992, + "loss": 1.3126, + "grad_norm": 0.10473300516605377, + "learning_rate": 0.000225 + }, + { + "step": 57, + "epoch": 1.6056338028169015, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224883712, + "loss": 1.323, + "grad_norm": 0.07168374210596085, + "learning_rate": 0.00022173809679319772 + }, + { + "step": 58, + "epoch": 1.6338028169014085, + "cpu_mem": 1.887051776, + "gpu_mem": 5.22486528, + "loss": 1.3176, + "grad_norm": 0.09714635461568832, + "learning_rate": 0.00021843159860297442 + }, + { + "step": 59, + "epoch": 1.6619718309859155, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224851456, + "loss": 1.3164, + "grad_norm": 0.06779345870018005, + "learning_rate": 0.00021508256086763368 + }, + { + "step": 60, + "epoch": 1.6901408450704225, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224889856, + "loss": 1.3057, + "grad_norm": 0.1408751904964447, + "learning_rate": 0.00021169306546959174 + }, + { + "step": 61, + "epoch": 1.7183098591549295, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224822272, + "loss": 1.3035, + "grad_norm": 0.13847613334655762, + "learning_rate": 0.0002082652194412042 + }, + { + "step": 62, + "epoch": 1.7464788732394365, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224869888, + "loss": 1.3527, + "grad_norm": 0.09067568182945251, + "learning_rate": 0.00020480115365495926 + }, + { + "step": 63, + "epoch": 1.7746478873239435, + "cpu_mem": 1.887051776, + "gpu_mem": 5.2248192, + "loss": 1.3277, + "grad_norm": 0.1276838183403015, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 64, + "epoch": 1.8028169014084507, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224868352, + "loss": 1.333, + "grad_norm": 0.15827490389347076, + "learning_rate": 0.00019777299753775265 + }, + { + "step": 65, + "epoch": 1.8309859154929577, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224866816, + "loss": 1.3487, + "grad_norm": 0.11841024458408356, + "learning_rate": 0.00019421327616163563 + }, + { + "step": 66, + "epoch": 1.8591549295774648, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224885248, + "loss": 1.3174, + "grad_norm": 0.12149811536073685, + "learning_rate": 0.00019062607022145078 + }, + { + "step": 67, + "epoch": 1.887323943661972, + "cpu_mem": 1.887051776, + "gpu_mem": 5.22482688, + "loss": 1.3116, + "grad_norm": 0.1002575233578682, + "learning_rate": 0.00018701360965354402 + }, + { + "step": 68, + "epoch": 1.915492957746479, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224839168, + "loss": 1.3516, + "grad_norm": 0.08308219164609909, + "learning_rate": 0.00018337814009344714 + }, + { + "step": 69, + "epoch": 1.943661971830986, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224863744, + "loss": 1.305, + "grad_norm": 0.12909609079360962, + "learning_rate": 0.0001797219214799096 + }, + { + "step": 70, + "epoch": 1.971830985915493, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224840704, + "loss": 1.2842, + "grad_norm": 0.11583128571510315, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 71, + "epoch": 2.0, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224676352, + "loss": 2.0167, + "grad_norm": 0.45690178871154785, + "learning_rate": 0.00017235633992642615 + }, + { + "step": 72, + "epoch": 2.028169014084507, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023009792, + "loss": 1.346, + "grad_norm": 0.11426329612731934, + "learning_rate": 0.00016865155569712278 + }, + { + "step": 73, + "epoch": 2.056338028169014, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022972928, + "loss": 1.3018, + "grad_norm": 0.08155354112386703, + "learning_rate": 0.0001649351769893725 + }, + { + "step": 74, + "epoch": 2.084507042253521, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023032832, + "loss": 1.3177, + "grad_norm": 0.08372794836759567, + "learning_rate": 0.00016120951403796364 + }, + { + "step": 75, + "epoch": 2.112676056338028, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023000576, + "loss": 1.3382, + "grad_norm": 0.1360010951757431, + "learning_rate": 0.00015747688284910457 + }, + { + "step": 76, + "epoch": 2.140845070422535, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023011328, + "loss": 1.3018, + "grad_norm": 0.08198140561580658, + "learning_rate": 0.00015373960376071093 + }, + { + "step": 77, + "epoch": 2.169014084507042, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023048192, + "loss": 1.3513, + "grad_norm": 0.06720685213804245, + "learning_rate": 0.00015 + }, + { + "step": 78, + "epoch": 2.1971830985915495, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023032832, + "loss": 1.3628, + "grad_norm": 0.1182149276137352, + "learning_rate": 0.00014626039623928907 + }, + { + "step": 79, + "epoch": 2.2253521126760565, + "cpu_mem": 1.887051776, + "gpu_mem": 5.02298368, + "loss": 1.2777, + "grad_norm": 0.12428417056798935, + "learning_rate": 0.0001425231171508954 + }, + { + "step": 80, + "epoch": 2.2535211267605635, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023028224, + "loss": 1.3643, + "grad_norm": 0.1532914638519287, + "learning_rate": 0.00013879048596203636 + }, + { + "step": 81, + "epoch": 2.2816901408450705, + "cpu_mem": 1.887051776, + "gpu_mem": 5.0230144, + "loss": 1.3425, + "grad_norm": 0.1302020251750946, + "learning_rate": 0.0001350648230106275 + }, + { + "step": 82, + "epoch": 2.3098591549295775, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022982144, + "loss": 1.3545, + "grad_norm": 0.14355947077274323, + "learning_rate": 0.00013134844430287725 + }, + { + "step": 83, + "epoch": 2.3380281690140845, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023032832, + "loss": 1.3503, + "grad_norm": 0.23603464663028717, + "learning_rate": 0.0001276436600735738 + }, + { + "step": 84, + "epoch": 2.3661971830985915, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022971392, + "loss": 1.3557, + "grad_norm": 0.11084804683923721, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 85, + "epoch": 2.3943661971830985, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023017472, + "loss": 1.3718, + "grad_norm": 0.1903240829706192, + "learning_rate": 0.00012027807852009038 + }, + { + "step": 86, + "epoch": 2.4225352112676055, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022971392, + "loss": 1.3436, + "grad_norm": 0.13541880249977112, + "learning_rate": 0.00011662185990655284 + }, + { + "step": 87, + "epoch": 2.4507042253521125, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023002112, + "loss": 1.3735, + "grad_norm": 0.09641522169113159, + "learning_rate": 0.00011298639034645593 + }, + { + "step": 88, + "epoch": 2.4788732394366195, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022977536, + "loss": 1.3246, + "grad_norm": 0.14845748245716095, + "learning_rate": 0.00010937392977854923 + }, + { + "step": 89, + "epoch": 2.507042253521127, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023031296, + "loss": 1.2975, + "grad_norm": 0.11014118045568466, + "learning_rate": 0.00010578672383836435 + }, + { + "step": 90, + "epoch": 2.535211267605634, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023012864, + "loss": 1.3234, + "grad_norm": 0.11979362368583679, + "learning_rate": 0.00010222700246224735 + }, + { + "step": 91, + "epoch": 2.563380281690141, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022962176, + "loss": 1.3937, + "grad_norm": 0.23128466308116913, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 92, + "epoch": 2.591549295774648, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022986752, + "loss": 1.3184, + "grad_norm": 0.1002981886267662, + "learning_rate": 9.519884634504074e-05 + }, + { + "step": 93, + "epoch": 2.619718309859155, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022989824, + "loss": 1.2619, + "grad_norm": 0.16905032098293304, + "learning_rate": 9.17347805587958e-05 + }, + { + "step": 94, + "epoch": 2.647887323943662, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022982144, + "loss": 1.3018, + "grad_norm": 0.10458124428987503, + "learning_rate": 8.830693453040829e-05 + }, + { + "step": 95, + "epoch": 2.676056338028169, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023020544, + "loss": 1.316, + "grad_norm": 0.08298267424106598, + "learning_rate": 8.491743913236628e-05 + }, + { + "step": 96, + "epoch": 2.704225352112676, + "cpu_mem": 1.887051776, + "gpu_mem": 5.02302976, + "loss": 1.2751, + "grad_norm": 0.2082097828388214, + "learning_rate": 8.156840139702554e-05 + }, + { + "step": 97, + "epoch": 2.732394366197183, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022972928, + "loss": 1.3436, + "grad_norm": 0.13853763043880463, + "learning_rate": 7.82619032068023e-05 + }, + { + "step": 98, + "epoch": 2.76056338028169, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022972928, + "loss": 1.3316, + "grad_norm": 0.06574063003063202, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 99, + "epoch": 2.788732394366197, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022969856, + "loss": 1.2902, + "grad_norm": 0.11772242933511734, + "learning_rate": 7.17847194930753e-05 + }, + { + "step": 100, + "epoch": 2.816901408450704, + "cpu_mem": 1.887051776, + "gpu_mem": 5.02296832, + "loss": 1.2679, + "grad_norm": 0.09213180840015411, + "learning_rate": 6.86180604201361e-05 + }, + { + "step": 101, + "epoch": 2.845070422535211, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023011328, + "loss": 1.2578, + "grad_norm": 0.10172642022371292, + "learning_rate": 6.550199129045668e-05 + }, + { + "step": 102, + "epoch": 2.873239436619718, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022949888, + "loss": 1.2992, + "grad_norm": 0.09530587494373322, + "learning_rate": 6.243844916478155e-05 + }, + { + "step": 103, + "epoch": 2.9014084507042255, + "cpu_mem": 1.887051776, + "gpu_mem": 5.02299904, + "loss": 1.3052, + "grad_norm": 0.068056121468544, + "learning_rate": 5.9429338451178355e-05 + }, + { + "step": 104, + "epoch": 2.9295774647887325, + "cpu_mem": 1.887051776, + "gpu_mem": 5.023062016, + "loss": 1.3313, + "grad_norm": 0.14723578095436096, + "learning_rate": 5.6476529721189974e-05 + }, + { + "step": 105, + "epoch": 2.9577464788732395, + "cpu_mem": 1.887051776, + "gpu_mem": 5.0230144, + "loss": 1.2685, + "grad_norm": 0.09320081025362015, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 106, + "epoch": 2.9859154929577465, + "cpu_mem": 1.887051776, + "gpu_mem": 5.022995968, + "loss": 1.3233, + "grad_norm": 0.06929963827133179, + "learning_rate": 5.074712436047112e-05 + }, + { + "step": 107, + "epoch": 3.0140845070422535, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224863744, + "loss": 1.8982, + "grad_norm": 0.3002396821975708, + "learning_rate": 4.7974089334362057e-05 + }, + { + "step": 108, + "epoch": 3.0422535211267605, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224845312, + "loss": 1.3171, + "grad_norm": 0.14029163122177124, + "learning_rate": 4.526447728708908e-05 + }, + { + "step": 109, + "epoch": 3.0704225352112675, + "cpu_mem": 1.887051776, + "gpu_mem": 5.22483456, + "loss": 1.3274, + "grad_norm": 0.10154382139444351, + "learning_rate": 4.261997261104223e-05 + }, + { + "step": 110, + "epoch": 3.0985915492957745, + "cpu_mem": 1.887051776, + "gpu_mem": 5.22488832, + "loss": 1.3339, + "grad_norm": 0.24149997532367706, + "learning_rate": 4.004221922552608e-05 + }, + { + "step": 111, + "epoch": 3.1267605633802815, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224848384, + "loss": 1.3278, + "grad_norm": 0.1440524160861969, + "learning_rate": 3.753281955483985e-05 + }, + { + "step": 112, + "epoch": 3.1549295774647885, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224866816, + "loss": 1.3039, + "grad_norm": 0.0836329460144043, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 113, + "epoch": 3.183098591549296, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224929792, + "loss": 1.303, + "grad_norm": 0.07715719193220139, + "learning_rate": 3.2725277629795526e-05 + }, + { + "step": 114, + "epoch": 3.211267605633803, + "cpu_mem": 1.887051776, + "gpu_mem": 5.2248576, + "loss": 1.3376, + "grad_norm": 0.12448336184024811, + "learning_rate": 3.0430123916561672e-05 + }, + { + "step": 115, + "epoch": 3.23943661971831, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224851456, + "loss": 1.3454, + "grad_norm": 0.14446860551834106, + "learning_rate": 2.8209299142621522e-05 + }, + { + "step": 116, + "epoch": 3.267605633802817, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224866816, + "loss": 1.3278, + "grad_norm": 0.09190875291824341, + "learning_rate": 2.6064183852600797e-05 + }, + { + "step": 117, + "epoch": 3.295774647887324, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224882176, + "loss": 1.2832, + "grad_norm": 0.12196337431669235, + "learning_rate": 2.3996111527384288e-05 + }, + { + "step": 118, + "epoch": 3.323943661971831, + "cpu_mem": 1.887051776, + "gpu_mem": 5.22487296, + "loss": 1.3066, + "grad_norm": 0.11807198077440262, + "learning_rate": 2.2006367755176655e-05 + }, + { + "step": 119, + "epoch": 3.352112676056338, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224863744, + "loss": 1.3351, + "grad_norm": 0.10722316056489944, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 120, + "epoch": 3.380281690140845, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224882176, + "loss": 1.3174, + "grad_norm": 0.10281093418598175, + "learning_rate": 1.82667639944657e-05 + }, + { + "step": 121, + "epoch": 3.408450704225352, + "cpu_mem": 1.887051776, + "gpu_mem": 5.22488064, + "loss": 1.3021, + "grad_norm": 0.11779104918241501, + "learning_rate": 1.6519228678279718e-05 + }, + { + "step": 122, + "epoch": 3.436619718309859, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224837632, + "loss": 1.3022, + "grad_norm": 0.1253589689731598, + "learning_rate": 1.4854669814637143e-05 + }, + { + "step": 123, + "epoch": 3.464788732394366, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224869888, + "loss": 1.2952, + "grad_norm": 0.10726069658994675, + "learning_rate": 1.3274122153249028e-05 + }, + { + "step": 124, + "epoch": 3.492957746478873, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224823808, + "loss": 1.306, + "grad_norm": 0.08831270784139633, + "learning_rate": 1.1778568219438839e-05 + }, + { + "step": 125, + "epoch": 3.52112676056338, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224868352, + "loss": 1.3094, + "grad_norm": 0.18499088287353516, + "learning_rate": 1.036893770336938e-05 + }, + { + "step": 126, + "epoch": 3.5492957746478875, + "cpu_mem": 1.887051776, + "gpu_mem": 5.2248192, + "loss": 1.2969, + "grad_norm": 0.16021127998828888, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 127, + "epoch": 3.5774647887323945, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224831488, + "loss": 1.3146, + "grad_norm": 0.07457026839256287, + "learning_rate": 7.810898074930243e-06 + }, + { + "step": 128, + "epoch": 3.6056338028169015, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224856064, + "loss": 1.2931, + "grad_norm": 0.12468431890010834, + "learning_rate": 6.664079132078881e-06 + }, + { + "step": 129, + "epoch": 3.6338028169014085, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224817664, + "loss": 1.307, + "grad_norm": 0.0701357051730156, + "learning_rate": 5.606362957498195e-06 + }, + { + "step": 130, + "epoch": 3.6619718309859155, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224820736, + "loss": 1.3259, + "grad_norm": 0.08124849945306778, + "learning_rate": 4.638407065638322e-06 + }, + { + "step": 131, + "epoch": 3.6901408450704225, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224833024, + "loss": 1.3028, + "grad_norm": 0.11030655354261398, + "learning_rate": 3.760813172726457e-06 + }, + { + "step": 132, + "epoch": 3.7183098591549295, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224797696, + "loss": 1.2973, + "grad_norm": 0.08337446302175522, + "learning_rate": 2.9741268227184255e-06 + }, + { + "step": 133, + "epoch": 3.7464788732394365, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224839168, + "loss": 1.2647, + "grad_norm": 0.1107446476817131, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 134, + "epoch": 3.7746478873239435, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224854528, + "loss": 1.268, + "grad_norm": 0.12301309406757355, + "learning_rate": 1.6753760662307215e-06 + }, + { + "step": 135, + "epoch": 3.802816901408451, + "cpu_mem": 1.887051776, + "gpu_mem": 5.2248192, + "loss": 1.3339, + "grad_norm": 0.17649513483047485, + "learning_rate": 1.1641190099741904e-06 + }, + { + "step": 136, + "epoch": 3.830985915492958, + "cpu_mem": 1.887051776, + "gpu_mem": 5.22482688, + "loss": 1.3567, + "grad_norm": 0.10685434192419052, + "learning_rate": 7.453836951897885e-07 + }, + { + "step": 137, + "epoch": 3.859154929577465, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224848384, + "loss": 1.3048, + "grad_norm": 0.13099899888038635, + "learning_rate": 4.194304228229806e-07 + }, + { + "step": 138, + "epoch": 3.887323943661972, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224859136, + "loss": 1.3443, + "grad_norm": 0.11077357828617096, + "learning_rate": 1.8646181716164831e-07 + }, + { + "step": 139, + "epoch": 3.915492957746479, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224851456, + "loss": 1.3505, + "grad_norm": 0.09666340798139572, + "learning_rate": 4.662269987756317e-08 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224885248, + "loss": 1.3305, + "grad_norm": 0.09434454888105392, + "learning_rate": 0.0 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.887051776, + "gpu_mem": 5.224885248, + "train_runtime": 1521.2549, + "train_samples_per_second": 5.919, + "train_steps_per_second": 0.092, + "total_flos": 7571197140185088.0, + "train_loss": 1.5437767037323542 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r8-a2/README.md b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r8-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r8-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r8-a2/adapter_config.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5723daa9f5f7b854bf548bbee9a6d37e12198a3a --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r8-a2/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha": 16, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "inference_mode": true, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 8, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r8-a2/eval_results.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..33172528fd89f072f30582b1f058012df501e488 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_e", + "results": 0.255050505050505 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r8-a2/training_configuration.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..b2114b555b256c80f2e5902f2061eab0128db67f --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_E", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "loha", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 12615680 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loha-arc_e-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r8-a2", + "seed": 42, + "timestamp": "2025-09-13T14:21:50.476114" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r8-a2/training_logs.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..1198089497f57d78f006abb3d089431c1b2aaf35 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-arc_e-r8-a2/training_logs.json @@ -0,0 +1,1273 @@ +[ + { + "step": 1, + "epoch": 0.028169014084507043, + "cpu_mem": 1.880072192, + "gpu_mem": 4.467871232, + "loss": 4.6319, + "grad_norm": 3.9025988578796387, + "learning_rate": 2.1428571428571425e-05 + }, + { + "step": 2, + "epoch": 0.056338028169014086, + "cpu_mem": 1.885577216, + "gpu_mem": 4.5688576, + "loss": 4.4578, + "grad_norm": 4.026349067687988, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 3, + "epoch": 0.08450704225352113, + "cpu_mem": 1.885773824, + "gpu_mem": 4.568836096, + "loss": 4.6108, + "grad_norm": 3.954428195953369, + "learning_rate": 6.428571428571427e-05 + }, + { + "step": 4, + "epoch": 0.11267605633802817, + "cpu_mem": 1.885970432, + "gpu_mem": 4.568814592, + "loss": 4.7087, + "grad_norm": 3.86318302154541, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 5, + "epoch": 0.14084507042253522, + "cpu_mem": 1.885970432, + "gpu_mem": 4.568856064, + "loss": 4.4035, + "grad_norm": 3.8401288986206055, + "learning_rate": 0.00010714285714285714 + }, + { + "step": 6, + "epoch": 0.16901408450704225, + "cpu_mem": 1.88616704, + "gpu_mem": 4.568831488, + "loss": 4.0312, + "grad_norm": 3.9855401515960693, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 7, + "epoch": 0.19718309859154928, + "cpu_mem": 1.88616704, + "gpu_mem": 4.568854528, + "loss": 3.9411, + "grad_norm": 3.8082945346832275, + "learning_rate": 0.00015 + }, + { + "step": 8, + "epoch": 0.22535211267605634, + "cpu_mem": 1.88616704, + "gpu_mem": 4.568813056, + "loss": 3.6635, + "grad_norm": 3.595900774002075, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 9, + "epoch": 0.2535211267605634, + "cpu_mem": 1.88616704, + "gpu_mem": 4.568814592, + "loss": 3.6771, + "grad_norm": 3.3186373710632324, + "learning_rate": 0.00019285714285714286 + }, + { + "step": 10, + "epoch": 0.28169014084507044, + "cpu_mem": 1.88616704, + "gpu_mem": 4.568809984, + "loss": 3.1542, + "grad_norm": 3.3927128314971924, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 11, + "epoch": 0.30985915492957744, + "cpu_mem": 1.88616704, + "gpu_mem": 4.56888832, + "loss": 2.9976, + "grad_norm": 3.4722182750701904, + "learning_rate": 0.00023571428571428569 + }, + { + "step": 12, + "epoch": 0.3380281690140845, + "cpu_mem": 1.886363648, + "gpu_mem": 4.568862208, + "loss": 2.706, + "grad_norm": 1.9735649824142456, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 13, + "epoch": 0.36619718309859156, + "cpu_mem": 1.886363648, + "gpu_mem": 4.568813056, + "loss": 2.5481, + "grad_norm": 1.802190899848938, + "learning_rate": 0.00027857142857142854 + }, + { + "step": 14, + "epoch": 0.39436619718309857, + "cpu_mem": 1.886363648, + "gpu_mem": 4.56883456, + "loss": 2.2927, + "grad_norm": 1.4061336517333984, + "learning_rate": 0.0003 + }, + { + "step": 15, + "epoch": 0.4225352112676056, + "cpu_mem": 1.886363648, + "gpu_mem": 4.56881152, + "loss": 2.1686, + "grad_norm": 1.0202769041061401, + "learning_rate": 0.0002999533773001224 + }, + { + "step": 16, + "epoch": 0.4507042253521127, + "cpu_mem": 1.886363648, + "gpu_mem": 4.568816128, + "loss": 1.9651, + "grad_norm": 0.7039622664451599, + "learning_rate": 0.0002998135381828383 + }, + { + "step": 17, + "epoch": 0.4788732394366197, + "cpu_mem": 1.886363648, + "gpu_mem": 4.568852992, + "loss": 1.8509, + "grad_norm": 0.7274305820465088, + "learning_rate": 0.00029958056957717696 + }, + { + "step": 18, + "epoch": 0.5070422535211268, + "cpu_mem": 1.886363648, + "gpu_mem": 4.568863744, + "loss": 1.7228, + "grad_norm": 0.7121962308883667, + "learning_rate": 0.0002992546163048102 + }, + { + "step": 19, + "epoch": 0.5352112676056338, + "cpu_mem": 1.886363648, + "gpu_mem": 4.568806912, + "loss": 1.8587, + "grad_norm": 0.8776935338973999, + "learning_rate": 0.0002988358809900258 + }, + { + "step": 20, + "epoch": 0.5633802816901409, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568877568, + "loss": 1.5145, + "grad_norm": 0.28660663962364197, + "learning_rate": 0.0002983246239337692 + }, + { + "step": 21, + "epoch": 0.5915492957746479, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568876032, + "loss": 1.5114, + "grad_norm": 0.43042662739753723, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 22, + "epoch": 0.6197183098591549, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568833024, + "loss": 1.5038, + "grad_norm": 0.4155093729496002, + "learning_rate": 0.00029702587317728153 + }, + { + "step": 23, + "epoch": 0.647887323943662, + "cpu_mem": 1.886560256, + "gpu_mem": 4.56884992, + "loss": 1.4149, + "grad_norm": 0.27284905314445496, + "learning_rate": 0.0002962391868272735 + }, + { + "step": 24, + "epoch": 0.676056338028169, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568806912, + "loss": 1.4439, + "grad_norm": 0.23430410027503967, + "learning_rate": 0.00029536159293436166 + }, + { + "step": 25, + "epoch": 0.704225352112676, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568836096, + "loss": 1.4418, + "grad_norm": 0.27775275707244873, + "learning_rate": 0.00029439363704250176 + }, + { + "step": 26, + "epoch": 0.7323943661971831, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568816128, + "loss": 1.4751, + "grad_norm": 0.19063498079776764, + "learning_rate": 0.00029333592086792107 + }, + { + "step": 27, + "epoch": 0.7605633802816901, + "cpu_mem": 1.886560256, + "gpu_mem": 4.56884224, + "loss": 1.3832, + "grad_norm": 0.16227760910987854, + "learning_rate": 0.0002921891019250697 + }, + { + "step": 28, + "epoch": 0.7887323943661971, + "cpu_mem": 1.886560256, + "gpu_mem": 4.56884224, + "loss": 1.4013, + "grad_norm": 0.28282374143600464, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 29, + "epoch": 0.8169014084507042, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568820736, + "loss": 1.3459, + "grad_norm": 0.47776976227760315, + "learning_rate": 0.00028963106229663063 + }, + { + "step": 30, + "epoch": 0.8450704225352113, + "cpu_mem": 1.886560256, + "gpu_mem": 4.56881152, + "loss": 1.374, + "grad_norm": 0.2164515256881714, + "learning_rate": 0.00028822143178056114 + }, + { + "step": 31, + "epoch": 0.8732394366197183, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568829952, + "loss": 1.3601, + "grad_norm": 0.21440152823925018, + "learning_rate": 0.00028672587784675096 + }, + { + "step": 32, + "epoch": 0.9014084507042254, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568852992, + "loss": 1.3531, + "grad_norm": 0.19432462751865387, + "learning_rate": 0.0002851453301853628 + }, + { + "step": 33, + "epoch": 0.9295774647887324, + "cpu_mem": 1.886560256, + "gpu_mem": 4.56884992, + "loss": 1.3435, + "grad_norm": 0.1284203678369522, + "learning_rate": 0.00028348077132172027 + }, + { + "step": 34, + "epoch": 0.9577464788732394, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568852992, + "loss": 1.3997, + "grad_norm": 0.2482559084892273, + "learning_rate": 0.0002817332360055343 + }, + { + "step": 35, + "epoch": 0.9859154929577465, + "cpu_mem": 1.886560256, + "gpu_mem": 4.56883456, + "loss": 1.3105, + "grad_norm": 0.23479041457176208, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 36, + "epoch": 1.0140845070422535, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619288064, + "loss": 1.9635, + "grad_norm": 0.5009448528289795, + "learning_rate": 0.0002779936322448233 + }, + { + "step": 37, + "epoch": 1.0422535211267605, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619292672, + "loss": 1.3605, + "grad_norm": 0.1490233987569809, + "learning_rate": 0.0002760038884726157 + }, + { + "step": 38, + "epoch": 1.0704225352112675, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619271168, + "loss": 1.2576, + "grad_norm": 0.1987287700176239, + "learning_rate": 0.00027393581614739923 + }, + { + "step": 39, + "epoch": 1.0985915492957747, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619260416, + "loss": 1.321, + "grad_norm": 0.3679438531398773, + "learning_rate": 0.0002717907008573785 + }, + { + "step": 40, + "epoch": 1.1267605633802817, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619323392, + "loss": 1.3804, + "grad_norm": 0.34010475873947144, + "learning_rate": 0.0002695698760834384 + }, + { + "step": 41, + "epoch": 1.1549295774647887, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619283456, + "loss": 1.3093, + "grad_norm": 0.13461565971374512, + "learning_rate": 0.00026727472237020447 + }, + { + "step": 42, + "epoch": 1.1830985915492958, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619326464, + "loss": 1.3426, + "grad_norm": 0.14988525211811066, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 43, + "epoch": 1.2112676056338028, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619275776, + "loss": 1.3871, + "grad_norm": 0.12486442178487778, + "learning_rate": 0.0002624671804451601 + }, + { + "step": 44, + "epoch": 1.2394366197183098, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619340288, + "loss": 1.3728, + "grad_norm": 0.19987812638282776, + "learning_rate": 0.0002599577807744739 + }, + { + "step": 45, + "epoch": 1.267605633802817, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619308032, + "loss": 1.366, + "grad_norm": 0.17538872361183167, + "learning_rate": 0.0002573800273889577 + }, + { + "step": 46, + "epoch": 1.295774647887324, + "cpu_mem": 1.886560256, + "gpu_mem": 4.61931264, + "loss": 1.3662, + "grad_norm": 0.15130501985549927, + "learning_rate": 0.0002547355227129109 + }, + { + "step": 47, + "epoch": 1.323943661971831, + "cpu_mem": 1.886560256, + "gpu_mem": 4.61925888, + "loss": 1.3132, + "grad_norm": 0.22999729216098785, + "learning_rate": 0.00025202591066563786 + }, + { + "step": 48, + "epoch": 1.352112676056338, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619272704, + "loss": 1.3329, + "grad_norm": 0.15070316195487976, + "learning_rate": 0.0002492528756395289 + }, + { + "step": 49, + "epoch": 1.380281690140845, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619261952, + "loss": 1.3476, + "grad_norm": 0.19187936186790466, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 50, + "epoch": 1.408450704225352, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619275776, + "loss": 1.346, + "grad_norm": 0.23265525698661804, + "learning_rate": 0.00024352347027881003 + }, + { + "step": 51, + "epoch": 1.436619718309859, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619328, + "loss": 1.3686, + "grad_norm": 0.2331940084695816, + "learning_rate": 0.0002405706615488216 + }, + { + "step": 52, + "epoch": 1.4647887323943662, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619275776, + "loss": 1.3615, + "grad_norm": 0.24115760624408722, + "learning_rate": 0.00023756155083521846 + }, + { + "step": 53, + "epoch": 1.4929577464788732, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619344896, + "loss": 1.2977, + "grad_norm": 0.180658757686615, + "learning_rate": 0.00023449800870954326 + }, + { + "step": 54, + "epoch": 1.5211267605633803, + "cpu_mem": 1.886560256, + "gpu_mem": 4.61931264, + "loss": 1.3162, + "grad_norm": 0.23167963325977325, + "learning_rate": 0.0002313819395798639 + }, + { + "step": 55, + "epoch": 1.5492957746478875, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619321856, + "loss": 1.3561, + "grad_norm": 0.2034742385149002, + "learning_rate": 0.0002282152805069247 + }, + { + "step": 56, + "epoch": 1.5774647887323945, + "cpu_mem": 1.886560256, + "gpu_mem": 4.61929728, + "loss": 1.3118, + "grad_norm": 0.14295290410518646, + "learning_rate": 0.000225 + }, + { + "step": 57, + "epoch": 1.6056338028169015, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619331072, + "loss": 1.3277, + "grad_norm": 0.10579181462526321, + "learning_rate": 0.00022173809679319772 + }, + { + "step": 58, + "epoch": 1.6338028169014085, + "cpu_mem": 1.886560256, + "gpu_mem": 4.61931264, + "loss": 1.3147, + "grad_norm": 0.14804382622241974, + "learning_rate": 0.00021843159860297442 + }, + { + "step": 59, + "epoch": 1.6619718309859155, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619298816, + "loss": 1.3235, + "grad_norm": 0.1243520975112915, + "learning_rate": 0.00021508256086763368 + }, + { + "step": 60, + "epoch": 1.6901408450704225, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619337216, + "loss": 1.3091, + "grad_norm": 0.19099785387516022, + "learning_rate": 0.00021169306546959174 + }, + { + "step": 61, + "epoch": 1.7183098591549295, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619269632, + "loss": 1.3139, + "grad_norm": 0.17487937211990356, + "learning_rate": 0.0002082652194412042 + }, + { + "step": 62, + "epoch": 1.7464788732394365, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619317248, + "loss": 1.3663, + "grad_norm": 0.14837004244327545, + "learning_rate": 0.00020480115365495926 + }, + { + "step": 63, + "epoch": 1.7746478873239435, + "cpu_mem": 1.886560256, + "gpu_mem": 4.61926656, + "loss": 1.3373, + "grad_norm": 0.1990746706724167, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 64, + "epoch": 1.8028169014084507, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619315712, + "loss": 1.3308, + "grad_norm": 0.19030804932117462, + "learning_rate": 0.00019777299753775265 + }, + { + "step": 65, + "epoch": 1.8309859154929577, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619314176, + "loss": 1.358, + "grad_norm": 0.13666632771492004, + "learning_rate": 0.00019421327616163563 + }, + { + "step": 66, + "epoch": 1.8591549295774648, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619332608, + "loss": 1.3176, + "grad_norm": 0.1617087423801422, + "learning_rate": 0.00019062607022145078 + }, + { + "step": 67, + "epoch": 1.887323943661972, + "cpu_mem": 1.886560256, + "gpu_mem": 4.61927424, + "loss": 1.3123, + "grad_norm": 0.13441993296146393, + "learning_rate": 0.00018701360965354402 + }, + { + "step": 68, + "epoch": 1.915492957746479, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619286528, + "loss": 1.3613, + "grad_norm": 0.1162414401769638, + "learning_rate": 0.00018337814009344714 + }, + { + "step": 69, + "epoch": 1.943661971830986, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619311104, + "loss": 1.307, + "grad_norm": 0.17820283770561218, + "learning_rate": 0.0001797219214799096 + }, + { + "step": 70, + "epoch": 1.971830985915493, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619288064, + "loss": 1.2812, + "grad_norm": 0.1250607669353485, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 71, + "epoch": 2.0, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619123712, + "loss": 2.0167, + "grad_norm": 0.6089555621147156, + "learning_rate": 0.00017235633992642615 + }, + { + "step": 72, + "epoch": 2.028169014084507, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568845312, + "loss": 1.346, + "grad_norm": 0.1651427000761032, + "learning_rate": 0.00016865155569712278 + }, + { + "step": 73, + "epoch": 2.056338028169014, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568808448, + "loss": 1.3057, + "grad_norm": 0.1244783028960228, + "learning_rate": 0.0001649351769893725 + }, + { + "step": 74, + "epoch": 2.084507042253521, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568868352, + "loss": 1.3145, + "grad_norm": 0.15886901319026947, + "learning_rate": 0.00016120951403796364 + }, + { + "step": 75, + "epoch": 2.112676056338028, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568836096, + "loss": 1.3479, + "grad_norm": 0.21860629320144653, + "learning_rate": 0.00015747688284910457 + }, + { + "step": 76, + "epoch": 2.140845070422535, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568846848, + "loss": 1.3036, + "grad_norm": 0.11273213475942612, + "learning_rate": 0.00015373960376071093 + }, + { + "step": 77, + "epoch": 2.169014084507042, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568883712, + "loss": 1.3603, + "grad_norm": 0.09930621832609177, + "learning_rate": 0.00015 + }, + { + "step": 78, + "epoch": 2.1971830985915495, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568868352, + "loss": 1.3615, + "grad_norm": 0.16133816540241241, + "learning_rate": 0.00014626039623928907 + }, + { + "step": 79, + "epoch": 2.2253521126760565, + "cpu_mem": 1.886560256, + "gpu_mem": 4.5688192, + "loss": 1.283, + "grad_norm": 0.1745380163192749, + "learning_rate": 0.0001425231171508954 + }, + { + "step": 80, + "epoch": 2.2535211267605635, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568863744, + "loss": 1.3721, + "grad_norm": 0.17223180830478668, + "learning_rate": 0.00013879048596203636 + }, + { + "step": 81, + "epoch": 2.2816901408450705, + "cpu_mem": 1.886560256, + "gpu_mem": 4.56884992, + "loss": 1.3425, + "grad_norm": 0.19135650992393494, + "learning_rate": 0.0001350648230106275 + }, + { + "step": 82, + "epoch": 2.3098591549295775, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568817664, + "loss": 1.3548, + "grad_norm": 0.19674985110759735, + "learning_rate": 0.00013134844430287725 + }, + { + "step": 83, + "epoch": 2.3380281690140845, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568868352, + "loss": 1.3562, + "grad_norm": 0.3431258499622345, + "learning_rate": 0.0001276436600735738 + }, + { + "step": 84, + "epoch": 2.3661971830985915, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568806912, + "loss": 1.3621, + "grad_norm": 0.18228420615196228, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 85, + "epoch": 2.3943661971830985, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568852992, + "loss": 1.3799, + "grad_norm": 0.289233535528183, + "learning_rate": 0.00012027807852009038 + }, + { + "step": 86, + "epoch": 2.4225352112676055, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568806912, + "loss": 1.3499, + "grad_norm": 0.19072002172470093, + "learning_rate": 0.00011662185990655284 + }, + { + "step": 87, + "epoch": 2.4507042253521125, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568837632, + "loss": 1.373, + "grad_norm": 0.11903022229671478, + "learning_rate": 0.00011298639034645593 + }, + { + "step": 88, + "epoch": 2.4788732394366195, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568813056, + "loss": 1.3291, + "grad_norm": 0.2206014096736908, + "learning_rate": 0.00010937392977854923 + }, + { + "step": 89, + "epoch": 2.507042253521127, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568866816, + "loss": 1.2993, + "grad_norm": 0.14281520247459412, + "learning_rate": 0.00010578672383836435 + }, + { + "step": 90, + "epoch": 2.535211267605634, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568848384, + "loss": 1.3331, + "grad_norm": 0.15356840193271637, + "learning_rate": 0.00010222700246224735 + }, + { + "step": 91, + "epoch": 2.563380281690141, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568797696, + "loss": 1.3844, + "grad_norm": 0.29866281151771545, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 92, + "epoch": 2.591549295774648, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568822272, + "loss": 1.3346, + "grad_norm": 0.1422118991613388, + "learning_rate": 9.519884634504074e-05 + }, + { + "step": 93, + "epoch": 2.619718309859155, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568825344, + "loss": 1.265, + "grad_norm": 0.26736897230148315, + "learning_rate": 9.17347805587958e-05 + }, + { + "step": 94, + "epoch": 2.647887323943662, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568817664, + "loss": 1.3092, + "grad_norm": 0.16237102448940277, + "learning_rate": 8.830693453040829e-05 + }, + { + "step": 95, + "epoch": 2.676056338028169, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568856064, + "loss": 1.316, + "grad_norm": 0.14673510193824768, + "learning_rate": 8.491743913236628e-05 + }, + { + "step": 96, + "epoch": 2.704225352112676, + "cpu_mem": 1.886560256, + "gpu_mem": 4.56886528, + "loss": 1.2734, + "grad_norm": 0.3606308400630951, + "learning_rate": 8.156840139702554e-05 + }, + { + "step": 97, + "epoch": 2.732394366197183, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568808448, + "loss": 1.3538, + "grad_norm": 0.16796810925006866, + "learning_rate": 7.82619032068023e-05 + }, + { + "step": 98, + "epoch": 2.76056338028169, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568808448, + "loss": 1.3407, + "grad_norm": 0.10756803303956985, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 99, + "epoch": 2.788732394366197, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568805376, + "loss": 1.2869, + "grad_norm": 0.20476947724819183, + "learning_rate": 7.17847194930753e-05 + }, + { + "step": 100, + "epoch": 2.816901408450704, + "cpu_mem": 1.886560256, + "gpu_mem": 4.56880384, + "loss": 1.2761, + "grad_norm": 0.14890722930431366, + "learning_rate": 6.86180604201361e-05 + }, + { + "step": 101, + "epoch": 2.845070422535211, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568846848, + "loss": 1.2617, + "grad_norm": 0.17993809282779694, + "learning_rate": 6.550199129045668e-05 + }, + { + "step": 102, + "epoch": 2.873239436619718, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568785408, + "loss": 1.3072, + "grad_norm": 0.1884395033121109, + "learning_rate": 6.243844916478155e-05 + }, + { + "step": 103, + "epoch": 2.9014084507042255, + "cpu_mem": 1.886560256, + "gpu_mem": 4.56883456, + "loss": 1.3032, + "grad_norm": 0.1094408631324768, + "learning_rate": 5.9429338451178355e-05 + }, + { + "step": 104, + "epoch": 2.9295774647887325, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568897536, + "loss": 1.3347, + "grad_norm": 0.20236344635486603, + "learning_rate": 5.6476529721189974e-05 + }, + { + "step": 105, + "epoch": 2.9577464788732395, + "cpu_mem": 1.886560256, + "gpu_mem": 4.56884992, + "loss": 1.2678, + "grad_norm": 0.15404537320137024, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 106, + "epoch": 2.9859154929577465, + "cpu_mem": 1.886560256, + "gpu_mem": 4.568831488, + "loss": 1.3269, + "grad_norm": 0.14580389857292175, + "learning_rate": 5.074712436047112e-05 + }, + { + "step": 107, + "epoch": 3.0140845070422535, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619311104, + "loss": 1.9074, + "grad_norm": 0.3640492856502533, + "learning_rate": 4.7974089334362057e-05 + }, + { + "step": 108, + "epoch": 3.0422535211267605, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619292672, + "loss": 1.3248, + "grad_norm": 0.16967210173606873, + "learning_rate": 4.526447728708908e-05 + }, + { + "step": 109, + "epoch": 3.0704225352112675, + "cpu_mem": 1.886560256, + "gpu_mem": 4.61928192, + "loss": 1.3313, + "grad_norm": 0.16125313937664032, + "learning_rate": 4.261997261104223e-05 + }, + { + "step": 110, + "epoch": 3.0985915492957745, + "cpu_mem": 1.886560256, + "gpu_mem": 4.61933568, + "loss": 1.3427, + "grad_norm": 0.3960939645767212, + "learning_rate": 4.004221922552608e-05 + }, + { + "step": 111, + "epoch": 3.1267605633802815, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619295744, + "loss": 1.3396, + "grad_norm": 0.2253400981426239, + "learning_rate": 3.753281955483985e-05 + }, + { + "step": 112, + "epoch": 3.1549295774647885, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619314176, + "loss": 1.3238, + "grad_norm": 0.1146518811583519, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 113, + "epoch": 3.183098591549296, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619377152, + "loss": 1.3108, + "grad_norm": 0.11487311124801636, + "learning_rate": 3.2725277629795526e-05 + }, + { + "step": 114, + "epoch": 3.211267605633803, + "cpu_mem": 1.886560256, + "gpu_mem": 4.61930496, + "loss": 1.3377, + "grad_norm": 0.18008585274219513, + "learning_rate": 3.0430123916561672e-05 + }, + { + "step": 115, + "epoch": 3.23943661971831, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619298816, + "loss": 1.3484, + "grad_norm": 0.17114248871803284, + "learning_rate": 2.8209299142621522e-05 + }, + { + "step": 116, + "epoch": 3.267605633802817, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619314176, + "loss": 1.3287, + "grad_norm": 0.14098815619945526, + "learning_rate": 2.6064183852600797e-05 + }, + { + "step": 117, + "epoch": 3.295774647887324, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619329536, + "loss": 1.2868, + "grad_norm": 0.15139974653720856, + "learning_rate": 2.3996111527384288e-05 + }, + { + "step": 118, + "epoch": 3.323943661971831, + "cpu_mem": 1.886560256, + "gpu_mem": 4.61932032, + "loss": 1.3028, + "grad_norm": 0.17136478424072266, + "learning_rate": 2.2006367755176655e-05 + }, + { + "step": 119, + "epoch": 3.352112676056338, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619311104, + "loss": 1.3333, + "grad_norm": 0.1550767719745636, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 120, + "epoch": 3.380281690140845, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619329536, + "loss": 1.3199, + "grad_norm": 0.1421373337507248, + "learning_rate": 1.82667639944657e-05 + }, + { + "step": 121, + "epoch": 3.408450704225352, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619328, + "loss": 1.3072, + "grad_norm": 0.18246063590049744, + "learning_rate": 1.6519228678279718e-05 + }, + { + "step": 122, + "epoch": 3.436619718309859, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619284992, + "loss": 1.3044, + "grad_norm": 0.19827844202518463, + "learning_rate": 1.4854669814637143e-05 + }, + { + "step": 123, + "epoch": 3.464788732394366, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619317248, + "loss": 1.3033, + "grad_norm": 0.15101739764213562, + "learning_rate": 1.3274122153249028e-05 + }, + { + "step": 124, + "epoch": 3.492957746478873, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619271168, + "loss": 1.3134, + "grad_norm": 0.11909323930740356, + "learning_rate": 1.1778568219438839e-05 + }, + { + "step": 125, + "epoch": 3.52112676056338, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619315712, + "loss": 1.315, + "grad_norm": 0.2978835999965668, + "learning_rate": 1.036893770336938e-05 + }, + { + "step": 126, + "epoch": 3.5492957746478875, + "cpu_mem": 1.886560256, + "gpu_mem": 4.61926656, + "loss": 1.306, + "grad_norm": 0.25516074895858765, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 127, + "epoch": 3.5774647887323945, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619278848, + "loss": 1.322, + "grad_norm": 0.11534011363983154, + "learning_rate": 7.810898074930243e-06 + }, + { + "step": 128, + "epoch": 3.6056338028169015, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619303424, + "loss": 1.2986, + "grad_norm": 0.21799878776073456, + "learning_rate": 6.664079132078881e-06 + }, + { + "step": 129, + "epoch": 3.6338028169014085, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619265024, + "loss": 1.3142, + "grad_norm": 0.11690708994865417, + "learning_rate": 5.606362957498195e-06 + }, + { + "step": 130, + "epoch": 3.6619718309859155, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619268096, + "loss": 1.3356, + "grad_norm": 0.1318606287240982, + "learning_rate": 4.638407065638322e-06 + }, + { + "step": 131, + "epoch": 3.6901408450704225, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619280384, + "loss": 1.3065, + "grad_norm": 0.17136652767658234, + "learning_rate": 3.760813172726457e-06 + }, + { + "step": 132, + "epoch": 3.7183098591549295, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619245056, + "loss": 1.304, + "grad_norm": 0.11763548105955124, + "learning_rate": 2.9741268227184255e-06 + }, + { + "step": 133, + "epoch": 3.7464788732394365, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619286528, + "loss": 1.2722, + "grad_norm": 0.169888436794281, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 134, + "epoch": 3.7746478873239435, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619301888, + "loss": 1.2704, + "grad_norm": 0.18496723473072052, + "learning_rate": 1.6753760662307215e-06 + }, + { + "step": 135, + "epoch": 3.802816901408451, + "cpu_mem": 1.886560256, + "gpu_mem": 4.61926656, + "loss": 1.33, + "grad_norm": 0.25345268845558167, + "learning_rate": 1.1641190099741904e-06 + }, + { + "step": 136, + "epoch": 3.830985915492958, + "cpu_mem": 1.886560256, + "gpu_mem": 4.61927424, + "loss": 1.3593, + "grad_norm": 0.13358858227729797, + "learning_rate": 7.453836951897885e-07 + }, + { + "step": 137, + "epoch": 3.859154929577465, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619295744, + "loss": 1.304, + "grad_norm": 0.2519340515136719, + "learning_rate": 4.194304228229806e-07 + }, + { + "step": 138, + "epoch": 3.887323943661972, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619306496, + "loss": 1.3472, + "grad_norm": 0.17821015417575836, + "learning_rate": 1.8646181716164831e-07 + }, + { + "step": 139, + "epoch": 3.915492957746479, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619298816, + "loss": 1.3442, + "grad_norm": 0.15131331980228424, + "learning_rate": 4.662269987756317e-08 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619332608, + "loss": 1.3333, + "grad_norm": 0.14364835619926453, + "learning_rate": 0.0 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.886560256, + "gpu_mem": 4.619332608, + "train_runtime": 1380.7828, + "train_samples_per_second": 6.521, + "train_steps_per_second": 0.101, + "total_flos": 7307092093022208.0, + "train_loss": 1.6083332913262502 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r2-a2/README.md b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r2-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r2-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r2-a2/adapter_config.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4c4758d99093e963e7b960b3e04b3ff68f0cc5fe --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r2-a2/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha": 4, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "inference_mode": true, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 2, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r2-a2/eval_results.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..fbb59b248f0026be28d28a1bb56d4df3caffcc63 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "boolq", + "results": 0.6571865443425077 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r2-a2/training_configuration.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..5a42aa074c617c1200ea53decef41533e9d8f1c7 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "BOOLQ", + "dataset_id": "google/boolq", + "preprocess_id": "boolq_train_deepeval" + }, + "peft_config": { + "method": "loha", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 3153920 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 2, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loha-boolq-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r2-a2", + "seed": 42, + "timestamp": "2025-09-12T17:07:10.177015" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r2-a2/training_logs.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..1271fe51cb4b89194f71ffcb33cd877ab6db1c64 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r2-a2/training_logs.json @@ -0,0 +1,2659 @@ +[ + { + "step": 1, + "epoch": 0.006779661016949152, + "cpu_mem": 1.826816, + "gpu_mem": 4.430388224, + "loss": 8.869, + "grad_norm": 2.6093997955322266, + "learning_rate": 9.999999999999999e-06 + }, + { + "step": 2, + "epoch": 0.013559322033898305, + "cpu_mem": 1.833107456, + "gpu_mem": 4.455757312, + "loss": 8.9376, + "grad_norm": 2.735384225845337, + "learning_rate": 1.9999999999999998e-05 + }, + { + "step": 3, + "epoch": 0.020338983050847456, + "cpu_mem": 1.833893888, + "gpu_mem": 4.455675904, + "loss": 8.9517, + "grad_norm": 2.8548736572265625, + "learning_rate": 2.9999999999999997e-05 + }, + { + "step": 4, + "epoch": 0.02711864406779661, + "cpu_mem": 1.834483712, + "gpu_mem": 4.455675904, + "loss": 8.9282, + "grad_norm": 2.7734270095825195, + "learning_rate": 3.9999999999999996e-05 + }, + { + "step": 5, + "epoch": 0.03389830508474576, + "cpu_mem": 1.835073536, + "gpu_mem": 4.455611392, + "loss": 8.7919, + "grad_norm": 2.767235517501831, + "learning_rate": 4.9999999999999996e-05 + }, + { + "step": 6, + "epoch": 0.04067796610169491, + "cpu_mem": 1.83566336, + "gpu_mem": 4.45563136, + "loss": 8.9569, + "grad_norm": 2.564236879348755, + "learning_rate": 5.9999999999999995e-05 + }, + { + "step": 7, + "epoch": 0.04745762711864407, + "cpu_mem": 1.836253184, + "gpu_mem": 4.455683584, + "loss": 8.9023, + "grad_norm": 2.8906381130218506, + "learning_rate": 7e-05 + }, + { + "step": 8, + "epoch": 0.05423728813559322, + "cpu_mem": 1.8366464, + "gpu_mem": 4.4557696, + "loss": 8.8454, + "grad_norm": 2.9403791427612305, + "learning_rate": 7.999999999999999e-05 + }, + { + "step": 9, + "epoch": 0.061016949152542375, + "cpu_mem": 1.837039616, + "gpu_mem": 4.45567744, + "loss": 8.5825, + "grad_norm": 2.799252986907959, + "learning_rate": 8.999999999999999e-05 + }, + { + "step": 10, + "epoch": 0.06779661016949153, + "cpu_mem": 1.83762944, + "gpu_mem": 4.4555776, + "loss": 8.6043, + "grad_norm": 2.826673746109009, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 11, + "epoch": 0.07457627118644068, + "cpu_mem": 1.838022656, + "gpu_mem": 4.455682048, + "loss": 8.6315, + "grad_norm": 2.8614091873168945, + "learning_rate": 0.00010999999999999998 + }, + { + "step": 12, + "epoch": 0.08135593220338982, + "cpu_mem": 1.838415872, + "gpu_mem": 4.45605376, + "loss": 8.6006, + "grad_norm": 2.9368252754211426, + "learning_rate": 0.00011999999999999999 + }, + { + "step": 13, + "epoch": 0.08813559322033898, + "cpu_mem": 1.838809088, + "gpu_mem": 4.455657472, + "loss": 8.5055, + "grad_norm": 2.9108381271362305, + "learning_rate": 0.00013 + }, + { + "step": 14, + "epoch": 0.09491525423728814, + "cpu_mem": 1.839202304, + "gpu_mem": 4.455634432, + "loss": 8.2975, + "grad_norm": 2.9461123943328857, + "learning_rate": 0.00014 + }, + { + "step": 15, + "epoch": 0.1016949152542373, + "cpu_mem": 1.83959552, + "gpu_mem": 4.455572992, + "loss": 8.1004, + "grad_norm": 2.9370806217193604, + "learning_rate": 0.00015 + }, + { + "step": 16, + "epoch": 0.10847457627118644, + "cpu_mem": 1.839988736, + "gpu_mem": 4.455657472, + "loss": 8.1226, + "grad_norm": 2.9428882598876953, + "learning_rate": 0.00015999999999999999 + }, + { + "step": 17, + "epoch": 0.1152542372881356, + "cpu_mem": 1.840185344, + "gpu_mem": 4.455697408, + "loss": 7.8381, + "grad_norm": 3.017432928085327, + "learning_rate": 0.00016999999999999999 + }, + { + "step": 18, + "epoch": 0.12203389830508475, + "cpu_mem": 1.84057856, + "gpu_mem": 4.455760384, + "loss": 8.1168, + "grad_norm": 2.9628195762634277, + "learning_rate": 0.00017999999999999998 + }, + { + "step": 19, + "epoch": 0.1288135593220339, + "cpu_mem": 1.840971776, + "gpu_mem": 4.455597568, + "loss": 7.4, + "grad_norm": 3.100884199142456, + "learning_rate": 0.00018999999999999998 + }, + { + "step": 20, + "epoch": 0.13559322033898305, + "cpu_mem": 1.841364992, + "gpu_mem": 4.455709696, + "loss": 7.4253, + "grad_norm": 3.394153118133545, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 21, + "epoch": 0.1423728813559322, + "cpu_mem": 1.8415616, + "gpu_mem": 4.455867904, + "loss": 7.3351, + "grad_norm": 3.3152081966400146, + "learning_rate": 0.00020999999999999998 + }, + { + "step": 22, + "epoch": 0.14915254237288136, + "cpu_mem": 1.841954816, + "gpu_mem": 4.455760384, + "loss": 7.0615, + "grad_norm": 3.5069007873535156, + "learning_rate": 0.00021999999999999995 + }, + { + "step": 23, + "epoch": 0.15593220338983052, + "cpu_mem": 1.842151424, + "gpu_mem": 4.455732736, + "loss": 6.7825, + "grad_norm": 3.3171651363372803, + "learning_rate": 0.00023 + }, + { + "step": 24, + "epoch": 0.16271186440677965, + "cpu_mem": 1.84254464, + "gpu_mem": 4.455789568, + "loss": 6.2443, + "grad_norm": 3.7537736892700195, + "learning_rate": 0.00023999999999999998 + }, + { + "step": 25, + "epoch": 0.1694915254237288, + "cpu_mem": 1.842937856, + "gpu_mem": 4.455574528, + "loss": 6.177, + "grad_norm": 3.8140556812286377, + "learning_rate": 0.00025 + }, + { + "step": 26, + "epoch": 0.17627118644067796, + "cpu_mem": 1.843134464, + "gpu_mem": 4.455629824, + "loss": 5.6974, + "grad_norm": 3.569026231765747, + "learning_rate": 0.00026 + }, + { + "step": 27, + "epoch": 0.18305084745762712, + "cpu_mem": 1.843331072, + "gpu_mem": 4.455921664, + "loss": 5.1472, + "grad_norm": 4.094846725463867, + "learning_rate": 0.00027 + }, + { + "step": 28, + "epoch": 0.18983050847457628, + "cpu_mem": 1.843724288, + "gpu_mem": 4.45560064, + "loss": 4.7773, + "grad_norm": 3.8313169479370117, + "learning_rate": 0.00028 + }, + { + "step": 29, + "epoch": 0.19661016949152543, + "cpu_mem": 1.843920896, + "gpu_mem": 4.455665152, + "loss": 4.407, + "grad_norm": 3.808440685272217, + "learning_rate": 0.00029 + }, + { + "step": 30, + "epoch": 0.2033898305084746, + "cpu_mem": 1.844117504, + "gpu_mem": 4.455743488, + "loss": 3.8313, + "grad_norm": 3.6070327758789062, + "learning_rate": 0.0003 + }, + { + "step": 31, + "epoch": 0.21016949152542372, + "cpu_mem": 1.844314112, + "gpu_mem": 4.45554688, + "loss": 3.3325, + "grad_norm": 3.210742473602295, + "learning_rate": 0.0002999893794250036 + }, + { + "step": 32, + "epoch": 0.21694915254237288, + "cpu_mem": 1.84451072, + "gpu_mem": 4.455660544, + "loss": 3.1541, + "grad_norm": 3.1735172271728516, + "learning_rate": 0.00029995751920396937 + }, + { + "step": 33, + "epoch": 0.22372881355932203, + "cpu_mem": 1.844707328, + "gpu_mem": 4.455898624, + "loss": 2.5364, + "grad_norm": 2.6123247146606445, + "learning_rate": 0.00029990442384854874 + }, + { + "step": 34, + "epoch": 0.2305084745762712, + "cpu_mem": 1.844903936, + "gpu_mem": 4.45560064, + "loss": 1.9164, + "grad_norm": 1.9531000852584839, + "learning_rate": 0.0002998301008774512 + }, + { + "step": 35, + "epoch": 0.23728813559322035, + "cpu_mem": 1.845297152, + "gpu_mem": 4.455811072, + "loss": 2.0016, + "grad_norm": 1.9291279315948486, + "learning_rate": 0.0002997345608153792 + }, + { + "step": 36, + "epoch": 0.2440677966101695, + "cpu_mem": 1.84549376, + "gpu_mem": 4.45576192, + "loss": 1.7018, + "grad_norm": 1.373255968093872, + "learning_rate": 0.000299617817191538 + }, + { + "step": 37, + "epoch": 0.25084745762711863, + "cpu_mem": 1.845690368, + "gpu_mem": 4.455572992, + "loss": 1.5017, + "grad_norm": 1.2521672248840332, + "learning_rate": 0.0002994798865377198 + }, + { + "step": 38, + "epoch": 0.2576271186440678, + "cpu_mem": 1.845886976, + "gpu_mem": 4.455820288, + "loss": 1.4991, + "grad_norm": 1.1270779371261597, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 39, + "epoch": 0.26440677966101694, + "cpu_mem": 1.846083584, + "gpu_mem": 4.45619968, + "loss": 1.2384, + "grad_norm": 0.9001747369766235, + "learning_rate": 0.0002991405452657846 + }, + { + "step": 40, + "epoch": 0.2711864406779661, + "cpu_mem": 1.846280192, + "gpu_mem": 4.4557696, + "loss": 1.2265, + "grad_norm": 0.9694863557815552, + "learning_rate": 0.00029893918270099324 + }, + { + "step": 41, + "epoch": 0.27796610169491526, + "cpu_mem": 1.8464768, + "gpu_mem": 4.455996928, + "loss": 1.3154, + "grad_norm": 1.019802212715149, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 42, + "epoch": 0.2847457627118644, + "cpu_mem": 1.846673408, + "gpu_mem": 4.455894016, + "loss": 0.9553, + "grad_norm": 0.5520044565200806, + "learning_rate": 0.0002984732162821399 + }, + { + "step": 43, + "epoch": 0.29152542372881357, + "cpu_mem": 1.846870016, + "gpu_mem": 4.45571584, + "loss": 0.8405, + "grad_norm": 0.5387597680091858, + "learning_rate": 0.0002982086784124952 + }, + { + "step": 44, + "epoch": 0.2983050847457627, + "cpu_mem": 1.847066624, + "gpu_mem": 4.455858688, + "loss": 0.7805, + "grad_norm": 0.3981980085372925, + "learning_rate": 0.00029792315305772796 + }, + { + "step": 45, + "epoch": 0.3050847457627119, + "cpu_mem": 1.847263232, + "gpu_mem": 4.45563904, + "loss": 0.9714, + "grad_norm": 0.8029425740242004, + "learning_rate": 0.0002976166806504174 + }, + { + "step": 46, + "epoch": 0.31186440677966104, + "cpu_mem": 1.847263232, + "gpu_mem": 4.455881728, + "loss": 0.7864, + "grad_norm": 0.49656426906585693, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 47, + "epoch": 0.31864406779661014, + "cpu_mem": 1.847263232, + "gpu_mem": 4.455605248, + "loss": 0.7032, + "grad_norm": 0.33939453959465027, + "learning_rate": 0.00029694107123365385 + }, + { + "step": 48, + "epoch": 0.3254237288135593, + "cpu_mem": 1.84745984, + "gpu_mem": 4.455682048, + "loss": 0.6884, + "grad_norm": 0.26276156306266785, + "learning_rate": 0.00029657202989567393 + }, + { + "step": 49, + "epoch": 0.33220338983050846, + "cpu_mem": 1.847656448, + "gpu_mem": 4.455698944, + "loss": 0.8937, + "grad_norm": 0.7578421235084534, + "learning_rate": 0.00029618223283454893 + }, + { + "step": 50, + "epoch": 0.3389830508474576, + "cpu_mem": 1.847656448, + "gpu_mem": 4.455637504, + "loss": 0.6308, + "grad_norm": 0.267073392868042, + "learning_rate": 0.00029577173524853123 + }, + { + "step": 51, + "epoch": 0.34576271186440677, + "cpu_mem": 1.847853056, + "gpu_mem": 4.455642112, + "loss": 0.7145, + "grad_norm": 0.7757521271705627, + "learning_rate": 0.0002953405952672261 + }, + { + "step": 52, + "epoch": 0.3525423728813559, + "cpu_mem": 1.848049664, + "gpu_mem": 4.455721984, + "loss": 0.6989, + "grad_norm": 0.349129855632782, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 53, + "epoch": 0.3593220338983051, + "cpu_mem": 1.848049664, + "gpu_mem": 4.455745024, + "loss": 0.6876, + "grad_norm": 0.8621045351028442, + "learning_rate": 0.0002944166352441363 + }, + { + "step": 54, + "epoch": 0.36610169491525424, + "cpu_mem": 1.848049664, + "gpu_mem": 4.455672832, + "loss": 0.7435, + "grad_norm": 0.26083245873451233, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 55, + "epoch": 0.3728813559322034, + "cpu_mem": 1.848246272, + "gpu_mem": 4.455943168, + "loss": 0.6947, + "grad_norm": 0.4885731339454651, + "learning_rate": 0.00029341087610604337 + }, + { + "step": 56, + "epoch": 0.37966101694915255, + "cpu_mem": 1.848246272, + "gpu_mem": 4.455729664, + "loss": 0.6426, + "grad_norm": 0.27352550625801086, + "learning_rate": 0.00029287749809037904 + }, + { + "step": 57, + "epoch": 0.3864406779661017, + "cpu_mem": 1.84844288, + "gpu_mem": 4.45572352, + "loss": 0.6501, + "grad_norm": 0.33800575137138367, + "learning_rate": 0.0002923238875255979 + }, + { + "step": 58, + "epoch": 0.39322033898305087, + "cpu_mem": 1.84844288, + "gpu_mem": 4.455619072, + "loss": 0.6296, + "grad_norm": 0.395685076713562, + "learning_rate": 0.00029175012280720024 + }, + { + "step": 59, + "epoch": 0.4, + "cpu_mem": 1.848639488, + "gpu_mem": 4.455635968, + "loss": 0.7319, + "grad_norm": 0.9802838563919067, + "learning_rate": 0.000291156285184669 + }, + { + "step": 60, + "epoch": 0.4067796610169492, + "cpu_mem": 1.848639488, + "gpu_mem": 4.455729664, + "loss": 0.6095, + "grad_norm": 0.3365701138973236, + "learning_rate": 0.00029054245874996426 + }, + { + "step": 61, + "epoch": 0.4135593220338983, + "cpu_mem": 1.848639488, + "gpu_mem": 4.455740416, + "loss": 0.6207, + "grad_norm": 0.338260293006897, + "learning_rate": 0.0002899087304256151 + }, + { + "step": 62, + "epoch": 0.42033898305084744, + "cpu_mem": 1.848836096, + "gpu_mem": 4.455728128, + "loss": 0.7702, + "grad_norm": 0.9633134007453918, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 63, + "epoch": 0.4271186440677966, + "cpu_mem": 1.848836096, + "gpu_mem": 4.455720448, + "loss": 0.6159, + "grad_norm": 0.7405604720115662, + "learning_rate": 0.000288581929876693 + }, + { + "step": 64, + "epoch": 0.43389830508474575, + "cpu_mem": 1.849032704, + "gpu_mem": 4.455649792, + "loss": 0.7049, + "grad_norm": 0.467547208070755, + "learning_rate": 0.0002878890455372498 + }, + { + "step": 65, + "epoch": 0.4406779661016949, + "cpu_mem": 1.849032704, + "gpu_mem": 4.455694336, + "loss": 0.6664, + "grad_norm": 0.28826063871383667, + "learning_rate": 0.0002871766350518159 + }, + { + "step": 66, + "epoch": 0.44745762711864406, + "cpu_mem": 1.849032704, + "gpu_mem": 4.455887872, + "loss": 0.6593, + "grad_norm": 0.7321875095367432, + "learning_rate": 0.00028644479930317775 + }, + { + "step": 67, + "epoch": 0.4542372881355932, + "cpu_mem": 1.849229312, + "gpu_mem": 4.455597568, + "loss": 0.6672, + "grad_norm": 0.5612125396728516, + "learning_rate": 0.00028569364192488803 + }, + { + "step": 68, + "epoch": 0.4610169491525424, + "cpu_mem": 1.84942592, + "gpu_mem": 4.455565312, + "loss": 0.7516, + "grad_norm": 0.6401737332344055, + "learning_rate": 0.00028492326928659045 + }, + { + "step": 69, + "epoch": 0.46779661016949153, + "cpu_mem": 1.84942592, + "gpu_mem": 4.45563136, + "loss": 0.6684, + "grad_norm": 0.2867793142795563, + "learning_rate": 0.00028413379047895665 + }, + { + "step": 70, + "epoch": 0.4745762711864407, + "cpu_mem": 1.84942592, + "gpu_mem": 4.455625216, + "loss": 0.7141, + "grad_norm": 0.5054699182510376, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 71, + "epoch": 0.48135593220338985, + "cpu_mem": 1.84942592, + "gpu_mem": 4.45585408, + "loss": 0.5975, + "grad_norm": 0.28301355242729187, + "learning_rate": 0.0002824979642304366 + }, + { + "step": 72, + "epoch": 0.488135593220339, + "cpu_mem": 1.849622528, + "gpu_mem": 4.4558464, + "loss": 0.7041, + "grad_norm": 0.35183823108673096, + "learning_rate": 0.0002816518484350883 + }, + { + "step": 73, + "epoch": 0.49491525423728816, + "cpu_mem": 1.849622528, + "gpu_mem": 4.455812608, + "loss": 0.7261, + "grad_norm": 0.6441971659660339, + "learning_rate": 0.0002807870897286772 + }, + { + "step": 74, + "epoch": 0.5016949152542373, + "cpu_mem": 1.849622528, + "gpu_mem": 4.455672832, + "loss": 0.6262, + "grad_norm": 0.2839999198913574, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 75, + "epoch": 0.5084745762711864, + "cpu_mem": 1.849622528, + "gpu_mem": 4.455597568, + "loss": 0.6236, + "grad_norm": 0.7037397623062134, + "learning_rate": 0.000279002136031155 + }, + { + "step": 76, + "epoch": 0.5152542372881356, + "cpu_mem": 1.849819136, + "gpu_mem": 4.455537664, + "loss": 0.7072, + "grad_norm": 0.5704520344734192, + "learning_rate": 0.00027808219380317216 + }, + { + "step": 77, + "epoch": 0.5220338983050847, + "cpu_mem": 1.849819136, + "gpu_mem": 4.455611392, + "loss": 0.616, + "grad_norm": 0.5970688462257385, + "learning_rate": 0.0002771441141545895 + }, + { + "step": 78, + "epoch": 0.5288135593220339, + "cpu_mem": 1.849819136, + "gpu_mem": 4.455663616, + "loss": 0.7553, + "grad_norm": 0.6622800827026367, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 79, + "epoch": 0.535593220338983, + "cpu_mem": 1.849819136, + "gpu_mem": 4.455795712, + "loss": 0.6647, + "grad_norm": 0.23367147147655487, + "learning_rate": 0.000275214076502292 + }, + { + "step": 80, + "epoch": 0.5423728813559322, + "cpu_mem": 1.849819136, + "gpu_mem": 4.455686656, + "loss": 0.6577, + "grad_norm": 0.18281403183937073, + "learning_rate": 0.0002742223918067056 + }, + { + "step": 81, + "epoch": 0.5491525423728814, + "cpu_mem": 1.849819136, + "gpu_mem": 4.455566848, + "loss": 0.6325, + "grad_norm": 0.26135876774787903, + "learning_rate": 0.00027321311626807374 + }, + { + "step": 82, + "epoch": 0.5559322033898305, + "cpu_mem": 1.849819136, + "gpu_mem": 4.455635968, + "loss": 0.6845, + "grad_norm": 0.42330271005630493, + "learning_rate": 0.0002721863928075503 + }, + { + "step": 83, + "epoch": 0.5627118644067797, + "cpu_mem": 1.850015744, + "gpu_mem": 4.455735808, + "loss": 0.7185, + "grad_norm": 0.5893667936325073, + "learning_rate": 0.000271142366817049 + }, + { + "step": 84, + "epoch": 0.5694915254237288, + "cpu_mem": 1.850015744, + "gpu_mem": 4.455698944, + "loss": 0.6589, + "grad_norm": 0.25530919432640076, + "learning_rate": 0.00027008118613865406 + }, + { + "step": 85, + "epoch": 0.576271186440678, + "cpu_mem": 1.850015744, + "gpu_mem": 4.4557312, + "loss": 0.6087, + "grad_norm": 0.37893298268318176, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 86, + "epoch": 0.5830508474576271, + "cpu_mem": 1.850015744, + "gpu_mem": 4.455682048, + "loss": 0.6328, + "grad_norm": 0.3785851001739502, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 87, + "epoch": 0.5898305084745763, + "cpu_mem": 1.850015744, + "gpu_mem": 4.455689728, + "loss": 0.6801, + "grad_norm": 0.22404277324676514, + "learning_rate": 0.00026679623070746325 + }, + { + "step": 88, + "epoch": 0.5966101694915255, + "cpu_mem": 1.850015744, + "gpu_mem": 4.455834112, + "loss": 0.6184, + "grad_norm": 0.35276269912719727, + "learning_rate": 0.0002656679579618081 + }, + { + "step": 89, + "epoch": 0.6033898305084746, + "cpu_mem": 1.850015744, + "gpu_mem": 4.455616, + "loss": 0.6544, + "grad_norm": 0.33943140506744385, + "learning_rate": 0.0002645233057465235 + }, + { + "step": 90, + "epoch": 0.6101694915254238, + "cpu_mem": 1.850212352, + "gpu_mem": 4.45566976, + "loss": 0.6684, + "grad_norm": 0.26048827171325684, + "learning_rate": 0.00026336243615313873 + }, + { + "step": 91, + "epoch": 0.6169491525423729, + "cpu_mem": 1.850212352, + "gpu_mem": 4.455637504, + "loss": 0.6392, + "grad_norm": 0.2618330419063568, + "learning_rate": 0.00026218551356968814 + }, + { + "step": 92, + "epoch": 0.6237288135593221, + "cpu_mem": 1.850212352, + "gpu_mem": 4.455718912, + "loss": 0.7333, + "grad_norm": 0.5945683121681213, + "learning_rate": 0.00026099270465743254 + }, + { + "step": 93, + "epoch": 0.6305084745762712, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455522304, + "loss": 0.6981, + "grad_norm": 0.3624102771282196, + "learning_rate": 0.0002597841783272588 + }, + { + "step": 94, + "epoch": 0.6372881355932203, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455635968, + "loss": 0.6065, + "grad_norm": 0.39492160081863403, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 95, + "epoch": 0.6440677966101694, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455655936, + "loss": 0.7122, + "grad_norm": 0.2508014142513275, + "learning_rate": 0.00025732066016100394 + }, + { + "step": 96, + "epoch": 0.6508474576271186, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455694336, + "loss": 0.6722, + "grad_norm": 0.29485297203063965, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 97, + "epoch": 0.6576271186440678, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455678976, + "loss": 0.6644, + "grad_norm": 0.2120937556028366, + "learning_rate": 0.0002547963544337602 + }, + { + "step": 98, + "epoch": 0.6644067796610169, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455591424, + "loss": 0.6555, + "grad_norm": 0.1680106520652771, + "learning_rate": 0.0002535118517223168 + }, + { + "step": 99, + "epoch": 0.6711864406779661, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455540736, + "loss": 0.6393, + "grad_norm": 0.19342853128910065, + "learning_rate": 0.00025221269093908365 + }, + { + "step": 100, + "epoch": 0.6779661016949152, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455657472, + "loss": 0.6174, + "grad_norm": 0.3688255548477173, + "learning_rate": 0.0002508990560551879 + }, + { + "step": 101, + "epoch": 0.6847457627118644, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455689728, + "loss": 0.6738, + "grad_norm": 0.18377186357975006, + "learning_rate": 0.0002495711330914001 + }, + { + "step": 102, + "epoch": 0.6915254237288135, + "cpu_mem": 1.85040896, + "gpu_mem": 4.45572352, + "loss": 0.6553, + "grad_norm": 0.1681523472070694, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 103, + "epoch": 0.6983050847457627, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455774208, + "loss": 0.6859, + "grad_norm": 0.1780584305524826, + "learning_rate": 0.0002468731770971113 + }, + { + "step": 104, + "epoch": 0.7050847457627119, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455678976, + "loss": 0.6696, + "grad_norm": 0.3029868006706238, + "learning_rate": 0.0002455035261178632 + }, + { + "step": 105, + "epoch": 0.711864406779661, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455780352, + "loss": 0.664, + "grad_norm": 0.1715494692325592, + "learning_rate": 0.0002441203511071278 + }, + { + "step": 106, + "epoch": 0.7186440677966102, + "cpu_mem": 1.85040896, + "gpu_mem": 4.4557312, + "loss": 0.6041, + "grad_norm": 0.2795843780040741, + "learning_rate": 0.00024272384793309077 + }, + { + "step": 107, + "epoch": 0.7254237288135593, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455619072, + "loss": 0.5953, + "grad_norm": 0.2860918343067169, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 108, + "epoch": 0.7322033898305085, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455803392, + "loss": 0.6481, + "grad_norm": 0.22841840982437134, + "learning_rate": 0.00023989164997670202 + }, + { + "step": 109, + "epoch": 0.7389830508474576, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455657472, + "loss": 0.6704, + "grad_norm": 0.3137684762477875, + "learning_rate": 0.0002384563562552943 + }, + { + "step": 110, + "epoch": 0.7457627118644068, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455660544, + "loss": 0.6482, + "grad_norm": 0.2068740427494049, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 111, + "epoch": 0.752542372881356, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455629824, + "loss": 0.649, + "grad_norm": 0.42369332909584045, + "learning_rate": 0.0002355483955402446 + }, + { + "step": 112, + "epoch": 0.7593220338983051, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455675904, + "loss": 0.6612, + "grad_norm": 0.18822750449180603, + "learning_rate": 0.00023407614033613407 + }, + { + "step": 113, + "epoch": 0.7661016949152543, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455666688, + "loss": 0.6712, + "grad_norm": 0.24965153634548187, + "learning_rate": 0.0002325919793059723 + }, + { + "step": 114, + "epoch": 0.7728813559322034, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455648256, + "loss": 0.6232, + "grad_norm": 0.2858082950115204, + "learning_rate": 0.00023109612261833963 + }, + { + "step": 115, + "epoch": 0.7796610169491526, + "cpu_mem": 1.85040896, + "gpu_mem": 4.45572352, + "loss": 0.6194, + "grad_norm": 0.20793592929840088, + "learning_rate": 0.0002295887820980112 + }, + { + "step": 116, + "epoch": 0.7864406779661017, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455643648, + "loss": 0.6414, + "grad_norm": 0.3180989623069763, + "learning_rate": 0.0002280701711959608 + }, + { + "step": 117, + "epoch": 0.7932203389830509, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455534592, + "loss": 0.6451, + "grad_norm": 0.17328216135501862, + "learning_rate": 0.00022654050495913495 + }, + { + "step": 118, + "epoch": 0.8, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455772672, + "loss": 0.6445, + "grad_norm": 0.22578033804893494, + "learning_rate": 0.000225 + }, + { + "step": 119, + "epoch": 0.8067796610169492, + "cpu_mem": 1.85040896, + "gpu_mem": 4.455943168, + "loss": 0.6187, + "grad_norm": 0.2839568853378296, + "learning_rate": 0.00022344887446586865 + }, + { + "step": 120, + "epoch": 0.8135593220338984, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455675904, + "loss": 0.6133, + "grad_norm": 0.28772634267807007, + "learning_rate": 0.00022188734800800852 + }, + { + "step": 121, + "epoch": 0.8203389830508474, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455703552, + "loss": 0.6067, + "grad_norm": 0.2220933884382248, + "learning_rate": 0.00022031564175053754 + }, + { + "step": 122, + "epoch": 0.8271186440677966, + "cpu_mem": 1.850605568, + "gpu_mem": 4.45575424, + "loss": 0.59, + "grad_norm": 0.254190593957901, + "learning_rate": 0.00021873397825911153 + }, + { + "step": 123, + "epoch": 0.8338983050847457, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455563776, + "loss": 0.6498, + "grad_norm": 0.43377062678337097, + "learning_rate": 0.00021714258150940685 + }, + { + "step": 124, + "epoch": 0.8406779661016949, + "cpu_mem": 1.850605568, + "gpu_mem": 4.456006144, + "loss": 0.6294, + "grad_norm": 0.3498164415359497, + "learning_rate": 0.0002155416768554039 + }, + { + "step": 125, + "epoch": 0.847457627118644, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455732736, + "loss": 0.563, + "grad_norm": 0.16917088627815247, + "learning_rate": 0.00021393149099747523 + }, + { + "step": 126, + "epoch": 0.8542372881355932, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455616, + "loss": 0.5816, + "grad_norm": 0.17502300441265106, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 127, + "epoch": 0.8610169491525423, + "cpu_mem": 1.850605568, + "gpu_mem": 4.456055296, + "loss": 0.674, + "grad_norm": 0.6400803327560425, + "learning_rate": 0.00021068418901049025 + }, + { + "step": 128, + "epoch": 0.8677966101694915, + "cpu_mem": 1.850605568, + "gpu_mem": 4.45583104, + "loss": 0.5787, + "grad_norm": 0.2856670320034027, + "learning_rate": 0.0002090475327242912 + }, + { + "step": 129, + "epoch": 0.8745762711864407, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455870976, + "loss": 0.704, + "grad_norm": 0.467952698469162, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 130, + "epoch": 0.8813559322033898, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455652864, + "loss": 0.6626, + "grad_norm": 0.2002081573009491, + "learning_rate": 0.0002057493683490491 + }, + { + "step": 131, + "epoch": 0.888135593220339, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455781888, + "loss": 0.6846, + "grad_norm": 0.2312997430562973, + "learning_rate": 0.00020408832730536746 + }, + { + "step": 132, + "epoch": 0.8949152542372881, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455863296, + "loss": 0.607, + "grad_norm": 0.22437311708927155, + "learning_rate": 0.00020241962693986476 + }, + { + "step": 133, + "epoch": 0.9016949152542373, + "cpu_mem": 1.850605568, + "gpu_mem": 4.45564672, + "loss": 0.6148, + "grad_norm": 0.41069039702415466, + "learning_rate": 0.0002007435035533061 + }, + { + "step": 134, + "epoch": 0.9084745762711864, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455780352, + "loss": 0.6199, + "grad_norm": 0.8219402432441711, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 135, + "epoch": 0.9152542372881356, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455803392, + "loss": 0.5861, + "grad_norm": 0.2851550877094269, + "learning_rate": 0.00019736993814225374 + }, + { + "step": 136, + "epoch": 0.9220338983050848, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455640576, + "loss": 0.5793, + "grad_norm": 0.44092613458633423, + "learning_rate": 0.00019567297384048604 + }, + { + "step": 137, + "epoch": 0.9288135593220339, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455520768, + "loss": 0.6702, + "grad_norm": 0.19708696007728577, + "learning_rate": 0.0001939695418954653 + }, + { + "step": 138, + "epoch": 0.9355932203389831, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455702016, + "loss": 0.6169, + "grad_norm": 0.33922433853149414, + "learning_rate": 0.00019225988352621445 + }, + { + "step": 139, + "epoch": 0.9423728813559322, + "cpu_mem": 1.850605568, + "gpu_mem": 4.45560064, + "loss": 0.6771, + "grad_norm": 0.49199095368385315, + "learning_rate": 0.00019054424083346592 + }, + { + "step": 140, + "epoch": 0.9491525423728814, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455652864, + "loss": 0.6248, + "grad_norm": 0.37177667021751404, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 141, + "epoch": 0.9559322033898305, + "cpu_mem": 1.850605568, + "gpu_mem": 4.45568512, + "loss": 0.6267, + "grad_norm": 0.32898959517478943, + "learning_rate": 0.0001870959750831323 + }, + { + "step": 142, + "epoch": 0.9627118644067797, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455824896, + "loss": 0.5779, + "grad_norm": 0.31023335456848145, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 143, + "epoch": 0.9694915254237289, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455808, + "loss": 0.6938, + "grad_norm": 0.4730161726474762, + "learning_rate": 0.00018362669777878453 + }, + { + "step": 144, + "epoch": 0.976271186440678, + "cpu_mem": 1.850605568, + "gpu_mem": 4.456, + "loss": 0.6237, + "grad_norm": 0.20079341530799866, + "learning_rate": 0.00018188479343294648 + }, + { + "step": 145, + "epoch": 0.9830508474576272, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455711232, + "loss": 0.5858, + "grad_norm": 0.32074663043022156, + "learning_rate": 0.0001801383739559098 + }, + { + "step": 146, + "epoch": 0.9898305084745763, + "cpu_mem": 1.850605568, + "gpu_mem": 4.45574656, + "loss": 0.631, + "grad_norm": 0.20254695415496826, + "learning_rate": 0.0001783876866540615 + }, + { + "step": 147, + "epoch": 0.9966101694915255, + "cpu_mem": 1.850605568, + "gpu_mem": 4.455645184, + "loss": 0.6071, + "grad_norm": 0.3146207928657532, + "learning_rate": 0.00017663297943814552 + }, + { + "step": 148, + "epoch": 1.0033898305084745, + "cpu_mem": 1.850605568, + "gpu_mem": 4.468425216, + "loss": 1.0064, + "grad_norm": 0.5937636494636536, + "learning_rate": 0.0001748745007881561 + }, + { + "step": 149, + "epoch": 1.0101694915254238, + "cpu_mem": 1.850605568, + "gpu_mem": 4.468360704, + "loss": 0.6338, + "grad_norm": 0.22661003470420837, + "learning_rate": 0.00017311249971815185 + }, + { + "step": 150, + "epoch": 1.0169491525423728, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468197888, + "loss": 0.6522, + "grad_norm": 0.22681213915348053, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 151, + "epoch": 1.023728813559322, + "cpu_mem": 1.850802176, + "gpu_mem": 4.46827008, + "loss": 0.7436, + "grad_norm": 0.3580166697502136, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 152, + "epoch": 1.0305084745762711, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468305408, + "loss": 0.6128, + "grad_norm": 0.2025514394044876, + "learning_rate": 0.00016780785939859576 + }, + { + "step": 153, + "epoch": 1.0372881355932204, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468329984, + "loss": 0.6297, + "grad_norm": 0.5702033638954163, + "learning_rate": 0.00016603426823476693 + }, + { + "step": 154, + "epoch": 1.0440677966101695, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468291584, + "loss": 0.6089, + "grad_norm": 0.27258849143981934, + "learning_rate": 0.00016425840649562736 + }, + { + "step": 155, + "epoch": 1.0508474576271187, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468512768, + "loss": 0.6323, + "grad_norm": 0.2791895866394043, + "learning_rate": 0.00016248052565681436 + }, + { + "step": 156, + "epoch": 1.0576271186440678, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468420608, + "loss": 0.6393, + "grad_norm": 0.5888108015060425, + "learning_rate": 0.00016070087747988482 + }, + { + "step": 157, + "epoch": 1.064406779661017, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468326912, + "loss": 0.6448, + "grad_norm": 0.3506273627281189, + "learning_rate": 0.00015891971397666464 + }, + { + "step": 158, + "epoch": 1.071186440677966, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468253184, + "loss": 0.6585, + "grad_norm": 0.3157466650009155, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 159, + "epoch": 1.0779661016949154, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468601856, + "loss": 0.6091, + "grad_norm": 0.22938047349452972, + "learning_rate": 0.00015535385007584706 + }, + { + "step": 160, + "epoch": 1.0847457627118644, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468196352, + "loss": 0.6561, + "grad_norm": 0.2042321115732193, + "learning_rate": 0.0001535696546319161 + }, + { + "step": 161, + "epoch": 1.0915254237288137, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468142592, + "loss": 0.6225, + "grad_norm": 0.3129635751247406, + "learning_rate": 0.00015178495369752213 + }, + { + "step": 162, + "epoch": 1.0983050847457627, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468918272, + "loss": 0.5862, + "grad_norm": 0.20988433063030243, + "learning_rate": 0.00015 + }, + { + "step": 163, + "epoch": 1.1050847457627118, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468394496, + "loss": 0.6459, + "grad_norm": 0.42708247900009155, + "learning_rate": 0.00014821504630247785 + }, + { + "step": 164, + "epoch": 1.111864406779661, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468306944, + "loss": 0.6288, + "grad_norm": 0.3510715961456299, + "learning_rate": 0.00014643034536808387 + }, + { + "step": 165, + "epoch": 1.11864406779661, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468256256, + "loss": 0.6934, + "grad_norm": 0.44160574674606323, + "learning_rate": 0.00014464614992415294 + }, + { + "step": 166, + "epoch": 1.1254237288135593, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468351488, + "loss": 0.6431, + "grad_norm": 0.44434499740600586, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 167, + "epoch": 1.1322033898305084, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468268544, + "loss": 0.6736, + "grad_norm": 0.4769139587879181, + "learning_rate": 0.00014108028602333536 + }, + { + "step": 168, + "epoch": 1.1389830508474577, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468286976, + "loss": 0.5976, + "grad_norm": 0.198052778840065, + "learning_rate": 0.00013929912252011516 + }, + { + "step": 169, + "epoch": 1.1457627118644067, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468374528, + "loss": 0.6083, + "grad_norm": 0.3922422230243683, + "learning_rate": 0.00013751947434318564 + }, + { + "step": 170, + "epoch": 1.152542372881356, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468259328, + "loss": 0.5889, + "grad_norm": 0.1670912355184555, + "learning_rate": 0.00013574159350437261 + }, + { + "step": 171, + "epoch": 1.159322033898305, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468322304, + "loss": 0.642, + "grad_norm": 0.1844894289970398, + "learning_rate": 0.0001339657317652331 + }, + { + "step": 172, + "epoch": 1.1661016949152543, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468230144, + "loss": 0.6521, + "grad_norm": 0.24012227356433868, + "learning_rate": 0.00013219214060140424 + }, + { + "step": 173, + "epoch": 1.1728813559322033, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468529664, + "loss": 0.5717, + "grad_norm": 0.23874901235103607, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 174, + "epoch": 1.1796610169491526, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468253184, + "loss": 0.652, + "grad_norm": 0.3750673830509186, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 175, + "epoch": 1.1864406779661016, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468219392, + "loss": 0.6436, + "grad_norm": 0.2714921832084656, + "learning_rate": 0.00012688750028184818 + }, + { + "step": 176, + "epoch": 1.193220338983051, + "cpu_mem": 1.850802176, + "gpu_mem": 4.468357632, + "loss": 0.6137, + "grad_norm": 0.5542746186256409, + "learning_rate": 0.0001251254992118439 + }, + { + "step": 177, + "epoch": 1.2, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468455936, + "loss": 0.6003, + "grad_norm": 0.2969790995121002, + "learning_rate": 0.00012336702056185453 + }, + { + "step": 178, + "epoch": 1.2067796610169492, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468202496, + "loss": 0.6221, + "grad_norm": 0.3097589910030365, + "learning_rate": 0.00012161231334593851 + }, + { + "step": 179, + "epoch": 1.2135593220338983, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468302336, + "loss": 0.6021, + "grad_norm": 0.6609083414077759, + "learning_rate": 0.00011986162604409015 + }, + { + "step": 180, + "epoch": 1.2203389830508475, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468274688, + "loss": 0.6567, + "grad_norm": 0.2192075550556183, + "learning_rate": 0.00011811520656705348 + }, + { + "step": 181, + "epoch": 1.2271186440677966, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468211712, + "loss": 0.5765, + "grad_norm": 0.22410885989665985, + "learning_rate": 0.00011637330222121543 + }, + { + "step": 182, + "epoch": 1.2338983050847459, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468429824, + "loss": 0.7055, + "grad_norm": 0.914152204990387, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 183, + "epoch": 1.240677966101695, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468326912, + "loss": 0.6088, + "grad_norm": 0.1924823820590973, + "learning_rate": 0.00011290402491686766 + }, + { + "step": 184, + "epoch": 1.2474576271186442, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468274688, + "loss": 0.5698, + "grad_norm": 0.20785626769065857, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 185, + "epoch": 1.2542372881355932, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468253184, + "loss": 0.5831, + "grad_norm": 0.22347433865070343, + "learning_rate": 0.00010945575916653407 + }, + { + "step": 186, + "epoch": 1.2610169491525425, + "cpu_mem": 1.853751296, + "gpu_mem": 4.4682624, + "loss": 0.6008, + "grad_norm": 0.3052065968513489, + "learning_rate": 0.00010774011647378553 + }, + { + "step": 187, + "epoch": 1.2677966101694915, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468194816, + "loss": 0.6478, + "grad_norm": 0.38187873363494873, + "learning_rate": 0.00010603045810453468 + }, + { + "step": 188, + "epoch": 1.2745762711864406, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468357632, + "loss": 0.6533, + "grad_norm": 0.49509212374687195, + "learning_rate": 0.00010432702615951396 + }, + { + "step": 189, + "epoch": 1.2813559322033898, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468227072, + "loss": 0.6395, + "grad_norm": 0.25903093814849854, + "learning_rate": 0.00010263006185774627 + }, + { + "step": 190, + "epoch": 1.288135593220339, + "cpu_mem": 1.853751296, + "gpu_mem": 4.46834688, + "loss": 0.6139, + "grad_norm": 0.2126762568950653, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 191, + "epoch": 1.2949152542372881, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468165632, + "loss": 0.5871, + "grad_norm": 0.235030859708786, + "learning_rate": 9.925649644669391e-05 + }, + { + "step": 192, + "epoch": 1.3016949152542372, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468297728, + "loss": 0.573, + "grad_norm": 0.7183632254600525, + "learning_rate": 9.758037306013526e-05 + }, + { + "step": 193, + "epoch": 1.3084745762711865, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468271616, + "loss": 0.6135, + "grad_norm": 0.24626216292381287, + "learning_rate": 9.591167269463255e-05 + }, + { + "step": 194, + "epoch": 1.3152542372881357, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468237824, + "loss": 0.6622, + "grad_norm": 0.2514670193195343, + "learning_rate": 9.425063165095088e-05 + }, + { + "step": 195, + "epoch": 1.3220338983050848, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468342272, + "loss": 0.5794, + "grad_norm": 0.2309395968914032, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 196, + "epoch": 1.3288135593220338, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468337664, + "loss": 0.6236, + "grad_norm": 0.26372888684272766, + "learning_rate": 9.095246727570879e-05 + }, + { + "step": 197, + "epoch": 1.335593220338983, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468196352, + "loss": 0.5861, + "grad_norm": 0.4842132031917572, + "learning_rate": 8.931581098950973e-05 + }, + { + "step": 198, + "epoch": 1.3423728813559321, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468388352, + "loss": 0.6036, + "grad_norm": 0.33922138810157776, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 199, + "epoch": 1.3491525423728814, + "cpu_mem": 1.853751296, + "gpu_mem": 4.46823936, + "loss": 0.659, + "grad_norm": 0.483968049287796, + "learning_rate": 8.606850900252478e-05 + }, + { + "step": 200, + "epoch": 1.3559322033898304, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468342272, + "loss": 0.6001, + "grad_norm": 0.3576025664806366, + "learning_rate": 8.445832314459608e-05 + }, + { + "step": 201, + "epoch": 1.3627118644067797, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468545024, + "loss": 0.5892, + "grad_norm": 0.2619866132736206, + "learning_rate": 8.285741849059311e-05 + }, + { + "step": 202, + "epoch": 1.3694915254237288, + "cpu_mem": 1.853751296, + "gpu_mem": 4.46834688, + "loss": 0.6048, + "grad_norm": 0.2648727297782898, + "learning_rate": 8.126602174088843e-05 + }, + { + "step": 203, + "epoch": 1.376271186440678, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468233216, + "loss": 0.5363, + "grad_norm": 0.4723322093486786, + "learning_rate": 7.968435824946242e-05 + }, + { + "step": 204, + "epoch": 1.383050847457627, + "cpu_mem": 1.853751296, + "gpu_mem": 4.46824704, + "loss": 0.6113, + "grad_norm": 0.27361926436424255, + "learning_rate": 7.811265199199152e-05 + }, + { + "step": 205, + "epoch": 1.3898305084745763, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468291584, + "loss": 0.6462, + "grad_norm": 0.55796217918396, + "learning_rate": 7.655112553413135e-05 + }, + { + "step": 206, + "epoch": 1.3966101694915254, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468233216, + "loss": 0.594, + "grad_norm": 0.2293906807899475, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 207, + "epoch": 1.4033898305084747, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468466688, + "loss": 0.5655, + "grad_norm": 0.43224436044692993, + "learning_rate": 7.345949504086507e-05 + }, + { + "step": 208, + "epoch": 1.4101694915254237, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468497408, + "loss": 0.5342, + "grad_norm": 0.5847765207290649, + "learning_rate": 7.192982880403917e-05 + }, + { + "step": 209, + "epoch": 1.4169491525423727, + "cpu_mem": 1.853751296, + "gpu_mem": 4.46842368, + "loss": 0.674, + "grad_norm": 0.4264238476753235, + "learning_rate": 7.041121790198881e-05 + }, + { + "step": 210, + "epoch": 1.423728813559322, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468311552, + "loss": 0.5961, + "grad_norm": 0.2815016508102417, + "learning_rate": 6.890387738166041e-05 + }, + { + "step": 211, + "epoch": 1.4305084745762713, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468260864, + "loss": 0.5553, + "grad_norm": 0.5523644089698792, + "learning_rate": 6.740802069402771e-05 + }, + { + "step": 212, + "epoch": 1.4372881355932203, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468230144, + "loss": 0.6456, + "grad_norm": 0.27391669154167175, + "learning_rate": 6.592385966386588e-05 + }, + { + "step": 213, + "epoch": 1.4440677966101694, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468253184, + "loss": 0.6732, + "grad_norm": 0.2961755096912384, + "learning_rate": 6.445160445975536e-05 + }, + { + "step": 214, + "epoch": 1.4508474576271186, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468336128, + "loss": 0.5614, + "grad_norm": 0.27295544743537903, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 215, + "epoch": 1.457627118644068, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468263936, + "loss": 0.6628, + "grad_norm": 0.7745182514190674, + "learning_rate": 6.154364374470568e-05 + }, + { + "step": 216, + "epoch": 1.464406779661017, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468429824, + "loss": 0.5718, + "grad_norm": 0.26692095398902893, + "learning_rate": 6.010835002329795e-05 + }, + { + "step": 217, + "epoch": 1.471186440677966, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468271616, + "loss": 0.6542, + "grad_norm": 0.7538592219352722, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 218, + "epoch": 1.4779661016949153, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468248576, + "loss": 0.5793, + "grad_norm": 0.22904421389102936, + "learning_rate": 5.72761520669092e-05 + }, + { + "step": 219, + "epoch": 1.4847457627118645, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468374528, + "loss": 0.6434, + "grad_norm": 0.2408234030008316, + "learning_rate": 5.587964889287218e-05 + }, + { + "step": 220, + "epoch": 1.4915254237288136, + "cpu_mem": 1.853751296, + "gpu_mem": 4.46840832, + "loss": 0.6108, + "grad_norm": 0.3602496087551117, + "learning_rate": 5.449647388213678e-05 + }, + { + "step": 221, + "epoch": 1.4983050847457626, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468276224, + "loss": 0.6844, + "grad_norm": 0.511895477771759, + "learning_rate": 5.312682290288869e-05 + }, + { + "step": 222, + "epoch": 1.505084745762712, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468412928, + "loss": 0.5369, + "grad_norm": 0.4397069811820984, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 223, + "epoch": 1.5118644067796612, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468326912, + "loss": 0.5379, + "grad_norm": 0.35749414563179016, + "learning_rate": 5.0428866908599864e-05 + }, + { + "step": 224, + "epoch": 1.5186440677966102, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468291584, + "loss": 0.5784, + "grad_norm": 0.32093074917793274, + "learning_rate": 4.9100943944812114e-05 + }, + { + "step": 225, + "epoch": 1.5254237288135593, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468256256, + "loss": 0.6256, + "grad_norm": 0.2618851065635681, + "learning_rate": 4.778730906091632e-05 + }, + { + "step": 226, + "epoch": 1.5322033898305085, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468405248, + "loss": 0.5765, + "grad_norm": 0.43652087450027466, + "learning_rate": 4.648814827768322e-05 + }, + { + "step": 227, + "epoch": 1.5389830508474578, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468294656, + "loss": 0.5655, + "grad_norm": 0.27720674872398376, + "learning_rate": 4.5203645566239816e-05 + }, + { + "step": 228, + "epoch": 1.5457627118644068, + "cpu_mem": 1.853751296, + "gpu_mem": 4.46823936, + "loss": 0.6989, + "grad_norm": 0.2755570709705353, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 229, + "epoch": 1.5525423728813559, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468180992, + "loss": 0.6325, + "grad_norm": 0.3191831707954407, + "learning_rate": 4.267933983899601e-05 + }, + { + "step": 230, + "epoch": 1.559322033898305, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468237824, + "loss": 0.6165, + "grad_norm": 0.2783920466899872, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 231, + "epoch": 1.5661016949152542, + "cpu_mem": 1.853751296, + "gpu_mem": 4.46851584, + "loss": 0.6547, + "grad_norm": 0.3163803815841675, + "learning_rate": 4.0215821672741213e-05 + }, + { + "step": 232, + "epoch": 1.5728813559322035, + "cpu_mem": 1.853751296, + "gpu_mem": 4.46823936, + "loss": 0.6686, + "grad_norm": 0.36094409227371216, + "learning_rate": 3.900729534256745e-05 + }, + { + "step": 233, + "epoch": 1.5796610169491525, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468552704, + "loss": 0.6203, + "grad_norm": 0.23762071132659912, + "learning_rate": 3.781448643031187e-05 + }, + { + "step": 234, + "epoch": 1.5864406779661016, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468428288, + "loss": 0.5908, + "grad_norm": 0.35159897804260254, + "learning_rate": 3.663756384686127e-05 + }, + { + "step": 235, + "epoch": 1.5932203389830508, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468184064, + "loss": 0.5946, + "grad_norm": 0.6495630741119385, + "learning_rate": 3.547669425347647e-05 + }, + { + "step": 236, + "epoch": 1.6, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468243968, + "loss": 0.6237, + "grad_norm": 0.2745390236377716, + "learning_rate": 3.433204203819185e-05 + }, + { + "step": 237, + "epoch": 1.6067796610169491, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468305408, + "loss": 0.5639, + "grad_norm": 0.5049595236778259, + "learning_rate": 3.3203769292536764e-05 + }, + { + "step": 238, + "epoch": 1.6135593220338982, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468306944, + "loss": 0.6509, + "grad_norm": 0.5199962258338928, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 239, + "epoch": 1.6203389830508474, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468560384, + "loss": 0.6964, + "grad_norm": 0.34460026025772095, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 240, + "epoch": 1.6271186440677967, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468210176, + "loss": 0.6684, + "grad_norm": 0.21762734651565552, + "learning_rate": 2.9918813861345952e-05 + }, + { + "step": 241, + "epoch": 1.6338983050847458, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468506624, + "loss": 0.616, + "grad_norm": 0.4349551498889923, + "learning_rate": 2.885763318295102e-05 + }, + { + "step": 242, + "epoch": 1.6406779661016948, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468368384, + "loss": 0.6413, + "grad_norm": 0.31837329268455505, + "learning_rate": 2.781360719244964e-05 + }, + { + "step": 243, + "epoch": 1.647457627118644, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468220928, + "loss": 0.7088, + "grad_norm": 0.525629997253418, + "learning_rate": 2.6786883731926306e-05 + }, + { + "step": 244, + "epoch": 1.6542372881355933, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468360704, + "loss": 0.5857, + "grad_norm": 0.3820871412754059, + "learning_rate": 2.5777608193294396e-05 + }, + { + "step": 245, + "epoch": 1.6610169491525424, + "cpu_mem": 1.853751296, + "gpu_mem": 4.46823936, + "loss": 0.6388, + "grad_norm": 0.2547449469566345, + "learning_rate": 2.4785923497707956e-05 + }, + { + "step": 246, + "epoch": 1.6677966101694914, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468333056, + "loss": 0.6052, + "grad_norm": 0.320404589176178, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 247, + "epoch": 1.6745762711864407, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468351488, + "loss": 0.6075, + "grad_norm": 0.3338000476360321, + "learning_rate": 2.285588584541047e-05 + }, + { + "step": 248, + "epoch": 1.68135593220339, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468303872, + "loss": 0.5874, + "grad_norm": 0.2440679371356964, + "learning_rate": 2.1917806196827792e-05 + }, + { + "step": 249, + "epoch": 1.688135593220339, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468210176, + "loss": 0.5831, + "grad_norm": 0.29768481850624084, + "learning_rate": 2.0997863968844914e-05 + }, + { + "step": 250, + "epoch": 1.694915254237288, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468302336, + "loss": 0.5316, + "grad_norm": 0.4611869752407074, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 251, + "epoch": 1.7016949152542373, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468214784, + "loss": 0.638, + "grad_norm": 0.25020280480384827, + "learning_rate": 1.921291027132278e-05 + }, + { + "step": 252, + "epoch": 1.7084745762711866, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468257792, + "loss": 0.6803, + "grad_norm": 0.38832998275756836, + "learning_rate": 1.834815156491165e-05 + }, + { + "step": 253, + "epoch": 1.7152542372881356, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468451328, + "loss": 0.6573, + "grad_norm": 0.28276100754737854, + "learning_rate": 1.750203576956341e-05 + }, + { + "step": 254, + "epoch": 1.7220338983050847, + "cpu_mem": 1.853751296, + "gpu_mem": 4.46824704, + "loss": 0.6542, + "grad_norm": 0.3274550437927246, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 255, + "epoch": 1.7288135593220337, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468403712, + "loss": 0.6741, + "grad_norm": 0.7640435099601746, + "learning_rate": 1.5866209521043304e-05 + }, + { + "step": 256, + "epoch": 1.735593220338983, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468230144, + "loss": 0.5957, + "grad_norm": 0.271648645401001, + "learning_rate": 1.5076730713409523e-05 + }, + { + "step": 257, + "epoch": 1.7423728813559323, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468643328, + "loss": 0.5997, + "grad_norm": 0.34759384393692017, + "learning_rate": 1.4306358075111923e-05 + }, + { + "step": 258, + "epoch": 1.7491525423728813, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468302336, + "loss": 0.5801, + "grad_norm": 0.5884922742843628, + "learning_rate": 1.3555200696822232e-05 + }, + { + "step": 259, + "epoch": 1.7559322033898304, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468219392, + "loss": 0.6205, + "grad_norm": 0.28841882944107056, + "learning_rate": 1.2823364948184095e-05 + }, + { + "step": 260, + "epoch": 1.7627118644067796, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468336128, + "loss": 0.5421, + "grad_norm": 0.25886058807373047, + "learning_rate": 1.2110954462750166e-05 + }, + { + "step": 261, + "epoch": 1.769491525423729, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468291584, + "loss": 0.5227, + "grad_norm": 0.5446920394897461, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 262, + "epoch": 1.776271186440678, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468248576, + "loss": 0.6253, + "grad_norm": 0.3370090126991272, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 263, + "epoch": 1.783050847457627, + "cpu_mem": 1.853751296, + "gpu_mem": 4.46828544, + "loss": 0.5821, + "grad_norm": 0.3236730098724365, + "learning_rate": 1.0091269574384874e-05 + }, + { + "step": 264, + "epoch": 1.7898305084745763, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468372992, + "loss": 0.5609, + "grad_norm": 0.32639986276626587, + "learning_rate": 9.45754125003576e-06 + }, + { + "step": 265, + "epoch": 1.7966101694915255, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468291584, + "loss": 0.6096, + "grad_norm": 0.32308170199394226, + "learning_rate": 8.843714815330987e-06 + }, + { + "step": 266, + "epoch": 1.8033898305084746, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468506624, + "loss": 0.6637, + "grad_norm": 2.7500827312469482, + "learning_rate": 8.249877192799731e-06 + }, + { + "step": 267, + "epoch": 1.8101694915254236, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468299264, + "loss": 0.5843, + "grad_norm": 0.7114717364311218, + "learning_rate": 7.676112474402068e-06 + }, + { + "step": 268, + "epoch": 1.8169491525423729, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468303872, + "loss": 0.6054, + "grad_norm": 0.47407907247543335, + "learning_rate": 7.122501909620926e-06 + }, + { + "step": 269, + "epoch": 1.8237288135593221, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468314624, + "loss": 0.6062, + "grad_norm": 0.47521302103996277, + "learning_rate": 6.5891238939566275e-06 + }, + { + "step": 270, + "epoch": 1.8305084745762712, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468353024, + "loss": 0.6826, + "grad_norm": 0.26419681310653687, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 271, + "epoch": 1.8372881355932202, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468405248, + "loss": 0.5893, + "grad_norm": 0.3205205500125885, + "learning_rate": 5.583364755863701e-06 + }, + { + "step": 272, + "epoch": 1.8440677966101695, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468263936, + "loss": 0.6154, + "grad_norm": 0.46389511227607727, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 273, + "epoch": 1.8508474576271188, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468144128, + "loss": 0.6674, + "grad_norm": 0.3916938304901123, + "learning_rate": 4.659404732773908e-06 + }, + { + "step": 274, + "epoch": 1.8576271186440678, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468371456, + "loss": 0.6905, + "grad_norm": 0.4528336822986603, + "learning_rate": 4.228264751468752e-06 + }, + { + "step": 275, + "epoch": 1.8644067796610169, + "cpu_mem": 1.853751296, + "gpu_mem": 4.46861568, + "loss": 0.5567, + "grad_norm": 0.22137446701526642, + "learning_rate": 3.817767165451041e-06 + }, + { + "step": 276, + "epoch": 1.8711864406779661, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468276224, + "loss": 0.6386, + "grad_norm": 0.25773534178733826, + "learning_rate": 3.4279701043260886e-06 + }, + { + "step": 277, + "epoch": 1.8779661016949154, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468222464, + "loss": 0.6737, + "grad_norm": 0.3573743999004364, + "learning_rate": 3.0589287663461472e-06 + }, + { + "step": 278, + "epoch": 1.8847457627118644, + "cpu_mem": 1.853751296, + "gpu_mem": 4.46838528, + "loss": 0.6179, + "grad_norm": 0.23479345440864563, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 279, + "epoch": 1.8915254237288135, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468325376, + "loss": 0.6682, + "grad_norm": 0.25542300939559937, + "learning_rate": 2.3833193495825853e-06 + }, + { + "step": 280, + "epoch": 1.8983050847457628, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468305408, + "loss": 0.6634, + "grad_norm": 0.4205879271030426, + "learning_rate": 2.076846942272026e-06 + }, + { + "step": 281, + "epoch": 1.905084745762712, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468240896, + "loss": 0.652, + "grad_norm": 0.27616575360298157, + "learning_rate": 1.791321587504768e-06 + }, + { + "step": 282, + "epoch": 1.911864406779661, + "cpu_mem": 1.853751296, + "gpu_mem": 4.46866944, + "loss": 0.6324, + "grad_norm": 0.7122885584831238, + "learning_rate": 1.5267837178600972e-06 + }, + { + "step": 283, + "epoch": 1.9186440677966101, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468376064, + "loss": 0.6099, + "grad_norm": 0.29265135526657104, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 284, + "epoch": 1.9254237288135592, + "cpu_mem": 1.853751296, + "gpu_mem": 4.46823168, + "loss": 0.5862, + "grad_norm": 0.2867797017097473, + "learning_rate": 1.0608172990067553e-06 + }, + { + "step": 285, + "epoch": 1.9322033898305084, + "cpu_mem": 1.853751296, + "gpu_mem": 4.46828544, + "loss": 0.6328, + "grad_norm": 0.32424023747444153, + "learning_rate": 8.594547342153979e-07 + }, + { + "step": 286, + "epoch": 1.9389830508474577, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468703232, + "loss": 0.5849, + "grad_norm": 0.5058652758598328, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 287, + "epoch": 1.9457627118644067, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468472832, + "loss": 0.6561, + "grad_norm": 0.5653034448623657, + "learning_rate": 5.201134622801473e-07 + }, + { + "step": 288, + "epoch": 1.9525423728813558, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468257792, + "loss": 0.6279, + "grad_norm": 0.35074102878570557, + "learning_rate": 3.821828084619727e-07 + }, + { + "step": 289, + "epoch": 1.959322033898305, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468342272, + "loss": 0.6451, + "grad_norm": 0.43168380856513977, + "learning_rate": 2.654391846207915e-07 + }, + { + "step": 290, + "epoch": 1.9661016949152543, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468267008, + "loss": 0.7058, + "grad_norm": 0.6807760000228882, + "learning_rate": 1.6989912254880556e-07 + }, + { + "step": 291, + "epoch": 1.9728813559322034, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468302336, + "loss": 0.6867, + "grad_norm": 0.33638420701026917, + "learning_rate": 9.557615145123765e-08 + }, + { + "step": 292, + "epoch": 1.9796610169491524, + "cpu_mem": 1.853751296, + "gpu_mem": 4.46838528, + "loss": 0.6334, + "grad_norm": 0.3186628222465515, + "learning_rate": 4.248079603064724e-08 + }, + { + "step": 293, + "epoch": 1.9864406779661017, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468302336, + "loss": 0.7583, + "grad_norm": 0.4976275563240051, + "learning_rate": 1.0620574996372811e-08 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468328448, + "loss": 0.6494, + "grad_norm": 0.48173174262046814, + "learning_rate": 0.0 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.853751296, + "gpu_mem": 4.468328448, + "train_runtime": 8788.6119, + "train_samples_per_second": 2.145, + "train_steps_per_second": 0.033, + "total_flos": 4.709175406932787e+16, + "train_loss": 1.3969791707943897 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r32-a2/README.md b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r32-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r32-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r32-a2/adapter_config.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0ef1d724eca7640a4f365c193cda2fc4efdb2073 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r32-a2/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha": 64, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "inference_mode": true, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 32, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r32-a2/eval_results.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..5106719f7546734181da15e7f4e2e40fcc1cf4f6 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "boolq", + "results": 0.6844036697247706 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r32-a2/training_configuration.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..31421eeb0759ac357feffd1b80ad7c6a8c990f24 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "BOOLQ", + "dataset_id": "google/boolq", + "preprocess_id": "boolq_train_deepeval" + }, + "peft_config": { + "method": "loha", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 50462720 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 2, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loha-boolq-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r32-a2", + "seed": 42, + "timestamp": "2025-09-13T20:49:07.709424" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r32-a2/training_logs.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..41ed02b241ebd2b001ef31c8125ad5bd30beee98 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r32-a2/training_logs.json @@ -0,0 +1,2659 @@ +[ + { + "step": 1, + "epoch": 0.006779661016949152, + "cpu_mem": 1.846718464, + "gpu_mem": 4.619623424, + "loss": 8.869, + "grad_norm": 3.042114496231079, + "learning_rate": 9.999999999999999e-06 + }, + { + "step": 2, + "epoch": 0.013559322033898305, + "cpu_mem": 1.853206528, + "gpu_mem": 5.023462912, + "loss": 8.9376, + "grad_norm": 3.146047830581665, + "learning_rate": 1.9999999999999998e-05 + }, + { + "step": 3, + "epoch": 0.020338983050847456, + "cpu_mem": 1.853796352, + "gpu_mem": 5.023381504, + "loss": 8.9295, + "grad_norm": 3.1460790634155273, + "learning_rate": 2.9999999999999997e-05 + }, + { + "step": 4, + "epoch": 0.02711864406779661, + "cpu_mem": 1.854582784, + "gpu_mem": 5.023381504, + "loss": 8.8601, + "grad_norm": 3.2027969360351562, + "learning_rate": 3.9999999999999996e-05 + }, + { + "step": 5, + "epoch": 0.03389830508474576, + "cpu_mem": 1.854976, + "gpu_mem": 5.023316992, + "loss": 8.6577, + "grad_norm": 3.3020784854888916, + "learning_rate": 4.9999999999999996e-05 + }, + { + "step": 6, + "epoch": 0.04067796610169491, + "cpu_mem": 1.855565824, + "gpu_mem": 5.02333696, + "loss": 8.7417, + "grad_norm": 3.018744945526123, + "learning_rate": 5.9999999999999995e-05 + }, + { + "step": 7, + "epoch": 0.04745762711864407, + "cpu_mem": 1.856155648, + "gpu_mem": 5.023389184, + "loss": 8.5601, + "grad_norm": 3.2627673149108887, + "learning_rate": 7e-05 + }, + { + "step": 8, + "epoch": 0.05423728813559322, + "cpu_mem": 1.856745472, + "gpu_mem": 5.0234752, + "loss": 8.3684, + "grad_norm": 3.352971315383911, + "learning_rate": 7.999999999999999e-05 + }, + { + "step": 9, + "epoch": 0.061016949152542375, + "cpu_mem": 1.857138688, + "gpu_mem": 5.02338304, + "loss": 7.9703, + "grad_norm": 3.3595988750457764, + "learning_rate": 8.999999999999999e-05 + }, + { + "step": 10, + "epoch": 0.06779661016949153, + "cpu_mem": 1.857728512, + "gpu_mem": 5.0232832, + "loss": 7.8138, + "grad_norm": 3.24662184715271, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 11, + "epoch": 0.07457627118644068, + "cpu_mem": 1.858121728, + "gpu_mem": 5.023387648, + "loss": 7.601, + "grad_norm": 3.395935297012329, + "learning_rate": 0.00010999999999999998 + }, + { + "step": 12, + "epoch": 0.08135593220338982, + "cpu_mem": 1.858514944, + "gpu_mem": 5.02375936, + "loss": 7.3328, + "grad_norm": 3.423154592514038, + "learning_rate": 0.00011999999999999999 + }, + { + "step": 13, + "epoch": 0.08813559322033898, + "cpu_mem": 1.85890816, + "gpu_mem": 5.023363072, + "loss": 7.028, + "grad_norm": 3.327317476272583, + "learning_rate": 0.00013 + }, + { + "step": 14, + "epoch": 0.09491525423728814, + "cpu_mem": 1.859301376, + "gpu_mem": 5.023340032, + "loss": 6.4518, + "grad_norm": 3.477672576904297, + "learning_rate": 0.00014 + }, + { + "step": 15, + "epoch": 0.1016949152542373, + "cpu_mem": 1.859694592, + "gpu_mem": 5.023278592, + "loss": 6.043, + "grad_norm": 3.3920552730560303, + "learning_rate": 0.00015 + }, + { + "step": 16, + "epoch": 0.10847457627118644, + "cpu_mem": 1.860087808, + "gpu_mem": 5.023363072, + "loss": 5.6741, + "grad_norm": 3.3612961769104004, + "learning_rate": 0.00015999999999999999 + }, + { + "step": 17, + "epoch": 0.1152542372881356, + "cpu_mem": 1.860481024, + "gpu_mem": 5.023403008, + "loss": 4.9799, + "grad_norm": 3.5613696575164795, + "learning_rate": 0.00016999999999999999 + }, + { + "step": 18, + "epoch": 0.12203389830508475, + "cpu_mem": 1.860677632, + "gpu_mem": 5.023465984, + "loss": 4.8027, + "grad_norm": 3.5409622192382812, + "learning_rate": 0.00017999999999999998 + }, + { + "step": 19, + "epoch": 0.1288135593220339, + "cpu_mem": 1.861070848, + "gpu_mem": 5.023303168, + "loss": 3.6884, + "grad_norm": 3.2301747798919678, + "learning_rate": 0.00018999999999999998 + }, + { + "step": 20, + "epoch": 0.13559322033898305, + "cpu_mem": 1.861464064, + "gpu_mem": 5.023415296, + "loss": 3.1386, + "grad_norm": 2.9546728134155273, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 21, + "epoch": 0.1423728813559322, + "cpu_mem": 1.861660672, + "gpu_mem": 5.023573504, + "loss": 2.8159, + "grad_norm": 2.5422348976135254, + "learning_rate": 0.00020999999999999998 + }, + { + "step": 22, + "epoch": 0.14915254237288136, + "cpu_mem": 1.862053888, + "gpu_mem": 5.023465984, + "loss": 2.2352, + "grad_norm": 1.905133605003357, + "learning_rate": 0.00021999999999999995 + }, + { + "step": 23, + "epoch": 0.15593220338983052, + "cpu_mem": 1.862250496, + "gpu_mem": 5.023438336, + "loss": 2.0096, + "grad_norm": 1.4887255430221558, + "learning_rate": 0.00023 + }, + { + "step": 24, + "epoch": 0.16271186440677965, + "cpu_mem": 1.862447104, + "gpu_mem": 5.023495168, + "loss": 1.416, + "grad_norm": 1.0154629945755005, + "learning_rate": 0.00023999999999999998 + }, + { + "step": 25, + "epoch": 0.1694915254237288, + "cpu_mem": 1.86284032, + "gpu_mem": 5.023280128, + "loss": 1.3383, + "grad_norm": 0.8761482238769531, + "learning_rate": 0.00025 + }, + { + "step": 26, + "epoch": 0.17627118644067796, + "cpu_mem": 1.863036928, + "gpu_mem": 5.023335424, + "loss": 1.1409, + "grad_norm": 0.6635974645614624, + "learning_rate": 0.00026 + }, + { + "step": 27, + "epoch": 0.18305084745762712, + "cpu_mem": 1.863430144, + "gpu_mem": 5.023627264, + "loss": 0.87, + "grad_norm": 0.5957152843475342, + "learning_rate": 0.00027 + }, + { + "step": 28, + "epoch": 0.18983050847457628, + "cpu_mem": 1.863626752, + "gpu_mem": 5.02330624, + "loss": 0.9038, + "grad_norm": 0.4361616373062134, + "learning_rate": 0.00028 + }, + { + "step": 29, + "epoch": 0.19661016949152543, + "cpu_mem": 1.864019968, + "gpu_mem": 5.023370752, + "loss": 0.8665, + "grad_norm": 0.47157764434814453, + "learning_rate": 0.00029 + }, + { + "step": 30, + "epoch": 0.2033898305084746, + "cpu_mem": 1.864216576, + "gpu_mem": 5.023449088, + "loss": 0.8217, + "grad_norm": 0.4374017119407654, + "learning_rate": 0.0003 + }, + { + "step": 31, + "epoch": 0.21016949152542372, + "cpu_mem": 1.864413184, + "gpu_mem": 5.02325248, + "loss": 0.635, + "grad_norm": 0.17127206921577454, + "learning_rate": 0.0002999893794250036 + }, + { + "step": 32, + "epoch": 0.21694915254237288, + "cpu_mem": 1.864609792, + "gpu_mem": 5.023366144, + "loss": 0.7516, + "grad_norm": 0.23733778297901154, + "learning_rate": 0.00029995751920396937 + }, + { + "step": 33, + "epoch": 0.22372881355932203, + "cpu_mem": 1.8648064, + "gpu_mem": 5.023604224, + "loss": 0.6732, + "grad_norm": 0.394763320684433, + "learning_rate": 0.00029990442384854874 + }, + { + "step": 34, + "epoch": 0.2305084745762712, + "cpu_mem": 1.865003008, + "gpu_mem": 5.02330624, + "loss": 0.7163, + "grad_norm": 0.7849714756011963, + "learning_rate": 0.0002998301008774512 + }, + { + "step": 35, + "epoch": 0.23728813559322035, + "cpu_mem": 1.865199616, + "gpu_mem": 5.023516672, + "loss": 0.6832, + "grad_norm": 0.3544332683086395, + "learning_rate": 0.0002997345608153792 + }, + { + "step": 36, + "epoch": 0.2440677966101695, + "cpu_mem": 1.865396224, + "gpu_mem": 5.02346752, + "loss": 0.7138, + "grad_norm": 0.32547086477279663, + "learning_rate": 0.000299617817191538 + }, + { + "step": 37, + "epoch": 0.25084745762711863, + "cpu_mem": 1.865592832, + "gpu_mem": 5.023278592, + "loss": 0.7027, + "grad_norm": 0.2586589455604553, + "learning_rate": 0.0002994798865377198 + }, + { + "step": 38, + "epoch": 0.2576271186440678, + "cpu_mem": 1.865986048, + "gpu_mem": 5.023525888, + "loss": 0.7468, + "grad_norm": 0.4783079922199249, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 39, + "epoch": 0.26440677966101694, + "cpu_mem": 1.866182656, + "gpu_mem": 5.02390528, + "loss": 0.6703, + "grad_norm": 0.24983654916286469, + "learning_rate": 0.0002991405452657846 + }, + { + "step": 40, + "epoch": 0.2711864406779661, + "cpu_mem": 1.866182656, + "gpu_mem": 5.0234752, + "loss": 0.6589, + "grad_norm": 0.4332180917263031, + "learning_rate": 0.00029893918270099324 + }, + { + "step": 41, + "epoch": 0.27796610169491526, + "cpu_mem": 1.866379264, + "gpu_mem": 5.023702528, + "loss": 0.6867, + "grad_norm": 0.2247151881456375, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 42, + "epoch": 0.2847457627118644, + "cpu_mem": 1.866575872, + "gpu_mem": 5.023599616, + "loss": 0.6635, + "grad_norm": 0.19143211841583252, + "learning_rate": 0.0002984732162821399 + }, + { + "step": 43, + "epoch": 0.29152542372881357, + "cpu_mem": 1.86677248, + "gpu_mem": 5.02342144, + "loss": 0.6223, + "grad_norm": 0.29915115237236023, + "learning_rate": 0.0002982086784124952 + }, + { + "step": 44, + "epoch": 0.2983050847457627, + "cpu_mem": 1.866969088, + "gpu_mem": 5.023564288, + "loss": 0.6186, + "grad_norm": 0.18584896624088287, + "learning_rate": 0.00029792315305772796 + }, + { + "step": 45, + "epoch": 0.3050847457627119, + "cpu_mem": 1.866969088, + "gpu_mem": 5.02334464, + "loss": 0.8531, + "grad_norm": 1.1423810720443726, + "learning_rate": 0.0002976166806504174 + }, + { + "step": 46, + "epoch": 0.31186440677966104, + "cpu_mem": 1.867165696, + "gpu_mem": 5.023587328, + "loss": 0.6769, + "grad_norm": 0.20281441509723663, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 47, + "epoch": 0.31864406779661014, + "cpu_mem": 1.867362304, + "gpu_mem": 5.023310848, + "loss": 0.6337, + "grad_norm": 0.48029351234436035, + "learning_rate": 0.00029694107123365385 + }, + { + "step": 48, + "epoch": 0.3254237288135593, + "cpu_mem": 1.867558912, + "gpu_mem": 5.023387648, + "loss": 0.6872, + "grad_norm": 0.744110643863678, + "learning_rate": 0.00029657202989567393 + }, + { + "step": 49, + "epoch": 0.33220338983050846, + "cpu_mem": 1.86775552, + "gpu_mem": 5.023404544, + "loss": 0.7656, + "grad_norm": 0.46564292907714844, + "learning_rate": 0.00029618223283454893 + }, + { + "step": 50, + "epoch": 0.3389830508474576, + "cpu_mem": 1.86775552, + "gpu_mem": 5.023343104, + "loss": 0.6049, + "grad_norm": 0.23375388979911804, + "learning_rate": 0.00029577173524853123 + }, + { + "step": 51, + "epoch": 0.34576271186440677, + "cpu_mem": 1.867952128, + "gpu_mem": 5.023347712, + "loss": 0.6173, + "grad_norm": 0.28572502732276917, + "learning_rate": 0.0002953405952672261 + }, + { + "step": 52, + "epoch": 0.3525423728813559, + "cpu_mem": 1.867952128, + "gpu_mem": 5.023427584, + "loss": 0.7053, + "grad_norm": 0.4270164668560028, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 53, + "epoch": 0.3593220338983051, + "cpu_mem": 1.868148736, + "gpu_mem": 5.023450624, + "loss": 0.5915, + "grad_norm": 0.13602399826049805, + "learning_rate": 0.0002944166352441363 + }, + { + "step": 54, + "epoch": 0.36610169491525424, + "cpu_mem": 1.868148736, + "gpu_mem": 5.023378432, + "loss": 0.7928, + "grad_norm": 0.6496982574462891, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 55, + "epoch": 0.3728813559322034, + "cpu_mem": 1.868345344, + "gpu_mem": 5.023648768, + "loss": 0.6391, + "grad_norm": 0.17558370530605316, + "learning_rate": 0.00029341087610604337 + }, + { + "step": 56, + "epoch": 0.37966101694915255, + "cpu_mem": 1.868345344, + "gpu_mem": 5.023435264, + "loss": 0.6578, + "grad_norm": 0.1138649508357048, + "learning_rate": 0.00029287749809037904 + }, + { + "step": 57, + "epoch": 0.3864406779661017, + "cpu_mem": 1.868345344, + "gpu_mem": 5.02342912, + "loss": 0.6502, + "grad_norm": 0.4735816419124603, + "learning_rate": 0.0002923238875255979 + }, + { + "step": 58, + "epoch": 0.39322033898305087, + "cpu_mem": 1.868541952, + "gpu_mem": 5.023324672, + "loss": 0.6207, + "grad_norm": 0.35316169261932373, + "learning_rate": 0.00029175012280720024 + }, + { + "step": 59, + "epoch": 0.4, + "cpu_mem": 1.868541952, + "gpu_mem": 5.023341568, + "loss": 0.6709, + "grad_norm": 0.16203640401363373, + "learning_rate": 0.000291156285184669 + }, + { + "step": 60, + "epoch": 0.4067796610169492, + "cpu_mem": 1.86873856, + "gpu_mem": 5.023435264, + "loss": 0.6008, + "grad_norm": 0.18635128438472748, + "learning_rate": 0.00029054245874996426 + }, + { + "step": 61, + "epoch": 0.4135593220338983, + "cpu_mem": 1.86873856, + "gpu_mem": 5.023446016, + "loss": 0.6121, + "grad_norm": 0.128395214676857, + "learning_rate": 0.0002899087304256151 + }, + { + "step": 62, + "epoch": 0.42033898305084744, + "cpu_mem": 1.86873856, + "gpu_mem": 5.023433728, + "loss": 0.7691, + "grad_norm": 0.6999316811561584, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 63, + "epoch": 0.4271186440677966, + "cpu_mem": 1.868935168, + "gpu_mem": 5.023426048, + "loss": 0.5718, + "grad_norm": 0.11580169945955276, + "learning_rate": 0.000288581929876693 + }, + { + "step": 64, + "epoch": 0.43389830508474575, + "cpu_mem": 1.869131776, + "gpu_mem": 5.023355392, + "loss": 0.6319, + "grad_norm": 0.1087900847196579, + "learning_rate": 0.0002878890455372498 + }, + { + "step": 65, + "epoch": 0.4406779661016949, + "cpu_mem": 1.869131776, + "gpu_mem": 5.023399936, + "loss": 0.6716, + "grad_norm": 0.11916189640760422, + "learning_rate": 0.0002871766350518159 + }, + { + "step": 66, + "epoch": 0.44745762711864406, + "cpu_mem": 1.869131776, + "gpu_mem": 5.023593472, + "loss": 0.6361, + "grad_norm": 0.3694639503955841, + "learning_rate": 0.00028644479930317775 + }, + { + "step": 67, + "epoch": 0.4542372881355932, + "cpu_mem": 1.869131776, + "gpu_mem": 5.023303168, + "loss": 0.6498, + "grad_norm": 0.3680834174156189, + "learning_rate": 0.00028569364192488803 + }, + { + "step": 68, + "epoch": 0.4610169491525424, + "cpu_mem": 1.869131776, + "gpu_mem": 5.023270912, + "loss": 0.7044, + "grad_norm": 0.3008520305156708, + "learning_rate": 0.00028492326928659045 + }, + { + "step": 69, + "epoch": 0.46779661016949153, + "cpu_mem": 1.869328384, + "gpu_mem": 5.02333696, + "loss": 0.6387, + "grad_norm": 0.10104458779096603, + "learning_rate": 0.00028413379047895665 + }, + { + "step": 70, + "epoch": 0.4745762711864407, + "cpu_mem": 1.869524992, + "gpu_mem": 5.023330816, + "loss": 0.6676, + "grad_norm": 0.14583146572113037, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 71, + "epoch": 0.48135593220338985, + "cpu_mem": 1.869524992, + "gpu_mem": 5.02355968, + "loss": 0.5922, + "grad_norm": 0.16506627202033997, + "learning_rate": 0.0002824979642304366 + }, + { + "step": 72, + "epoch": 0.488135593220339, + "cpu_mem": 1.869524992, + "gpu_mem": 5.023552, + "loss": 0.6587, + "grad_norm": 0.10320814698934555, + "learning_rate": 0.0002816518484350883 + }, + { + "step": 73, + "epoch": 0.49491525423728816, + "cpu_mem": 1.8697216, + "gpu_mem": 5.023518208, + "loss": 0.7194, + "grad_norm": 0.4403783977031708, + "learning_rate": 0.0002807870897286772 + }, + { + "step": 74, + "epoch": 0.5016949152542373, + "cpu_mem": 1.8697216, + "gpu_mem": 5.023378432, + "loss": 0.61, + "grad_norm": 0.23258762061595917, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 75, + "epoch": 0.5084745762711864, + "cpu_mem": 1.8697216, + "gpu_mem": 5.023303168, + "loss": 0.5948, + "grad_norm": 0.4660118818283081, + "learning_rate": 0.000279002136031155 + }, + { + "step": 76, + "epoch": 0.5152542372881356, + "cpu_mem": 1.8697216, + "gpu_mem": 5.023243264, + "loss": 0.6557, + "grad_norm": 0.27212899923324585, + "learning_rate": 0.00027808219380317216 + }, + { + "step": 77, + "epoch": 0.5220338983050847, + "cpu_mem": 1.8697216, + "gpu_mem": 5.023316992, + "loss": 0.578, + "grad_norm": 0.24673743546009064, + "learning_rate": 0.0002771441141545895 + }, + { + "step": 78, + "epoch": 0.5288135593220339, + "cpu_mem": 1.8697216, + "gpu_mem": 5.023369216, + "loss": 0.8033, + "grad_norm": 0.8064074516296387, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 79, + "epoch": 0.535593220338983, + "cpu_mem": 1.869918208, + "gpu_mem": 5.023501312, + "loss": 0.6549, + "grad_norm": 0.2466876208782196, + "learning_rate": 0.000275214076502292 + }, + { + "step": 80, + "epoch": 0.5423728813559322, + "cpu_mem": 1.869918208, + "gpu_mem": 5.023392256, + "loss": 0.6268, + "grad_norm": 0.12053028494119644, + "learning_rate": 0.0002742223918067056 + }, + { + "step": 81, + "epoch": 0.5491525423728814, + "cpu_mem": 1.869918208, + "gpu_mem": 5.023272448, + "loss": 0.636, + "grad_norm": 0.17156650125980377, + "learning_rate": 0.00027321311626807374 + }, + { + "step": 82, + "epoch": 0.5559322033898305, + "cpu_mem": 1.869918208, + "gpu_mem": 5.023341568, + "loss": 0.6705, + "grad_norm": 0.3170614540576935, + "learning_rate": 0.0002721863928075503 + }, + { + "step": 83, + "epoch": 0.5627118644067797, + "cpu_mem": 1.869918208, + "gpu_mem": 5.023441408, + "loss": 0.6676, + "grad_norm": 0.23627905547618866, + "learning_rate": 0.000271142366817049 + }, + { + "step": 84, + "epoch": 0.5694915254237288, + "cpu_mem": 1.869918208, + "gpu_mem": 5.023404544, + "loss": 0.6546, + "grad_norm": 0.4949694275856018, + "learning_rate": 0.00027008118613865406 + }, + { + "step": 85, + "epoch": 0.576271186440678, + "cpu_mem": 1.869918208, + "gpu_mem": 5.0234368, + "loss": 0.6096, + "grad_norm": 0.5397067666053772, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 86, + "epoch": 0.5830508474576271, + "cpu_mem": 1.869918208, + "gpu_mem": 5.023387648, + "loss": 0.6175, + "grad_norm": 0.1353190839290619, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 87, + "epoch": 0.5898305084745763, + "cpu_mem": 1.870114816, + "gpu_mem": 5.023395328, + "loss": 0.642, + "grad_norm": 0.2982172667980194, + "learning_rate": 0.00026679623070746325 + }, + { + "step": 88, + "epoch": 0.5966101694915255, + "cpu_mem": 1.870114816, + "gpu_mem": 5.023539712, + "loss": 0.6022, + "grad_norm": 0.16495056450366974, + "learning_rate": 0.0002656679579618081 + }, + { + "step": 89, + "epoch": 0.6033898305084746, + "cpu_mem": 1.870311424, + "gpu_mem": 5.0233216, + "loss": 0.6863, + "grad_norm": 0.5650367736816406, + "learning_rate": 0.0002645233057465235 + }, + { + "step": 90, + "epoch": 0.6101694915254238, + "cpu_mem": 1.870311424, + "gpu_mem": 5.02337536, + "loss": 0.6662, + "grad_norm": 0.2848023474216461, + "learning_rate": 0.00026336243615313873 + }, + { + "step": 91, + "epoch": 0.6169491525423729, + "cpu_mem": 1.870311424, + "gpu_mem": 5.023343104, + "loss": 0.5955, + "grad_norm": 0.24643996357917786, + "learning_rate": 0.00026218551356968814 + }, + { + "step": 92, + "epoch": 0.6237288135593221, + "cpu_mem": 1.870311424, + "gpu_mem": 5.023424512, + "loss": 0.6867, + "grad_norm": 0.2992117702960968, + "learning_rate": 0.00026099270465743254 + }, + { + "step": 93, + "epoch": 0.6305084745762712, + "cpu_mem": 1.870311424, + "gpu_mem": 5.023227904, + "loss": 0.7091, + "grad_norm": 0.11756899207830429, + "learning_rate": 0.0002597841783272588 + }, + { + "step": 94, + "epoch": 0.6372881355932203, + "cpu_mem": 1.870311424, + "gpu_mem": 5.023341568, + "loss": 0.6598, + "grad_norm": 0.6866552233695984, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 95, + "epoch": 0.6440677966101694, + "cpu_mem": 1.870311424, + "gpu_mem": 5.023361536, + "loss": 0.6936, + "grad_norm": 0.3745550513267517, + "learning_rate": 0.00025732066016100394 + }, + { + "step": 96, + "epoch": 0.6508474576271186, + "cpu_mem": 1.870311424, + "gpu_mem": 5.023399936, + "loss": 0.6284, + "grad_norm": 0.2467467188835144, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 97, + "epoch": 0.6576271186440678, + "cpu_mem": 1.870311424, + "gpu_mem": 5.023384576, + "loss": 0.6718, + "grad_norm": 0.19287262856960297, + "learning_rate": 0.0002547963544337602 + }, + { + "step": 98, + "epoch": 0.6644067796610169, + "cpu_mem": 1.870508032, + "gpu_mem": 5.023297024, + "loss": 0.656, + "grad_norm": 0.3111989200115204, + "learning_rate": 0.0002535118517223168 + }, + { + "step": 99, + "epoch": 0.6711864406779661, + "cpu_mem": 1.870508032, + "gpu_mem": 5.023246336, + "loss": 0.6494, + "grad_norm": 0.357318639755249, + "learning_rate": 0.00025221269093908365 + }, + { + "step": 100, + "epoch": 0.6779661016949152, + "cpu_mem": 1.870508032, + "gpu_mem": 5.023363072, + "loss": 0.6283, + "grad_norm": 0.13202090561389923, + "learning_rate": 0.0002508990560551879 + }, + { + "step": 101, + "epoch": 0.6847457627118644, + "cpu_mem": 1.870508032, + "gpu_mem": 5.023395328, + "loss": 0.6475, + "grad_norm": 0.21935313940048218, + "learning_rate": 0.0002495711330914001 + }, + { + "step": 102, + "epoch": 0.6915254237288135, + "cpu_mem": 1.870508032, + "gpu_mem": 5.02342912, + "loss": 0.6299, + "grad_norm": 0.11016760021448135, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 103, + "epoch": 0.6983050847457627, + "cpu_mem": 1.870508032, + "gpu_mem": 5.023479808, + "loss": 0.6521, + "grad_norm": 0.12788377702236176, + "learning_rate": 0.0002468731770971113 + }, + { + "step": 104, + "epoch": 0.7050847457627119, + "cpu_mem": 1.870508032, + "gpu_mem": 5.023384576, + "loss": 0.6177, + "grad_norm": 0.12298800051212311, + "learning_rate": 0.0002455035261178632 + }, + { + "step": 105, + "epoch": 0.711864406779661, + "cpu_mem": 1.870508032, + "gpu_mem": 5.023485952, + "loss": 0.635, + "grad_norm": 0.24654819071292877, + "learning_rate": 0.0002441203511071278 + }, + { + "step": 106, + "epoch": 0.7186440677966102, + "cpu_mem": 1.870508032, + "gpu_mem": 5.0234368, + "loss": 0.6521, + "grad_norm": 0.4598885178565979, + "learning_rate": 0.00024272384793309077 + }, + { + "step": 107, + "epoch": 0.7254237288135593, + "cpu_mem": 1.870508032, + "gpu_mem": 5.023324672, + "loss": 0.5749, + "grad_norm": 0.22293901443481445, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 108, + "epoch": 0.7322033898305085, + "cpu_mem": 1.870508032, + "gpu_mem": 5.023508992, + "loss": 0.6071, + "grad_norm": 0.1253669410943985, + "learning_rate": 0.00023989164997670202 + }, + { + "step": 109, + "epoch": 0.7389830508474576, + "cpu_mem": 1.870508032, + "gpu_mem": 5.023363072, + "loss": 0.6863, + "grad_norm": 0.33255815505981445, + "learning_rate": 0.0002384563562552943 + }, + { + "step": 110, + "epoch": 0.7457627118644068, + "cpu_mem": 1.870508032, + "gpu_mem": 5.023366144, + "loss": 0.6561, + "grad_norm": 0.38743141293525696, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 111, + "epoch": 0.752542372881356, + "cpu_mem": 1.870508032, + "gpu_mem": 5.023335424, + "loss": 0.661, + "grad_norm": 0.4689392149448395, + "learning_rate": 0.0002355483955402446 + }, + { + "step": 112, + "epoch": 0.7593220338983051, + "cpu_mem": 1.870508032, + "gpu_mem": 5.023381504, + "loss": 0.6225, + "grad_norm": 0.14543704688549042, + "learning_rate": 0.00023407614033613407 + }, + { + "step": 113, + "epoch": 0.7661016949152543, + "cpu_mem": 1.870508032, + "gpu_mem": 5.023372288, + "loss": 0.625, + "grad_norm": 0.13054385781288147, + "learning_rate": 0.0002325919793059723 + }, + { + "step": 114, + "epoch": 0.7728813559322034, + "cpu_mem": 1.870508032, + "gpu_mem": 5.023353856, + "loss": 0.6177, + "grad_norm": 0.34386685490608215, + "learning_rate": 0.00023109612261833963 + }, + { + "step": 115, + "epoch": 0.7796610169491526, + "cpu_mem": 1.870508032, + "gpu_mem": 5.02342912, + "loss": 0.616, + "grad_norm": 0.3601931035518646, + "learning_rate": 0.0002295887820980112 + }, + { + "step": 116, + "epoch": 0.7864406779661017, + "cpu_mem": 1.87070464, + "gpu_mem": 5.023349248, + "loss": 0.6503, + "grad_norm": 0.3698611259460449, + "learning_rate": 0.0002280701711959608 + }, + { + "step": 117, + "epoch": 0.7932203389830509, + "cpu_mem": 1.87070464, + "gpu_mem": 5.023240192, + "loss": 0.6212, + "grad_norm": 0.13360032439231873, + "learning_rate": 0.00022654050495913495 + }, + { + "step": 118, + "epoch": 0.8, + "cpu_mem": 1.87070464, + "gpu_mem": 5.023478272, + "loss": 0.5798, + "grad_norm": 0.15429022908210754, + "learning_rate": 0.000225 + }, + { + "step": 119, + "epoch": 0.8067796610169492, + "cpu_mem": 1.87070464, + "gpu_mem": 5.023648768, + "loss": 0.5775, + "grad_norm": 0.12612316012382507, + "learning_rate": 0.00022344887446586865 + }, + { + "step": 120, + "epoch": 0.8135593220338984, + "cpu_mem": 1.87070464, + "gpu_mem": 5.023381504, + "loss": 0.5839, + "grad_norm": 0.12544581294059753, + "learning_rate": 0.00022188734800800852 + }, + { + "step": 121, + "epoch": 0.8203389830508474, + "cpu_mem": 1.87070464, + "gpu_mem": 5.023409152, + "loss": 0.5824, + "grad_norm": 0.13359619677066803, + "learning_rate": 0.00022031564175053754 + }, + { + "step": 122, + "epoch": 0.8271186440677966, + "cpu_mem": 1.87070464, + "gpu_mem": 5.02345984, + "loss": 0.552, + "grad_norm": 0.18490256369113922, + "learning_rate": 0.00021873397825911153 + }, + { + "step": 123, + "epoch": 0.8338983050847457, + "cpu_mem": 1.87070464, + "gpu_mem": 5.023269376, + "loss": 0.6268, + "grad_norm": 0.510979950428009, + "learning_rate": 0.00021714258150940685 + }, + { + "step": 124, + "epoch": 0.8406779661016949, + "cpu_mem": 1.87070464, + "gpu_mem": 5.023711744, + "loss": 0.5699, + "grad_norm": 0.2778141498565674, + "learning_rate": 0.0002155416768554039 + }, + { + "step": 125, + "epoch": 0.847457627118644, + "cpu_mem": 1.87070464, + "gpu_mem": 5.023438336, + "loss": 0.5396, + "grad_norm": 0.17448659241199493, + "learning_rate": 0.00021393149099747523 + }, + { + "step": 126, + "epoch": 0.8542372881355932, + "cpu_mem": 1.87070464, + "gpu_mem": 5.0233216, + "loss": 0.5575, + "grad_norm": 0.22467684745788574, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 127, + "epoch": 0.8610169491525423, + "cpu_mem": 1.87070464, + "gpu_mem": 5.023760896, + "loss": 0.6045, + "grad_norm": 0.20929023623466492, + "learning_rate": 0.00021068418901049025 + }, + { + "step": 128, + "epoch": 0.8677966101694915, + "cpu_mem": 1.87070464, + "gpu_mem": 5.02353664, + "loss": 0.595, + "grad_norm": 0.4037820100784302, + "learning_rate": 0.0002090475327242912 + }, + { + "step": 129, + "epoch": 0.8745762711864407, + "cpu_mem": 1.87070464, + "gpu_mem": 5.023576576, + "loss": 0.6431, + "grad_norm": 0.16436971724033356, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 130, + "epoch": 0.8813559322033898, + "cpu_mem": 1.87070464, + "gpu_mem": 5.023358464, + "loss": 0.6762, + "grad_norm": 0.11991788446903229, + "learning_rate": 0.0002057493683490491 + }, + { + "step": 131, + "epoch": 0.888135593220339, + "cpu_mem": 1.87070464, + "gpu_mem": 5.023487488, + "loss": 0.6389, + "grad_norm": 0.1495029330253601, + "learning_rate": 0.00020408832730536746 + }, + { + "step": 132, + "epoch": 0.8949152542372881, + "cpu_mem": 1.87070464, + "gpu_mem": 5.023568896, + "loss": 0.5735, + "grad_norm": 0.177620992064476, + "learning_rate": 0.00020241962693986476 + }, + { + "step": 133, + "epoch": 0.9016949152542373, + "cpu_mem": 1.87070464, + "gpu_mem": 5.02335232, + "loss": 0.5882, + "grad_norm": 0.20325639843940735, + "learning_rate": 0.0002007435035533061 + }, + { + "step": 134, + "epoch": 0.9084745762711864, + "cpu_mem": 1.87070464, + "gpu_mem": 5.023485952, + "loss": 0.5542, + "grad_norm": 0.47119808197021484, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 135, + "epoch": 0.9152542372881356, + "cpu_mem": 1.871491072, + "gpu_mem": 5.023508992, + "loss": 0.5576, + "grad_norm": 0.16285395622253418, + "learning_rate": 0.00019736993814225374 + }, + { + "step": 136, + "epoch": 0.9220338983050848, + "cpu_mem": 1.871491072, + "gpu_mem": 5.023346176, + "loss": 0.5104, + "grad_norm": 0.14363999664783478, + "learning_rate": 0.00019567297384048604 + }, + { + "step": 137, + "epoch": 0.9288135593220339, + "cpu_mem": 1.871491072, + "gpu_mem": 5.023226368, + "loss": 0.6778, + "grad_norm": 0.44658613204956055, + "learning_rate": 0.0001939695418954653 + }, + { + "step": 138, + "epoch": 0.9355932203389831, + "cpu_mem": 1.871491072, + "gpu_mem": 5.023407616, + "loss": 0.6227, + "grad_norm": 0.4282582998275757, + "learning_rate": 0.00019225988352621445 + }, + { + "step": 139, + "epoch": 0.9423728813559322, + "cpu_mem": 1.871491072, + "gpu_mem": 5.02330624, + "loss": 0.6186, + "grad_norm": 0.3712233603000641, + "learning_rate": 0.00019054424083346592 + }, + { + "step": 140, + "epoch": 0.9491525423728814, + "cpu_mem": 1.871491072, + "gpu_mem": 5.023358464, + "loss": 0.56, + "grad_norm": 0.13681486248970032, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 141, + "epoch": 0.9559322033898305, + "cpu_mem": 1.871491072, + "gpu_mem": 5.02339072, + "loss": 0.5801, + "grad_norm": 0.22461175918579102, + "learning_rate": 0.0001870959750831323 + }, + { + "step": 142, + "epoch": 0.9627118644067797, + "cpu_mem": 1.871491072, + "gpu_mem": 5.023530496, + "loss": 0.564, + "grad_norm": 0.5137683153152466, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 143, + "epoch": 0.9694915254237289, + "cpu_mem": 1.871491072, + "gpu_mem": 5.0235136, + "loss": 0.6957, + "grad_norm": 0.19101597368717194, + "learning_rate": 0.00018362669777878453 + }, + { + "step": 144, + "epoch": 0.976271186440678, + "cpu_mem": 1.871491072, + "gpu_mem": 5.0237056, + "loss": 0.5769, + "grad_norm": 0.1593334674835205, + "learning_rate": 0.00018188479343294648 + }, + { + "step": 145, + "epoch": 0.9830508474576272, + "cpu_mem": 1.871491072, + "gpu_mem": 5.023416832, + "loss": 0.5933, + "grad_norm": 0.3929411768913269, + "learning_rate": 0.0001801383739559098 + }, + { + "step": 146, + "epoch": 0.9898305084745763, + "cpu_mem": 1.871491072, + "gpu_mem": 5.02345216, + "loss": 0.5723, + "grad_norm": 0.16411206126213074, + "learning_rate": 0.0001783876866540615 + }, + { + "step": 147, + "epoch": 0.9966101694915255, + "cpu_mem": 1.871491072, + "gpu_mem": 5.023350784, + "loss": 0.603, + "grad_norm": 0.1643112599849701, + "learning_rate": 0.00017663297943814552 + }, + { + "step": 148, + "epoch": 1.0033898305084745, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225366016, + "loss": 0.9492, + "grad_norm": 0.660728394985199, + "learning_rate": 0.0001748745007881561 + }, + { + "step": 149, + "epoch": 1.0101694915254238, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225301504, + "loss": 0.5891, + "grad_norm": 0.20600003004074097, + "learning_rate": 0.00017311249971815185 + }, + { + "step": 150, + "epoch": 1.0169491525423728, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225138688, + "loss": 0.6098, + "grad_norm": 0.23444625735282898, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 151, + "epoch": 1.023728813559322, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22521088, + "loss": 0.7312, + "grad_norm": 0.2805699110031128, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 152, + "epoch": 1.0305084745762711, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225246208, + "loss": 0.5216, + "grad_norm": 0.18168288469314575, + "learning_rate": 0.00016780785939859576 + }, + { + "step": 153, + "epoch": 1.0372881355932204, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225270784, + "loss": 0.6255, + "grad_norm": 0.4077012538909912, + "learning_rate": 0.00016603426823476693 + }, + { + "step": 154, + "epoch": 1.0440677966101695, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225232384, + "loss": 0.5898, + "grad_norm": 0.2513872981071472, + "learning_rate": 0.00016425840649562736 + }, + { + "step": 155, + "epoch": 1.0508474576271187, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225453568, + "loss": 0.6035, + "grad_norm": 0.2890379726886749, + "learning_rate": 0.00016248052565681436 + }, + { + "step": 156, + "epoch": 1.0576271186440678, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225361408, + "loss": 0.6487, + "grad_norm": 0.4876692593097687, + "learning_rate": 0.00016070087747988482 + }, + { + "step": 157, + "epoch": 1.064406779661017, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225267712, + "loss": 0.5994, + "grad_norm": 0.14750680327415466, + "learning_rate": 0.00015891971397666464 + }, + { + "step": 158, + "epoch": 1.071186440677966, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225193984, + "loss": 0.591, + "grad_norm": 0.13341213762760162, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 159, + "epoch": 1.0779661016949154, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225542656, + "loss": 0.5314, + "grad_norm": 0.11648880690336227, + "learning_rate": 0.00015535385007584706 + }, + { + "step": 160, + "epoch": 1.0847457627118644, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225137152, + "loss": 0.6416, + "grad_norm": 0.223282128572464, + "learning_rate": 0.0001535696546319161 + }, + { + "step": 161, + "epoch": 1.0915254237288137, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225083392, + "loss": 0.546, + "grad_norm": 0.13665218651294708, + "learning_rate": 0.00015178495369752213 + }, + { + "step": 162, + "epoch": 1.0983050847457627, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225859072, + "loss": 0.5486, + "grad_norm": 0.16804996132850647, + "learning_rate": 0.00015 + }, + { + "step": 163, + "epoch": 1.1050847457627118, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225335296, + "loss": 0.6578, + "grad_norm": 0.31079021096229553, + "learning_rate": 0.00014821504630247785 + }, + { + "step": 164, + "epoch": 1.111864406779661, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225247744, + "loss": 0.647, + "grad_norm": 0.17374569177627563, + "learning_rate": 0.00014643034536808387 + }, + { + "step": 165, + "epoch": 1.11864406779661, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225197056, + "loss": 0.6055, + "grad_norm": 0.2730662226676941, + "learning_rate": 0.00014464614992415294 + }, + { + "step": 166, + "epoch": 1.1254237288135593, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225292288, + "loss": 0.5914, + "grad_norm": 0.23634549975395203, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 167, + "epoch": 1.1322033898305084, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225209344, + "loss": 0.637, + "grad_norm": 0.17366164922714233, + "learning_rate": 0.00014108028602333536 + }, + { + "step": 168, + "epoch": 1.1389830508474577, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225227776, + "loss": 0.5928, + "grad_norm": 0.32267147302627563, + "learning_rate": 0.00013929912252011516 + }, + { + "step": 169, + "epoch": 1.1457627118644067, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225315328, + "loss": 0.6065, + "grad_norm": 0.4495171904563904, + "learning_rate": 0.00013751947434318564 + }, + { + "step": 170, + "epoch": 1.152542372881356, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225200128, + "loss": 0.5631, + "grad_norm": 0.23280799388885498, + "learning_rate": 0.00013574159350437261 + }, + { + "step": 171, + "epoch": 1.159322033898305, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225263104, + "loss": 0.6123, + "grad_norm": 0.14402291178703308, + "learning_rate": 0.0001339657317652331 + }, + { + "step": 172, + "epoch": 1.1661016949152543, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225170944, + "loss": 0.5963, + "grad_norm": 0.1302947700023651, + "learning_rate": 0.00013219214060140424 + }, + { + "step": 173, + "epoch": 1.1728813559322033, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225470464, + "loss": 0.5652, + "grad_norm": 0.12529513239860535, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 174, + "epoch": 1.1796610169491526, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225193984, + "loss": 0.7194, + "grad_norm": 0.3564681112766266, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 175, + "epoch": 1.1864406779661016, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225160192, + "loss": 0.585, + "grad_norm": 0.18094022572040558, + "learning_rate": 0.00012688750028184818 + }, + { + "step": 176, + "epoch": 1.193220338983051, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225298432, + "loss": 0.5272, + "grad_norm": 0.26489946246147156, + "learning_rate": 0.0001251254992118439 + }, + { + "step": 177, + "epoch": 1.2, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225396736, + "loss": 0.5662, + "grad_norm": 0.14967241883277893, + "learning_rate": 0.00012336702056185453 + }, + { + "step": 178, + "epoch": 1.2067796610169492, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225143296, + "loss": 0.5661, + "grad_norm": 0.1831762194633484, + "learning_rate": 0.00012161231334593851 + }, + { + "step": 179, + "epoch": 1.2135593220338983, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225243136, + "loss": 0.6176, + "grad_norm": 0.3751544952392578, + "learning_rate": 0.00011986162604409015 + }, + { + "step": 180, + "epoch": 1.2203389830508475, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225215488, + "loss": 0.6114, + "grad_norm": 0.15540283918380737, + "learning_rate": 0.00011811520656705348 + }, + { + "step": 181, + "epoch": 1.2271186440677966, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225152512, + "loss": 0.5583, + "grad_norm": 0.14212781190872192, + "learning_rate": 0.00011637330222121543 + }, + { + "step": 182, + "epoch": 1.2338983050847459, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225370624, + "loss": 0.6592, + "grad_norm": 0.5663373470306396, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 183, + "epoch": 1.240677966101695, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225267712, + "loss": 0.6192, + "grad_norm": 0.13510197401046753, + "learning_rate": 0.00011290402491686766 + }, + { + "step": 184, + "epoch": 1.2474576271186442, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225215488, + "loss": 0.5444, + "grad_norm": 0.17342118918895721, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 185, + "epoch": 1.2542372881355932, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225193984, + "loss": 0.5495, + "grad_norm": 0.15126851201057434, + "learning_rate": 0.00010945575916653407 + }, + { + "step": 186, + "epoch": 1.2610169491525425, + "cpu_mem": 1.871491072, + "gpu_mem": 5.2252032, + "loss": 0.5073, + "grad_norm": 0.15851685404777527, + "learning_rate": 0.00010774011647378553 + }, + { + "step": 187, + "epoch": 1.2677966101694915, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225135616, + "loss": 0.6398, + "grad_norm": 0.19668269157409668, + "learning_rate": 0.00010603045810453468 + }, + { + "step": 188, + "epoch": 1.2745762711864406, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225298432, + "loss": 0.5876, + "grad_norm": 0.27273330092430115, + "learning_rate": 0.00010432702615951396 + }, + { + "step": 189, + "epoch": 1.2813559322033898, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225167872, + "loss": 0.6522, + "grad_norm": 0.16754496097564697, + "learning_rate": 0.00010263006185774627 + }, + { + "step": 190, + "epoch": 1.288135593220339, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22528768, + "loss": 0.5582, + "grad_norm": 0.20495641231536865, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 191, + "epoch": 1.2949152542372881, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225106432, + "loss": 0.5476, + "grad_norm": 0.18175441026687622, + "learning_rate": 9.925649644669391e-05 + }, + { + "step": 192, + "epoch": 1.3016949152542372, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225238528, + "loss": 0.5283, + "grad_norm": 0.5028181672096252, + "learning_rate": 9.758037306013526e-05 + }, + { + "step": 193, + "epoch": 1.3084745762711865, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225212416, + "loss": 0.5734, + "grad_norm": 0.16918456554412842, + "learning_rate": 9.591167269463255e-05 + }, + { + "step": 194, + "epoch": 1.3152542372881357, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225178624, + "loss": 0.6704, + "grad_norm": 0.19521644711494446, + "learning_rate": 9.425063165095088e-05 + }, + { + "step": 195, + "epoch": 1.3220338983050848, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225283072, + "loss": 0.5426, + "grad_norm": 0.20951034128665924, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 196, + "epoch": 1.3288135593220338, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225278464, + "loss": 0.52, + "grad_norm": 0.19132260978221893, + "learning_rate": 9.095246727570879e-05 + }, + { + "step": 197, + "epoch": 1.335593220338983, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225137152, + "loss": 0.5367, + "grad_norm": 0.31606876850128174, + "learning_rate": 8.931581098950973e-05 + }, + { + "step": 198, + "epoch": 1.3423728813559321, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225329152, + "loss": 0.534, + "grad_norm": 0.1973571926355362, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 199, + "epoch": 1.3491525423728814, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22518016, + "loss": 0.6115, + "grad_norm": 0.36959734559059143, + "learning_rate": 8.606850900252478e-05 + }, + { + "step": 200, + "epoch": 1.3559322033898304, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225283072, + "loss": 0.5945, + "grad_norm": 0.26384681463241577, + "learning_rate": 8.445832314459608e-05 + }, + { + "step": 201, + "epoch": 1.3627118644067797, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225485824, + "loss": 0.5395, + "grad_norm": 0.16042476892471313, + "learning_rate": 8.285741849059311e-05 + }, + { + "step": 202, + "epoch": 1.3694915254237288, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22528768, + "loss": 0.5736, + "grad_norm": 0.16286450624465942, + "learning_rate": 8.126602174088843e-05 + }, + { + "step": 203, + "epoch": 1.376271186440678, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225174016, + "loss": 0.491, + "grad_norm": 0.3161409795284271, + "learning_rate": 7.968435824946242e-05 + }, + { + "step": 204, + "epoch": 1.383050847457627, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22518784, + "loss": 0.5694, + "grad_norm": 0.1815277636051178, + "learning_rate": 7.811265199199152e-05 + }, + { + "step": 205, + "epoch": 1.3898305084745763, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225232384, + "loss": 0.5988, + "grad_norm": 0.3414490520954132, + "learning_rate": 7.655112553413135e-05 + }, + { + "step": 206, + "epoch": 1.3966101694915254, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225174016, + "loss": 0.5458, + "grad_norm": 0.1929984986782074, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 207, + "epoch": 1.4033898305084747, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225407488, + "loss": 0.5894, + "grad_norm": 0.3814259469509125, + "learning_rate": 7.345949504086507e-05 + }, + { + "step": 208, + "epoch": 1.4101694915254237, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225438208, + "loss": 0.5484, + "grad_norm": 0.5490694642066956, + "learning_rate": 7.192982880403917e-05 + }, + { + "step": 209, + "epoch": 1.4169491525423727, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22536448, + "loss": 0.6131, + "grad_norm": 0.21227596700191498, + "learning_rate": 7.041121790198881e-05 + }, + { + "step": 210, + "epoch": 1.423728813559322, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225252352, + "loss": 0.6105, + "grad_norm": 0.23302190005779266, + "learning_rate": 6.890387738166041e-05 + }, + { + "step": 211, + "epoch": 1.4305084745762713, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225201664, + "loss": 0.486, + "grad_norm": 0.3283984959125519, + "learning_rate": 6.740802069402771e-05 + }, + { + "step": 212, + "epoch": 1.4372881355932203, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225170944, + "loss": 0.6435, + "grad_norm": 0.27306556701660156, + "learning_rate": 6.592385966386588e-05 + }, + { + "step": 213, + "epoch": 1.4440677966101694, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225193984, + "loss": 0.6862, + "grad_norm": 0.3469858467578888, + "learning_rate": 6.445160445975536e-05 + }, + { + "step": 214, + "epoch": 1.4508474576271186, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225276928, + "loss": 0.5356, + "grad_norm": 0.2371579259634018, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 215, + "epoch": 1.457627118644068, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225204736, + "loss": 0.668, + "grad_norm": 0.6338670253753662, + "learning_rate": 6.154364374470568e-05 + }, + { + "step": 216, + "epoch": 1.464406779661017, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225370624, + "loss": 0.5353, + "grad_norm": 0.19290903210639954, + "learning_rate": 6.010835002329795e-05 + }, + { + "step": 217, + "epoch": 1.471186440677966, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225212416, + "loss": 0.6452, + "grad_norm": 0.5430368781089783, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 218, + "epoch": 1.4779661016949153, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225189376, + "loss": 0.5442, + "grad_norm": 0.14057472348213196, + "learning_rate": 5.72761520669092e-05 + }, + { + "step": 219, + "epoch": 1.4847457627118645, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225315328, + "loss": 0.5932, + "grad_norm": 0.13682378828525543, + "learning_rate": 5.587964889287218e-05 + }, + { + "step": 220, + "epoch": 1.4915254237288136, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22534912, + "loss": 0.5825, + "grad_norm": 0.18990926444530487, + "learning_rate": 5.449647388213678e-05 + }, + { + "step": 221, + "epoch": 1.4983050847457626, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225217024, + "loss": 0.627, + "grad_norm": 0.23651687800884247, + "learning_rate": 5.312682290288869e-05 + }, + { + "step": 222, + "epoch": 1.505084745762712, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225353728, + "loss": 0.5422, + "grad_norm": 0.3717305660247803, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 223, + "epoch": 1.5118644067796612, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225267712, + "loss": 0.5473, + "grad_norm": 0.3648890256881714, + "learning_rate": 5.0428866908599864e-05 + }, + { + "step": 224, + "epoch": 1.5186440677966102, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225232384, + "loss": 0.588, + "grad_norm": 0.3533920645713806, + "learning_rate": 4.9100943944812114e-05 + }, + { + "step": 225, + "epoch": 1.5254237288135593, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225197056, + "loss": 0.5837, + "grad_norm": 0.1884913295507431, + "learning_rate": 4.778730906091632e-05 + }, + { + "step": 226, + "epoch": 1.5322033898305085, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225346048, + "loss": 0.5197, + "grad_norm": 0.3794596791267395, + "learning_rate": 4.648814827768322e-05 + }, + { + "step": 227, + "epoch": 1.5389830508474578, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225235456, + "loss": 0.5137, + "grad_norm": 0.17430512607097626, + "learning_rate": 4.5203645566239816e-05 + }, + { + "step": 228, + "epoch": 1.5457627118644068, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22518016, + "loss": 0.6881, + "grad_norm": 0.22330285608768463, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 229, + "epoch": 1.5525423728813559, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225121792, + "loss": 0.5801, + "grad_norm": 0.16818001866340637, + "learning_rate": 4.267933983899601e-05 + }, + { + "step": 230, + "epoch": 1.559322033898305, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225178624, + "loss": 0.5655, + "grad_norm": 0.22824430465698242, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 231, + "epoch": 1.5661016949152542, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22545664, + "loss": 0.6239, + "grad_norm": 0.20222674310207367, + "learning_rate": 4.0215821672741213e-05 + }, + { + "step": 232, + "epoch": 1.5728813559322035, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22518016, + "loss": 0.6274, + "grad_norm": 0.20738743245601654, + "learning_rate": 3.900729534256745e-05 + }, + { + "step": 233, + "epoch": 1.5796610169491525, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225493504, + "loss": 0.5455, + "grad_norm": 0.24053917825222015, + "learning_rate": 3.781448643031187e-05 + }, + { + "step": 234, + "epoch": 1.5864406779661016, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225369088, + "loss": 0.5561, + "grad_norm": 0.33785226941108704, + "learning_rate": 3.663756384686127e-05 + }, + { + "step": 235, + "epoch": 1.5932203389830508, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225124864, + "loss": 0.5241, + "grad_norm": 0.3118985593318939, + "learning_rate": 3.547669425347647e-05 + }, + { + "step": 236, + "epoch": 1.6, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225184768, + "loss": 0.578, + "grad_norm": 0.23728522658348083, + "learning_rate": 3.433204203819185e-05 + }, + { + "step": 237, + "epoch": 1.6067796610169491, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225246208, + "loss": 0.5147, + "grad_norm": 0.25529155135154724, + "learning_rate": 3.3203769292536764e-05 + }, + { + "step": 238, + "epoch": 1.6135593220338982, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225247744, + "loss": 0.5871, + "grad_norm": 0.22935517132282257, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 239, + "epoch": 1.6203389830508474, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225501184, + "loss": 0.6639, + "grad_norm": 0.28994080424308777, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 240, + "epoch": 1.6271186440677967, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225150976, + "loss": 0.6768, + "grad_norm": 0.16814063489437103, + "learning_rate": 2.9918813861345952e-05 + }, + { + "step": 241, + "epoch": 1.6338983050847458, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225447424, + "loss": 0.56, + "grad_norm": 0.2648347020149231, + "learning_rate": 2.885763318295102e-05 + }, + { + "step": 242, + "epoch": 1.6406779661016948, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225309184, + "loss": 0.5894, + "grad_norm": 0.25962719321250916, + "learning_rate": 2.781360719244964e-05 + }, + { + "step": 243, + "epoch": 1.647457627118644, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225161728, + "loss": 0.6934, + "grad_norm": 0.3970635235309601, + "learning_rate": 2.6786883731926306e-05 + }, + { + "step": 244, + "epoch": 1.6542372881355933, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225301504, + "loss": 0.5626, + "grad_norm": 0.22156696021556854, + "learning_rate": 2.5777608193294396e-05 + }, + { + "step": 245, + "epoch": 1.6610169491525424, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22518016, + "loss": 0.6038, + "grad_norm": 0.16415396332740784, + "learning_rate": 2.4785923497707956e-05 + }, + { + "step": 246, + "epoch": 1.6677966101694914, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225273856, + "loss": 0.5576, + "grad_norm": 0.2100769281387329, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 247, + "epoch": 1.6745762711864407, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225292288, + "loss": 0.5211, + "grad_norm": 0.18482615053653717, + "learning_rate": 2.285588584541047e-05 + }, + { + "step": 248, + "epoch": 1.68135593220339, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225244672, + "loss": 0.561, + "grad_norm": 0.12901265919208527, + "learning_rate": 2.1917806196827792e-05 + }, + { + "step": 249, + "epoch": 1.688135593220339, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225150976, + "loss": 0.5202, + "grad_norm": 0.2598748803138733, + "learning_rate": 2.0997863968844914e-05 + }, + { + "step": 250, + "epoch": 1.694915254237288, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225243136, + "loss": 0.5597, + "grad_norm": 0.3591465950012207, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 251, + "epoch": 1.7016949152542373, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225155584, + "loss": 0.5758, + "grad_norm": 0.20229923725128174, + "learning_rate": 1.921291027132278e-05 + }, + { + "step": 252, + "epoch": 1.7084745762711866, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225198592, + "loss": 0.6394, + "grad_norm": 0.2406589239835739, + "learning_rate": 1.834815156491165e-05 + }, + { + "step": 253, + "epoch": 1.7152542372881356, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225392128, + "loss": 0.6167, + "grad_norm": 0.20660465955734253, + "learning_rate": 1.750203576956341e-05 + }, + { + "step": 254, + "epoch": 1.7220338983050847, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22518784, + "loss": 0.6005, + "grad_norm": 0.2539401948451996, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 255, + "epoch": 1.7288135593220337, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225344512, + "loss": 0.6158, + "grad_norm": 0.4432738423347473, + "learning_rate": 1.5866209521043304e-05 + }, + { + "step": 256, + "epoch": 1.735593220338983, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225170944, + "loss": 0.5868, + "grad_norm": 0.22313469648361206, + "learning_rate": 1.5076730713409523e-05 + }, + { + "step": 257, + "epoch": 1.7423728813559323, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225584128, + "loss": 0.5734, + "grad_norm": 0.2720390260219574, + "learning_rate": 1.4306358075111923e-05 + }, + { + "step": 258, + "epoch": 1.7491525423728813, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225243136, + "loss": 0.593, + "grad_norm": 0.4196253716945648, + "learning_rate": 1.3555200696822232e-05 + }, + { + "step": 259, + "epoch": 1.7559322033898304, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225160192, + "loss": 0.5543, + "grad_norm": 0.14926786720752716, + "learning_rate": 1.2823364948184095e-05 + }, + { + "step": 260, + "epoch": 1.7627118644067796, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225276928, + "loss": 0.4745, + "grad_norm": 0.20046955347061157, + "learning_rate": 1.2110954462750166e-05 + }, + { + "step": 261, + "epoch": 1.769491525423729, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225232384, + "loss": 0.4664, + "grad_norm": 0.27472227811813354, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 262, + "epoch": 1.776271186440678, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225189376, + "loss": 0.529, + "grad_norm": 0.17377826571464539, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 263, + "epoch": 1.783050847457627, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22522624, + "loss": 0.5524, + "grad_norm": 0.18128226697444916, + "learning_rate": 1.0091269574384874e-05 + }, + { + "step": 264, + "epoch": 1.7898305084745763, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225313792, + "loss": 0.5246, + "grad_norm": 0.21767883002758026, + "learning_rate": 9.45754125003576e-06 + }, + { + "step": 265, + "epoch": 1.7966101694915255, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225232384, + "loss": 0.6018, + "grad_norm": 0.25386571884155273, + "learning_rate": 8.843714815330987e-06 + }, + { + "step": 266, + "epoch": 1.8033898305084746, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225447424, + "loss": 0.5375, + "grad_norm": 0.1767737865447998, + "learning_rate": 8.249877192799731e-06 + }, + { + "step": 267, + "epoch": 1.8101694915254236, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225240064, + "loss": 0.5474, + "grad_norm": 0.4066689908504486, + "learning_rate": 7.676112474402068e-06 + }, + { + "step": 268, + "epoch": 1.8169491525423729, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225244672, + "loss": 0.5126, + "grad_norm": 0.2731332778930664, + "learning_rate": 7.122501909620926e-06 + }, + { + "step": 269, + "epoch": 1.8237288135593221, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225255424, + "loss": 0.5449, + "grad_norm": 0.33967921137809753, + "learning_rate": 6.5891238939566275e-06 + }, + { + "step": 270, + "epoch": 1.8305084745762712, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225293824, + "loss": 0.6284, + "grad_norm": 0.23640812933444977, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 271, + "epoch": 1.8372881355932202, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225346048, + "loss": 0.538, + "grad_norm": 0.240090474486351, + "learning_rate": 5.583364755863701e-06 + }, + { + "step": 272, + "epoch": 1.8440677966101695, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225204736, + "loss": 0.5681, + "grad_norm": 0.2946719825267792, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 273, + "epoch": 1.8508474576271188, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225084928, + "loss": 0.6037, + "grad_norm": 0.25073859095573425, + "learning_rate": 4.659404732773908e-06 + }, + { + "step": 274, + "epoch": 1.8576271186440678, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225312256, + "loss": 0.6278, + "grad_norm": 0.36047831177711487, + "learning_rate": 4.228264751468752e-06 + }, + { + "step": 275, + "epoch": 1.8644067796610169, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22555648, + "loss": 0.488, + "grad_norm": 0.16725294291973114, + "learning_rate": 3.817767165451041e-06 + }, + { + "step": 276, + "epoch": 1.8711864406779661, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225217024, + "loss": 0.6001, + "grad_norm": 0.1796533763408661, + "learning_rate": 3.4279701043260886e-06 + }, + { + "step": 277, + "epoch": 1.8779661016949154, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225163264, + "loss": 0.6319, + "grad_norm": 0.26552966237068176, + "learning_rate": 3.0589287663461472e-06 + }, + { + "step": 278, + "epoch": 1.8847457627118644, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22532608, + "loss": 0.6041, + "grad_norm": 0.15598487854003906, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 279, + "epoch": 1.8915254237288135, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225266176, + "loss": 0.65, + "grad_norm": 0.18141354620456696, + "learning_rate": 2.3833193495825853e-06 + }, + { + "step": 280, + "epoch": 1.8983050847457628, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225246208, + "loss": 0.6038, + "grad_norm": 0.2367304265499115, + "learning_rate": 2.076846942272026e-06 + }, + { + "step": 281, + "epoch": 1.905084745762712, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225181696, + "loss": 0.6332, + "grad_norm": 0.2213892638683319, + "learning_rate": 1.791321587504768e-06 + }, + { + "step": 282, + "epoch": 1.911864406779661, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22561024, + "loss": 0.5804, + "grad_norm": 0.43039804697036743, + "learning_rate": 1.5267837178600972e-06 + }, + { + "step": 283, + "epoch": 1.9186440677966101, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225316864, + "loss": 0.5661, + "grad_norm": 0.16909770667552948, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 284, + "epoch": 1.9254237288135592, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22517248, + "loss": 0.5349, + "grad_norm": 0.17821425199508667, + "learning_rate": 1.0608172990067553e-06 + }, + { + "step": 285, + "epoch": 1.9322033898305084, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22522624, + "loss": 0.5902, + "grad_norm": 0.20041631162166595, + "learning_rate": 8.594547342153979e-07 + }, + { + "step": 286, + "epoch": 1.9389830508474577, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225644032, + "loss": 0.5822, + "grad_norm": 0.26122304797172546, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 287, + "epoch": 1.9457627118644067, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225413632, + "loss": 0.6083, + "grad_norm": 0.34505853056907654, + "learning_rate": 5.201134622801473e-07 + }, + { + "step": 288, + "epoch": 1.9525423728813558, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225198592, + "loss": 0.5914, + "grad_norm": 0.23107720911502838, + "learning_rate": 3.821828084619727e-07 + }, + { + "step": 289, + "epoch": 1.959322033898305, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225283072, + "loss": 0.589, + "grad_norm": 0.25194406509399414, + "learning_rate": 2.654391846207915e-07 + }, + { + "step": 290, + "epoch": 1.9661016949152543, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225207808, + "loss": 0.6598, + "grad_norm": 0.40302157402038574, + "learning_rate": 1.6989912254880556e-07 + }, + { + "step": 291, + "epoch": 1.9728813559322034, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225243136, + "loss": 0.655, + "grad_norm": 0.25559553503990173, + "learning_rate": 9.557615145123765e-08 + }, + { + "step": 292, + "epoch": 1.9796610169491524, + "cpu_mem": 1.871491072, + "gpu_mem": 5.22532608, + "loss": 0.5699, + "grad_norm": 0.1895446628332138, + "learning_rate": 4.248079603064724e-08 + }, + { + "step": 293, + "epoch": 1.9864406779661017, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225243136, + "loss": 0.6982, + "grad_norm": 0.3614128530025482, + "learning_rate": 1.0620574996372811e-08 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225269248, + "loss": 0.6327, + "grad_norm": 0.3568006455898285, + "learning_rate": 0.0 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.871491072, + "gpu_mem": 5.225269248, + "train_runtime": 8801.4158, + "train_samples_per_second": 2.142, + "train_steps_per_second": 0.033, + "total_flos": 4.923873944056627e+16, + "train_loss": 1.0822927254397854 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r8-a2/README.md b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r8-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r8-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r8-a2/adapter_config.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5723daa9f5f7b854bf548bbee9a6d37e12198a3a --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r8-a2/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha": 16, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "inference_mode": true, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 8, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r8-a2/eval_results.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..723738a870dd86bfb47ec8f2d1af81af94a4f94d --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "boolq", + "results": 0.6807339449541284 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r8-a2/training_configuration.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..7b9df0d9cc6b9251d9bc9e130c59debcaef49a58 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "BOOLQ", + "dataset_id": "google/boolq", + "preprocess_id": "boolq_train_deepeval" + }, + "peft_config": { + "method": "loha", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 12615680 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 2, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loha-boolq-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r8-a2", + "seed": 42, + "timestamp": "2025-09-13T06:58:22.817471" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r8-a2/training_logs.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..ec1c621af3cd98cbe42591df1b9244ef0738bc68 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-boolq-r8-a2/training_logs.json @@ -0,0 +1,2659 @@ +[ + { + "step": 1, + "epoch": 0.006779661016949152, + "cpu_mem": 1.844789248, + "gpu_mem": 4.468235264, + "loss": 8.869, + "grad_norm": 3.4864389896392822, + "learning_rate": 9.999999999999999e-06 + }, + { + "step": 2, + "epoch": 0.013559322033898305, + "cpu_mem": 1.850884096, + "gpu_mem": 4.569298432, + "loss": 8.9376, + "grad_norm": 3.5296335220336914, + "learning_rate": 1.9999999999999998e-05 + }, + { + "step": 3, + "epoch": 0.020338983050847456, + "cpu_mem": 1.851670528, + "gpu_mem": 4.569217024, + "loss": 8.9443, + "grad_norm": 3.556117534637451, + "learning_rate": 2.9999999999999997e-05 + }, + { + "step": 4, + "epoch": 0.02711864406779661, + "cpu_mem": 1.852260352, + "gpu_mem": 4.569217024, + "loss": 8.9055, + "grad_norm": 3.5820488929748535, + "learning_rate": 3.9999999999999996e-05 + }, + { + "step": 5, + "epoch": 0.03389830508474576, + "cpu_mem": 1.852850176, + "gpu_mem": 4.569152512, + "loss": 8.7468, + "grad_norm": 3.7951903343200684, + "learning_rate": 4.9999999999999996e-05 + }, + { + "step": 6, + "epoch": 0.04067796610169491, + "cpu_mem": 1.853243392, + "gpu_mem": 4.56917248, + "loss": 8.8842, + "grad_norm": 3.500131607055664, + "learning_rate": 5.9999999999999995e-05 + }, + { + "step": 7, + "epoch": 0.04745762711864407, + "cpu_mem": 1.853833216, + "gpu_mem": 4.569224704, + "loss": 8.7862, + "grad_norm": 3.7001960277557373, + "learning_rate": 7e-05 + }, + { + "step": 8, + "epoch": 0.05423728813559322, + "cpu_mem": 1.85442304, + "gpu_mem": 4.56931072, + "loss": 8.6843, + "grad_norm": 3.8163208961486816, + "learning_rate": 7.999999999999999e-05 + }, + { + "step": 9, + "epoch": 0.061016949152542375, + "cpu_mem": 1.854816256, + "gpu_mem": 4.56921856, + "loss": 8.374, + "grad_norm": 3.9458162784576416, + "learning_rate": 8.999999999999999e-05 + }, + { + "step": 10, + "epoch": 0.06779661016949153, + "cpu_mem": 1.85540608, + "gpu_mem": 4.56911872, + "loss": 8.3368, + "grad_norm": 3.7611031532287598, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 11, + "epoch": 0.07457627118644068, + "cpu_mem": 1.855799296, + "gpu_mem": 4.569223168, + "loss": 8.289, + "grad_norm": 3.897369623184204, + "learning_rate": 0.00010999999999999998 + }, + { + "step": 12, + "epoch": 0.08135593220338982, + "cpu_mem": 1.856192512, + "gpu_mem": 4.56959488, + "loss": 8.1762, + "grad_norm": 3.7736144065856934, + "learning_rate": 0.00011999999999999999 + }, + { + "step": 13, + "epoch": 0.08813559322033898, + "cpu_mem": 1.856585728, + "gpu_mem": 4.569198592, + "loss": 8.0118, + "grad_norm": 3.76057505607605, + "learning_rate": 0.00013 + }, + { + "step": 14, + "epoch": 0.09491525423728814, + "cpu_mem": 1.856978944, + "gpu_mem": 4.569175552, + "loss": 7.6728, + "grad_norm": 4.03656530380249, + "learning_rate": 0.00014 + }, + { + "step": 15, + "epoch": 0.1016949152542373, + "cpu_mem": 1.85737216, + "gpu_mem": 4.569114112, + "loss": 7.4054, + "grad_norm": 3.9731688499450684, + "learning_rate": 0.00015 + }, + { + "step": 16, + "epoch": 0.10847457627118644, + "cpu_mem": 1.857765376, + "gpu_mem": 4.569198592, + "loss": 7.2846, + "grad_norm": 3.811659336090088, + "learning_rate": 0.00015999999999999999 + }, + { + "step": 17, + "epoch": 0.1152542372881356, + "cpu_mem": 1.858158592, + "gpu_mem": 4.569238528, + "loss": 6.8515, + "grad_norm": 4.381457328796387, + "learning_rate": 0.00016999999999999999 + }, + { + "step": 18, + "epoch": 0.12203389830508475, + "cpu_mem": 1.858551808, + "gpu_mem": 4.569301504, + "loss": 6.9906, + "grad_norm": 4.234468460083008, + "learning_rate": 0.00017999999999999998 + }, + { + "step": 19, + "epoch": 0.1288135593220339, + "cpu_mem": 1.858945024, + "gpu_mem": 4.569138688, + "loss": 6.0908, + "grad_norm": 4.214846611022949, + "learning_rate": 0.00018999999999999998 + }, + { + "step": 20, + "epoch": 0.13559322033898305, + "cpu_mem": 1.859141632, + "gpu_mem": 4.569250816, + "loss": 5.865, + "grad_norm": 4.571763515472412, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 21, + "epoch": 0.1423728813559322, + "cpu_mem": 1.859534848, + "gpu_mem": 4.569409024, + "loss": 5.6187, + "grad_norm": 4.177043437957764, + "learning_rate": 0.00020999999999999998 + }, + { + "step": 22, + "epoch": 0.14915254237288136, + "cpu_mem": 1.859731456, + "gpu_mem": 4.569301504, + "loss": 5.0828, + "grad_norm": 4.575229167938232, + "learning_rate": 0.00021999999999999995 + }, + { + "step": 23, + "epoch": 0.15593220338983052, + "cpu_mem": 1.859928064, + "gpu_mem": 4.569273856, + "loss": 4.6725, + "grad_norm": 3.6655373573303223, + "learning_rate": 0.00023 + }, + { + "step": 24, + "epoch": 0.16271186440677965, + "cpu_mem": 1.86032128, + "gpu_mem": 4.569330688, + "loss": 3.7778, + "grad_norm": 4.1521196365356445, + "learning_rate": 0.00023999999999999998 + }, + { + "step": 25, + "epoch": 0.1694915254237288, + "cpu_mem": 1.860714496, + "gpu_mem": 4.569115648, + "loss": 3.4804, + "grad_norm": 3.705185651779175, + "learning_rate": 0.00025 + }, + { + "step": 26, + "epoch": 0.17627118644067796, + "cpu_mem": 1.860911104, + "gpu_mem": 4.569170944, + "loss": 2.9155, + "grad_norm": 3.0109832286834717, + "learning_rate": 0.00026 + }, + { + "step": 27, + "epoch": 0.18305084745762712, + "cpu_mem": 1.861107712, + "gpu_mem": 4.569462784, + "loss": 2.2909, + "grad_norm": 2.2551920413970947, + "learning_rate": 0.00027 + }, + { + "step": 28, + "epoch": 0.18983050847457628, + "cpu_mem": 1.86130432, + "gpu_mem": 4.56914176, + "loss": 2.0139, + "grad_norm": 1.7635722160339355, + "learning_rate": 0.00028 + }, + { + "step": 29, + "epoch": 0.19661016949152543, + "cpu_mem": 1.861697536, + "gpu_mem": 4.569206272, + "loss": 1.8352, + "grad_norm": 1.276904821395874, + "learning_rate": 0.00029 + }, + { + "step": 30, + "epoch": 0.2033898305084746, + "cpu_mem": 1.861894144, + "gpu_mem": 4.569284608, + "loss": 1.4976, + "grad_norm": 1.067708969116211, + "learning_rate": 0.0003 + }, + { + "step": 31, + "epoch": 0.21016949152542372, + "cpu_mem": 1.862090752, + "gpu_mem": 4.569088, + "loss": 1.2895, + "grad_norm": 1.1051933765411377, + "learning_rate": 0.0002999893794250036 + }, + { + "step": 32, + "epoch": 0.21694915254237288, + "cpu_mem": 1.862483968, + "gpu_mem": 4.569201664, + "loss": 1.2167, + "grad_norm": 0.8914623260498047, + "learning_rate": 0.00029995751920396937 + }, + { + "step": 33, + "epoch": 0.22372881355932203, + "cpu_mem": 1.862680576, + "gpu_mem": 4.569439744, + "loss": 0.9549, + "grad_norm": 0.5938089489936829, + "learning_rate": 0.00029990442384854874 + }, + { + "step": 34, + "epoch": 0.2305084745762712, + "cpu_mem": 1.862877184, + "gpu_mem": 4.56914176, + "loss": 0.7617, + "grad_norm": 0.4567009508609772, + "learning_rate": 0.0002998301008774512 + }, + { + "step": 35, + "epoch": 0.23728813559322035, + "cpu_mem": 1.863073792, + "gpu_mem": 4.569352192, + "loss": 0.9054, + "grad_norm": 0.5605969429016113, + "learning_rate": 0.0002997345608153792 + }, + { + "step": 36, + "epoch": 0.2440677966101695, + "cpu_mem": 1.8632704, + "gpu_mem": 4.56930304, + "loss": 0.8204, + "grad_norm": 0.48071518540382385, + "learning_rate": 0.000299617817191538 + }, + { + "step": 37, + "epoch": 0.25084745762711863, + "cpu_mem": 1.863467008, + "gpu_mem": 4.569114112, + "loss": 0.7449, + "grad_norm": 0.3182627856731415, + "learning_rate": 0.0002994798865377198 + }, + { + "step": 38, + "epoch": 0.2576271186440678, + "cpu_mem": 1.863663616, + "gpu_mem": 4.569361408, + "loss": 0.8315, + "grad_norm": 0.6276575922966003, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 39, + "epoch": 0.26440677966101694, + "cpu_mem": 1.863860224, + "gpu_mem": 4.5697408, + "loss": 0.7002, + "grad_norm": 0.2890534996986389, + "learning_rate": 0.0002991405452657846 + }, + { + "step": 40, + "epoch": 0.2711864406779661, + "cpu_mem": 1.864056832, + "gpu_mem": 4.56931072, + "loss": 0.6799, + "grad_norm": 0.274429589509964, + "learning_rate": 0.00029893918270099324 + }, + { + "step": 41, + "epoch": 0.27796610169491526, + "cpu_mem": 1.86425344, + "gpu_mem": 4.569538048, + "loss": 0.855, + "grad_norm": 0.5877048373222351, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 42, + "epoch": 0.2847457627118644, + "cpu_mem": 1.864450048, + "gpu_mem": 4.569435136, + "loss": 0.7373, + "grad_norm": 0.6851824522018433, + "learning_rate": 0.0002984732162821399 + }, + { + "step": 43, + "epoch": 0.29152542372881357, + "cpu_mem": 1.864646656, + "gpu_mem": 4.56925696, + "loss": 0.7262, + "grad_norm": 0.9600096940994263, + "learning_rate": 0.0002982086784124952 + }, + { + "step": 44, + "epoch": 0.2983050847457627, + "cpu_mem": 1.864843264, + "gpu_mem": 4.569399808, + "loss": 0.6695, + "grad_norm": 0.5769892334938049, + "learning_rate": 0.00029792315305772796 + }, + { + "step": 45, + "epoch": 0.3050847457627119, + "cpu_mem": 1.864843264, + "gpu_mem": 4.56918016, + "loss": 0.7997, + "grad_norm": 0.9360148310661316, + "learning_rate": 0.0002976166806504174 + }, + { + "step": 46, + "epoch": 0.31186440677966104, + "cpu_mem": 1.865039872, + "gpu_mem": 4.569422848, + "loss": 0.6805, + "grad_norm": 0.5352592468261719, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 47, + "epoch": 0.31864406779661014, + "cpu_mem": 1.865039872, + "gpu_mem": 4.569146368, + "loss": 0.6462, + "grad_norm": 0.43625855445861816, + "learning_rate": 0.00029694107123365385 + }, + { + "step": 48, + "epoch": 0.3254237288135593, + "cpu_mem": 1.86523648, + "gpu_mem": 4.569223168, + "loss": 0.6556, + "grad_norm": 0.23391999304294586, + "learning_rate": 0.00029657202989567393 + }, + { + "step": 49, + "epoch": 0.33220338983050846, + "cpu_mem": 1.865433088, + "gpu_mem": 4.569240064, + "loss": 0.8585, + "grad_norm": 0.932235598564148, + "learning_rate": 0.00029618223283454893 + }, + { + "step": 50, + "epoch": 0.3389830508474576, + "cpu_mem": 1.865433088, + "gpu_mem": 4.569178624, + "loss": 0.602, + "grad_norm": 0.300733357667923, + "learning_rate": 0.00029577173524853123 + }, + { + "step": 51, + "epoch": 0.34576271186440677, + "cpu_mem": 1.865433088, + "gpu_mem": 4.569183232, + "loss": 0.6788, + "grad_norm": 0.6813507080078125, + "learning_rate": 0.0002953405952672261 + }, + { + "step": 52, + "epoch": 0.3525423728813559, + "cpu_mem": 1.865629696, + "gpu_mem": 4.569263104, + "loss": 0.6625, + "grad_norm": 0.303780198097229, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 53, + "epoch": 0.3593220338983051, + "cpu_mem": 1.865826304, + "gpu_mem": 4.569286144, + "loss": 0.6608, + "grad_norm": 0.7614128589630127, + "learning_rate": 0.0002944166352441363 + }, + { + "step": 54, + "epoch": 0.36610169491525424, + "cpu_mem": 1.865826304, + "gpu_mem": 4.569213952, + "loss": 0.7232, + "grad_norm": 0.29216963052749634, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 55, + "epoch": 0.3728813559322034, + "cpu_mem": 1.866022912, + "gpu_mem": 4.569484288, + "loss": 0.6526, + "grad_norm": 0.26287776231765747, + "learning_rate": 0.00029341087610604337 + }, + { + "step": 56, + "epoch": 0.37966101694915255, + "cpu_mem": 1.866022912, + "gpu_mem": 4.569270784, + "loss": 0.6677, + "grad_norm": 0.3018553555011749, + "learning_rate": 0.00029287749809037904 + }, + { + "step": 57, + "epoch": 0.3864406779661017, + "cpu_mem": 1.86621952, + "gpu_mem": 4.56926464, + "loss": 0.6152, + "grad_norm": 0.16227291524410248, + "learning_rate": 0.0002923238875255979 + }, + { + "step": 58, + "epoch": 0.39322033898305087, + "cpu_mem": 1.86621952, + "gpu_mem": 4.569160192, + "loss": 0.602, + "grad_norm": 0.2049732655286789, + "learning_rate": 0.00029175012280720024 + }, + { + "step": 59, + "epoch": 0.4, + "cpu_mem": 1.866416128, + "gpu_mem": 4.569177088, + "loss": 0.6999, + "grad_norm": 0.530553936958313, + "learning_rate": 0.000291156285184669 + }, + { + "step": 60, + "epoch": 0.4067796610169492, + "cpu_mem": 1.866612736, + "gpu_mem": 4.569270784, + "loss": 0.6024, + "grad_norm": 0.15197591483592987, + "learning_rate": 0.00029054245874996426 + }, + { + "step": 61, + "epoch": 0.4135593220338983, + "cpu_mem": 1.866612736, + "gpu_mem": 4.569281536, + "loss": 0.619, + "grad_norm": 0.17665980756282806, + "learning_rate": 0.0002899087304256151 + }, + { + "step": 62, + "epoch": 0.42033898305084744, + "cpu_mem": 1.866612736, + "gpu_mem": 4.569269248, + "loss": 0.7138, + "grad_norm": 0.5418952703475952, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 63, + "epoch": 0.4271186440677966, + "cpu_mem": 1.866809344, + "gpu_mem": 4.569261568, + "loss": 0.5947, + "grad_norm": 0.407823383808136, + "learning_rate": 0.000288581929876693 + }, + { + "step": 64, + "epoch": 0.43389830508474575, + "cpu_mem": 1.866809344, + "gpu_mem": 4.569190912, + "loss": 0.6476, + "grad_norm": 0.3117940127849579, + "learning_rate": 0.0002878890455372498 + }, + { + "step": 65, + "epoch": 0.4406779661016949, + "cpu_mem": 1.867005952, + "gpu_mem": 4.569235456, + "loss": 0.6723, + "grad_norm": 0.18984301388263702, + "learning_rate": 0.0002871766350518159 + }, + { + "step": 66, + "epoch": 0.44745762711864406, + "cpu_mem": 1.867005952, + "gpu_mem": 4.569428992, + "loss": 0.6323, + "grad_norm": 0.5517657399177551, + "learning_rate": 0.00028644479930317775 + }, + { + "step": 67, + "epoch": 0.4542372881355932, + "cpu_mem": 1.867005952, + "gpu_mem": 4.569138688, + "loss": 0.6329, + "grad_norm": 0.32155558466911316, + "learning_rate": 0.00028569364192488803 + }, + { + "step": 68, + "epoch": 0.4610169491525424, + "cpu_mem": 1.867005952, + "gpu_mem": 4.569106432, + "loss": 0.7257, + "grad_norm": 0.5993584394454956, + "learning_rate": 0.00028492326928659045 + }, + { + "step": 69, + "epoch": 0.46779661016949153, + "cpu_mem": 1.867005952, + "gpu_mem": 4.56917248, + "loss": 0.6649, + "grad_norm": 0.36204221844673157, + "learning_rate": 0.00028413379047895665 + }, + { + "step": 70, + "epoch": 0.4745762711864407, + "cpu_mem": 1.86720256, + "gpu_mem": 4.569166336, + "loss": 0.6801, + "grad_norm": 0.3007507026195526, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 71, + "epoch": 0.48135593220338985, + "cpu_mem": 1.86720256, + "gpu_mem": 4.5693952, + "loss": 0.594, + "grad_norm": 0.2737438976764679, + "learning_rate": 0.0002824979642304366 + }, + { + "step": 72, + "epoch": 0.488135593220339, + "cpu_mem": 1.86720256, + "gpu_mem": 4.56938752, + "loss": 0.6695, + "grad_norm": 0.20364800095558167, + "learning_rate": 0.0002816518484350883 + }, + { + "step": 73, + "epoch": 0.49491525423728816, + "cpu_mem": 1.867399168, + "gpu_mem": 4.569353728, + "loss": 0.7123, + "grad_norm": 0.5893431305885315, + "learning_rate": 0.0002807870897286772 + }, + { + "step": 74, + "epoch": 0.5016949152542373, + "cpu_mem": 1.867399168, + "gpu_mem": 4.569213952, + "loss": 0.6062, + "grad_norm": 0.29233109951019287, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 75, + "epoch": 0.5084745762711864, + "cpu_mem": 1.867595776, + "gpu_mem": 4.569138688, + "loss": 0.6028, + "grad_norm": 0.6165870428085327, + "learning_rate": 0.000279002136031155 + }, + { + "step": 76, + "epoch": 0.5152542372881356, + "cpu_mem": 1.867595776, + "gpu_mem": 4.569078784, + "loss": 0.6498, + "grad_norm": 0.4225340485572815, + "learning_rate": 0.00027808219380317216 + }, + { + "step": 77, + "epoch": 0.5220338983050847, + "cpu_mem": 1.867595776, + "gpu_mem": 4.569152512, + "loss": 0.6043, + "grad_norm": 0.45868176221847534, + "learning_rate": 0.0002771441141545895 + }, + { + "step": 78, + "epoch": 0.5288135593220339, + "cpu_mem": 1.867595776, + "gpu_mem": 4.569204736, + "loss": 0.7725, + "grad_norm": 0.8883103728294373, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 79, + "epoch": 0.535593220338983, + "cpu_mem": 1.867595776, + "gpu_mem": 4.569336832, + "loss": 0.6641, + "grad_norm": 0.272790789604187, + "learning_rate": 0.000275214076502292 + }, + { + "step": 80, + "epoch": 0.5423728813559322, + "cpu_mem": 1.867595776, + "gpu_mem": 4.569227776, + "loss": 0.6369, + "grad_norm": 0.16785593330860138, + "learning_rate": 0.0002742223918067056 + }, + { + "step": 81, + "epoch": 0.5491525423728814, + "cpu_mem": 1.867792384, + "gpu_mem": 4.569107968, + "loss": 0.6504, + "grad_norm": 0.2761949896812439, + "learning_rate": 0.00027321311626807374 + }, + { + "step": 82, + "epoch": 0.5559322033898305, + "cpu_mem": 1.867792384, + "gpu_mem": 4.569177088, + "loss": 0.6781, + "grad_norm": 0.2840648591518402, + "learning_rate": 0.0002721863928075503 + }, + { + "step": 83, + "epoch": 0.5627118644067797, + "cpu_mem": 1.867792384, + "gpu_mem": 4.569276928, + "loss": 0.6974, + "grad_norm": 0.333095908164978, + "learning_rate": 0.000271142366817049 + }, + { + "step": 84, + "epoch": 0.5694915254237288, + "cpu_mem": 1.867792384, + "gpu_mem": 4.569240064, + "loss": 0.6338, + "grad_norm": 0.42851707339286804, + "learning_rate": 0.00027008118613865406 + }, + { + "step": 85, + "epoch": 0.576271186440678, + "cpu_mem": 1.867792384, + "gpu_mem": 4.56927232, + "loss": 0.6254, + "grad_norm": 0.5778331160545349, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 86, + "epoch": 0.5830508474576271, + "cpu_mem": 1.867792384, + "gpu_mem": 4.569223168, + "loss": 0.636, + "grad_norm": 0.44489744305610657, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 87, + "epoch": 0.5898305084745763, + "cpu_mem": 1.867792384, + "gpu_mem": 4.569230848, + "loss": 0.6438, + "grad_norm": 0.15058523416519165, + "learning_rate": 0.00026679623070746325 + }, + { + "step": 88, + "epoch": 0.5966101694915255, + "cpu_mem": 1.867792384, + "gpu_mem": 4.569375232, + "loss": 0.6191, + "grad_norm": 0.2644456923007965, + "learning_rate": 0.0002656679579618081 + }, + { + "step": 89, + "epoch": 0.6033898305084746, + "cpu_mem": 1.867792384, + "gpu_mem": 4.56915712, + "loss": 0.6407, + "grad_norm": 0.4522266983985901, + "learning_rate": 0.0002645233057465235 + }, + { + "step": 90, + "epoch": 0.6101694915254238, + "cpu_mem": 1.867988992, + "gpu_mem": 4.56921088, + "loss": 0.6785, + "grad_norm": 0.40537896752357483, + "learning_rate": 0.00026336243615313873 + }, + { + "step": 91, + "epoch": 0.6169491525423729, + "cpu_mem": 1.867988992, + "gpu_mem": 4.569178624, + "loss": 0.5951, + "grad_norm": 0.15953081846237183, + "learning_rate": 0.00026218551356968814 + }, + { + "step": 92, + "epoch": 0.6237288135593221, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569260032, + "loss": 0.7326, + "grad_norm": 0.7195523977279663, + "learning_rate": 0.00026099270465743254 + }, + { + "step": 93, + "epoch": 0.6305084745762712, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569063424, + "loss": 0.7221, + "grad_norm": 0.37060847878456116, + "learning_rate": 0.0002597841783272588 + }, + { + "step": 94, + "epoch": 0.6372881355932203, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569177088, + "loss": 0.6141, + "grad_norm": 0.448542058467865, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 95, + "epoch": 0.6440677966101694, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569197056, + "loss": 0.6702, + "grad_norm": 0.2975255846977234, + "learning_rate": 0.00025732066016100394 + }, + { + "step": 96, + "epoch": 0.6508474576271186, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569235456, + "loss": 0.6303, + "grad_norm": 0.38070985674858093, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 97, + "epoch": 0.6576271186440678, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569220096, + "loss": 0.6651, + "grad_norm": 0.2246520221233368, + "learning_rate": 0.0002547963544337602 + }, + { + "step": 98, + "epoch": 0.6644067796610169, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569132544, + "loss": 0.6517, + "grad_norm": 0.17215245962142944, + "learning_rate": 0.0002535118517223168 + }, + { + "step": 99, + "epoch": 0.6711864406779661, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569081856, + "loss": 0.6429, + "grad_norm": 0.16858263313770294, + "learning_rate": 0.00025221269093908365 + }, + { + "step": 100, + "epoch": 0.6779661016949152, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569198592, + "loss": 0.6192, + "grad_norm": 0.26492831110954285, + "learning_rate": 0.0002508990560551879 + }, + { + "step": 101, + "epoch": 0.6847457627118644, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569230848, + "loss": 0.6653, + "grad_norm": 0.18192459642887115, + "learning_rate": 0.0002495711330914001 + }, + { + "step": 102, + "epoch": 0.6915254237288135, + "cpu_mem": 1.8681856, + "gpu_mem": 4.56926464, + "loss": 0.6374, + "grad_norm": 0.17392142117023468, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 103, + "epoch": 0.6983050847457627, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569315328, + "loss": 0.6655, + "grad_norm": 0.212745800614357, + "learning_rate": 0.0002468731770971113 + }, + { + "step": 104, + "epoch": 0.7050847457627119, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569220096, + "loss": 0.6323, + "grad_norm": 0.3657839000225067, + "learning_rate": 0.0002455035261178632 + }, + { + "step": 105, + "epoch": 0.711864406779661, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569321472, + "loss": 0.6611, + "grad_norm": 0.1585790365934372, + "learning_rate": 0.0002441203511071278 + }, + { + "step": 106, + "epoch": 0.7186440677966102, + "cpu_mem": 1.8681856, + "gpu_mem": 4.56927232, + "loss": 0.6102, + "grad_norm": 0.2500617504119873, + "learning_rate": 0.00024272384793309077 + }, + { + "step": 107, + "epoch": 0.7254237288135593, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569160192, + "loss": 0.568, + "grad_norm": 0.22091622650623322, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 108, + "epoch": 0.7322033898305085, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569344512, + "loss": 0.6218, + "grad_norm": 0.2417532503604889, + "learning_rate": 0.00023989164997670202 + }, + { + "step": 109, + "epoch": 0.7389830508474576, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569198592, + "loss": 0.6461, + "grad_norm": 0.21034584939479828, + "learning_rate": 0.0002384563562552943 + }, + { + "step": 110, + "epoch": 0.7457627118644068, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569201664, + "loss": 0.6216, + "grad_norm": 0.16161663830280304, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 111, + "epoch": 0.752542372881356, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569170944, + "loss": 0.6476, + "grad_norm": 0.31787025928497314, + "learning_rate": 0.0002355483955402446 + }, + { + "step": 112, + "epoch": 0.7593220338983051, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569217024, + "loss": 0.6392, + "grad_norm": 0.15448404848575592, + "learning_rate": 0.00023407614033613407 + }, + { + "step": 113, + "epoch": 0.7661016949152543, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569207808, + "loss": 0.642, + "grad_norm": 0.15988048911094666, + "learning_rate": 0.0002325919793059723 + }, + { + "step": 114, + "epoch": 0.7728813559322034, + "cpu_mem": 1.8681856, + "gpu_mem": 4.569189376, + "loss": 0.583, + "grad_norm": 0.24285565316677094, + "learning_rate": 0.00023109612261833963 + }, + { + "step": 115, + "epoch": 0.7796610169491526, + "cpu_mem": 1.868382208, + "gpu_mem": 4.56926464, + "loss": 0.5866, + "grad_norm": 0.18814919888973236, + "learning_rate": 0.0002295887820980112 + }, + { + "step": 116, + "epoch": 0.7864406779661017, + "cpu_mem": 1.868382208, + "gpu_mem": 4.569184768, + "loss": 0.6334, + "grad_norm": 0.2691470980644226, + "learning_rate": 0.0002280701711959608 + }, + { + "step": 117, + "epoch": 0.7932203389830509, + "cpu_mem": 1.868382208, + "gpu_mem": 4.569075712, + "loss": 0.6342, + "grad_norm": 0.18082375824451447, + "learning_rate": 0.00022654050495913495 + }, + { + "step": 118, + "epoch": 0.8, + "cpu_mem": 1.868382208, + "gpu_mem": 4.569313792, + "loss": 0.5947, + "grad_norm": 0.27322983741760254, + "learning_rate": 0.000225 + }, + { + "step": 119, + "epoch": 0.8067796610169492, + "cpu_mem": 1.868382208, + "gpu_mem": 4.569484288, + "loss": 0.5978, + "grad_norm": 0.18081514537334442, + "learning_rate": 0.00022344887446586865 + }, + { + "step": 120, + "epoch": 0.8135593220338984, + "cpu_mem": 1.868382208, + "gpu_mem": 4.569217024, + "loss": 0.5756, + "grad_norm": 0.24086304008960724, + "learning_rate": 0.00022188734800800852 + }, + { + "step": 121, + "epoch": 0.8203389830508474, + "cpu_mem": 1.868382208, + "gpu_mem": 4.569244672, + "loss": 0.5914, + "grad_norm": 0.19171011447906494, + "learning_rate": 0.00022031564175053754 + }, + { + "step": 122, + "epoch": 0.8271186440677966, + "cpu_mem": 1.868382208, + "gpu_mem": 4.56929536, + "loss": 0.5737, + "grad_norm": 0.1919744908809662, + "learning_rate": 0.00021873397825911153 + }, + { + "step": 123, + "epoch": 0.8338983050847457, + "cpu_mem": 1.868382208, + "gpu_mem": 4.569104896, + "loss": 0.6124, + "grad_norm": 0.4881593883037567, + "learning_rate": 0.00021714258150940685 + }, + { + "step": 124, + "epoch": 0.8406779661016949, + "cpu_mem": 1.868382208, + "gpu_mem": 4.569547264, + "loss": 0.5855, + "grad_norm": 0.32220786809921265, + "learning_rate": 0.0002155416768554039 + }, + { + "step": 125, + "epoch": 0.847457627118644, + "cpu_mem": 1.868382208, + "gpu_mem": 4.569273856, + "loss": 0.5547, + "grad_norm": 0.13969209790229797, + "learning_rate": 0.00021393149099747523 + }, + { + "step": 126, + "epoch": 0.8542372881355932, + "cpu_mem": 1.868382208, + "gpu_mem": 4.56915712, + "loss": 0.5766, + "grad_norm": 0.2447868436574936, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 127, + "epoch": 0.8610169491525423, + "cpu_mem": 1.868382208, + "gpu_mem": 4.569596416, + "loss": 0.6184, + "grad_norm": 0.3503088653087616, + "learning_rate": 0.00021068418901049025 + }, + { + "step": 128, + "epoch": 0.8677966101694915, + "cpu_mem": 1.868382208, + "gpu_mem": 4.56937216, + "loss": 0.5191, + "grad_norm": 0.33297279477119446, + "learning_rate": 0.0002090475327242912 + }, + { + "step": 129, + "epoch": 0.8745762711864407, + "cpu_mem": 1.868382208, + "gpu_mem": 4.569412096, + "loss": 0.6663, + "grad_norm": 0.32061004638671875, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 130, + "epoch": 0.8813559322033898, + "cpu_mem": 1.868382208, + "gpu_mem": 4.569193984, + "loss": 0.6925, + "grad_norm": 0.14841172099113464, + "learning_rate": 0.0002057493683490491 + }, + { + "step": 131, + "epoch": 0.888135593220339, + "cpu_mem": 1.868382208, + "gpu_mem": 4.569323008, + "loss": 0.6377, + "grad_norm": 0.16201747953891754, + "learning_rate": 0.00020408832730536746 + }, + { + "step": 132, + "epoch": 0.8949152542372881, + "cpu_mem": 1.868382208, + "gpu_mem": 4.569404416, + "loss": 0.5764, + "grad_norm": 0.35174643993377686, + "learning_rate": 0.00020241962693986476 + }, + { + "step": 133, + "epoch": 0.9016949152542373, + "cpu_mem": 1.868578816, + "gpu_mem": 4.56918784, + "loss": 0.5874, + "grad_norm": 0.34143969416618347, + "learning_rate": 0.0002007435035533061 + }, + { + "step": 134, + "epoch": 0.9084745762711864, + "cpu_mem": 1.868578816, + "gpu_mem": 4.569321472, + "loss": 0.577, + "grad_norm": 0.7061131596565247, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 135, + "epoch": 0.9152542372881356, + "cpu_mem": 1.868578816, + "gpu_mem": 4.569344512, + "loss": 0.5563, + "grad_norm": 0.18467654287815094, + "learning_rate": 0.00019736993814225374 + }, + { + "step": 136, + "epoch": 0.9220338983050848, + "cpu_mem": 1.868578816, + "gpu_mem": 4.569181696, + "loss": 0.499, + "grad_norm": 0.25841766595840454, + "learning_rate": 0.00019567297384048604 + }, + { + "step": 137, + "epoch": 0.9288135593220339, + "cpu_mem": 1.868578816, + "gpu_mem": 4.569061888, + "loss": 0.6562, + "grad_norm": 0.6774580478668213, + "learning_rate": 0.0001939695418954653 + }, + { + "step": 138, + "epoch": 0.9355932203389831, + "cpu_mem": 1.868578816, + "gpu_mem": 4.569243136, + "loss": 0.6027, + "grad_norm": 0.6688999533653259, + "learning_rate": 0.00019225988352621445 + }, + { + "step": 139, + "epoch": 0.9423728813559322, + "cpu_mem": 1.868578816, + "gpu_mem": 4.56914176, + "loss": 0.6442, + "grad_norm": 0.5563299655914307, + "learning_rate": 0.00019054424083346592 + }, + { + "step": 140, + "epoch": 0.9491525423728814, + "cpu_mem": 1.868578816, + "gpu_mem": 4.569193984, + "loss": 0.578, + "grad_norm": 0.3428448736667633, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 141, + "epoch": 0.9559322033898305, + "cpu_mem": 1.868578816, + "gpu_mem": 4.56922624, + "loss": 0.5996, + "grad_norm": 0.23625700175762177, + "learning_rate": 0.0001870959750831323 + }, + { + "step": 142, + "epoch": 0.9627118644067797, + "cpu_mem": 1.868578816, + "gpu_mem": 4.569366016, + "loss": 0.5019, + "grad_norm": 0.4002193808555603, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 143, + "epoch": 0.9694915254237289, + "cpu_mem": 1.868578816, + "gpu_mem": 4.56934912, + "loss": 0.6905, + "grad_norm": 0.23743273317813873, + "learning_rate": 0.00018362669777878453 + }, + { + "step": 144, + "epoch": 0.976271186440678, + "cpu_mem": 1.868578816, + "gpu_mem": 4.56954112, + "loss": 0.5939, + "grad_norm": 0.3022172749042511, + "learning_rate": 0.00018188479343294648 + }, + { + "step": 145, + "epoch": 0.9830508474576272, + "cpu_mem": 1.868578816, + "gpu_mem": 4.569252352, + "loss": 0.6195, + "grad_norm": 0.5608668327331543, + "learning_rate": 0.0001801383739559098 + }, + { + "step": 146, + "epoch": 0.9898305084745763, + "cpu_mem": 1.868578816, + "gpu_mem": 4.56928768, + "loss": 0.5852, + "grad_norm": 0.17279960215091705, + "learning_rate": 0.0001783876866540615 + }, + { + "step": 147, + "epoch": 0.9966101694915255, + "cpu_mem": 1.868578816, + "gpu_mem": 4.569186304, + "loss": 0.6073, + "grad_norm": 0.30650338530540466, + "learning_rate": 0.00017663297943814552 + }, + { + "step": 148, + "epoch": 1.0033898305084745, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619813376, + "loss": 0.9718, + "grad_norm": 0.7434167265892029, + "learning_rate": 0.0001748745007881561 + }, + { + "step": 149, + "epoch": 1.0101694915254238, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619748864, + "loss": 0.5927, + "grad_norm": 0.23540715873241425, + "learning_rate": 0.00017311249971815185 + }, + { + "step": 150, + "epoch": 1.0169491525423728, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619586048, + "loss": 0.623, + "grad_norm": 0.3019307553768158, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 151, + "epoch": 1.023728813559322, + "cpu_mem": 1.868578816, + "gpu_mem": 4.61965824, + "loss": 0.7438, + "grad_norm": 0.40375852584838867, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 152, + "epoch": 1.0305084745762711, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619693568, + "loss": 0.5371, + "grad_norm": 0.303040087223053, + "learning_rate": 0.00016780785939859576 + }, + { + "step": 153, + "epoch": 1.0372881355932204, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619718144, + "loss": 0.6261, + "grad_norm": 0.3840215802192688, + "learning_rate": 0.00016603426823476693 + }, + { + "step": 154, + "epoch": 1.0440677966101695, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619679744, + "loss": 0.5695, + "grad_norm": 0.15683363378047943, + "learning_rate": 0.00016425840649562736 + }, + { + "step": 155, + "epoch": 1.0508474576271187, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619900928, + "loss": 0.5837, + "grad_norm": 0.27359122037887573, + "learning_rate": 0.00016248052565681436 + }, + { + "step": 156, + "epoch": 1.0576271186440678, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619808768, + "loss": 0.6318, + "grad_norm": 0.6406038999557495, + "learning_rate": 0.00016070087747988482 + }, + { + "step": 157, + "epoch": 1.064406779661017, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619715072, + "loss": 0.6114, + "grad_norm": 0.23893854022026062, + "learning_rate": 0.00015891971397666464 + }, + { + "step": 158, + "epoch": 1.071186440677966, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619641344, + "loss": 0.571, + "grad_norm": 0.19982002675533295, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 159, + "epoch": 1.0779661016949154, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619990016, + "loss": 0.5366, + "grad_norm": 0.21643070876598358, + "learning_rate": 0.00015535385007584706 + }, + { + "step": 160, + "epoch": 1.0847457627118644, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619584512, + "loss": 0.6489, + "grad_norm": 0.2626379132270813, + "learning_rate": 0.0001535696546319161 + }, + { + "step": 161, + "epoch": 1.0915254237288137, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619530752, + "loss": 0.563, + "grad_norm": 0.24028165638446808, + "learning_rate": 0.00015178495369752213 + }, + { + "step": 162, + "epoch": 1.0983050847457627, + "cpu_mem": 1.868578816, + "gpu_mem": 4.620306432, + "loss": 0.5502, + "grad_norm": 0.20449261367321014, + "learning_rate": 0.00015 + }, + { + "step": 163, + "epoch": 1.1050847457627118, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619782656, + "loss": 0.6463, + "grad_norm": 0.4013831913471222, + "learning_rate": 0.00014821504630247785 + }, + { + "step": 164, + "epoch": 1.111864406779661, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619695104, + "loss": 0.6441, + "grad_norm": 0.2570289373397827, + "learning_rate": 0.00014643034536808387 + }, + { + "step": 165, + "epoch": 1.11864406779661, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619644416, + "loss": 0.6276, + "grad_norm": 0.505293607711792, + "learning_rate": 0.00014464614992415294 + }, + { + "step": 166, + "epoch": 1.1254237288135593, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619739648, + "loss": 0.5858, + "grad_norm": 0.4458205997943878, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 167, + "epoch": 1.1322033898305084, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619656704, + "loss": 0.6358, + "grad_norm": 0.40672361850738525, + "learning_rate": 0.00014108028602333536 + }, + { + "step": 168, + "epoch": 1.1389830508474577, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619675136, + "loss": 0.543, + "grad_norm": 0.215139701962471, + "learning_rate": 0.00013929912252011516 + }, + { + "step": 169, + "epoch": 1.1457627118644067, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619762688, + "loss": 0.5558, + "grad_norm": 0.3583396375179291, + "learning_rate": 0.00013751947434318564 + }, + { + "step": 170, + "epoch": 1.152542372881356, + "cpu_mem": 1.868578816, + "gpu_mem": 4.619647488, + "loss": 0.5457, + "grad_norm": 0.20694230496883392, + "learning_rate": 0.00013574159350437261 + }, + { + "step": 171, + "epoch": 1.159322033898305, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619710464, + "loss": 0.6018, + "grad_norm": 0.20557917654514313, + "learning_rate": 0.0001339657317652331 + }, + { + "step": 172, + "epoch": 1.1661016949152543, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619618304, + "loss": 0.6049, + "grad_norm": 0.16779173910617828, + "learning_rate": 0.00013219214060140424 + }, + { + "step": 173, + "epoch": 1.1728813559322033, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619917824, + "loss": 0.5489, + "grad_norm": 0.2219359427690506, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 174, + "epoch": 1.1796610169491526, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619641344, + "loss": 0.6814, + "grad_norm": 0.3426586389541626, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 175, + "epoch": 1.1864406779661016, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619607552, + "loss": 0.5968, + "grad_norm": 0.20247220993041992, + "learning_rate": 0.00012688750028184818 + }, + { + "step": 176, + "epoch": 1.193220338983051, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619745792, + "loss": 0.5592, + "grad_norm": 0.4305143654346466, + "learning_rate": 0.0001251254992118439 + }, + { + "step": 177, + "epoch": 1.2, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619844096, + "loss": 0.5609, + "grad_norm": 0.27157676219940186, + "learning_rate": 0.00012336702056185453 + }, + { + "step": 178, + "epoch": 1.2067796610169492, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619590656, + "loss": 0.5699, + "grad_norm": 0.33713287115097046, + "learning_rate": 0.00012161231334593851 + }, + { + "step": 179, + "epoch": 1.2135593220338983, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619690496, + "loss": 0.6097, + "grad_norm": 0.552636444568634, + "learning_rate": 0.00011986162604409015 + }, + { + "step": 180, + "epoch": 1.2203389830508475, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619662848, + "loss": 0.6353, + "grad_norm": 0.1929439902305603, + "learning_rate": 0.00011811520656705348 + }, + { + "step": 181, + "epoch": 1.2271186440677966, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619599872, + "loss": 0.5461, + "grad_norm": 0.21299293637275696, + "learning_rate": 0.00011637330222121543 + }, + { + "step": 182, + "epoch": 1.2338983050847459, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619817984, + "loss": 0.6515, + "grad_norm": 0.6366555690765381, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 183, + "epoch": 1.240677966101695, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619715072, + "loss": 0.61, + "grad_norm": 0.22585715353488922, + "learning_rate": 0.00011290402491686766 + }, + { + "step": 184, + "epoch": 1.2474576271186442, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619662848, + "loss": 0.5414, + "grad_norm": 0.19121809303760529, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 185, + "epoch": 1.2542372881355932, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619641344, + "loss": 0.5139, + "grad_norm": 0.2095835655927658, + "learning_rate": 0.00010945575916653407 + }, + { + "step": 186, + "epoch": 1.2610169491525425, + "cpu_mem": 1.869955072, + "gpu_mem": 4.61965056, + "loss": 0.5044, + "grad_norm": 0.3684116303920746, + "learning_rate": 0.00010774011647378553 + }, + { + "step": 187, + "epoch": 1.2677966101694915, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619582976, + "loss": 0.6399, + "grad_norm": 0.38584113121032715, + "learning_rate": 0.00010603045810453468 + }, + { + "step": 188, + "epoch": 1.2745762711864406, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619745792, + "loss": 0.5968, + "grad_norm": 0.3719564378261566, + "learning_rate": 0.00010432702615951396 + }, + { + "step": 189, + "epoch": 1.2813559322033898, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619615232, + "loss": 0.6727, + "grad_norm": 0.26370057463645935, + "learning_rate": 0.00010263006185774627 + }, + { + "step": 190, + "epoch": 1.288135593220339, + "cpu_mem": 1.869955072, + "gpu_mem": 4.61973504, + "loss": 0.562, + "grad_norm": 0.21919971704483032, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 191, + "epoch": 1.2949152542372881, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619553792, + "loss": 0.5599, + "grad_norm": 0.2167692631483078, + "learning_rate": 9.925649644669391e-05 + }, + { + "step": 192, + "epoch": 1.3016949152542372, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619685888, + "loss": 0.542, + "grad_norm": 0.6977086663246155, + "learning_rate": 9.758037306013526e-05 + }, + { + "step": 193, + "epoch": 1.3084745762711865, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619659776, + "loss": 0.5724, + "grad_norm": 0.20705866813659668, + "learning_rate": 9.591167269463255e-05 + }, + { + "step": 194, + "epoch": 1.3152542372881357, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619625984, + "loss": 0.6231, + "grad_norm": 0.2529354691505432, + "learning_rate": 9.425063165095088e-05 + }, + { + "step": 195, + "epoch": 1.3220338983050848, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619730432, + "loss": 0.5275, + "grad_norm": 0.19715985655784607, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 196, + "epoch": 1.3288135593220338, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619725824, + "loss": 0.522, + "grad_norm": 0.23550023138523102, + "learning_rate": 9.095246727570879e-05 + }, + { + "step": 197, + "epoch": 1.335593220338983, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619584512, + "loss": 0.5592, + "grad_norm": 0.44015738368034363, + "learning_rate": 8.931581098950973e-05 + }, + { + "step": 198, + "epoch": 1.3423728813559321, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619776512, + "loss": 0.5552, + "grad_norm": 0.33142605423927307, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 199, + "epoch": 1.3491525423728814, + "cpu_mem": 1.869955072, + "gpu_mem": 4.61962752, + "loss": 0.5802, + "grad_norm": 0.33144035935401917, + "learning_rate": 8.606850900252478e-05 + }, + { + "step": 200, + "epoch": 1.3559322033898304, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619730432, + "loss": 0.5913, + "grad_norm": 0.27675193548202515, + "learning_rate": 8.445832314459608e-05 + }, + { + "step": 201, + "epoch": 1.3627118644067797, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619933184, + "loss": 0.5726, + "grad_norm": 0.23826994001865387, + "learning_rate": 8.285741849059311e-05 + }, + { + "step": 202, + "epoch": 1.3694915254237288, + "cpu_mem": 1.869955072, + "gpu_mem": 4.61973504, + "loss": 0.5471, + "grad_norm": 0.24110861122608185, + "learning_rate": 8.126602174088843e-05 + }, + { + "step": 203, + "epoch": 1.376271186440678, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619621376, + "loss": 0.5154, + "grad_norm": 0.3283371329307556, + "learning_rate": 7.968435824946242e-05 + }, + { + "step": 204, + "epoch": 1.383050847457627, + "cpu_mem": 1.869955072, + "gpu_mem": 4.6196352, + "loss": 0.5857, + "grad_norm": 0.3745998442173004, + "learning_rate": 7.811265199199152e-05 + }, + { + "step": 205, + "epoch": 1.3898305084745763, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619679744, + "loss": 0.6002, + "grad_norm": 0.4208592474460602, + "learning_rate": 7.655112553413135e-05 + }, + { + "step": 206, + "epoch": 1.3966101694915254, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619621376, + "loss": 0.5675, + "grad_norm": 0.22009742259979248, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 207, + "epoch": 1.4033898305084747, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619854848, + "loss": 0.5768, + "grad_norm": 0.36210229992866516, + "learning_rate": 7.345949504086507e-05 + }, + { + "step": 208, + "epoch": 1.4101694915254237, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619885568, + "loss": 0.5194, + "grad_norm": 0.5253971219062805, + "learning_rate": 7.192982880403917e-05 + }, + { + "step": 209, + "epoch": 1.4169491525423727, + "cpu_mem": 1.869955072, + "gpu_mem": 4.61981184, + "loss": 0.6625, + "grad_norm": 0.2661760449409485, + "learning_rate": 7.041121790198881e-05 + }, + { + "step": 210, + "epoch": 1.423728813559322, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619699712, + "loss": 0.5725, + "grad_norm": 0.26974421739578247, + "learning_rate": 6.890387738166041e-05 + }, + { + "step": 211, + "epoch": 1.4305084745762713, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619649024, + "loss": 0.5044, + "grad_norm": 0.4165514409542084, + "learning_rate": 6.740802069402771e-05 + }, + { + "step": 212, + "epoch": 1.4372881355932203, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619618304, + "loss": 0.6335, + "grad_norm": 0.3144778907299042, + "learning_rate": 6.592385966386588e-05 + }, + { + "step": 213, + "epoch": 1.4440677966101694, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619641344, + "loss": 0.5922, + "grad_norm": 0.33758872747421265, + "learning_rate": 6.445160445975536e-05 + }, + { + "step": 214, + "epoch": 1.4508474576271186, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619724288, + "loss": 0.4803, + "grad_norm": 0.25521180033683777, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 215, + "epoch": 1.457627118644068, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619652096, + "loss": 0.637, + "grad_norm": 0.6204141974449158, + "learning_rate": 6.154364374470568e-05 + }, + { + "step": 216, + "epoch": 1.464406779661017, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619817984, + "loss": 0.5391, + "grad_norm": 0.2896119952201843, + "learning_rate": 6.010835002329795e-05 + }, + { + "step": 217, + "epoch": 1.471186440677966, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619659776, + "loss": 0.6725, + "grad_norm": 0.6467360854148865, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 218, + "epoch": 1.4779661016949153, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619636736, + "loss": 0.5311, + "grad_norm": 0.2315453290939331, + "learning_rate": 5.72761520669092e-05 + }, + { + "step": 219, + "epoch": 1.4847457627118645, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619762688, + "loss": 0.5925, + "grad_norm": 0.26674365997314453, + "learning_rate": 5.587964889287218e-05 + }, + { + "step": 220, + "epoch": 1.4915254237288136, + "cpu_mem": 1.869955072, + "gpu_mem": 4.61979648, + "loss": 0.5732, + "grad_norm": 0.34899473190307617, + "learning_rate": 5.449647388213678e-05 + }, + { + "step": 221, + "epoch": 1.4983050847457626, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619664384, + "loss": 0.6299, + "grad_norm": 0.31713443994522095, + "learning_rate": 5.312682290288869e-05 + }, + { + "step": 222, + "epoch": 1.505084745762712, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619801088, + "loss": 0.5361, + "grad_norm": 0.31083235144615173, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 223, + "epoch": 1.5118644067796612, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619715072, + "loss": 0.563, + "grad_norm": 0.36497583985328674, + "learning_rate": 5.0428866908599864e-05 + }, + { + "step": 224, + "epoch": 1.5186440677966102, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619679744, + "loss": 0.5678, + "grad_norm": 0.23361752927303314, + "learning_rate": 4.9100943944812114e-05 + }, + { + "step": 225, + "epoch": 1.5254237288135593, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619644416, + "loss": 0.5817, + "grad_norm": 0.1971440315246582, + "learning_rate": 4.778730906091632e-05 + }, + { + "step": 226, + "epoch": 1.5322033898305085, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619793408, + "loss": 0.5413, + "grad_norm": 0.4486442506313324, + "learning_rate": 4.648814827768322e-05 + }, + { + "step": 227, + "epoch": 1.5389830508474578, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619682816, + "loss": 0.5297, + "grad_norm": 0.20069724321365356, + "learning_rate": 4.5203645566239816e-05 + }, + { + "step": 228, + "epoch": 1.5457627118644068, + "cpu_mem": 1.869955072, + "gpu_mem": 4.61962752, + "loss": 0.6441, + "grad_norm": 0.28323158621788025, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 229, + "epoch": 1.5525423728813559, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619569152, + "loss": 0.5394, + "grad_norm": 0.2116287648677826, + "learning_rate": 4.267933983899601e-05 + }, + { + "step": 230, + "epoch": 1.559322033898305, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619625984, + "loss": 0.5674, + "grad_norm": 0.2197268009185791, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 231, + "epoch": 1.5661016949152542, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619904, + "loss": 0.6033, + "grad_norm": 0.23146265745162964, + "learning_rate": 4.0215821672741213e-05 + }, + { + "step": 232, + "epoch": 1.5728813559322035, + "cpu_mem": 1.869955072, + "gpu_mem": 4.61962752, + "loss": 0.6502, + "grad_norm": 0.2810196280479431, + "learning_rate": 3.900729534256745e-05 + }, + { + "step": 233, + "epoch": 1.5796610169491525, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619940864, + "loss": 0.5418, + "grad_norm": 0.25268030166625977, + "learning_rate": 3.781448643031187e-05 + }, + { + "step": 234, + "epoch": 1.5864406779661016, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619816448, + "loss": 0.5725, + "grad_norm": 0.3863709270954132, + "learning_rate": 3.663756384686127e-05 + }, + { + "step": 235, + "epoch": 1.5932203389830508, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619572224, + "loss": 0.5238, + "grad_norm": 0.4046196937561035, + "learning_rate": 3.547669425347647e-05 + }, + { + "step": 236, + "epoch": 1.6, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619632128, + "loss": 0.5906, + "grad_norm": 0.31145113706588745, + "learning_rate": 3.433204203819185e-05 + }, + { + "step": 237, + "epoch": 1.6067796610169491, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619693568, + "loss": 0.5476, + "grad_norm": 0.3644321858882904, + "learning_rate": 3.3203769292536764e-05 + }, + { + "step": 238, + "epoch": 1.6135593220338982, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619695104, + "loss": 0.5746, + "grad_norm": 0.3033443093299866, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 239, + "epoch": 1.6203389830508474, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619948544, + "loss": 0.6773, + "grad_norm": 0.3245789408683777, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 240, + "epoch": 1.6271186440677967, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619598336, + "loss": 0.6536, + "grad_norm": 0.22184336185455322, + "learning_rate": 2.9918813861345952e-05 + }, + { + "step": 241, + "epoch": 1.6338983050847458, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619894784, + "loss": 0.5909, + "grad_norm": 0.33780184388160706, + "learning_rate": 2.885763318295102e-05 + }, + { + "step": 242, + "epoch": 1.6406779661016948, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619756544, + "loss": 0.5751, + "grad_norm": 0.25562813878059387, + "learning_rate": 2.781360719244964e-05 + }, + { + "step": 243, + "epoch": 1.647457627118644, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619609088, + "loss": 0.682, + "grad_norm": 0.47343429923057556, + "learning_rate": 2.6786883731926306e-05 + }, + { + "step": 244, + "epoch": 1.6542372881355933, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619748864, + "loss": 0.5392, + "grad_norm": 0.326248437166214, + "learning_rate": 2.5777608193294396e-05 + }, + { + "step": 245, + "epoch": 1.6610169491525424, + "cpu_mem": 1.869955072, + "gpu_mem": 4.61962752, + "loss": 0.5775, + "grad_norm": 0.2188483327627182, + "learning_rate": 2.4785923497707956e-05 + }, + { + "step": 246, + "epoch": 1.6677966101694914, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619721216, + "loss": 0.5632, + "grad_norm": 0.27142593264579773, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 247, + "epoch": 1.6745762711864407, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619739648, + "loss": 0.5391, + "grad_norm": 0.2119600921869278, + "learning_rate": 2.285588584541047e-05 + }, + { + "step": 248, + "epoch": 1.68135593220339, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619692032, + "loss": 0.5703, + "grad_norm": 0.2127329707145691, + "learning_rate": 2.1917806196827792e-05 + }, + { + "step": 249, + "epoch": 1.688135593220339, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619598336, + "loss": 0.5046, + "grad_norm": 0.2819758653640747, + "learning_rate": 2.0997863968844914e-05 + }, + { + "step": 250, + "epoch": 1.694915254237288, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619690496, + "loss": 0.5471, + "grad_norm": 0.38040268421173096, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 251, + "epoch": 1.7016949152542373, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619602944, + "loss": 0.5509, + "grad_norm": 0.26347485184669495, + "learning_rate": 1.921291027132278e-05 + }, + { + "step": 252, + "epoch": 1.7084745762711866, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619645952, + "loss": 0.6369, + "grad_norm": 0.27009615302085876, + "learning_rate": 1.834815156491165e-05 + }, + { + "step": 253, + "epoch": 1.7152542372881356, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619839488, + "loss": 0.5872, + "grad_norm": 0.21669737994670868, + "learning_rate": 1.750203576956341e-05 + }, + { + "step": 254, + "epoch": 1.7220338983050847, + "cpu_mem": 1.869955072, + "gpu_mem": 4.6196352, + "loss": 0.6057, + "grad_norm": 0.3231895864009857, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 255, + "epoch": 1.7288135593220337, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619791872, + "loss": 0.6172, + "grad_norm": 0.524515688419342, + "learning_rate": 1.5866209521043304e-05 + }, + { + "step": 256, + "epoch": 1.735593220338983, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619618304, + "loss": 0.5597, + "grad_norm": 0.30464836955070496, + "learning_rate": 1.5076730713409523e-05 + }, + { + "step": 257, + "epoch": 1.7423728813559323, + "cpu_mem": 1.869955072, + "gpu_mem": 4.620031488, + "loss": 0.5818, + "grad_norm": 0.3819347620010376, + "learning_rate": 1.4306358075111923e-05 + }, + { + "step": 258, + "epoch": 1.7491525423728813, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619690496, + "loss": 0.59, + "grad_norm": 0.46726247668266296, + "learning_rate": 1.3555200696822232e-05 + }, + { + "step": 259, + "epoch": 1.7559322033898304, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619607552, + "loss": 0.5646, + "grad_norm": 0.21613921225070953, + "learning_rate": 1.2823364948184095e-05 + }, + { + "step": 260, + "epoch": 1.7627118644067796, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619724288, + "loss": 0.4709, + "grad_norm": 0.20782452821731567, + "learning_rate": 1.2110954462750166e-05 + }, + { + "step": 261, + "epoch": 1.769491525423729, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619679744, + "loss": 0.4803, + "grad_norm": 0.3153877556324005, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 262, + "epoch": 1.776271186440678, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619636736, + "loss": 0.5302, + "grad_norm": 0.2309408187866211, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 263, + "epoch": 1.783050847457627, + "cpu_mem": 1.869955072, + "gpu_mem": 4.6196736, + "loss": 0.5507, + "grad_norm": 0.22791320085525513, + "learning_rate": 1.0091269574384874e-05 + }, + { + "step": 264, + "epoch": 1.7898305084745763, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619761152, + "loss": 0.521, + "grad_norm": 0.3149639368057251, + "learning_rate": 9.45754125003576e-06 + }, + { + "step": 265, + "epoch": 1.7966101694915255, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619679744, + "loss": 0.5747, + "grad_norm": 0.2614198327064514, + "learning_rate": 8.843714815330987e-06 + }, + { + "step": 266, + "epoch": 1.8033898305084746, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619894784, + "loss": 0.556, + "grad_norm": 0.2069764882326126, + "learning_rate": 8.249877192799731e-06 + }, + { + "step": 267, + "epoch": 1.8101694915254236, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619687424, + "loss": 0.5584, + "grad_norm": 0.5513150691986084, + "learning_rate": 7.676112474402068e-06 + }, + { + "step": 268, + "epoch": 1.8169491525423729, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619692032, + "loss": 0.4987, + "grad_norm": 0.33310428261756897, + "learning_rate": 7.122501909620926e-06 + }, + { + "step": 269, + "epoch": 1.8237288135593221, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619702784, + "loss": 0.5402, + "grad_norm": 0.40690258145332336, + "learning_rate": 6.5891238939566275e-06 + }, + { + "step": 270, + "epoch": 1.8305084745762712, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619741184, + "loss": 0.6208, + "grad_norm": 0.29315072298049927, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 271, + "epoch": 1.8372881355932202, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619793408, + "loss": 0.5545, + "grad_norm": 0.25929155945777893, + "learning_rate": 5.583364755863701e-06 + }, + { + "step": 272, + "epoch": 1.8440677966101695, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619652096, + "loss": 0.548, + "grad_norm": 0.3184114694595337, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 273, + "epoch": 1.8508474576271188, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619532288, + "loss": 0.6291, + "grad_norm": 0.35154369473457336, + "learning_rate": 4.659404732773908e-06 + }, + { + "step": 274, + "epoch": 1.8576271186440678, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619759616, + "loss": 0.6034, + "grad_norm": 0.32993513345718384, + "learning_rate": 4.228264751468752e-06 + }, + { + "step": 275, + "epoch": 1.8644067796610169, + "cpu_mem": 1.869955072, + "gpu_mem": 4.62000384, + "loss": 0.5331, + "grad_norm": 0.20142915844917297, + "learning_rate": 3.817767165451041e-06 + }, + { + "step": 276, + "epoch": 1.8711864406779661, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619664384, + "loss": 0.5774, + "grad_norm": 0.20837923884391785, + "learning_rate": 3.4279701043260886e-06 + }, + { + "step": 277, + "epoch": 1.8779661016949154, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619610624, + "loss": 0.6398, + "grad_norm": 0.3170711398124695, + "learning_rate": 3.0589287663461472e-06 + }, + { + "step": 278, + "epoch": 1.8847457627118644, + "cpu_mem": 1.869955072, + "gpu_mem": 4.61977344, + "loss": 0.5956, + "grad_norm": 0.20124398171901703, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 279, + "epoch": 1.8915254237288135, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619713536, + "loss": 0.6597, + "grad_norm": 0.2521730661392212, + "learning_rate": 2.3833193495825853e-06 + }, + { + "step": 280, + "epoch": 1.8983050847457628, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619693568, + "loss": 0.6055, + "grad_norm": 0.27245184779167175, + "learning_rate": 2.076846942272026e-06 + }, + { + "step": 281, + "epoch": 1.905084745762712, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619629056, + "loss": 0.5983, + "grad_norm": 0.21393615007400513, + "learning_rate": 1.791321587504768e-06 + }, + { + "step": 282, + "epoch": 1.911864406779661, + "cpu_mem": 1.869955072, + "gpu_mem": 4.6200576, + "loss": 0.5938, + "grad_norm": 0.4789579212665558, + "learning_rate": 1.5267837178600972e-06 + }, + { + "step": 283, + "epoch": 1.9186440677966101, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619764224, + "loss": 0.5506, + "grad_norm": 0.24231503903865814, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 284, + "epoch": 1.9254237288135592, + "cpu_mem": 1.869955072, + "gpu_mem": 4.61961984, + "loss": 0.5613, + "grad_norm": 0.22894299030303955, + "learning_rate": 1.0608172990067553e-06 + }, + { + "step": 285, + "epoch": 1.9322033898305084, + "cpu_mem": 1.869955072, + "gpu_mem": 4.6196736, + "loss": 0.5672, + "grad_norm": 0.24269582331180573, + "learning_rate": 8.594547342153979e-07 + }, + { + "step": 286, + "epoch": 1.9389830508474577, + "cpu_mem": 1.869955072, + "gpu_mem": 4.620091392, + "loss": 0.5801, + "grad_norm": 0.3782336115837097, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 287, + "epoch": 1.9457627118644067, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619860992, + "loss": 0.5711, + "grad_norm": 0.40622860193252563, + "learning_rate": 5.201134622801473e-07 + }, + { + "step": 288, + "epoch": 1.9525423728813558, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619645952, + "loss": 0.6113, + "grad_norm": 0.330731600522995, + "learning_rate": 3.821828084619727e-07 + }, + { + "step": 289, + "epoch": 1.959322033898305, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619730432, + "loss": 0.6043, + "grad_norm": 0.29115578532218933, + "learning_rate": 2.654391846207915e-07 + }, + { + "step": 290, + "epoch": 1.9661016949152543, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619655168, + "loss": 0.6569, + "grad_norm": 0.5765615105628967, + "learning_rate": 1.6989912254880556e-07 + }, + { + "step": 291, + "epoch": 1.9728813559322034, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619690496, + "loss": 0.644, + "grad_norm": 0.3383577764034271, + "learning_rate": 9.557615145123765e-08 + }, + { + "step": 292, + "epoch": 1.9796610169491524, + "cpu_mem": 1.869955072, + "gpu_mem": 4.61977344, + "loss": 0.561, + "grad_norm": 0.2158483862876892, + "learning_rate": 4.248079603064724e-08 + }, + { + "step": 293, + "epoch": 1.9864406779661017, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619690496, + "loss": 0.7093, + "grad_norm": 0.41401755809783936, + "learning_rate": 1.0620574996372811e-08 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619716608, + "loss": 0.6199, + "grad_norm": 0.4071233868598938, + "learning_rate": 0.0 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.869955072, + "gpu_mem": 4.619716608, + "train_runtime": 8792.1334, + "train_samples_per_second": 2.144, + "train_steps_per_second": 0.033, + "total_flos": 4.752115114357555e+16, + "train_loss": 1.2131220978133532 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r2-a2/README.md b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r2-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r2-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r2-a2/adapter_config.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4c4758d99093e963e7b960b3e04b3ff68f0cc5fe --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r2-a2/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha": 4, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "inference_mode": true, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 2, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r2-a2/eval_results.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..8f1608b00415c0bb861fc0a99db8bd070f47ec59 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "hellaswag", + "results": 0.26289583748257317 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r2-a2/training_configuration.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..9f72e49210e288636443d0b1a8d4a886b9a303e1 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "HELLASWAG", + "dataset_id": "Rowan/hellaswag", + "preprocess_id": "hellaswag_train_deepeval" + }, + "peft_config": { + "method": "loha", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 3153920 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 1, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loha-hellaswag-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r2-a2", + "seed": 42, + "timestamp": "2025-09-13T02:07:44.831528" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r2-a2/training_logs.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..012f0dfabc65c98a2f08e785c90c3efa418d30a2 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r2-a2/training_logs.json @@ -0,0 +1,5629 @@ +[ + { + "step": 1, + "epoch": 0.0016025641025641025, + "cpu_mem": 1.84311808, + "gpu_mem": 4.4303744, + "loss": 3.4877, + "grad_norm": 2.5128583908081055, + "learning_rate": 4.7619047619047615e-06 + }, + { + "step": 2, + "epoch": 0.003205128205128205, + "cpu_mem": 1.849212928, + "gpu_mem": 4.455599104, + "loss": 3.6203, + "grad_norm": 2.4821343421936035, + "learning_rate": 9.523809523809523e-06 + }, + { + "step": 3, + "epoch": 0.004807692307692308, + "cpu_mem": 1.850392576, + "gpu_mem": 4.455606784, + "loss": 3.4328, + "grad_norm": 2.461855411529541, + "learning_rate": 1.4285714285714284e-05 + }, + { + "step": 4, + "epoch": 0.00641025641025641, + "cpu_mem": 1.851572224, + "gpu_mem": 4.455640576, + "loss": 3.6281, + "grad_norm": 2.4223098754882812, + "learning_rate": 1.9047619047619046e-05 + }, + { + "step": 5, + "epoch": 0.008012820512820512, + "cpu_mem": 1.852555264, + "gpu_mem": 4.455603712, + "loss": 3.5456, + "grad_norm": 2.503544569015503, + "learning_rate": 2.3809523809523807e-05 + }, + { + "step": 6, + "epoch": 0.009615384615384616, + "cpu_mem": 1.853538304, + "gpu_mem": 4.455649792, + "loss": 3.6669, + "grad_norm": 2.6511034965515137, + "learning_rate": 2.8571428571428567e-05 + }, + { + "step": 7, + "epoch": 0.011217948717948718, + "cpu_mem": 1.854324736, + "gpu_mem": 4.455609856, + "loss": 3.6512, + "grad_norm": 2.434396982192993, + "learning_rate": 3.333333333333333e-05 + }, + { + "step": 8, + "epoch": 0.01282051282051282, + "cpu_mem": 1.855111168, + "gpu_mem": 4.455640576, + "loss": 3.4234, + "grad_norm": 2.5581724643707275, + "learning_rate": 3.809523809523809e-05 + }, + { + "step": 9, + "epoch": 0.014423076923076924, + "cpu_mem": 1.856094208, + "gpu_mem": 4.455640576, + "loss": 3.388, + "grad_norm": 2.4799861907958984, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 10, + "epoch": 0.016025641025641024, + "cpu_mem": 1.85688064, + "gpu_mem": 4.455583744, + "loss": 3.2758, + "grad_norm": 2.50430965423584, + "learning_rate": 4.7619047619047614e-05 + }, + { + "step": 11, + "epoch": 0.017628205128205128, + "cpu_mem": 1.857667072, + "gpu_mem": 4.455603712, + "loss": 3.3184, + "grad_norm": 2.7605438232421875, + "learning_rate": 5.238095238095237e-05 + }, + { + "step": 12, + "epoch": 0.019230769230769232, + "cpu_mem": 1.858256896, + "gpu_mem": 4.45560064, + "loss": 3.625, + "grad_norm": 2.8089077472686768, + "learning_rate": 5.7142857142857135e-05 + }, + { + "step": 13, + "epoch": 0.020833333333333332, + "cpu_mem": 1.859043328, + "gpu_mem": 4.45559296, + "loss": 3.4095, + "grad_norm": 2.627682685852051, + "learning_rate": 6.190476190476189e-05 + }, + { + "step": 14, + "epoch": 0.022435897435897436, + "cpu_mem": 1.85982976, + "gpu_mem": 4.455619072, + "loss": 3.3473, + "grad_norm": 2.7129805088043213, + "learning_rate": 6.666666666666666e-05 + }, + { + "step": 15, + "epoch": 0.02403846153846154, + "cpu_mem": 1.860616192, + "gpu_mem": 4.455617536, + "loss": 3.1893, + "grad_norm": 2.672372341156006, + "learning_rate": 7.142857142857142e-05 + }, + { + "step": 16, + "epoch": 0.02564102564102564, + "cpu_mem": 1.861206016, + "gpu_mem": 4.455609856, + "loss": 3.326, + "grad_norm": 2.5466272830963135, + "learning_rate": 7.619047619047618e-05 + }, + { + "step": 17, + "epoch": 0.027243589743589744, + "cpu_mem": 1.861992448, + "gpu_mem": 4.455609856, + "loss": 3.1212, + "grad_norm": 2.536281108856201, + "learning_rate": 8.095238095238093e-05 + }, + { + "step": 18, + "epoch": 0.028846153846153848, + "cpu_mem": 1.862582272, + "gpu_mem": 4.455609856, + "loss": 3.1416, + "grad_norm": 2.3969826698303223, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 19, + "epoch": 0.030448717948717948, + "cpu_mem": 1.863368704, + "gpu_mem": 4.455609856, + "loss": 3.1987, + "grad_norm": 2.4620909690856934, + "learning_rate": 9.047619047619046e-05 + }, + { + "step": 20, + "epoch": 0.03205128205128205, + "cpu_mem": 1.863958528, + "gpu_mem": 4.455583744, + "loss": 3.1199, + "grad_norm": 2.638507843017578, + "learning_rate": 9.523809523809523e-05 + }, + { + "step": 21, + "epoch": 0.03365384615384615, + "cpu_mem": 1.864548352, + "gpu_mem": 4.45560064, + "loss": 3.1528, + "grad_norm": 2.291907548904419, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 22, + "epoch": 0.035256410256410256, + "cpu_mem": 1.865334784, + "gpu_mem": 4.45560832, + "loss": 2.772, + "grad_norm": 2.164724349975586, + "learning_rate": 0.00010476190476190474 + }, + { + "step": 23, + "epoch": 0.03685897435897436, + "cpu_mem": 1.865924608, + "gpu_mem": 4.455622144, + "loss": 2.9836, + "grad_norm": 2.597036361694336, + "learning_rate": 0.0001095238095238095 + }, + { + "step": 24, + "epoch": 0.038461538461538464, + "cpu_mem": 1.866514432, + "gpu_mem": 4.455606784, + "loss": 3.1074, + "grad_norm": 2.6924314498901367, + "learning_rate": 0.00011428571428571427 + }, + { + "step": 25, + "epoch": 0.04006410256410257, + "cpu_mem": 1.867104256, + "gpu_mem": 4.455594496, + "loss": 2.5904, + "grad_norm": 2.1538643836975098, + "learning_rate": 0.00011904761904761903 + }, + { + "step": 26, + "epoch": 0.041666666666666664, + "cpu_mem": 1.86769408, + "gpu_mem": 4.45560064, + "loss": 2.3789, + "grad_norm": 1.869153380393982, + "learning_rate": 0.00012380952380952378 + }, + { + "step": 27, + "epoch": 0.04326923076923077, + "cpu_mem": 1.868283904, + "gpu_mem": 4.45560832, + "loss": 2.4549, + "grad_norm": 1.9755829572677612, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 28, + "epoch": 0.04487179487179487, + "cpu_mem": 1.868873728, + "gpu_mem": 4.455603712, + "loss": 2.3328, + "grad_norm": 1.7507712841033936, + "learning_rate": 0.0001333333333333333 + }, + { + "step": 29, + "epoch": 0.046474358974358976, + "cpu_mem": 1.870839808, + "gpu_mem": 4.455612928, + "loss": 2.4478, + "grad_norm": 2.1020007133483887, + "learning_rate": 0.00013809523809523808 + }, + { + "step": 30, + "epoch": 0.04807692307692308, + "cpu_mem": 1.871429632, + "gpu_mem": 4.45558528, + "loss": 2.3423, + "grad_norm": 1.587646245956421, + "learning_rate": 0.00014285714285714284 + }, + { + "step": 31, + "epoch": 0.049679487179487176, + "cpu_mem": 1.872019456, + "gpu_mem": 4.455640576, + "loss": 2.2072, + "grad_norm": 1.6829043626785278, + "learning_rate": 0.0001476190476190476 + }, + { + "step": 32, + "epoch": 0.05128205128205128, + "cpu_mem": 1.87260928, + "gpu_mem": 4.455632896, + "loss": 2.303, + "grad_norm": 1.692579746246338, + "learning_rate": 0.00015238095238095237 + }, + { + "step": 33, + "epoch": 0.052884615384615384, + "cpu_mem": 1.873199104, + "gpu_mem": 4.455586816, + "loss": 2.0239, + "grad_norm": 1.2231611013412476, + "learning_rate": 0.00015714285714285713 + }, + { + "step": 34, + "epoch": 0.05448717948717949, + "cpu_mem": 1.873788928, + "gpu_mem": 4.455605248, + "loss": 1.9875, + "grad_norm": 1.1457680463790894, + "learning_rate": 0.00016190476190476187 + }, + { + "step": 35, + "epoch": 0.05608974358974359, + "cpu_mem": 1.874378752, + "gpu_mem": 4.455626752, + "loss": 1.718, + "grad_norm": 0.9726106524467468, + "learning_rate": 0.00016666666666666666 + }, + { + "step": 36, + "epoch": 0.057692307692307696, + "cpu_mem": 1.874968576, + "gpu_mem": 4.455625216, + "loss": 1.8217, + "grad_norm": 1.0548276901245117, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 37, + "epoch": 0.05929487179487179, + "cpu_mem": 1.875361792, + "gpu_mem": 4.455657472, + "loss": 1.79, + "grad_norm": 0.9067928791046143, + "learning_rate": 0.0001761904761904762 + }, + { + "step": 38, + "epoch": 0.060897435897435896, + "cpu_mem": 1.875951616, + "gpu_mem": 4.455609856, + "loss": 1.7483, + "grad_norm": 0.7876018285751343, + "learning_rate": 0.00018095238095238093 + }, + { + "step": 39, + "epoch": 0.0625, + "cpu_mem": 1.876738048, + "gpu_mem": 4.455666688, + "loss": 1.5296, + "grad_norm": 0.5893408060073853, + "learning_rate": 0.00018571428571428572 + }, + { + "step": 40, + "epoch": 0.0641025641025641, + "cpu_mem": 1.877131264, + "gpu_mem": 4.455594496, + "loss": 1.7067, + "grad_norm": 0.6502890586853027, + "learning_rate": 0.00019047619047619045 + }, + { + "step": 41, + "epoch": 0.06570512820512821, + "cpu_mem": 1.877721088, + "gpu_mem": 4.455622144, + "loss": 1.585, + "grad_norm": 0.4173852503299713, + "learning_rate": 0.00019523809523809522 + }, + { + "step": 42, + "epoch": 0.0673076923076923, + "cpu_mem": 1.878310912, + "gpu_mem": 4.455635968, + "loss": 1.4991, + "grad_norm": 0.41396141052246094, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 43, + "epoch": 0.06891025641025642, + "cpu_mem": 1.878900736, + "gpu_mem": 4.455642112, + "loss": 1.497, + "grad_norm": 0.25634443759918213, + "learning_rate": 0.00020476190476190475 + }, + { + "step": 44, + "epoch": 0.07051282051282051, + "cpu_mem": 1.87949056, + "gpu_mem": 4.455620608, + "loss": 1.5, + "grad_norm": 0.2795088291168213, + "learning_rate": 0.00020952380952380948 + }, + { + "step": 45, + "epoch": 0.07211538461538461, + "cpu_mem": 1.879883776, + "gpu_mem": 4.455620608, + "loss": 1.4608, + "grad_norm": 0.23724018037319183, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 46, + "epoch": 0.07371794871794872, + "cpu_mem": 1.8804736, + "gpu_mem": 4.455620608, + "loss": 1.459, + "grad_norm": 0.35558125376701355, + "learning_rate": 0.000219047619047619 + }, + { + "step": 47, + "epoch": 0.07532051282051282, + "cpu_mem": 1.881063424, + "gpu_mem": 4.455606784, + "loss": 1.4326, + "grad_norm": 0.20985399186611176, + "learning_rate": 0.0002238095238095238 + }, + { + "step": 48, + "epoch": 0.07692307692307693, + "cpu_mem": 1.881653248, + "gpu_mem": 4.455625216, + "loss": 1.4267, + "grad_norm": 0.26539531350135803, + "learning_rate": 0.00022857142857142854 + }, + { + "step": 49, + "epoch": 0.07852564102564102, + "cpu_mem": 1.882046464, + "gpu_mem": 4.455637504, + "loss": 1.4186, + "grad_norm": 0.3287004828453064, + "learning_rate": 0.0002333333333333333 + }, + { + "step": 50, + "epoch": 0.08012820512820513, + "cpu_mem": 1.882636288, + "gpu_mem": 4.455614464, + "loss": 1.3882, + "grad_norm": 0.2930854558944702, + "learning_rate": 0.00023809523809523807 + }, + { + "step": 51, + "epoch": 0.08173076923076923, + "cpu_mem": 1.883029504, + "gpu_mem": 4.455599104, + "loss": 1.4024, + "grad_norm": 0.2226148247718811, + "learning_rate": 0.00024285714285714283 + }, + { + "step": 52, + "epoch": 0.08333333333333333, + "cpu_mem": 1.883619328, + "gpu_mem": 4.455603712, + "loss": 1.3923, + "grad_norm": 0.2812391221523285, + "learning_rate": 0.00024761904761904757 + }, + { + "step": 53, + "epoch": 0.08493589743589744, + "cpu_mem": 1.884209152, + "gpu_mem": 4.45563136, + "loss": 1.4397, + "grad_norm": 0.390704870223999, + "learning_rate": 0.0002523809523809524 + }, + { + "step": 54, + "epoch": 0.08653846153846154, + "cpu_mem": 1.884602368, + "gpu_mem": 4.455606784, + "loss": 1.4049, + "grad_norm": 0.3151853084564209, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 55, + "epoch": 0.08814102564102565, + "cpu_mem": 1.884995584, + "gpu_mem": 4.455625216, + "loss": 1.444, + "grad_norm": 0.3927886188030243, + "learning_rate": 0.00026190476190476186 + }, + { + "step": 56, + "epoch": 0.08974358974358974, + "cpu_mem": 1.8853888, + "gpu_mem": 4.455619072, + "loss": 1.4267, + "grad_norm": 0.3127141296863556, + "learning_rate": 0.0002666666666666666 + }, + { + "step": 57, + "epoch": 0.09134615384615384, + "cpu_mem": 1.885782016, + "gpu_mem": 4.45558528, + "loss": 1.4141, + "grad_norm": 0.3081414997577667, + "learning_rate": 0.0002714285714285714 + }, + { + "step": 58, + "epoch": 0.09294871794871795, + "cpu_mem": 1.88637184, + "gpu_mem": 4.455614464, + "loss": 1.4076, + "grad_norm": 0.2593015134334564, + "learning_rate": 0.00027619047619047615 + }, + { + "step": 59, + "epoch": 0.09455128205128205, + "cpu_mem": 1.886765056, + "gpu_mem": 4.455597568, + "loss": 1.4038, + "grad_norm": 0.3540239930152893, + "learning_rate": 0.0002809523809523809 + }, + { + "step": 60, + "epoch": 0.09615384615384616, + "cpu_mem": 1.88735488, + "gpu_mem": 4.45563904, + "loss": 1.3933, + "grad_norm": 0.3802341818809509, + "learning_rate": 0.0002857142857142857 + }, + { + "step": 61, + "epoch": 0.09775641025641026, + "cpu_mem": 1.887748096, + "gpu_mem": 4.455605248, + "loss": 1.3804, + "grad_norm": 0.15232135355472565, + "learning_rate": 0.00029047619047619045 + }, + { + "step": 62, + "epoch": 0.09935897435897435, + "cpu_mem": 1.88833792, + "gpu_mem": 4.455645184, + "loss": 1.365, + "grad_norm": 0.3793972134590149, + "learning_rate": 0.0002952380952380952 + }, + { + "step": 63, + "epoch": 0.10096153846153846, + "cpu_mem": 1.888927744, + "gpu_mem": 4.455599104, + "loss": 1.4389, + "grad_norm": 0.2426760047674179, + "learning_rate": 0.0003 + }, + { + "step": 64, + "epoch": 0.10256410256410256, + "cpu_mem": 1.88932096, + "gpu_mem": 4.455603712, + "loss": 1.4373, + "grad_norm": 0.26277458667755127, + "learning_rate": 0.00029999764801714643 + }, + { + "step": 65, + "epoch": 0.10416666666666667, + "cpu_mem": 1.889714176, + "gpu_mem": 4.45560064, + "loss": 1.4432, + "grad_norm": 0.2748221158981323, + "learning_rate": 0.00029999059214234344 + }, + { + "step": 66, + "epoch": 0.10576923076923077, + "cpu_mem": 1.890107392, + "gpu_mem": 4.455619072, + "loss": 1.4501, + "grad_norm": 0.22935280203819275, + "learning_rate": 0.00029997883259686163 + }, + { + "step": 67, + "epoch": 0.10737179487179487, + "cpu_mem": 1.890697216, + "gpu_mem": 4.455611392, + "loss": 1.3982, + "grad_norm": 0.15509389340877533, + "learning_rate": 0.00029996236974947764 + }, + { + "step": 68, + "epoch": 0.10897435897435898, + "cpu_mem": 1.89128704, + "gpu_mem": 4.455596032, + "loss": 1.3831, + "grad_norm": 0.205854594707489, + "learning_rate": 0.00029994120411646263 + }, + { + "step": 69, + "epoch": 0.11057692307692307, + "cpu_mem": 1.891680256, + "gpu_mem": 4.455666688, + "loss": 1.4352, + "grad_norm": 0.3528335690498352, + "learning_rate": 0.00029991533636156603 + }, + { + "step": 70, + "epoch": 0.11217948717948718, + "cpu_mem": 1.892073472, + "gpu_mem": 4.455617536, + "loss": 1.4332, + "grad_norm": 0.3484487533569336, + "learning_rate": 0.00029988476729599464 + }, + { + "step": 71, + "epoch": 0.11378205128205128, + "cpu_mem": 1.893449728, + "gpu_mem": 4.455642112, + "loss": 1.4475, + "grad_norm": 0.4785464406013489, + "learning_rate": 0.0002998494978783874 + }, + { + "step": 72, + "epoch": 0.11538461538461539, + "cpu_mem": 1.894039552, + "gpu_mem": 4.455612928, + "loss": 1.3609, + "grad_norm": 0.13429881632328033, + "learning_rate": 0.0002998095292147852 + }, + { + "step": 73, + "epoch": 0.11698717948717949, + "cpu_mem": 1.894432768, + "gpu_mem": 4.455605248, + "loss": 1.4097, + "grad_norm": 0.30282288789749146, + "learning_rate": 0.0002997648625585962 + }, + { + "step": 74, + "epoch": 0.11858974358974358, + "cpu_mem": 1.894825984, + "gpu_mem": 4.455599104, + "loss": 1.3925, + "grad_norm": 0.17945511639118195, + "learning_rate": 0.0002997154993105566 + }, + { + "step": 75, + "epoch": 0.1201923076923077, + "cpu_mem": 1.8952192, + "gpu_mem": 4.455628288, + "loss": 1.3963, + "grad_norm": 0.213457390666008, + "learning_rate": 0.00029966144101868636 + }, + { + "step": 76, + "epoch": 0.12179487179487179, + "cpu_mem": 1.895612416, + "gpu_mem": 4.455619072, + "loss": 1.4391, + "grad_norm": 0.48411449790000916, + "learning_rate": 0.0002996026893782414 + }, + { + "step": 77, + "epoch": 0.1233974358974359, + "cpu_mem": 1.896005632, + "gpu_mem": 4.455606784, + "loss": 1.4403, + "grad_norm": 0.33211350440979004, + "learning_rate": 0.00029953924623165955 + }, + { + "step": 78, + "epoch": 0.125, + "cpu_mem": 1.896398848, + "gpu_mem": 4.455599104, + "loss": 1.3593, + "grad_norm": 0.20841778814792633, + "learning_rate": 0.0002994711135685035 + }, + { + "step": 79, + "epoch": 0.1266025641025641, + "cpu_mem": 1.896792064, + "gpu_mem": 4.455651328, + "loss": 1.4055, + "grad_norm": 0.24423983693122864, + "learning_rate": 0.00029939829352539787 + }, + { + "step": 80, + "epoch": 0.1282051282051282, + "cpu_mem": 1.89718528, + "gpu_mem": 4.455629824, + "loss": 1.4136, + "grad_norm": 0.17654864490032196, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 81, + "epoch": 0.12980769230769232, + "cpu_mem": 1.897578496, + "gpu_mem": 4.45562368, + "loss": 1.3801, + "grad_norm": 0.2682344913482666, + "learning_rate": 0.0002992386005807413 + }, + { + "step": 82, + "epoch": 0.13141025641025642, + "cpu_mem": 1.897971712, + "gpu_mem": 4.45560064, + "loss": 1.3873, + "grad_norm": 0.1930839866399765, + "learning_rate": 0.00029915173268712456 + }, + { + "step": 83, + "epoch": 0.1330128205128205, + "cpu_mem": 1.898364928, + "gpu_mem": 4.455622144, + "loss": 1.4133, + "grad_norm": 0.3096357583999634, + "learning_rate": 0.0002990601874292698 + }, + { + "step": 84, + "epoch": 0.1346153846153846, + "cpu_mem": 1.898758144, + "gpu_mem": 4.455594496, + "loss": 1.4198, + "grad_norm": 0.271965354681015, + "learning_rate": 0.0002989639676780152 + }, + { + "step": 85, + "epoch": 0.1362179487179487, + "cpu_mem": 1.89915136, + "gpu_mem": 4.455602176, + "loss": 1.401, + "grad_norm": 0.18421366810798645, + "learning_rate": 0.0002988630764507904 + }, + { + "step": 86, + "epoch": 0.13782051282051283, + "cpu_mem": 1.899544576, + "gpu_mem": 4.455620608, + "loss": 1.3766, + "grad_norm": 0.13072967529296875, + "learning_rate": 0.00029875751691152094 + }, + { + "step": 87, + "epoch": 0.13942307692307693, + "cpu_mem": 1.899937792, + "gpu_mem": 4.455609856, + "loss": 1.4063, + "grad_norm": 0.25514790415763855, + "learning_rate": 0.0002986472923705301 + }, + { + "step": 88, + "epoch": 0.14102564102564102, + "cpu_mem": 1.900331008, + "gpu_mem": 4.45560832, + "loss": 1.4173, + "grad_norm": 0.3596479892730713, + "learning_rate": 0.0002985324062844341 + }, + { + "step": 89, + "epoch": 0.14262820512820512, + "cpu_mem": 1.900724224, + "gpu_mem": 4.455603712, + "loss": 1.3716, + "grad_norm": 0.2189432829618454, + "learning_rate": 0.0002984128622560345 + }, + { + "step": 90, + "epoch": 0.14423076923076922, + "cpu_mem": 1.90111744, + "gpu_mem": 4.45560832, + "loss": 1.3985, + "grad_norm": 0.2754857540130615, + "learning_rate": 0.0002982886640342046 + }, + { + "step": 91, + "epoch": 0.14583333333333334, + "cpu_mem": 1.901510656, + "gpu_mem": 4.455619072, + "loss": 1.3748, + "grad_norm": 0.12790535390377045, + "learning_rate": 0.00029815981551377217 + }, + { + "step": 92, + "epoch": 0.14743589743589744, + "cpu_mem": 1.901903872, + "gpu_mem": 4.455622144, + "loss": 1.4028, + "grad_norm": 0.16529704630374908, + "learning_rate": 0.00029802632073539745 + }, + { + "step": 93, + "epoch": 0.14903846153846154, + "cpu_mem": 1.902297088, + "gpu_mem": 4.455622144, + "loss": 1.4218, + "grad_norm": 0.20160584151744843, + "learning_rate": 0.0002978881838854462 + }, + { + "step": 94, + "epoch": 0.15064102564102563, + "cpu_mem": 1.902690304, + "gpu_mem": 4.455617536, + "loss": 1.4154, + "grad_norm": 0.2022165060043335, + "learning_rate": 0.00029774540929585847 + }, + { + "step": 95, + "epoch": 0.15224358974358973, + "cpu_mem": 1.90308352, + "gpu_mem": 4.455635968, + "loss": 1.3729, + "grad_norm": 0.2602253556251526, + "learning_rate": 0.0002975980014440126 + }, + { + "step": 96, + "epoch": 0.15384615384615385, + "cpu_mem": 1.903476736, + "gpu_mem": 4.45563904, + "loss": 1.4003, + "grad_norm": 0.13144388794898987, + "learning_rate": 0.00029744596495258525 + }, + { + "step": 97, + "epoch": 0.15544871794871795, + "cpu_mem": 1.903869952, + "gpu_mem": 4.455616, + "loss": 1.4014, + "grad_norm": 0.15986980497837067, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 98, + "epoch": 0.15705128205128205, + "cpu_mem": 1.904263168, + "gpu_mem": 4.455626752, + "loss": 1.4009, + "grad_norm": 0.11606542766094208, + "learning_rate": 0.000297128025267308 + }, + { + "step": 99, + "epoch": 0.15865384615384615, + "cpu_mem": 1.904656384, + "gpu_mem": 4.455626752, + "loss": 1.391, + "grad_norm": 0.2365250587463379, + "learning_rate": 0.00029696213204397396 + }, + { + "step": 100, + "epoch": 0.16025641025641027, + "cpu_mem": 1.9050496, + "gpu_mem": 4.455602176, + "loss": 1.3984, + "grad_norm": 0.15977653861045837, + "learning_rate": 0.00029679163012177737 + }, + { + "step": 101, + "epoch": 0.16185897435897437, + "cpu_mem": 1.905442816, + "gpu_mem": 4.45563136, + "loss": 1.4067, + "grad_norm": 0.2774398624897003, + "learning_rate": 0.0002966165248476196 + }, + { + "step": 102, + "epoch": 0.16346153846153846, + "cpu_mem": 1.905836032, + "gpu_mem": 4.45560832, + "loss": 1.3707, + "grad_norm": 0.2397141456604004, + "learning_rate": 0.00029643682171276203 + }, + { + "step": 103, + "epoch": 0.16506410256410256, + "cpu_mem": 1.906229248, + "gpu_mem": 4.455625216, + "loss": 1.394, + "grad_norm": 0.20532380044460297, + "learning_rate": 0.0002962525263526538 + }, + { + "step": 104, + "epoch": 0.16666666666666666, + "cpu_mem": 1.906425856, + "gpu_mem": 4.45559296, + "loss": 1.3797, + "grad_norm": 0.20464231073856354, + "learning_rate": 0.0002960636445467553 + }, + { + "step": 105, + "epoch": 0.16826923076923078, + "cpu_mem": 1.906819072, + "gpu_mem": 4.45560832, + "loss": 1.3978, + "grad_norm": 0.22663050889968872, + "learning_rate": 0.0002958701822183569 + }, + { + "step": 106, + "epoch": 0.16987179487179488, + "cpu_mem": 1.90701568, + "gpu_mem": 4.455588352, + "loss": 1.4016, + "grad_norm": 0.1968384087085724, + "learning_rate": 0.0002956721454343928 + }, + { + "step": 107, + "epoch": 0.17147435897435898, + "cpu_mem": 1.907212288, + "gpu_mem": 4.455629824, + "loss": 1.3725, + "grad_norm": 0.10404251515865326, + "learning_rate": 0.0002954695404052514 + }, + { + "step": 108, + "epoch": 0.17307692307692307, + "cpu_mem": 1.907605504, + "gpu_mem": 4.455625216, + "loss": 1.3979, + "grad_norm": 0.1685674637556076, + "learning_rate": 0.00029526237348458003 + }, + { + "step": 109, + "epoch": 0.17467948717948717, + "cpu_mem": 1.90799872, + "gpu_mem": 4.45563136, + "loss": 1.3746, + "grad_norm": 0.18063496053218842, + "learning_rate": 0.000295050651169086 + }, + { + "step": 110, + "epoch": 0.1762820512820513, + "cpu_mem": 1.908391936, + "gpu_mem": 4.455628288, + "loss": 1.3892, + "grad_norm": 0.15602722764015198, + "learning_rate": 0.00029483438009833264 + }, + { + "step": 111, + "epoch": 0.1778846153846154, + "cpu_mem": 1.908785152, + "gpu_mem": 4.455629824, + "loss": 1.3704, + "grad_norm": 0.1818743348121643, + "learning_rate": 0.0002946135670545314 + }, + { + "step": 112, + "epoch": 0.1794871794871795, + "cpu_mem": 1.909178368, + "gpu_mem": 4.455626752, + "loss": 1.3958, + "grad_norm": 0.1893455684185028, + "learning_rate": 0.0002943882189623288 + }, + { + "step": 113, + "epoch": 0.18108974358974358, + "cpu_mem": 1.909374976, + "gpu_mem": 4.455606784, + "loss": 1.408, + "grad_norm": 0.13489145040512085, + "learning_rate": 0.00029415834288858947 + }, + { + "step": 114, + "epoch": 0.18269230769230768, + "cpu_mem": 1.909768192, + "gpu_mem": 4.455602176, + "loss": 1.4058, + "grad_norm": 0.26649245619773865, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 115, + "epoch": 0.1842948717948718, + "cpu_mem": 1.9099648, + "gpu_mem": 4.455620608, + "loss": 1.4015, + "grad_norm": 0.11739127337932587, + "learning_rate": 0.0002936850357737156 + }, + { + "step": 116, + "epoch": 0.1858974358974359, + "cpu_mem": 1.910358016, + "gpu_mem": 4.45563136, + "loss": 1.3959, + "grad_norm": 0.1669812798500061, + "learning_rate": 0.0002934416195753839 + }, + { + "step": 117, + "epoch": 0.1875, + "cpu_mem": 1.910751232, + "gpu_mem": 4.455617536, + "loss": 1.3963, + "grad_norm": 0.11700169742107391, + "learning_rate": 0.00029319370508065594 + }, + { + "step": 118, + "epoch": 0.1891025641025641, + "cpu_mem": 1.91094784, + "gpu_mem": 4.455632896, + "loss": 1.404, + "grad_norm": 0.23675836622714996, + "learning_rate": 0.0002929413000640735 + }, + { + "step": 119, + "epoch": 0.1907051282051282, + "cpu_mem": 1.911341056, + "gpu_mem": 4.455614464, + "loss": 1.374, + "grad_norm": 0.24805021286010742, + "learning_rate": 0.0002926844124410001 + }, + { + "step": 120, + "epoch": 0.19230769230769232, + "cpu_mem": 1.911734272, + "gpu_mem": 4.455640576, + "loss": 1.3953, + "grad_norm": 0.19911380112171173, + "learning_rate": 0.0002924230502673731 + }, + { + "step": 121, + "epoch": 0.19391025641025642, + "cpu_mem": 1.912127488, + "gpu_mem": 4.455599104, + "loss": 1.4056, + "grad_norm": 0.18831779062747955, + "learning_rate": 0.00029215722173945034 + }, + { + "step": 122, + "epoch": 0.1955128205128205, + "cpu_mem": 1.912324096, + "gpu_mem": 4.45563136, + "loss": 1.3884, + "grad_norm": 0.1480238139629364, + "learning_rate": 0.0002918869351935537 + }, + { + "step": 123, + "epoch": 0.1971153846153846, + "cpu_mem": 1.912717312, + "gpu_mem": 4.455625216, + "loss": 1.386, + "grad_norm": 0.10932669788599014, + "learning_rate": 0.00029161219910580754 + }, + { + "step": 124, + "epoch": 0.1987179487179487, + "cpu_mem": 1.913110528, + "gpu_mem": 4.455626752, + "loss": 1.3845, + "grad_norm": 0.17890113592147827, + "learning_rate": 0.00029133302209187267 + }, + { + "step": 125, + "epoch": 0.20032051282051283, + "cpu_mem": 1.913503744, + "gpu_mem": 4.455602176, + "loss": 1.3571, + "grad_norm": 0.15868866443634033, + "learning_rate": 0.00029104941290667655 + }, + { + "step": 126, + "epoch": 0.20192307692307693, + "cpu_mem": 1.913700352, + "gpu_mem": 4.455611392, + "loss": 1.3519, + "grad_norm": 0.18742027878761292, + "learning_rate": 0.00029076138044413827 + }, + { + "step": 127, + "epoch": 0.20352564102564102, + "cpu_mem": 1.914093568, + "gpu_mem": 4.455597568, + "loss": 1.4204, + "grad_norm": 0.2918684780597687, + "learning_rate": 0.00029046893373689 + }, + { + "step": 128, + "epoch": 0.20512820512820512, + "cpu_mem": 1.914486784, + "gpu_mem": 4.455634432, + "loss": 1.3693, + "grad_norm": 0.16720642149448395, + "learning_rate": 0.00029017208195599375 + }, + { + "step": 129, + "epoch": 0.20673076923076922, + "cpu_mem": 1.914683392, + "gpu_mem": 4.45563136, + "loss": 1.3882, + "grad_norm": 0.10875742882490158, + "learning_rate": 0.0002898708344106533 + }, + { + "step": 130, + "epoch": 0.20833333333333334, + "cpu_mem": 1.915076608, + "gpu_mem": 4.45563136, + "loss": 1.4227, + "grad_norm": 0.2143721580505371, + "learning_rate": 0.00028956520054792303 + }, + { + "step": 131, + "epoch": 0.20993589743589744, + "cpu_mem": 1.915469824, + "gpu_mem": 4.455620608, + "loss": 1.4093, + "grad_norm": 0.1833495795726776, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 132, + "epoch": 0.21153846153846154, + "cpu_mem": 1.915666432, + "gpu_mem": 4.455620608, + "loss": 1.3512, + "grad_norm": 0.17044134438037872, + "learning_rate": 0.0002889408123459782 + }, + { + "step": 133, + "epoch": 0.21314102564102563, + "cpu_mem": 1.916059648, + "gpu_mem": 4.455602176, + "loss": 1.3863, + "grad_norm": 0.15147517621517181, + "learning_rate": 0.000288622077587435 + }, + { + "step": 134, + "epoch": 0.21474358974358973, + "cpu_mem": 1.916256256, + "gpu_mem": 4.455612928, + "loss": 1.4178, + "grad_norm": 0.2443518191576004, + "learning_rate": 0.0002882989956722303 + }, + { + "step": 135, + "epoch": 0.21634615384615385, + "cpu_mem": 1.916452864, + "gpu_mem": 4.455622144, + "loss": 1.3935, + "grad_norm": 0.2074698507785797, + "learning_rate": 0.00028797157673213914 + }, + { + "step": 136, + "epoch": 0.21794871794871795, + "cpu_mem": 1.91684608, + "gpu_mem": 4.455637504, + "loss": 1.4111, + "grad_norm": 0.19738945364952087, + "learning_rate": 0.00028763983103494465 + }, + { + "step": 137, + "epoch": 0.21955128205128205, + "cpu_mem": 1.917042688, + "gpu_mem": 4.45558528, + "loss": 1.3765, + "grad_norm": 0.10189327597618103, + "learning_rate": 0.00028730376898411606 + }, + { + "step": 138, + "epoch": 0.22115384615384615, + "cpu_mem": 1.917435904, + "gpu_mem": 4.455605248, + "loss": 1.4068, + "grad_norm": 0.188468798995018, + "learning_rate": 0.00028696340111848245 + }, + { + "step": 139, + "epoch": 0.22275641025641027, + "cpu_mem": 1.917632512, + "gpu_mem": 4.455586816, + "loss": 1.4007, + "grad_norm": 0.24485869705677032, + "learning_rate": 0.00028661873811190226 + }, + { + "step": 140, + "epoch": 0.22435897435897437, + "cpu_mem": 1.918025728, + "gpu_mem": 4.455603712, + "loss": 1.3807, + "grad_norm": 0.16903604567050934, + "learning_rate": 0.0002862697907729285 + }, + { + "step": 141, + "epoch": 0.22596153846153846, + "cpu_mem": 1.918222336, + "gpu_mem": 4.455609856, + "loss": 1.3861, + "grad_norm": 0.17098411917686462, + "learning_rate": 0.0002859165700444701 + }, + { + "step": 142, + "epoch": 0.22756410256410256, + "cpu_mem": 1.918418944, + "gpu_mem": 4.455606784, + "loss": 1.377, + "grad_norm": 0.11444650590419769, + "learning_rate": 0.00028555908700344824 + }, + { + "step": 143, + "epoch": 0.22916666666666666, + "cpu_mem": 1.91881216, + "gpu_mem": 4.455632896, + "loss": 1.3874, + "grad_norm": 0.11977389454841614, + "learning_rate": 0.00028519735286044936 + }, + { + "step": 144, + "epoch": 0.23076923076923078, + "cpu_mem": 1.919008768, + "gpu_mem": 4.455606784, + "loss": 1.3912, + "grad_norm": 0.08771314471960068, + "learning_rate": 0.0002848313789593736 + }, + { + "step": 145, + "epoch": 0.23237179487179488, + "cpu_mem": 1.919401984, + "gpu_mem": 4.45564672, + "loss": 1.3941, + "grad_norm": 0.21630851924419403, + "learning_rate": 0.00028446117677707867 + }, + { + "step": 146, + "epoch": 0.23397435897435898, + "cpu_mem": 1.9197952, + "gpu_mem": 4.455596032, + "loss": 1.3868, + "grad_norm": 0.11535011976957321, + "learning_rate": 0.0002840867579230205 + }, + { + "step": 147, + "epoch": 0.23557692307692307, + "cpu_mem": 1.919991808, + "gpu_mem": 4.455605248, + "loss": 1.4183, + "grad_norm": 0.21235281229019165, + "learning_rate": 0.00028370813413888866 + }, + { + "step": 148, + "epoch": 0.23717948717948717, + "cpu_mem": 1.920188416, + "gpu_mem": 4.455625216, + "loss": 1.3831, + "grad_norm": 0.16136379539966583, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 149, + "epoch": 0.2387820512820513, + "cpu_mem": 1.920581632, + "gpu_mem": 4.455616, + "loss": 1.3859, + "grad_norm": 0.13336604833602905, + "learning_rate": 0.0002829383194061186 + }, + { + "step": 150, + "epoch": 0.2403846153846154, + "cpu_mem": 1.92077824, + "gpu_mem": 4.455628288, + "loss": 1.3718, + "grad_norm": 0.16929689049720764, + "learning_rate": 0.00028254715259869444 + }, + { + "step": 151, + "epoch": 0.2419871794871795, + "cpu_mem": 1.920974848, + "gpu_mem": 4.45559296, + "loss": 1.3829, + "grad_norm": 0.19579057395458221, + "learning_rate": 0.00028215182914286766 + }, + { + "step": 152, + "epoch": 0.24358974358974358, + "cpu_mem": 1.921171456, + "gpu_mem": 4.45562368, + "loss": 1.3859, + "grad_norm": 0.2050841897726059, + "learning_rate": 0.00028175236143589144 + }, + { + "step": 153, + "epoch": 0.24519230769230768, + "cpu_mem": 1.921564672, + "gpu_mem": 4.455619072, + "loss": 1.3757, + "grad_norm": 0.16676609218120575, + "learning_rate": 0.0002813487620049817 + }, + { + "step": 154, + "epoch": 0.2467948717948718, + "cpu_mem": 1.92176128, + "gpu_mem": 4.455643648, + "loss": 1.3884, + "grad_norm": 0.12383885681629181, + "learning_rate": 0.00028094104350692435 + }, + { + "step": 155, + "epoch": 0.2483974358974359, + "cpu_mem": 1.922154496, + "gpu_mem": 4.455580672, + "loss": 1.3923, + "grad_norm": 0.14915084838867188, + "learning_rate": 0.0002805292187276783 + }, + { + "step": 156, + "epoch": 0.25, + "cpu_mem": 1.922351104, + "gpu_mem": 4.455634432, + "loss": 1.3693, + "grad_norm": 0.1435364931821823, + "learning_rate": 0.0002801133005819744 + }, + { + "step": 157, + "epoch": 0.2516025641025641, + "cpu_mem": 1.922547712, + "gpu_mem": 4.455626752, + "loss": 1.3904, + "grad_norm": 0.14253225922584534, + "learning_rate": 0.00027969330211291077 + }, + { + "step": 158, + "epoch": 0.2532051282051282, + "cpu_mem": 1.922940928, + "gpu_mem": 4.455642112, + "loss": 1.4014, + "grad_norm": 0.14435213804244995, + "learning_rate": 0.00027926923649154327 + }, + { + "step": 159, + "epoch": 0.2548076923076923, + "cpu_mem": 1.923137536, + "gpu_mem": 4.455643648, + "loss": 1.3845, + "grad_norm": 0.12501677870750427, + "learning_rate": 0.00027884111701647284 + }, + { + "step": 160, + "epoch": 0.2564102564102564, + "cpu_mem": 1.923530752, + "gpu_mem": 4.455611392, + "loss": 1.4285, + "grad_norm": 0.2647113800048828, + "learning_rate": 0.00027840895711342834 + }, + { + "step": 161, + "epoch": 0.25801282051282054, + "cpu_mem": 1.92372736, + "gpu_mem": 4.455603712, + "loss": 1.3998, + "grad_norm": 0.23141193389892578, + "learning_rate": 0.00027797277033484553 + }, + { + "step": 162, + "epoch": 0.25961538461538464, + "cpu_mem": 1.923923968, + "gpu_mem": 4.45563904, + "loss": 1.3934, + "grad_norm": 0.14193156361579895, + "learning_rate": 0.0002775325703594421 + }, + { + "step": 163, + "epoch": 0.26121794871794873, + "cpu_mem": 1.924120576, + "gpu_mem": 4.455586816, + "loss": 1.4039, + "grad_norm": 0.25639158487319946, + "learning_rate": 0.0002770883709917886 + }, + { + "step": 164, + "epoch": 0.26282051282051283, + "cpu_mem": 1.924513792, + "gpu_mem": 4.455622144, + "loss": 1.3718, + "grad_norm": 0.17062553763389587, + "learning_rate": 0.0002766401861618757 + }, + { + "step": 165, + "epoch": 0.2644230769230769, + "cpu_mem": 1.9247104, + "gpu_mem": 4.455611392, + "loss": 1.373, + "grad_norm": 0.12277490645647049, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 166, + "epoch": 0.266025641025641, + "cpu_mem": 1.924907008, + "gpu_mem": 4.455643648, + "loss": 1.3875, + "grad_norm": 0.1929093897342682, + "learning_rate": 0.0002757319164597092 + }, + { + "step": 167, + "epoch": 0.2676282051282051, + "cpu_mem": 1.925103616, + "gpu_mem": 4.455637504, + "loss": 1.3667, + "grad_norm": 0.19885137677192688, + "learning_rate": 0.0002752718600705858 + }, + { + "step": 168, + "epoch": 0.2692307692307692, + "cpu_mem": 1.925300224, + "gpu_mem": 4.455616, + "loss": 1.3784, + "grad_norm": 0.12266800552606583, + "learning_rate": 0.00027480787518457023 + }, + { + "step": 169, + "epoch": 0.2708333333333333, + "cpu_mem": 1.92569344, + "gpu_mem": 4.455612928, + "loss": 1.3821, + "grad_norm": 0.16938838362693787, + "learning_rate": 0.0002743399763521223 + }, + { + "step": 170, + "epoch": 0.2724358974358974, + "cpu_mem": 1.925890048, + "gpu_mem": 4.455649792, + "loss": 1.4099, + "grad_norm": 0.11750156432390213, + "learning_rate": 0.0002738681782464426 + }, + { + "step": 171, + "epoch": 0.27403846153846156, + "cpu_mem": 1.926086656, + "gpu_mem": 4.45562368, + "loss": 1.3728, + "grad_norm": 0.17166221141815186, + "learning_rate": 0.0002733924956630117 + }, + { + "step": 172, + "epoch": 0.27564102564102566, + "cpu_mem": 1.926283264, + "gpu_mem": 4.45560064, + "loss": 1.3893, + "grad_norm": 0.19107574224472046, + "learning_rate": 0.00027291294351912664 + }, + { + "step": 173, + "epoch": 0.27724358974358976, + "cpu_mem": 1.92667648, + "gpu_mem": 4.455626752, + "loss": 1.4149, + "grad_norm": 0.18033741414546967, + "learning_rate": 0.00027242953685343327 + }, + { + "step": 174, + "epoch": 0.27884615384615385, + "cpu_mem": 1.926873088, + "gpu_mem": 4.45563904, + "loss": 1.4115, + "grad_norm": 0.16714097559452057, + "learning_rate": 0.0002719422908254538 + }, + { + "step": 175, + "epoch": 0.28044871794871795, + "cpu_mem": 1.927266304, + "gpu_mem": 4.45560064, + "loss": 1.399, + "grad_norm": 0.18084725737571716, + "learning_rate": 0.0002714512207151125 + }, + { + "step": 176, + "epoch": 0.28205128205128205, + "cpu_mem": 1.927462912, + "gpu_mem": 4.455609856, + "loss": 1.4003, + "grad_norm": 0.13099360466003418, + "learning_rate": 0.0002709563419222557 + }, + { + "step": 177, + "epoch": 0.28365384615384615, + "cpu_mem": 1.927462912, + "gpu_mem": 4.455591424, + "loss": 1.3888, + "grad_norm": 0.1725221425294876, + "learning_rate": 0.0002704576699661691 + }, + { + "step": 178, + "epoch": 0.28525641025641024, + "cpu_mem": 1.92765952, + "gpu_mem": 4.455605248, + "loss": 1.391, + "grad_norm": 0.275603711605072, + "learning_rate": 0.0002699552204850914 + }, + { + "step": 179, + "epoch": 0.28685897435897434, + "cpu_mem": 1.928052736, + "gpu_mem": 4.455612928, + "loss": 1.4011, + "grad_norm": 0.161726713180542, + "learning_rate": 0.0002694490092357233 + }, + { + "step": 180, + "epoch": 0.28846153846153844, + "cpu_mem": 1.928249344, + "gpu_mem": 4.455594496, + "loss": 1.3997, + "grad_norm": 0.20925098657608032, + "learning_rate": 0.00026893905209273404 + }, + { + "step": 181, + "epoch": 0.2900641025641026, + "cpu_mem": 1.928445952, + "gpu_mem": 4.455625216, + "loss": 1.3884, + "grad_norm": 0.1754855364561081, + "learning_rate": 0.00026842536504826286 + }, + { + "step": 182, + "epoch": 0.2916666666666667, + "cpu_mem": 1.92864256, + "gpu_mem": 4.455596032, + "loss": 1.3938, + "grad_norm": 0.22553697228431702, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 183, + "epoch": 0.2932692307692308, + "cpu_mem": 1.929035776, + "gpu_mem": 4.455620608, + "loss": 1.372, + "grad_norm": 0.11611100286245346, + "learning_rate": 0.0002673868658077717 + }, + { + "step": 184, + "epoch": 0.2948717948717949, + "cpu_mem": 1.929232384, + "gpu_mem": 4.45560064, + "loss": 1.3915, + "grad_norm": 0.15747860074043274, + "learning_rate": 0.00026686208617885055 + }, + { + "step": 185, + "epoch": 0.296474358974359, + "cpu_mem": 1.929428992, + "gpu_mem": 4.455632896, + "loss": 1.4125, + "grad_norm": 0.280722975730896, + "learning_rate": 0.0002663336417816238 + }, + { + "step": 186, + "epoch": 0.2980769230769231, + "cpu_mem": 1.9296256, + "gpu_mem": 4.45562368, + "loss": 1.3862, + "grad_norm": 0.14614327251911163, + "learning_rate": 0.0002658015491879868 + }, + { + "step": 187, + "epoch": 0.29967948717948717, + "cpu_mem": 1.929822208, + "gpu_mem": 4.455619072, + "loss": 1.4165, + "grad_norm": 0.33601444959640503, + "learning_rate": 0.00026526582508424175 + }, + { + "step": 188, + "epoch": 0.30128205128205127, + "cpu_mem": 1.930018816, + "gpu_mem": 4.455576064, + "loss": 1.3652, + "grad_norm": 0.13644161820411682, + "learning_rate": 0.0002647264862705741 + }, + { + "step": 189, + "epoch": 0.30288461538461536, + "cpu_mem": 1.930412032, + "gpu_mem": 4.455655936, + "loss": 1.3721, + "grad_norm": 0.0781923159956932, + "learning_rate": 0.00026418354966052573 + }, + { + "step": 190, + "epoch": 0.30448717948717946, + "cpu_mem": 1.93060864, + "gpu_mem": 4.455606784, + "loss": 1.3805, + "grad_norm": 0.19900958240032196, + "learning_rate": 0.00026363703228046454 + }, + { + "step": 191, + "epoch": 0.3060897435897436, + "cpu_mem": 1.930805248, + "gpu_mem": 4.455606784, + "loss": 1.3704, + "grad_norm": 0.16141168773174286, + "learning_rate": 0.0002630869512690507 + }, + { + "step": 192, + "epoch": 0.3076923076923077, + "cpu_mem": 1.931001856, + "gpu_mem": 4.455572992, + "loss": 1.4301, + "grad_norm": 0.27716320753097534, + "learning_rate": 0.0002625333238766989 + }, + { + "step": 193, + "epoch": 0.3092948717948718, + "cpu_mem": 1.931198464, + "gpu_mem": 4.455612928, + "loss": 1.3838, + "grad_norm": 0.2620179057121277, + "learning_rate": 0.0002619761674650377 + }, + { + "step": 194, + "epoch": 0.3108974358974359, + "cpu_mem": 1.931395072, + "gpu_mem": 4.45560832, + "loss": 1.3724, + "grad_norm": 0.18589220941066742, + "learning_rate": 0.0002614154995063647 + }, + { + "step": 195, + "epoch": 0.3125, + "cpu_mem": 1.93159168, + "gpu_mem": 4.455596032, + "loss": 1.4077, + "grad_norm": 0.16074693202972412, + "learning_rate": 0.00026085133758309883 + }, + { + "step": 196, + "epoch": 0.3141025641025641, + "cpu_mem": 1.931984896, + "gpu_mem": 4.455620608, + "loss": 1.425, + "grad_norm": 0.3229081332683563, + "learning_rate": 0.0002602836993872292 + }, + { + "step": 197, + "epoch": 0.3157051282051282, + "cpu_mem": 1.932181504, + "gpu_mem": 4.455635968, + "loss": 1.401, + "grad_norm": 0.28662097454071045, + "learning_rate": 0.0002597126027197598 + }, + { + "step": 198, + "epoch": 0.3173076923076923, + "cpu_mem": 1.932378112, + "gpu_mem": 4.45560832, + "loss": 1.3932, + "grad_norm": 0.1694614142179489, + "learning_rate": 0.0002591380654901515 + }, + { + "step": 199, + "epoch": 0.3189102564102564, + "cpu_mem": 1.932378112, + "gpu_mem": 4.455605248, + "loss": 1.3922, + "grad_norm": 0.15834537148475647, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 200, + "epoch": 0.32051282051282054, + "cpu_mem": 1.93257472, + "gpu_mem": 4.455620608, + "loss": 1.4258, + "grad_norm": 0.20158275961875916, + "learning_rate": 0.0002579787415212732 + }, + { + "step": 201, + "epoch": 0.32211538461538464, + "cpu_mem": 1.93257472, + "gpu_mem": 4.455597568, + "loss": 1.3946, + "grad_norm": 0.11845407634973526, + "learning_rate": 0.00025739399113813784 + }, + { + "step": 202, + "epoch": 0.32371794871794873, + "cpu_mem": 1.932771328, + "gpu_mem": 4.455599104, + "loss": 1.3902, + "grad_norm": 0.12703348696231842, + "learning_rate": 0.00025680587290399277 + }, + { + "step": 203, + "epoch": 0.32532051282051283, + "cpu_mem": 1.932967936, + "gpu_mem": 4.455640576, + "loss": 1.3896, + "grad_norm": 0.20284932851791382, + "learning_rate": 0.0002562144052620913 + }, + { + "step": 204, + "epoch": 0.3269230769230769, + "cpu_mem": 1.933361152, + "gpu_mem": 4.455611392, + "loss": 1.3874, + "grad_norm": 0.13848000764846802, + "learning_rate": 0.00025561960676072354 + }, + { + "step": 205, + "epoch": 0.328525641025641, + "cpu_mem": 1.93355776, + "gpu_mem": 4.455611392, + "loss": 1.3833, + "grad_norm": 0.15022751688957214, + "learning_rate": 0.0002550214960526344 + }, + { + "step": 206, + "epoch": 0.3301282051282051, + "cpu_mem": 1.933754368, + "gpu_mem": 4.45560832, + "loss": 1.3989, + "grad_norm": 0.2506808042526245, + "learning_rate": 0.000254420091894439 + }, + { + "step": 207, + "epoch": 0.3317307692307692, + "cpu_mem": 1.933950976, + "gpu_mem": 4.45560832, + "loss": 1.3945, + "grad_norm": 0.14420081675052643, + "learning_rate": 0.0002538154131460342 + }, + { + "step": 208, + "epoch": 0.3333333333333333, + "cpu_mem": 1.934344192, + "gpu_mem": 4.455599104, + "loss": 1.3958, + "grad_norm": 0.14760451018810272, + "learning_rate": 0.00025320747877000745 + }, + { + "step": 209, + "epoch": 0.3349358974358974, + "cpu_mem": 1.934344192, + "gpu_mem": 4.455634432, + "loss": 1.3737, + "grad_norm": 0.16956306993961334, + "learning_rate": 0.00025259630783104164 + }, + { + "step": 210, + "epoch": 0.33653846153846156, + "cpu_mem": 1.9345408, + "gpu_mem": 4.455591424, + "loss": 1.3744, + "grad_norm": 0.1379985809326172, + "learning_rate": 0.00025198191949531786 + }, + { + "step": 211, + "epoch": 0.33814102564102566, + "cpu_mem": 1.934737408, + "gpu_mem": 4.455619072, + "loss": 1.3859, + "grad_norm": 0.13299067318439484, + "learning_rate": 0.00025136433302991366 + }, + { + "step": 212, + "epoch": 0.33974358974358976, + "cpu_mem": 1.934934016, + "gpu_mem": 4.455628288, + "loss": 1.3816, + "grad_norm": 0.28757578134536743, + "learning_rate": 0.00025074356780219946 + }, + { + "step": 213, + "epoch": 0.34134615384615385, + "cpu_mem": 1.935130624, + "gpu_mem": 4.45560064, + "loss": 1.3701, + "grad_norm": 0.24873995780944824, + "learning_rate": 0.000250119643279231 + }, + { + "step": 214, + "epoch": 0.34294871794871795, + "cpu_mem": 1.935327232, + "gpu_mem": 4.455609856, + "loss": 1.3966, + "grad_norm": 0.17373625934123993, + "learning_rate": 0.0002494925790271386 + }, + { + "step": 215, + "epoch": 0.34455128205128205, + "cpu_mem": 1.93552384, + "gpu_mem": 4.455611392, + "loss": 1.3906, + "grad_norm": 0.23343732953071594, + "learning_rate": 0.00024886239471051376 + }, + { + "step": 216, + "epoch": 0.34615384615384615, + "cpu_mem": 1.935720448, + "gpu_mem": 4.455611392, + "loss": 1.3943, + "grad_norm": 0.18887114524841309, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 217, + "epoch": 0.34775641025641024, + "cpu_mem": 1.935917056, + "gpu_mem": 4.455596032, + "loss": 1.3967, + "grad_norm": 0.13336127996444702, + "learning_rate": 0.0002475927450306363 + }, + { + "step": 218, + "epoch": 0.34935897435897434, + "cpu_mem": 1.936310272, + "gpu_mem": 4.455617536, + "loss": 1.3922, + "grad_norm": 0.1603967398405075, + "learning_rate": 0.0002469533194833073 + }, + { + "step": 219, + "epoch": 0.35096153846153844, + "cpu_mem": 1.936310272, + "gpu_mem": 4.455651328, + "loss": 1.3834, + "grad_norm": 0.19836394488811493, + "learning_rate": 0.0002463108535020447 + }, + { + "step": 220, + "epoch": 0.3525641025641026, + "cpu_mem": 1.93650688, + "gpu_mem": 4.455605248, + "loss": 1.4041, + "grad_norm": 0.17208240926265717, + "learning_rate": 0.0002456653672344348 + }, + { + "step": 221, + "epoch": 0.3541666666666667, + "cpu_mem": 1.936703488, + "gpu_mem": 4.455611392, + "loss": 1.3907, + "grad_norm": 0.14950527250766754, + "learning_rate": 0.0002450168809227794 + }, + { + "step": 222, + "epoch": 0.3557692307692308, + "cpu_mem": 1.936900096, + "gpu_mem": 4.455626752, + "loss": 1.3905, + "grad_norm": 0.14720837771892548, + "learning_rate": 0.00024436541490346095 + }, + { + "step": 223, + "epoch": 0.3573717948717949, + "cpu_mem": 1.937096704, + "gpu_mem": 4.455645184, + "loss": 1.3787, + "grad_norm": 0.13805663585662842, + "learning_rate": 0.00024371098960630495 + }, + { + "step": 224, + "epoch": 0.358974358974359, + "cpu_mem": 1.937293312, + "gpu_mem": 4.455614464, + "loss": 1.3855, + "grad_norm": 0.08652158826589584, + "learning_rate": 0.000243053625553939 + }, + { + "step": 225, + "epoch": 0.3605769230769231, + "cpu_mem": 1.93748992, + "gpu_mem": 4.45560064, + "loss": 1.3753, + "grad_norm": 0.13674074411392212, + "learning_rate": 0.00024239334336114953 + }, + { + "step": 226, + "epoch": 0.36217948717948717, + "cpu_mem": 1.937686528, + "gpu_mem": 4.45559296, + "loss": 1.3883, + "grad_norm": 0.07804403454065323, + "learning_rate": 0.0002417301637342352 + }, + { + "step": 227, + "epoch": 0.36378205128205127, + "cpu_mem": 1.937883136, + "gpu_mem": 4.455657472, + "loss": 1.3761, + "grad_norm": 0.10688818246126175, + "learning_rate": 0.00024106410747035744 + }, + { + "step": 228, + "epoch": 0.36538461538461536, + "cpu_mem": 1.938079744, + "gpu_mem": 4.455596032, + "loss": 1.3964, + "grad_norm": 0.24375967681407928, + "learning_rate": 0.00024039519545688846 + }, + { + "step": 229, + "epoch": 0.36698717948717946, + "cpu_mem": 1.938276352, + "gpu_mem": 4.455648256, + "loss": 1.381, + "grad_norm": 0.0881073847413063, + "learning_rate": 0.000239723448670756 + }, + { + "step": 230, + "epoch": 0.3685897435897436, + "cpu_mem": 1.93847296, + "gpu_mem": 4.455629824, + "loss": 1.3833, + "grad_norm": 0.10650983452796936, + "learning_rate": 0.0002390488881777858 + }, + { + "step": 231, + "epoch": 0.3701923076923077, + "cpu_mem": 1.938669568, + "gpu_mem": 4.455628288, + "loss": 1.3778, + "grad_norm": 0.19404929876327515, + "learning_rate": 0.0002383715351320406 + }, + { + "step": 232, + "epoch": 0.3717948717948718, + "cpu_mem": 1.938866176, + "gpu_mem": 4.455632896, + "loss": 1.3805, + "grad_norm": 0.11814621835947037, + "learning_rate": 0.00023769141077515713 + }, + { + "step": 233, + "epoch": 0.3733974358974359, + "cpu_mem": 1.939062784, + "gpu_mem": 4.45560832, + "loss": 1.3832, + "grad_norm": 0.31416213512420654, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 234, + "epoch": 0.375, + "cpu_mem": 1.939259392, + "gpu_mem": 4.455637504, + "loss": 1.4034, + "grad_norm": 0.187570258975029, + "learning_rate": 0.0002363229335283915 + }, + { + "step": 235, + "epoch": 0.3766025641025641, + "cpu_mem": 1.939456, + "gpu_mem": 4.455614464, + "loss": 1.3878, + "grad_norm": 0.23704944550991058, + "learning_rate": 0.00023563462355364297 + }, + { + "step": 236, + "epoch": 0.3782051282051282, + "cpu_mem": 1.939456, + "gpu_mem": 4.455675904, + "loss": 1.3842, + "grad_norm": 0.19985656440258026, + "learning_rate": 0.0002349436280966775 + }, + { + "step": 237, + "epoch": 0.3798076923076923, + "cpu_mem": 1.939652608, + "gpu_mem": 4.45560064, + "loss": 1.405, + "grad_norm": 0.2998318374156952, + "learning_rate": 0.00023424996882695468 + }, + { + "step": 238, + "epoch": 0.3814102564102564, + "cpu_mem": 1.939849216, + "gpu_mem": 4.455611392, + "loss": 1.4062, + "grad_norm": 0.22002387046813965, + "learning_rate": 0.00023355366749747063 + }, + { + "step": 239, + "epoch": 0.38301282051282054, + "cpu_mem": 1.940045824, + "gpu_mem": 4.455609856, + "loss": 1.4057, + "grad_norm": 0.10987144708633423, + "learning_rate": 0.00023285474594407585 + }, + { + "step": 240, + "epoch": 0.38461538461538464, + "cpu_mem": 1.940242432, + "gpu_mem": 4.455606784, + "loss": 1.3684, + "grad_norm": 0.13767299056053162, + "learning_rate": 0.0002321532260847905 + }, + { + "step": 241, + "epoch": 0.38621794871794873, + "cpu_mem": 1.94043904, + "gpu_mem": 4.455637504, + "loss": 1.3738, + "grad_norm": 0.1488540768623352, + "learning_rate": 0.00023144912991911691 + }, + { + "step": 242, + "epoch": 0.38782051282051283, + "cpu_mem": 1.94043904, + "gpu_mem": 4.455616, + "loss": 1.3785, + "grad_norm": 0.16282391548156738, + "learning_rate": 0.0002307424795273499 + }, + { + "step": 243, + "epoch": 0.3894230769230769, + "cpu_mem": 1.940635648, + "gpu_mem": 4.455611392, + "loss": 1.3884, + "grad_norm": 0.1714826375246048, + "learning_rate": 0.00023003329706988425 + }, + { + "step": 244, + "epoch": 0.391025641025641, + "cpu_mem": 1.940832256, + "gpu_mem": 4.455622144, + "loss": 1.3843, + "grad_norm": 0.2472050040960312, + "learning_rate": 0.00022932160478651963 + }, + { + "step": 245, + "epoch": 0.3926282051282051, + "cpu_mem": 1.940832256, + "gpu_mem": 4.455626752, + "loss": 1.3968, + "grad_norm": 0.137643501162529, + "learning_rate": 0.00022860742499576338 + }, + { + "step": 246, + "epoch": 0.3942307692307692, + "cpu_mem": 1.941028864, + "gpu_mem": 4.455588352, + "loss": 1.3896, + "grad_norm": 0.10332507640123367, + "learning_rate": 0.00022789078009413042 + }, + { + "step": 247, + "epoch": 0.3958333333333333, + "cpu_mem": 1.941225472, + "gpu_mem": 4.455655936, + "loss": 1.3695, + "grad_norm": 0.12709857523441315, + "learning_rate": 0.00022717169255544108 + }, + { + "step": 248, + "epoch": 0.3974358974358974, + "cpu_mem": 1.94142208, + "gpu_mem": 4.455619072, + "loss": 1.3733, + "grad_norm": 0.17139531672000885, + "learning_rate": 0.00022645018493011612 + }, + { + "step": 249, + "epoch": 0.39903846153846156, + "cpu_mem": 1.941618688, + "gpu_mem": 4.45560832, + "loss": 1.4008, + "grad_norm": 0.16437961161136627, + "learning_rate": 0.0002257262798444698 + }, + { + "step": 250, + "epoch": 0.40064102564102566, + "cpu_mem": 1.941815296, + "gpu_mem": 4.455625216, + "loss": 1.3811, + "grad_norm": 0.1774735003709793, + "learning_rate": 0.000225 + }, + { + "step": 251, + "epoch": 0.40224358974358976, + "cpu_mem": 1.941815296, + "gpu_mem": 4.455599104, + "loss": 1.4055, + "grad_norm": 0.28086498379707336, + "learning_rate": 0.00022427136817267668 + }, + { + "step": 252, + "epoch": 0.40384615384615385, + "cpu_mem": 1.942011904, + "gpu_mem": 4.45564672, + "loss": 1.4051, + "grad_norm": 0.15969908237457275, + "learning_rate": 0.0002235404072122273 + }, + { + "step": 253, + "epoch": 0.40544871794871795, + "cpu_mem": 1.942208512, + "gpu_mem": 4.455614464, + "loss": 1.3744, + "grad_norm": 0.16651958227157593, + "learning_rate": 0.00022280714004142054 + }, + { + "step": 254, + "epoch": 0.40705128205128205, + "cpu_mem": 1.942208512, + "gpu_mem": 4.455603712, + "loss": 1.3713, + "grad_norm": 0.12160292267799377, + "learning_rate": 0.00022207158965534726 + }, + { + "step": 255, + "epoch": 0.40865384615384615, + "cpu_mem": 1.94240512, + "gpu_mem": 4.455619072, + "loss": 1.3721, + "grad_norm": 0.13108134269714355, + "learning_rate": 0.0002213337791206993 + }, + { + "step": 256, + "epoch": 0.41025641025641024, + "cpu_mem": 1.942601728, + "gpu_mem": 4.455616, + "loss": 1.3735, + "grad_norm": 0.11571836471557617, + "learning_rate": 0.00022059373157504636 + }, + { + "step": 257, + "epoch": 0.41185897435897434, + "cpu_mem": 1.942798336, + "gpu_mem": 4.455616, + "loss": 1.3959, + "grad_norm": 0.22075843811035156, + "learning_rate": 0.00021985147022611038 + }, + { + "step": 258, + "epoch": 0.41346153846153844, + "cpu_mem": 1.942994944, + "gpu_mem": 4.455603712, + "loss": 1.3779, + "grad_norm": 0.14646145701408386, + "learning_rate": 0.0002191070183510375 + }, + { + "step": 259, + "epoch": 0.4150641025641026, + "cpu_mem": 1.943191552, + "gpu_mem": 4.455586816, + "loss": 1.3918, + "grad_norm": 0.17340078949928284, + "learning_rate": 0.00021836039929566835 + }, + { + "step": 260, + "epoch": 0.4166666666666667, + "cpu_mem": 1.943191552, + "gpu_mem": 4.455649792, + "loss": 1.398, + "grad_norm": 0.15223081409931183, + "learning_rate": 0.0002176116364738058 + }, + { + "step": 261, + "epoch": 0.4182692307692308, + "cpu_mem": 1.94338816, + "gpu_mem": 4.455603712, + "loss": 1.4, + "grad_norm": 0.1315041035413742, + "learning_rate": 0.00021686075336648075 + }, + { + "step": 262, + "epoch": 0.4198717948717949, + "cpu_mem": 1.943584768, + "gpu_mem": 4.455612928, + "loss": 1.3919, + "grad_norm": 0.2404022514820099, + "learning_rate": 0.00021610777352121574 + }, + { + "step": 263, + "epoch": 0.421474358974359, + "cpu_mem": 1.943781376, + "gpu_mem": 4.455648256, + "loss": 1.3861, + "grad_norm": 0.1070752739906311, + "learning_rate": 0.0002153527205512867 + }, + { + "step": 264, + "epoch": 0.4230769230769231, + "cpu_mem": 1.943977984, + "gpu_mem": 4.455612928, + "loss": 1.3865, + "grad_norm": 0.1120879203081131, + "learning_rate": 0.0002145956181349821 + }, + { + "step": 265, + "epoch": 0.42467948717948717, + "cpu_mem": 1.944174592, + "gpu_mem": 4.455617536, + "loss": 1.3854, + "grad_norm": 0.12548568844795227, + "learning_rate": 0.00021383649001486055 + }, + { + "step": 266, + "epoch": 0.42628205128205127, + "cpu_mem": 1.9443712, + "gpu_mem": 4.455665152, + "loss": 1.3864, + "grad_norm": 0.10937052220106125, + "learning_rate": 0.00021307535999700637 + }, + { + "step": 267, + "epoch": 0.42788461538461536, + "cpu_mem": 1.9443712, + "gpu_mem": 4.455674368, + "loss": 1.3885, + "grad_norm": 0.1373073160648346, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 268, + "epoch": 0.42948717948717946, + "cpu_mem": 1.9443712, + "gpu_mem": 4.455628288, + "loss": 1.3969, + "grad_norm": 0.18791326880455017, + "learning_rate": 0.00021154718980558417 + }, + { + "step": 269, + "epoch": 0.4310897435897436, + "cpu_mem": 1.944567808, + "gpu_mem": 4.455622144, + "loss": 1.4277, + "grad_norm": 0.3314601182937622, + "learning_rate": 0.00021078019755508398 + }, + { + "step": 270, + "epoch": 0.4326923076923077, + "cpu_mem": 1.944764416, + "gpu_mem": 4.455683584, + "loss": 1.3936, + "grad_norm": 0.1891196221113205, + "learning_rate": 0.00021001129925148396 + }, + { + "step": 271, + "epoch": 0.4342948717948718, + "cpu_mem": 1.944764416, + "gpu_mem": 4.455609856, + "loss": 1.3868, + "grad_norm": 0.1453242003917694, + "learning_rate": 0.00020924051900725923 + }, + { + "step": 272, + "epoch": 0.4358974358974359, + "cpu_mem": 1.944961024, + "gpu_mem": 4.45560832, + "loss": 1.3834, + "grad_norm": 0.09256257861852646, + "learning_rate": 0.00020846788099390188 + }, + { + "step": 273, + "epoch": 0.4375, + "cpu_mem": 1.945157632, + "gpu_mem": 4.455611392, + "loss": 1.3841, + "grad_norm": 0.12205930799245834, + "learning_rate": 0.0002076934094411635 + }, + { + "step": 274, + "epoch": 0.4391025641025641, + "cpu_mem": 1.94535424, + "gpu_mem": 4.455597568, + "loss": 1.4067, + "grad_norm": 0.20825810730457306, + "learning_rate": 0.0002069171286362949 + }, + { + "step": 275, + "epoch": 0.4407051282051282, + "cpu_mem": 1.94535424, + "gpu_mem": 4.455612928, + "loss": 1.3984, + "grad_norm": 0.16567936539649963, + "learning_rate": 0.00020613906292328457 + }, + { + "step": 276, + "epoch": 0.4423076923076923, + "cpu_mem": 1.945550848, + "gpu_mem": 4.455651328, + "loss": 1.3825, + "grad_norm": 0.10865393280982971, + "learning_rate": 0.0002053592367020955 + }, + { + "step": 277, + "epoch": 0.4439102564102564, + "cpu_mem": 1.945747456, + "gpu_mem": 4.45563136, + "loss": 1.3755, + "grad_norm": 0.1125832125544548, + "learning_rate": 0.0002045776744278996 + }, + { + "step": 278, + "epoch": 0.44551282051282054, + "cpu_mem": 1.945944064, + "gpu_mem": 4.455657472, + "loss": 1.3949, + "grad_norm": 0.1998971700668335, + "learning_rate": 0.00020379440061031118 + }, + { + "step": 279, + "epoch": 0.44711538461538464, + "cpu_mem": 1.945944064, + "gpu_mem": 4.45560832, + "loss": 1.3752, + "grad_norm": 0.1377192884683609, + "learning_rate": 0.00020300943981261808 + }, + { + "step": 280, + "epoch": 0.44871794871794873, + "cpu_mem": 1.946140672, + "gpu_mem": 4.455602176, + "loss": 1.3963, + "grad_norm": 0.20409147441387177, + "learning_rate": 0.0002022228166510114 + }, + { + "step": 281, + "epoch": 0.45032051282051283, + "cpu_mem": 1.946140672, + "gpu_mem": 4.455625216, + "loss": 1.3755, + "grad_norm": 0.1205507293343544, + "learning_rate": 0.00020143455579381373 + }, + { + "step": 282, + "epoch": 0.4519230769230769, + "cpu_mem": 1.94633728, + "gpu_mem": 4.455603712, + "loss": 1.3632, + "grad_norm": 0.13806171715259552, + "learning_rate": 0.0002006446819607053 + }, + { + "step": 283, + "epoch": 0.453525641025641, + "cpu_mem": 1.946533888, + "gpu_mem": 4.455617536, + "loss": 1.3805, + "grad_norm": 0.14786016941070557, + "learning_rate": 0.00019985321992194892 + }, + { + "step": 284, + "epoch": 0.4551282051282051, + "cpu_mem": 1.946533888, + "gpu_mem": 4.455622144, + "loss": 1.3679, + "grad_norm": 0.2236131727695465, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 285, + "epoch": 0.4567307692307692, + "cpu_mem": 1.946730496, + "gpu_mem": 4.455640576, + "loss": 1.4028, + "grad_norm": 0.26791509985923767, + "learning_rate": 0.00019826563055679418 + }, + { + "step": 286, + "epoch": 0.4583333333333333, + "cpu_mem": 1.946927104, + "gpu_mem": 4.455611392, + "loss": 1.3595, + "grad_norm": 0.151590958237648, + "learning_rate": 0.00019746955301683537 + }, + { + "step": 287, + "epoch": 0.4599358974358974, + "cpu_mem": 1.946927104, + "gpu_mem": 4.45563904, + "loss": 1.404, + "grad_norm": 0.10857828706502914, + "learning_rate": 0.0001966719868425464 + }, + { + "step": 288, + "epoch": 0.46153846153846156, + "cpu_mem": 1.946927104, + "gpu_mem": 4.455620608, + "loss": 1.3874, + "grad_norm": 0.23128440976142883, + "learning_rate": 0.0001958729570454201 + }, + { + "step": 289, + "epoch": 0.46314102564102566, + "cpu_mem": 1.947123712, + "gpu_mem": 4.45560832, + "loss": 1.4005, + "grad_norm": 0.19321958720684052, + "learning_rate": 0.0001950724886828484 + }, + { + "step": 290, + "epoch": 0.46474358974358976, + "cpu_mem": 1.94732032, + "gpu_mem": 4.455617536, + "loss": 1.3734, + "grad_norm": 0.1322495937347412, + "learning_rate": 0.000194270606857336 + }, + { + "step": 291, + "epoch": 0.46634615384615385, + "cpu_mem": 1.947516928, + "gpu_mem": 4.455614464, + "loss": 1.388, + "grad_norm": 0.13946077227592468, + "learning_rate": 0.00019346733671571367 + }, + { + "step": 292, + "epoch": 0.46794871794871795, + "cpu_mem": 1.947516928, + "gpu_mem": 4.455629824, + "loss": 1.4117, + "grad_norm": 0.19886770844459534, + "learning_rate": 0.00019266270344834942 + }, + { + "step": 293, + "epoch": 0.46955128205128205, + "cpu_mem": 1.947713536, + "gpu_mem": 4.455637504, + "loss": 1.3785, + "grad_norm": 0.127224400639534, + "learning_rate": 0.00019185673228835857 + }, + { + "step": 294, + "epoch": 0.47115384615384615, + "cpu_mem": 1.947910144, + "gpu_mem": 4.455626752, + "loss": 1.4004, + "grad_norm": 0.15896514058113098, + "learning_rate": 0.00019104944851081244 + }, + { + "step": 295, + "epoch": 0.47275641025641024, + "cpu_mem": 1.947910144, + "gpu_mem": 4.455611392, + "loss": 1.389, + "grad_norm": 0.19347207248210907, + "learning_rate": 0.00019024087743194564 + }, + { + "step": 296, + "epoch": 0.47435897435897434, + "cpu_mem": 1.948106752, + "gpu_mem": 4.455614464, + "loss": 1.3786, + "grad_norm": 0.10325469076633453, + "learning_rate": 0.0001894310444083625 + }, + { + "step": 297, + "epoch": 0.47596153846153844, + "cpu_mem": 1.948106752, + "gpu_mem": 4.45560832, + "loss": 1.3767, + "grad_norm": 0.1444447785615921, + "learning_rate": 0.00018861997483624136 + }, + { + "step": 298, + "epoch": 0.4775641025641026, + "cpu_mem": 1.94830336, + "gpu_mem": 4.455603712, + "loss": 1.393, + "grad_norm": 0.25527164340019226, + "learning_rate": 0.00018780769415053866 + }, + { + "step": 299, + "epoch": 0.4791666666666667, + "cpu_mem": 1.948499968, + "gpu_mem": 4.455625216, + "loss": 1.389, + "grad_norm": 0.1414865255355835, + "learning_rate": 0.00018699422782419094 + }, + { + "step": 300, + "epoch": 0.4807692307692308, + "cpu_mem": 1.948499968, + "gpu_mem": 4.455617536, + "loss": 1.3606, + "grad_norm": 0.1910536140203476, + "learning_rate": 0.00018617960136731624 + }, + { + "step": 301, + "epoch": 0.4823717948717949, + "cpu_mem": 1.948696576, + "gpu_mem": 4.455589888, + "loss": 1.3753, + "grad_norm": 0.09304851293563843, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 302, + "epoch": 0.483974358974359, + "cpu_mem": 1.948696576, + "gpu_mem": 4.455588352, + "loss": 1.3812, + "grad_norm": 0.10530442744493484, + "learning_rate": 0.0001845469702835641 + }, + { + "step": 303, + "epoch": 0.4855769230769231, + "cpu_mem": 1.948696576, + "gpu_mem": 4.455614464, + "loss": 1.409, + "grad_norm": 0.2089644819498062, + "learning_rate": 0.00018372901685562414 + }, + { + "step": 304, + "epoch": 0.48717948717948717, + "cpu_mem": 1.948893184, + "gpu_mem": 4.455597568, + "loss": 1.377, + "grad_norm": 0.1182415708899498, + "learning_rate": 0.00018291000569342676 + }, + { + "step": 305, + "epoch": 0.48878205128205127, + "cpu_mem": 1.948893184, + "gpu_mem": 4.455628288, + "loss": 1.352, + "grad_norm": 0.3715735971927643, + "learning_rate": 0.00018208996248097458 + }, + { + "step": 306, + "epoch": 0.49038461538461536, + "cpu_mem": 1.949089792, + "gpu_mem": 4.455611392, + "loss": 1.4015, + "grad_norm": 0.1776302307844162, + "learning_rate": 0.00018126891293463547 + }, + { + "step": 307, + "epoch": 0.49198717948717946, + "cpu_mem": 1.9492864, + "gpu_mem": 4.455642112, + "loss": 1.3929, + "grad_norm": 0.18861955404281616, + "learning_rate": 0.0001804468828023354 + }, + { + "step": 308, + "epoch": 0.4935897435897436, + "cpu_mem": 1.9492864, + "gpu_mem": 4.455609856, + "loss": 1.3953, + "grad_norm": 0.13656139373779297, + "learning_rate": 0.00017962389786275142 + }, + { + "step": 309, + "epoch": 0.4951923076923077, + "cpu_mem": 1.9492864, + "gpu_mem": 4.455635968, + "loss": 1.3681, + "grad_norm": 0.1282545030117035, + "learning_rate": 0.000178799983924503 + }, + { + "step": 310, + "epoch": 0.4967948717948718, + "cpu_mem": 1.949483008, + "gpu_mem": 4.455611392, + "loss": 1.3843, + "grad_norm": 0.07744809240102768, + "learning_rate": 0.00017797516682534293 + }, + { + "step": 311, + "epoch": 0.4983974358974359, + "cpu_mem": 1.949483008, + "gpu_mem": 4.455606784, + "loss": 1.3882, + "grad_norm": 0.10125494003295898, + "learning_rate": 0.00017714947243134695 + }, + { + "step": 312, + "epoch": 0.5, + "cpu_mem": 1.949679616, + "gpu_mem": 4.455609856, + "loss": 1.3909, + "grad_norm": 0.1669577956199646, + "learning_rate": 0.0001763229266361024 + }, + { + "step": 313, + "epoch": 0.5016025641025641, + "cpu_mem": 1.949679616, + "gpu_mem": 4.455628288, + "loss": 1.3957, + "grad_norm": 0.1830621212720871, + "learning_rate": 0.00017549555535989648 + }, + { + "step": 314, + "epoch": 0.5032051282051282, + "cpu_mem": 1.949679616, + "gpu_mem": 4.45560832, + "loss": 1.3721, + "grad_norm": 0.14906547963619232, + "learning_rate": 0.00017466738454890323 + }, + { + "step": 315, + "epoch": 0.5048076923076923, + "cpu_mem": 1.949876224, + "gpu_mem": 4.455612928, + "loss": 1.3867, + "grad_norm": 0.0937037318944931, + "learning_rate": 0.00017383844017436996 + }, + { + "step": 316, + "epoch": 0.5064102564102564, + "cpu_mem": 1.950072832, + "gpu_mem": 4.45560832, + "loss": 1.4103, + "grad_norm": 0.19134382903575897, + "learning_rate": 0.00017300874823180282 + }, + { + "step": 317, + "epoch": 0.5080128205128205, + "cpu_mem": 1.950072832, + "gpu_mem": 4.455616, + "loss": 1.3926, + "grad_norm": 0.10115544497966766, + "learning_rate": 0.00017217833474015128 + }, + { + "step": 318, + "epoch": 0.5096153846153846, + "cpu_mem": 1.95026944, + "gpu_mem": 4.455640576, + "loss": 1.3743, + "grad_norm": 0.12415555119514465, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 319, + "epoch": 0.5112179487179487, + "cpu_mem": 1.95026944, + "gpu_mem": 4.455632896, + "loss": 1.3894, + "grad_norm": 0.16117581725120544, + "learning_rate": 0.0001705154472977154 + }, + { + "step": 320, + "epoch": 0.5128205128205128, + "cpu_mem": 1.950466048, + "gpu_mem": 4.455634432, + "loss": 1.3902, + "grad_norm": 0.12393181771039963, + "learning_rate": 0.00016968302549470095 + }, + { + "step": 321, + "epoch": 0.5144230769230769, + "cpu_mem": 1.950466048, + "gpu_mem": 4.455609856, + "loss": 1.393, + "grad_norm": 0.16187816858291626, + "learning_rate": 0.00016884998643650694 + }, + { + "step": 322, + "epoch": 0.5160256410256411, + "cpu_mem": 1.950662656, + "gpu_mem": 4.455611392, + "loss": 1.3852, + "grad_norm": 0.13279278576374054, + "learning_rate": 0.00016801635624704776 + }, + { + "step": 323, + "epoch": 0.5176282051282052, + "cpu_mem": 1.950859264, + "gpu_mem": 4.45563136, + "loss": 1.389, + "grad_norm": 0.10837306827306747, + "learning_rate": 0.0001671821610687756 + }, + { + "step": 324, + "epoch": 0.5192307692307693, + "cpu_mem": 1.950859264, + "gpu_mem": 4.455603712, + "loss": 1.3922, + "grad_norm": 0.21560761332511902, + "learning_rate": 0.00016634742706186036 + }, + { + "step": 325, + "epoch": 0.5208333333333334, + "cpu_mem": 1.950859264, + "gpu_mem": 4.455616, + "loss": 1.3647, + "grad_norm": 0.1615973263978958, + "learning_rate": 0.00016551218040336993 + }, + { + "step": 326, + "epoch": 0.5224358974358975, + "cpu_mem": 1.951055872, + "gpu_mem": 4.455625216, + "loss": 1.3991, + "grad_norm": 0.19027961790561676, + "learning_rate": 0.00016467644728644843 + }, + { + "step": 327, + "epoch": 0.5240384615384616, + "cpu_mem": 1.95125248, + "gpu_mem": 4.455602176, + "loss": 1.3617, + "grad_norm": 0.20301522314548492, + "learning_rate": 0.0001638402539194953 + }, + { + "step": 328, + "epoch": 0.5256410256410257, + "cpu_mem": 1.95125248, + "gpu_mem": 4.455626752, + "loss": 1.3982, + "grad_norm": 0.22716152667999268, + "learning_rate": 0.00016300362652534346 + }, + { + "step": 329, + "epoch": 0.5272435897435898, + "cpu_mem": 1.95125248, + "gpu_mem": 4.455626752, + "loss": 1.3874, + "grad_norm": 0.1306220144033432, + "learning_rate": 0.00016216659134043657 + }, + { + "step": 330, + "epoch": 0.5288461538461539, + "cpu_mem": 1.951449088, + "gpu_mem": 4.455609856, + "loss": 1.3815, + "grad_norm": 0.12610889971256256, + "learning_rate": 0.00016132917461400686 + }, + { + "step": 331, + "epoch": 0.530448717948718, + "cpu_mem": 1.951449088, + "gpu_mem": 4.455606784, + "loss": 1.3854, + "grad_norm": 0.3287367820739746, + "learning_rate": 0.00016049140260725127 + }, + { + "step": 332, + "epoch": 0.532051282051282, + "cpu_mem": 1.951449088, + "gpu_mem": 4.455599104, + "loss": 1.3864, + "grad_norm": 0.12929855287075043, + "learning_rate": 0.00015965330159250845 + }, + { + "step": 333, + "epoch": 0.5336538461538461, + "cpu_mem": 1.951645696, + "gpu_mem": 4.455637504, + "loss": 1.4076, + "grad_norm": 0.2543030083179474, + "learning_rate": 0.00015881489785243467 + }, + { + "step": 334, + "epoch": 0.5352564102564102, + "cpu_mem": 1.951645696, + "gpu_mem": 4.455614464, + "loss": 1.3833, + "grad_norm": 0.1170952245593071, + "learning_rate": 0.00015797621767917942 + }, + { + "step": 335, + "epoch": 0.5368589743589743, + "cpu_mem": 1.951645696, + "gpu_mem": 4.455612928, + "loss": 1.3926, + "grad_norm": 0.11154106259346008, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 336, + "epoch": 0.5384615384615384, + "cpu_mem": 1.951645696, + "gpu_mem": 4.455629824, + "loss": 1.3912, + "grad_norm": 0.1513826847076416, + "learning_rate": 0.00015629813324424292 + }, + { + "step": 337, + "epoch": 0.5400641025641025, + "cpu_mem": 1.951842304, + "gpu_mem": 4.455614464, + "loss": 1.3616, + "grad_norm": 0.10020670294761658, + "learning_rate": 0.00015545878160690583 + }, + { + "step": 338, + "epoch": 0.5416666666666666, + "cpu_mem": 1.951842304, + "gpu_mem": 4.455626752, + "loss": 1.405, + "grad_norm": 0.22995728254318237, + "learning_rate": 0.00015461925878342556 + }, + { + "step": 339, + "epoch": 0.5432692307692307, + "cpu_mem": 1.952038912, + "gpu_mem": 4.45563904, + "loss": 1.3957, + "grad_norm": 0.19204968214035034, + "learning_rate": 0.00015377959110104584 + }, + { + "step": 340, + "epoch": 0.5448717948717948, + "cpu_mem": 1.95223552, + "gpu_mem": 4.455614464, + "loss": 1.4001, + "grad_norm": 0.1724342703819275, + "learning_rate": 0.00015293980489155333 + }, + { + "step": 341, + "epoch": 0.5464743589743589, + "cpu_mem": 1.95223552, + "gpu_mem": 4.455659008, + "loss": 1.41, + "grad_norm": 0.25114649534225464, + "learning_rate": 0.00015209992649045152 + }, + { + "step": 342, + "epoch": 0.5480769230769231, + "cpu_mem": 1.95223552, + "gpu_mem": 4.455632896, + "loss": 1.3651, + "grad_norm": 0.1062668040394783, + "learning_rate": 0.000151259982236135 + }, + { + "step": 343, + "epoch": 0.5496794871794872, + "cpu_mem": 1.952432128, + "gpu_mem": 4.455629824, + "loss": 1.3986, + "grad_norm": 0.13163644075393677, + "learning_rate": 0.00015041999846906367 + }, + { + "step": 344, + "epoch": 0.5512820512820513, + "cpu_mem": 1.952432128, + "gpu_mem": 4.455611392, + "loss": 1.3718, + "grad_norm": 0.08316055685281754, + "learning_rate": 0.00014958000153093634 + }, + { + "step": 345, + "epoch": 0.5528846153846154, + "cpu_mem": 1.952432128, + "gpu_mem": 4.455617536, + "loss": 1.403, + "grad_norm": 0.17354628443717957, + "learning_rate": 0.000148740017763865 + }, + { + "step": 346, + "epoch": 0.5544871794871795, + "cpu_mem": 1.952628736, + "gpu_mem": 4.455586816, + "loss": 1.3898, + "grad_norm": 0.148939847946167, + "learning_rate": 0.00014790007350954845 + }, + { + "step": 347, + "epoch": 0.5560897435897436, + "cpu_mem": 1.952628736, + "gpu_mem": 4.455651328, + "loss": 1.3861, + "grad_norm": 0.11925389617681503, + "learning_rate": 0.00014706019510844664 + }, + { + "step": 348, + "epoch": 0.5576923076923077, + "cpu_mem": 1.952825344, + "gpu_mem": 4.455605248, + "loss": 1.4147, + "grad_norm": 0.17690271139144897, + "learning_rate": 0.0001462204088989541 + }, + { + "step": 349, + "epoch": 0.5592948717948718, + "cpu_mem": 1.952825344, + "gpu_mem": 4.455599104, + "loss": 1.3848, + "grad_norm": 0.10510093718767166, + "learning_rate": 0.00014538074121657447 + }, + { + "step": 350, + "epoch": 0.5608974358974359, + "cpu_mem": 1.952825344, + "gpu_mem": 4.4556544, + "loss": 1.3839, + "grad_norm": 0.07285576313734055, + "learning_rate": 0.00014454121839309415 + }, + { + "step": 351, + "epoch": 0.5625, + "cpu_mem": 1.953021952, + "gpu_mem": 4.455620608, + "loss": 1.4056, + "grad_norm": 0.17273887991905212, + "learning_rate": 0.00014370186675575705 + }, + { + "step": 352, + "epoch": 0.5641025641025641, + "cpu_mem": 1.953021952, + "gpu_mem": 4.45560832, + "loss": 1.379, + "grad_norm": 0.20356370508670807, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 353, + "epoch": 0.5657051282051282, + "cpu_mem": 1.953021952, + "gpu_mem": 4.455612928, + "loss": 1.3795, + "grad_norm": 0.12373435497283936, + "learning_rate": 0.00014202378232082053 + }, + { + "step": 354, + "epoch": 0.5673076923076923, + "cpu_mem": 1.95321856, + "gpu_mem": 4.45559296, + "loss": 1.3838, + "grad_norm": 0.11669295281171799, + "learning_rate": 0.00014118510214756536 + }, + { + "step": 355, + "epoch": 0.5689102564102564, + "cpu_mem": 1.95321856, + "gpu_mem": 4.455617536, + "loss": 1.3708, + "grad_norm": 0.08797701448202133, + "learning_rate": 0.00014034669840749152 + }, + { + "step": 356, + "epoch": 0.5705128205128205, + "cpu_mem": 1.953415168, + "gpu_mem": 4.455596032, + "loss": 1.3798, + "grad_norm": 0.09130707383155823, + "learning_rate": 0.0001395085973927487 + }, + { + "step": 357, + "epoch": 0.5721153846153846, + "cpu_mem": 1.953415168, + "gpu_mem": 4.455612928, + "loss": 1.38, + "grad_norm": 0.11233506351709366, + "learning_rate": 0.00013867082538599317 + }, + { + "step": 358, + "epoch": 0.5737179487179487, + "cpu_mem": 1.953415168, + "gpu_mem": 4.4555776, + "loss": 1.391, + "grad_norm": 0.07960695028305054, + "learning_rate": 0.00013783340865956338 + }, + { + "step": 359, + "epoch": 0.5753205128205128, + "cpu_mem": 1.953611776, + "gpu_mem": 4.455609856, + "loss": 1.3771, + "grad_norm": 0.11814766377210617, + "learning_rate": 0.0001369963734746566 + }, + { + "step": 360, + "epoch": 0.5769230769230769, + "cpu_mem": 1.953808384, + "gpu_mem": 4.455599104, + "loss": 1.3814, + "grad_norm": 0.08579172194004059, + "learning_rate": 0.0001361597460805047 + }, + { + "step": 361, + "epoch": 0.5785256410256411, + "cpu_mem": 1.953808384, + "gpu_mem": 4.455635968, + "loss": 1.3935, + "grad_norm": 0.17007967829704285, + "learning_rate": 0.00013532355271355154 + }, + { + "step": 362, + "epoch": 0.5801282051282052, + "cpu_mem": 1.953808384, + "gpu_mem": 4.455602176, + "loss": 1.3739, + "grad_norm": 0.1313059777021408, + "learning_rate": 0.00013448781959663004 + }, + { + "step": 363, + "epoch": 0.5817307692307693, + "cpu_mem": 1.954004992, + "gpu_mem": 4.455625216, + "loss": 1.379, + "grad_norm": 0.1895531862974167, + "learning_rate": 0.00013365257293813956 + }, + { + "step": 364, + "epoch": 0.5833333333333334, + "cpu_mem": 1.954004992, + "gpu_mem": 4.455614464, + "loss": 1.3797, + "grad_norm": 0.12682649493217468, + "learning_rate": 0.00013281783893122446 + }, + { + "step": 365, + "epoch": 0.5849358974358975, + "cpu_mem": 1.954004992, + "gpu_mem": 4.455620608, + "loss": 1.3821, + "grad_norm": 0.10858836770057678, + "learning_rate": 0.00013198364375295224 + }, + { + "step": 366, + "epoch": 0.5865384615384616, + "cpu_mem": 1.9542016, + "gpu_mem": 4.455614464, + "loss": 1.3989, + "grad_norm": 0.21132023632526398, + "learning_rate": 0.000131150013563493 + }, + { + "step": 367, + "epoch": 0.5881410256410257, + "cpu_mem": 1.9542016, + "gpu_mem": 4.455632896, + "loss": 1.3892, + "grad_norm": 0.16347794234752655, + "learning_rate": 0.00013031697450529902 + }, + { + "step": 368, + "epoch": 0.5897435897435898, + "cpu_mem": 1.9542016, + "gpu_mem": 4.45559296, + "loss": 1.3949, + "grad_norm": 0.09750188142061234, + "learning_rate": 0.0001294845527022846 + }, + { + "step": 369, + "epoch": 0.5913461538461539, + "cpu_mem": 1.9542016, + "gpu_mem": 4.455625216, + "loss": 1.3681, + "grad_norm": 0.07433948665857315, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 370, + "epoch": 0.592948717948718, + "cpu_mem": 1.954398208, + "gpu_mem": 4.455645184, + "loss": 1.3927, + "grad_norm": 0.10452816635370255, + "learning_rate": 0.0001278216652598487 + }, + { + "step": 371, + "epoch": 0.594551282051282, + "cpu_mem": 1.954398208, + "gpu_mem": 4.45563904, + "loss": 1.3912, + "grad_norm": 0.13778789341449738, + "learning_rate": 0.00012699125176819716 + }, + { + "step": 372, + "epoch": 0.5961538461538461, + "cpu_mem": 1.954398208, + "gpu_mem": 4.455602176, + "loss": 1.4025, + "grad_norm": 0.215082049369812, + "learning_rate": 0.00012616155982563 + }, + { + "step": 373, + "epoch": 0.5977564102564102, + "cpu_mem": 1.954594816, + "gpu_mem": 4.455619072, + "loss": 1.4003, + "grad_norm": 0.17578022181987762, + "learning_rate": 0.00012533261545109674 + }, + { + "step": 374, + "epoch": 0.5993589743589743, + "cpu_mem": 1.954594816, + "gpu_mem": 4.455596032, + "loss": 1.3605, + "grad_norm": 0.17737841606140137, + "learning_rate": 0.00012450444464010352 + }, + { + "step": 375, + "epoch": 0.6009615384615384, + "cpu_mem": 1.954594816, + "gpu_mem": 4.455628288, + "loss": 1.3666, + "grad_norm": 0.2664681375026703, + "learning_rate": 0.0001236770733638976 + }, + { + "step": 376, + "epoch": 0.6025641025641025, + "cpu_mem": 1.954594816, + "gpu_mem": 4.45562368, + "loss": 1.3767, + "grad_norm": 0.11666446179151535, + "learning_rate": 0.000122850527568653 + }, + { + "step": 377, + "epoch": 0.6041666666666666, + "cpu_mem": 1.954791424, + "gpu_mem": 4.455632896, + "loss": 1.3843, + "grad_norm": 0.11560229957103729, + "learning_rate": 0.00012202483317465704 + }, + { + "step": 378, + "epoch": 0.6057692307692307, + "cpu_mem": 1.954791424, + "gpu_mem": 4.455606784, + "loss": 1.3934, + "grad_norm": 0.11155625432729721, + "learning_rate": 0.00012120001607549698 + }, + { + "step": 379, + "epoch": 0.6073717948717948, + "cpu_mem": 1.954791424, + "gpu_mem": 4.455626752, + "loss": 1.3735, + "grad_norm": 0.24961571395397186, + "learning_rate": 0.00012037610213724862 + }, + { + "step": 380, + "epoch": 0.6089743589743589, + "cpu_mem": 1.954791424, + "gpu_mem": 4.45560064, + "loss": 1.3862, + "grad_norm": 0.12204889208078384, + "learning_rate": 0.0001195531171976646 + }, + { + "step": 381, + "epoch": 0.6105769230769231, + "cpu_mem": 1.954988032, + "gpu_mem": 4.455625216, + "loss": 1.4028, + "grad_norm": 0.14311876893043518, + "learning_rate": 0.00011873108706536448 + }, + { + "step": 382, + "epoch": 0.6121794871794872, + "cpu_mem": 1.954988032, + "gpu_mem": 4.455609856, + "loss": 1.3722, + "grad_norm": 0.1785416603088379, + "learning_rate": 0.00011791003751902542 + }, + { + "step": 383, + "epoch": 0.6137820512820513, + "cpu_mem": 1.954988032, + "gpu_mem": 4.455643648, + "loss": 1.3819, + "grad_norm": 0.2443164587020874, + "learning_rate": 0.00011708999430657325 + }, + { + "step": 384, + "epoch": 0.6153846153846154, + "cpu_mem": 1.95518464, + "gpu_mem": 4.45562368, + "loss": 1.3923, + "grad_norm": 0.15114539861679077, + "learning_rate": 0.00011627098314437586 + }, + { + "step": 385, + "epoch": 0.6169871794871795, + "cpu_mem": 1.95518464, + "gpu_mem": 4.45560832, + "loss": 1.3805, + "grad_norm": 0.10945875942707062, + "learning_rate": 0.0001154530297164359 + }, + { + "step": 386, + "epoch": 0.6185897435897436, + "cpu_mem": 1.95518464, + "gpu_mem": 4.455643648, + "loss": 1.3841, + "grad_norm": 0.08985135704278946, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 387, + "epoch": 0.6201923076923077, + "cpu_mem": 1.95518464, + "gpu_mem": 4.455649792, + "loss": 1.3933, + "grad_norm": 0.164816215634346, + "learning_rate": 0.00011382039863268374 + }, + { + "step": 388, + "epoch": 0.6217948717948718, + "cpu_mem": 1.95518464, + "gpu_mem": 4.455612928, + "loss": 1.3915, + "grad_norm": 0.18174226582050323, + "learning_rate": 0.00011300577217580905 + }, + { + "step": 389, + "epoch": 0.6233974358974359, + "cpu_mem": 1.955381248, + "gpu_mem": 4.455591424, + "loss": 1.3911, + "grad_norm": 0.21274980902671814, + "learning_rate": 0.00011219230584946136 + }, + { + "step": 390, + "epoch": 0.625, + "cpu_mem": 1.955381248, + "gpu_mem": 4.455643648, + "loss": 1.3979, + "grad_norm": 0.21175791323184967, + "learning_rate": 0.00011138002516375864 + }, + { + "step": 391, + "epoch": 0.6266025641025641, + "cpu_mem": 1.955381248, + "gpu_mem": 4.455629824, + "loss": 1.3824, + "grad_norm": 0.15183362364768982, + "learning_rate": 0.00011056895559163748 + }, + { + "step": 392, + "epoch": 0.6282051282051282, + "cpu_mem": 1.955381248, + "gpu_mem": 4.45562368, + "loss": 1.3842, + "grad_norm": 0.1986180692911148, + "learning_rate": 0.00010975912256805436 + }, + { + "step": 393, + "epoch": 0.6298076923076923, + "cpu_mem": 1.955577856, + "gpu_mem": 4.455629824, + "loss": 1.4076, + "grad_norm": 0.25155699253082275, + "learning_rate": 0.00010895055148918756 + }, + { + "step": 394, + "epoch": 0.6314102564102564, + "cpu_mem": 1.955577856, + "gpu_mem": 4.455606784, + "loss": 1.3854, + "grad_norm": 0.1798579841852188, + "learning_rate": 0.00010814326771164141 + }, + { + "step": 395, + "epoch": 0.6330128205128205, + "cpu_mem": 1.955577856, + "gpu_mem": 4.455620608, + "loss": 1.3662, + "grad_norm": 0.18152295053005219, + "learning_rate": 0.00010733729655165054 + }, + { + "step": 396, + "epoch": 0.6346153846153846, + "cpu_mem": 1.955577856, + "gpu_mem": 4.455620608, + "loss": 1.3789, + "grad_norm": 0.16208909451961517, + "learning_rate": 0.00010653266328428628 + }, + { + "step": 397, + "epoch": 0.6362179487179487, + "cpu_mem": 1.955774464, + "gpu_mem": 4.455589888, + "loss": 1.372, + "grad_norm": 0.28387215733528137, + "learning_rate": 0.00010572939314266402 + }, + { + "step": 398, + "epoch": 0.6378205128205128, + "cpu_mem": 1.955774464, + "gpu_mem": 4.45562368, + "loss": 1.3831, + "grad_norm": 0.19361627101898193, + "learning_rate": 0.00010492751131715159 + }, + { + "step": 399, + "epoch": 0.6394230769230769, + "cpu_mem": 1.955774464, + "gpu_mem": 4.455602176, + "loss": 1.3815, + "grad_norm": 0.110410176217556, + "learning_rate": 0.00010412704295457988 + }, + { + "step": 400, + "epoch": 0.6410256410256411, + "cpu_mem": 1.955774464, + "gpu_mem": 4.455609856, + "loss": 1.3756, + "grad_norm": 0.09755247086286545, + "learning_rate": 0.00010332801315745361 + }, + { + "step": 401, + "epoch": 0.6426282051282052, + "cpu_mem": 1.955971072, + "gpu_mem": 4.455628288, + "loss": 1.377, + "grad_norm": 0.07346513122320175, + "learning_rate": 0.00010253044698316464 + }, + { + "step": 402, + "epoch": 0.6442307692307693, + "cpu_mem": 1.955971072, + "gpu_mem": 4.455596032, + "loss": 1.3892, + "grad_norm": 0.10962972044944763, + "learning_rate": 0.00010173436944320582 + }, + { + "step": 403, + "epoch": 0.6458333333333334, + "cpu_mem": 1.955971072, + "gpu_mem": 4.45560064, + "loss": 1.3957, + "grad_norm": 0.084483303129673, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 404, + "epoch": 0.6474358974358975, + "cpu_mem": 1.95616768, + "gpu_mem": 4.455596032, + "loss": 1.3804, + "grad_norm": 0.1282627284526825, + "learning_rate": 0.00010014678007805106 + }, + { + "step": 405, + "epoch": 0.6490384615384616, + "cpu_mem": 1.95616768, + "gpu_mem": 4.455640576, + "loss": 1.3889, + "grad_norm": 0.1248074322938919, + "learning_rate": 9.935531803929469e-05 + }, + { + "step": 406, + "epoch": 0.6506410256410257, + "cpu_mem": 1.956364288, + "gpu_mem": 4.45562368, + "loss": 1.3902, + "grad_norm": 0.24359115958213806, + "learning_rate": 9.856544420618624e-05 + }, + { + "step": 407, + "epoch": 0.6522435897435898, + "cpu_mem": 1.956364288, + "gpu_mem": 4.455612928, + "loss": 1.3765, + "grad_norm": 0.10514674335718155, + "learning_rate": 9.777718334898859e-05 + }, + { + "step": 408, + "epoch": 0.6538461538461539, + "cpu_mem": 1.956364288, + "gpu_mem": 4.455634432, + "loss": 1.3924, + "grad_norm": 0.2762450575828552, + "learning_rate": 9.699056018738192e-05 + }, + { + "step": 409, + "epoch": 0.655448717948718, + "cpu_mem": 1.956364288, + "gpu_mem": 4.45560064, + "loss": 1.3985, + "grad_norm": 0.11570607870817184, + "learning_rate": 9.62055993896888e-05 + }, + { + "step": 410, + "epoch": 0.657051282051282, + "cpu_mem": 1.956364288, + "gpu_mem": 4.455616, + "loss": 1.35, + "grad_norm": 0.1867758333683014, + "learning_rate": 9.542232557210039e-05 + }, + { + "step": 411, + "epoch": 0.6586538461538461, + "cpu_mem": 1.956364288, + "gpu_mem": 4.455616, + "loss": 1.3897, + "grad_norm": 0.1614803820848465, + "learning_rate": 9.464076329790451e-05 + }, + { + "step": 412, + "epoch": 0.6602564102564102, + "cpu_mem": 1.956560896, + "gpu_mem": 4.455606784, + "loss": 1.3862, + "grad_norm": 0.147138774394989, + "learning_rate": 9.386093707671543e-05 + }, + { + "step": 413, + "epoch": 0.6618589743589743, + "cpu_mem": 1.956560896, + "gpu_mem": 4.455617536, + "loss": 1.4106, + "grad_norm": 0.33423754572868347, + "learning_rate": 9.308287136370511e-05 + }, + { + "step": 414, + "epoch": 0.6634615384615384, + "cpu_mem": 1.956560896, + "gpu_mem": 4.455642112, + "loss": 1.3751, + "grad_norm": 0.13124047219753265, + "learning_rate": 9.230659055883649e-05 + }, + { + "step": 415, + "epoch": 0.6650641025641025, + "cpu_mem": 1.956560896, + "gpu_mem": 4.455594496, + "loss": 1.398, + "grad_norm": 0.19443421065807343, + "learning_rate": 9.15321190060981e-05 + }, + { + "step": 416, + "epoch": 0.6666666666666666, + "cpu_mem": 1.956560896, + "gpu_mem": 4.455629824, + "loss": 1.3886, + "grad_norm": 0.17181289196014404, + "learning_rate": 9.075948099274078e-05 + }, + { + "step": 417, + "epoch": 0.6682692307692307, + "cpu_mem": 1.956560896, + "gpu_mem": 4.455591424, + "loss": 1.3595, + "grad_norm": 0.2101513147354126, + "learning_rate": 8.998870074851604e-05 + }, + { + "step": 418, + "epoch": 0.6698717948717948, + "cpu_mem": 1.956757504, + "gpu_mem": 4.455609856, + "loss": 1.3937, + "grad_norm": 0.21757926046848297, + "learning_rate": 8.9219802444916e-05 + }, + { + "step": 419, + "epoch": 0.6714743589743589, + "cpu_mem": 1.956757504, + "gpu_mem": 4.455602176, + "loss": 1.3857, + "grad_norm": 0.11997231096029282, + "learning_rate": 8.845281019441583e-05 + }, + { + "step": 420, + "epoch": 0.6730769230769231, + "cpu_mem": 1.956757504, + "gpu_mem": 4.45563904, + "loss": 1.3841, + "grad_norm": 0.11704442650079727, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 421, + "epoch": 0.6746794871794872, + "cpu_mem": 1.956757504, + "gpu_mem": 4.455599104, + "loss": 1.3788, + "grad_norm": 0.14476141333580017, + "learning_rate": 8.692464000299362e-05 + }, + { + "step": 422, + "epoch": 0.6762820512820513, + "cpu_mem": 1.956757504, + "gpu_mem": 4.455612928, + "loss": 1.3866, + "grad_norm": 0.21956875920295715, + "learning_rate": 8.61635099851395e-05 + }, + { + "step": 423, + "epoch": 0.6778846153846154, + "cpu_mem": 1.956954112, + "gpu_mem": 4.455617536, + "loss": 1.3925, + "grad_norm": 0.20143459737300873, + "learning_rate": 8.540438186501792e-05 + }, + { + "step": 424, + "epoch": 0.6794871794871795, + "cpu_mem": 1.95715072, + "gpu_mem": 4.455579136, + "loss": 1.3741, + "grad_norm": 0.2133656144142151, + "learning_rate": 8.464727944871322e-05 + }, + { + "step": 425, + "epoch": 0.6810897435897436, + "cpu_mem": 1.95715072, + "gpu_mem": 4.455602176, + "loss": 1.3975, + "grad_norm": 0.12042617797851562, + "learning_rate": 8.389222647878426e-05 + }, + { + "step": 426, + "epoch": 0.6826923076923077, + "cpu_mem": 1.95715072, + "gpu_mem": 4.45560064, + "loss": 1.3719, + "grad_norm": 0.13165828585624695, + "learning_rate": 8.313924663351926e-05 + }, + { + "step": 427, + "epoch": 0.6842948717948718, + "cpu_mem": 1.95715072, + "gpu_mem": 4.455619072, + "loss": 1.4071, + "grad_norm": 0.15639862418174744, + "learning_rate": 8.238836352619424e-05 + }, + { + "step": 428, + "epoch": 0.6858974358974359, + "cpu_mem": 1.95715072, + "gpu_mem": 4.455616, + "loss": 1.3946, + "grad_norm": 0.10084199160337448, + "learning_rate": 8.163960070433164e-05 + }, + { + "step": 429, + "epoch": 0.6875, + "cpu_mem": 1.957347328, + "gpu_mem": 4.455614464, + "loss": 1.3889, + "grad_norm": 0.07884629815816879, + "learning_rate": 8.089298164896245e-05 + }, + { + "step": 430, + "epoch": 0.6891025641025641, + "cpu_mem": 1.957347328, + "gpu_mem": 4.455632896, + "loss": 1.3747, + "grad_norm": 0.17722991108894348, + "learning_rate": 8.014852977388964e-05 + }, + { + "step": 431, + "epoch": 0.6907051282051282, + "cpu_mem": 1.957347328, + "gpu_mem": 4.455594496, + "loss": 1.374, + "grad_norm": 0.09528081864118576, + "learning_rate": 7.940626842495362e-05 + }, + { + "step": 432, + "epoch": 0.6923076923076923, + "cpu_mem": 1.957347328, + "gpu_mem": 4.45563904, + "loss": 1.3758, + "grad_norm": 0.12855112552642822, + "learning_rate": 7.866622087930074e-05 + }, + { + "step": 433, + "epoch": 0.6939102564102564, + "cpu_mem": 1.957543936, + "gpu_mem": 4.455603712, + "loss": 1.3856, + "grad_norm": 0.1685332953929901, + "learning_rate": 7.792841034465275e-05 + }, + { + "step": 434, + "epoch": 0.6955128205128205, + "cpu_mem": 1.957543936, + "gpu_mem": 4.45563136, + "loss": 1.3793, + "grad_norm": 0.14854636788368225, + "learning_rate": 7.719285995857938e-05 + }, + { + "step": 435, + "epoch": 0.6971153846153846, + "cpu_mem": 1.957543936, + "gpu_mem": 4.455611392, + "loss": 1.3888, + "grad_norm": 0.12788328528404236, + "learning_rate": 7.64595927877727e-05 + }, + { + "step": 436, + "epoch": 0.6987179487179487, + "cpu_mem": 1.957543936, + "gpu_mem": 4.455657472, + "loss": 1.3895, + "grad_norm": 0.2529086768627167, + "learning_rate": 7.572863182732332e-05 + }, + { + "step": 437, + "epoch": 0.7003205128205128, + "cpu_mem": 1.957543936, + "gpu_mem": 4.455622144, + "loss": 1.3896, + "grad_norm": 0.13850978016853333, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 438, + "epoch": 0.7019230769230769, + "cpu_mem": 1.957543936, + "gpu_mem": 4.455612928, + "loss": 1.3889, + "grad_norm": 0.15749132633209229, + "learning_rate": 7.42737201555302e-05 + }, + { + "step": 439, + "epoch": 0.7035256410256411, + "cpu_mem": 1.957543936, + "gpu_mem": 4.455606784, + "loss": 1.4053, + "grad_norm": 0.14779983460903168, + "learning_rate": 7.354981506988387e-05 + }, + { + "step": 440, + "epoch": 0.7051282051282052, + "cpu_mem": 1.957543936, + "gpu_mem": 4.455591424, + "loss": 1.3812, + "grad_norm": 0.12526153028011322, + "learning_rate": 7.282830744455895e-05 + }, + { + "step": 441, + "epoch": 0.7067307692307693, + "cpu_mem": 1.957543936, + "gpu_mem": 4.455609856, + "loss": 1.3955, + "grad_norm": 0.22243613004684448, + "learning_rate": 7.210921990586957e-05 + }, + { + "step": 442, + "epoch": 0.7083333333333334, + "cpu_mem": 1.957740544, + "gpu_mem": 4.455611392, + "loss": 1.3939, + "grad_norm": 0.23048461973667145, + "learning_rate": 7.139257500423665e-05 + }, + { + "step": 443, + "epoch": 0.7099358974358975, + "cpu_mem": 1.957740544, + "gpu_mem": 4.455616, + "loss": 1.4025, + "grad_norm": 0.18228009343147278, + "learning_rate": 7.067839521348035e-05 + }, + { + "step": 444, + "epoch": 0.7115384615384616, + "cpu_mem": 1.957740544, + "gpu_mem": 4.455619072, + "loss": 1.3905, + "grad_norm": 0.10705084353685379, + "learning_rate": 6.996670293011575e-05 + }, + { + "step": 445, + "epoch": 0.7131410256410257, + "cpu_mem": 1.957740544, + "gpu_mem": 4.455612928, + "loss": 1.3875, + "grad_norm": 0.1794833242893219, + "learning_rate": 6.92575204726501e-05 + }, + { + "step": 446, + "epoch": 0.7147435897435898, + "cpu_mem": 1.957740544, + "gpu_mem": 4.45563904, + "loss": 1.3804, + "grad_norm": 0.12607093155384064, + "learning_rate": 6.855087008088307e-05 + }, + { + "step": 447, + "epoch": 0.7163461538461539, + "cpu_mem": 1.957937152, + "gpu_mem": 4.455606784, + "loss": 1.388, + "grad_norm": 0.20000159740447998, + "learning_rate": 6.784677391520952e-05 + }, + { + "step": 448, + "epoch": 0.717948717948718, + "cpu_mem": 1.957937152, + "gpu_mem": 4.455634432, + "loss": 1.3848, + "grad_norm": 0.1962977945804596, + "learning_rate": 6.714525405592412e-05 + }, + { + "step": 449, + "epoch": 0.719551282051282, + "cpu_mem": 1.957937152, + "gpu_mem": 4.455642112, + "loss": 1.3766, + "grad_norm": 0.23785002529621124, + "learning_rate": 6.644633250252937e-05 + }, + { + "step": 450, + "epoch": 0.7211538461538461, + "cpu_mem": 1.957937152, + "gpu_mem": 4.45562368, + "loss": 1.403, + "grad_norm": 0.19998309016227722, + "learning_rate": 6.575003117304535e-05 + }, + { + "step": 451, + "epoch": 0.7227564102564102, + "cpu_mem": 1.957937152, + "gpu_mem": 4.455609856, + "loss": 1.3926, + "grad_norm": 0.14376413822174072, + "learning_rate": 6.50563719033225e-05 + }, + { + "step": 452, + "epoch": 0.7243589743589743, + "cpu_mem": 1.95813376, + "gpu_mem": 4.455620608, + "loss": 1.3904, + "grad_norm": 0.12867380678653717, + "learning_rate": 6.436537644635705e-05 + }, + { + "step": 453, + "epoch": 0.7259615384615384, + "cpu_mem": 1.95813376, + "gpu_mem": 4.455612928, + "loss": 1.3706, + "grad_norm": 0.15609678626060486, + "learning_rate": 6.367706647160847e-05 + }, + { + "step": 454, + "epoch": 0.7275641025641025, + "cpu_mem": 1.95813376, + "gpu_mem": 4.455629824, + "loss": 1.3749, + "grad_norm": 0.09961818158626556, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 455, + "epoch": 0.7291666666666666, + "cpu_mem": 1.95813376, + "gpu_mem": 4.455602176, + "loss": 1.369, + "grad_norm": 0.16495688259601593, + "learning_rate": 6.230858922484288e-05 + }, + { + "step": 456, + "epoch": 0.7307692307692307, + "cpu_mem": 1.95813376, + "gpu_mem": 4.455632896, + "loss": 1.4012, + "grad_norm": 0.14402242004871368, + "learning_rate": 6.162846486795938e-05 + }, + { + "step": 457, + "epoch": 0.7323717948717948, + "cpu_mem": 1.95813376, + "gpu_mem": 4.455614464, + "loss": 1.3759, + "grad_norm": 0.175029456615448, + "learning_rate": 6.095111182221422e-05 + }, + { + "step": 458, + "epoch": 0.7339743589743589, + "cpu_mem": 1.95813376, + "gpu_mem": 4.455602176, + "loss": 1.3954, + "grad_norm": 0.12859249114990234, + "learning_rate": 6.027655132924397e-05 + }, + { + "step": 459, + "epoch": 0.7355769230769231, + "cpu_mem": 1.95813376, + "gpu_mem": 4.455614464, + "loss": 1.376, + "grad_norm": 0.17375575006008148, + "learning_rate": 5.960480454311155e-05 + }, + { + "step": 460, + "epoch": 0.7371794871794872, + "cpu_mem": 1.958330368, + "gpu_mem": 4.455620608, + "loss": 1.3861, + "grad_norm": 0.11873313039541245, + "learning_rate": 5.893589252964258e-05 + }, + { + "step": 461, + "epoch": 0.7387820512820513, + "cpu_mem": 1.958330368, + "gpu_mem": 4.45560832, + "loss": 1.393, + "grad_norm": 0.15762026607990265, + "learning_rate": 5.826983626576479e-05 + }, + { + "step": 462, + "epoch": 0.7403846153846154, + "cpu_mem": 1.958330368, + "gpu_mem": 4.455597568, + "loss": 1.3799, + "grad_norm": 0.17614008486270905, + "learning_rate": 5.760665663885046e-05 + }, + { + "step": 463, + "epoch": 0.7419871794871795, + "cpu_mem": 1.958330368, + "gpu_mem": 4.455599104, + "loss": 1.3866, + "grad_norm": 0.16230840981006622, + "learning_rate": 5.6946374446060984e-05 + }, + { + "step": 464, + "epoch": 0.7435897435897436, + "cpu_mem": 1.958330368, + "gpu_mem": 4.455612928, + "loss": 1.3792, + "grad_norm": 0.08348682522773743, + "learning_rate": 5.6289010393695056e-05 + }, + { + "step": 465, + "epoch": 0.7451923076923077, + "cpu_mem": 1.958330368, + "gpu_mem": 4.455616, + "loss": 1.381, + "grad_norm": 0.0881672352552414, + "learning_rate": 5.563458509653904e-05 + }, + { + "step": 466, + "epoch": 0.7467948717948718, + "cpu_mem": 1.958330368, + "gpu_mem": 4.455626752, + "loss": 1.3847, + "grad_norm": 0.16743974387645721, + "learning_rate": 5.498311907722057e-05 + }, + { + "step": 467, + "epoch": 0.7483974358974359, + "cpu_mem": 1.958330368, + "gpu_mem": 4.45560064, + "loss": 1.3735, + "grad_norm": 0.13205131888389587, + "learning_rate": 5.43346327655652e-05 + }, + { + "step": 468, + "epoch": 0.75, + "cpu_mem": 1.958330368, + "gpu_mem": 4.455616, + "loss": 1.3709, + "grad_norm": 0.10833486914634705, + "learning_rate": 5.3689146497955274e-05 + }, + { + "step": 469, + "epoch": 0.7516025641025641, + "cpu_mem": 1.958526976, + "gpu_mem": 4.455625216, + "loss": 1.3675, + "grad_norm": 0.13472846150398254, + "learning_rate": 5.30466805166927e-05 + }, + { + "step": 470, + "epoch": 0.7532051282051282, + "cpu_mem": 1.958526976, + "gpu_mem": 4.455599104, + "loss": 1.3885, + "grad_norm": 0.1967596411705017, + "learning_rate": 5.240725496936372e-05 + }, + { + "step": 471, + "epoch": 0.7548076923076923, + "cpu_mem": 1.958526976, + "gpu_mem": 4.455605248, + "loss": 1.3881, + "grad_norm": 0.16674017906188965, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 472, + "epoch": 0.7564102564102564, + "cpu_mem": 1.958526976, + "gpu_mem": 4.455594496, + "loss": 1.3792, + "grad_norm": 0.2490089237689972, + "learning_rate": 5.113760528948622e-05 + }, + { + "step": 473, + "epoch": 0.7580128205128205, + "cpu_mem": 1.958526976, + "gpu_mem": 4.45560064, + "loss": 1.3808, + "grad_norm": 0.12127837538719177, + "learning_rate": 5.05074209728614e-05 + }, + { + "step": 474, + "epoch": 0.7596153846153846, + "cpu_mem": 1.958526976, + "gpu_mem": 4.455637504, + "loss": 1.3985, + "grad_norm": 0.13927540183067322, + "learning_rate": 4.988035672076899e-05 + }, + { + "step": 475, + "epoch": 0.7612179487179487, + "cpu_mem": 1.958526976, + "gpu_mem": 4.45558528, + "loss": 1.3856, + "grad_norm": 0.12947559356689453, + "learning_rate": 4.925643219780052e-05 + }, + { + "step": 476, + "epoch": 0.7628205128205128, + "cpu_mem": 1.958526976, + "gpu_mem": 4.455605248, + "loss": 1.3943, + "grad_norm": 0.2050401270389557, + "learning_rate": 4.863566697008634e-05 + }, + { + "step": 477, + "epoch": 0.7644230769230769, + "cpu_mem": 1.958526976, + "gpu_mem": 4.455605248, + "loss": 1.4127, + "grad_norm": 0.20211903750896454, + "learning_rate": 4.801808050468219e-05 + }, + { + "step": 478, + "epoch": 0.7660256410256411, + "cpu_mem": 1.958526976, + "gpu_mem": 4.455603712, + "loss": 1.3979, + "grad_norm": 0.14255227148532867, + "learning_rate": 4.7403692168958305e-05 + }, + { + "step": 479, + "epoch": 0.7676282051282052, + "cpu_mem": 1.958723584, + "gpu_mem": 4.455602176, + "loss": 1.3619, + "grad_norm": 0.15288756787776947, + "learning_rate": 4.679252122999255e-05 + }, + { + "step": 480, + "epoch": 0.7692307692307693, + "cpu_mem": 1.958723584, + "gpu_mem": 4.455594496, + "loss": 1.3943, + "grad_norm": 0.2182450145483017, + "learning_rate": 4.618458685396579e-05 + }, + { + "step": 481, + "epoch": 0.7708333333333334, + "cpu_mem": 1.958723584, + "gpu_mem": 4.4556544, + "loss": 1.3827, + "grad_norm": 0.12184063345193863, + "learning_rate": 4.5579908105561016e-05 + }, + { + "step": 482, + "epoch": 0.7724358974358975, + "cpu_mem": 1.958920192, + "gpu_mem": 4.455599104, + "loss": 1.3698, + "grad_norm": 0.12414515763521194, + "learning_rate": 4.497850394736563e-05 + }, + { + "step": 483, + "epoch": 0.7740384615384616, + "cpu_mem": 1.958920192, + "gpu_mem": 4.455582208, + "loss": 1.3798, + "grad_norm": 0.14243240654468536, + "learning_rate": 4.438039323927648e-05 + }, + { + "step": 484, + "epoch": 0.7756410256410257, + "cpu_mem": 1.958920192, + "gpu_mem": 4.455612928, + "loss": 1.3879, + "grad_norm": 0.1982772797346115, + "learning_rate": 4.3785594737908676e-05 + }, + { + "step": 485, + "epoch": 0.7772435897435898, + "cpu_mem": 1.958920192, + "gpu_mem": 4.455657472, + "loss": 1.3756, + "grad_norm": 0.1531526893377304, + "learning_rate": 4.319412709600723e-05 + }, + { + "step": 486, + "epoch": 0.7788461538461539, + "cpu_mem": 1.958920192, + "gpu_mem": 4.455637504, + "loss": 1.4026, + "grad_norm": 0.16336970031261444, + "learning_rate": 4.2606008861862116e-05 + }, + { + "step": 487, + "epoch": 0.780448717948718, + "cpu_mem": 1.958920192, + "gpu_mem": 4.455637504, + "loss": 1.3983, + "grad_norm": 0.13184873759746552, + "learning_rate": 4.2021258478726774e-05 + }, + { + "step": 488, + "epoch": 0.782051282051282, + "cpu_mem": 1.958920192, + "gpu_mem": 4.455603712, + "loss": 1.4107, + "grad_norm": 0.2059178203344345, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 489, + "epoch": 0.7836538461538461, + "cpu_mem": 1.958920192, + "gpu_mem": 4.455628288, + "loss": 1.3707, + "grad_norm": 0.09727326780557632, + "learning_rate": 4.0861934509848507e-05 + }, + { + "step": 490, + "epoch": 0.7852564102564102, + "cpu_mem": 1.958920192, + "gpu_mem": 4.45563136, + "loss": 1.3829, + "grad_norm": 0.12138403207063675, + "learning_rate": 4.028739728024022e-05 + }, + { + "step": 491, + "epoch": 0.7868589743589743, + "cpu_mem": 1.958920192, + "gpu_mem": 4.455609856, + "loss": 1.3797, + "grad_norm": 0.18657460808753967, + "learning_rate": 3.971630061277077e-05 + }, + { + "step": 492, + "epoch": 0.7884615384615384, + "cpu_mem": 1.958920192, + "gpu_mem": 4.455632896, + "loss": 1.3829, + "grad_norm": 0.25270581245422363, + "learning_rate": 3.914866241690115e-05 + }, + { + "step": 493, + "epoch": 0.7900641025641025, + "cpu_mem": 1.958920192, + "gpu_mem": 4.455612928, + "loss": 1.3756, + "grad_norm": 0.14887063205242157, + "learning_rate": 3.858450049363532e-05 + }, + { + "step": 494, + "epoch": 0.7916666666666666, + "cpu_mem": 1.958920192, + "gpu_mem": 4.455637504, + "loss": 1.3845, + "grad_norm": 0.132082998752594, + "learning_rate": 3.8023832534962314e-05 + }, + { + "step": 495, + "epoch": 0.7932692307692307, + "cpu_mem": 1.958920192, + "gpu_mem": 4.455620608, + "loss": 1.3787, + "grad_norm": 0.0789191797375679, + "learning_rate": 3.746667612330109e-05 + }, + { + "step": 496, + "epoch": 0.7948717948717948, + "cpu_mem": 1.958920192, + "gpu_mem": 4.455616, + "loss": 1.3874, + "grad_norm": 0.1849944293498993, + "learning_rate": 3.691304873094927e-05 + }, + { + "step": 497, + "epoch": 0.7964743589743589, + "cpu_mem": 1.958920192, + "gpu_mem": 4.455628288, + "loss": 1.382, + "grad_norm": 0.1404387503862381, + "learning_rate": 3.636296771953544e-05 + }, + { + "step": 498, + "epoch": 0.7980769230769231, + "cpu_mem": 1.9591168, + "gpu_mem": 4.455599104, + "loss": 1.3783, + "grad_norm": 0.1402731090784073, + "learning_rate": 3.581645033947425e-05 + }, + { + "step": 499, + "epoch": 0.7996794871794872, + "cpu_mem": 1.9591168, + "gpu_mem": 4.455612928, + "loss": 1.3762, + "grad_norm": 0.15089792013168335, + "learning_rate": 3.527351372942588e-05 + }, + { + "step": 500, + "epoch": 0.8012820512820513, + "cpu_mem": 1.9591168, + "gpu_mem": 4.455599104, + "loss": 1.3696, + "grad_norm": 0.13106438517570496, + "learning_rate": 3.473417491575824e-05 + }, + { + "step": 501, + "epoch": 0.8028846153846154, + "cpu_mem": 1.9591168, + "gpu_mem": 4.45559296, + "loss": 1.3935, + "grad_norm": 0.09939797967672348, + "learning_rate": 3.41984508120132e-05 + }, + { + "step": 502, + "epoch": 0.8044871794871795, + "cpu_mem": 1.9591168, + "gpu_mem": 4.455599104, + "loss": 1.3873, + "grad_norm": 0.0780617892742157, + "learning_rate": 3.366635821837627e-05 + }, + { + "step": 503, + "epoch": 0.8060897435897436, + "cpu_mem": 1.959313408, + "gpu_mem": 4.455612928, + "loss": 1.3882, + "grad_norm": 0.11309966444969177, + "learning_rate": 3.3137913821149425e-05 + }, + { + "step": 504, + "epoch": 0.8076923076923077, + "cpu_mem": 1.959313408, + "gpu_mem": 4.455596032, + "loss": 1.3809, + "grad_norm": 0.22138404846191406, + "learning_rate": 3.261313419222825e-05 + }, + { + "step": 505, + "epoch": 0.8092948717948718, + "cpu_mem": 1.959313408, + "gpu_mem": 4.455649792, + "loss": 1.3785, + "grad_norm": 0.14897671341896057, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 506, + "epoch": 0.8108974358974359, + "cpu_mem": 1.959313408, + "gpu_mem": 4.45559296, + "loss": 1.3842, + "grad_norm": 0.10050924122333527, + "learning_rate": 3.157463495173713e-05 + }, + { + "step": 507, + "epoch": 0.8125, + "cpu_mem": 1.959313408, + "gpu_mem": 4.455671296, + "loss": 1.3954, + "grad_norm": 0.1750260889530182, + "learning_rate": 3.1060947907265936e-05 + }, + { + "step": 508, + "epoch": 0.8141025641025641, + "cpu_mem": 1.959313408, + "gpu_mem": 4.455614464, + "loss": 1.3727, + "grad_norm": 0.2976323366165161, + "learning_rate": 3.0550990764276634e-05 + }, + { + "step": 509, + "epoch": 0.8157051282051282, + "cpu_mem": 1.959313408, + "gpu_mem": 4.455632896, + "loss": 1.3726, + "grad_norm": 0.16810554265975952, + "learning_rate": 3.00447795149086e-05 + }, + { + "step": 510, + "epoch": 0.8173076923076923, + "cpu_mem": 1.959313408, + "gpu_mem": 4.45560832, + "loss": 1.3984, + "grad_norm": 0.1683572679758072, + "learning_rate": 2.9542330033830884e-05 + }, + { + "step": 511, + "epoch": 0.8189102564102564, + "cpu_mem": 1.959313408, + "gpu_mem": 4.455640576, + "loss": 1.3713, + "grad_norm": 0.1282840520143509, + "learning_rate": 2.9043658077744316e-05 + }, + { + "step": 512, + "epoch": 0.8205128205128205, + "cpu_mem": 1.959313408, + "gpu_mem": 4.455660544, + "loss": 1.3881, + "grad_norm": 0.15000705420970917, + "learning_rate": 2.8548779284887442e-05 + }, + { + "step": 513, + "epoch": 0.8221153846153846, + "cpu_mem": 1.959313408, + "gpu_mem": 4.455589888, + "loss": 1.382, + "grad_norm": 0.1279844343662262, + "learning_rate": 2.805770917454614e-05 + }, + { + "step": 514, + "epoch": 0.8237179487179487, + "cpu_mem": 1.959313408, + "gpu_mem": 4.455603712, + "loss": 1.3701, + "grad_norm": 0.1586838811635971, + "learning_rate": 2.7570463146566758e-05 + }, + { + "step": 515, + "epoch": 0.8253205128205128, + "cpu_mem": 1.959313408, + "gpu_mem": 4.455588352, + "loss": 1.3782, + "grad_norm": 0.14760109782218933, + "learning_rate": 2.708705648087332e-05 + }, + { + "step": 516, + "epoch": 0.8269230769230769, + "cpu_mem": 1.959313408, + "gpu_mem": 4.455626752, + "loss": 1.3764, + "grad_norm": 0.0972471833229065, + "learning_rate": 2.6607504336988317e-05 + }, + { + "step": 517, + "epoch": 0.8285256410256411, + "cpu_mem": 1.959510016, + "gpu_mem": 4.455626752, + "loss": 1.3967, + "grad_norm": 0.13810722529888153, + "learning_rate": 2.613182175355739e-05 + }, + { + "step": 518, + "epoch": 0.8301282051282052, + "cpu_mem": 1.959510016, + "gpu_mem": 4.455612928, + "loss": 1.3884, + "grad_norm": 0.14698509871959686, + "learning_rate": 2.5660023647877644e-05 + }, + { + "step": 519, + "epoch": 0.8317307692307693, + "cpu_mem": 1.959510016, + "gpu_mem": 4.455603712, + "loss": 1.3798, + "grad_norm": 0.09506634622812271, + "learning_rate": 2.5192124815429777e-05 + }, + { + "step": 520, + "epoch": 0.8333333333333334, + "cpu_mem": 1.959510016, + "gpu_mem": 4.45560832, + "loss": 1.3862, + "grad_norm": 0.09187142550945282, + "learning_rate": 2.472813992941418e-05 + }, + { + "step": 521, + "epoch": 0.8349358974358975, + "cpu_mem": 1.959510016, + "gpu_mem": 4.455611392, + "loss": 1.3805, + "grad_norm": 0.12737655639648438, + "learning_rate": 2.426808354029078e-05 + }, + { + "step": 522, + "epoch": 0.8365384615384616, + "cpu_mem": 1.959510016, + "gpu_mem": 4.455617536, + "loss": 1.3883, + "grad_norm": 0.10178422927856445, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 523, + "epoch": 0.8381410256410257, + "cpu_mem": 1.959510016, + "gpu_mem": 4.455635968, + "loss": 1.3787, + "grad_norm": 0.09081809967756271, + "learning_rate": 2.3359813838124277e-05 + }, + { + "step": 524, + "epoch": 0.8397435897435898, + "cpu_mem": 1.959510016, + "gpu_mem": 4.455629824, + "loss": 1.3888, + "grad_norm": 0.15421898663043976, + "learning_rate": 2.2911629008211363e-05 + }, + { + "step": 525, + "epoch": 0.8413461538461539, + "cpu_mem": 1.959510016, + "gpu_mem": 4.455606784, + "loss": 1.3729, + "grad_norm": 0.11899249255657196, + "learning_rate": 2.24674296405579e-05 + }, + { + "step": 526, + "epoch": 0.842948717948718, + "cpu_mem": 1.959510016, + "gpu_mem": 4.455594496, + "loss": 1.3773, + "grad_norm": 0.18847927451133728, + "learning_rate": 2.2027229665154446e-05 + }, + { + "step": 527, + "epoch": 0.844551282051282, + "cpu_mem": 1.959510016, + "gpu_mem": 4.455560704, + "loss": 1.3805, + "grad_norm": 0.2878577411174774, + "learning_rate": 2.1591042886571634e-05 + }, + { + "step": 528, + "epoch": 0.8461538461538461, + "cpu_mem": 1.959510016, + "gpu_mem": 4.45560832, + "loss": 1.3835, + "grad_norm": 0.12328501790761948, + "learning_rate": 2.1158882983527166e-05 + }, + { + "step": 529, + "epoch": 0.8477564102564102, + "cpu_mem": 1.959510016, + "gpu_mem": 4.455574528, + "loss": 1.3788, + "grad_norm": 0.13373613357543945, + "learning_rate": 2.0730763508456738e-05 + }, + { + "step": 530, + "epoch": 0.8493589743589743, + "cpu_mem": 1.959510016, + "gpu_mem": 4.455622144, + "loss": 1.3769, + "grad_norm": 0.1797960251569748, + "learning_rate": 2.0306697887089235e-05 + }, + { + "step": 531, + "epoch": 0.8509615384615384, + "cpu_mem": 1.959510016, + "gpu_mem": 4.455620608, + "loss": 1.3866, + "grad_norm": 0.10254404693841934, + "learning_rate": 1.9886699418025543e-05 + }, + { + "step": 532, + "epoch": 0.8525641025641025, + "cpu_mem": 1.959510016, + "gpu_mem": 4.455622144, + "loss": 1.398, + "grad_norm": 0.16689260303974152, + "learning_rate": 1.947078127232169e-05 + }, + { + "step": 533, + "epoch": 0.8541666666666666, + "cpu_mem": 1.959510016, + "gpu_mem": 4.45563136, + "loss": 1.3744, + "grad_norm": 0.1213759332895279, + "learning_rate": 1.9058956493075644e-05 + }, + { + "step": 534, + "epoch": 0.8557692307692307, + "cpu_mem": 1.959706624, + "gpu_mem": 4.455606784, + "loss": 1.3851, + "grad_norm": 0.14462347328662872, + "learning_rate": 1.8651237995018324e-05 + }, + { + "step": 535, + "epoch": 0.8573717948717948, + "cpu_mem": 1.959706624, + "gpu_mem": 4.455591424, + "loss": 1.3855, + "grad_norm": 0.11478224396705627, + "learning_rate": 1.8247638564108607e-05 + }, + { + "step": 536, + "epoch": 0.8589743589743589, + "cpu_mem": 1.959706624, + "gpu_mem": 4.455620608, + "loss": 1.3885, + "grad_norm": 0.17257098853588104, + "learning_rate": 1.7848170857132325e-05 + }, + { + "step": 537, + "epoch": 0.8605769230769231, + "cpu_mem": 1.959706624, + "gpu_mem": 4.455634432, + "loss": 1.3831, + "grad_norm": 0.14720571041107178, + "learning_rate": 1.7452847401305496e-05 + }, + { + "step": 538, + "epoch": 0.8621794871794872, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455589888, + "loss": 1.3815, + "grad_norm": 0.19833175837993622, + "learning_rate": 1.7061680593881344e-05 + }, + { + "step": 539, + "epoch": 0.8637820512820513, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455596032, + "loss": 1.3758, + "grad_norm": 0.18438014388084412, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 540, + "epoch": 0.8653846153846154, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455625216, + "loss": 1.3785, + "grad_norm": 0.08320009708404541, + "learning_rate": 1.6291865861111353e-05 + }, + { + "step": 541, + "epoch": 0.8669871794871795, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455620608, + "loss": 1.3852, + "grad_norm": 0.14464764297008514, + "learning_rate": 1.5913242076979493e-05 + }, + { + "step": 542, + "epoch": 0.8685897435897436, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455606784, + "loss": 1.3857, + "grad_norm": 0.13265293836593628, + "learning_rate": 1.5538823222921288e-05 + }, + { + "step": 543, + "epoch": 0.8701923076923077, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455620608, + "loss": 1.3804, + "grad_norm": 0.1849140226840973, + "learning_rate": 1.5168621040626388e-05 + }, + { + "step": 544, + "epoch": 0.8717948717948718, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455609856, + "loss": 1.3741, + "grad_norm": 0.14215770363807678, + "learning_rate": 1.4802647139550577e-05 + }, + { + "step": 545, + "epoch": 0.8733974358974359, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455616, + "loss": 1.3699, + "grad_norm": 0.10068326443433762, + "learning_rate": 1.444091299655175e-05 + }, + { + "step": 546, + "epoch": 0.875, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455620608, + "loss": 1.3882, + "grad_norm": 0.13041482865810394, + "learning_rate": 1.408342995552988e-05 + }, + { + "step": 547, + "epoch": 0.8766025641025641, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455616, + "loss": 1.3863, + "grad_norm": 0.10676822066307068, + "learning_rate": 1.3730209227071436e-05 + }, + { + "step": 548, + "epoch": 0.8782051282051282, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455589888, + "loss": 1.396, + "grad_norm": 0.1325804591178894, + "learning_rate": 1.3381261888097755e-05 + }, + { + "step": 549, + "epoch": 0.8798076923076923, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455599104, + "loss": 1.3832, + "grad_norm": 0.16988079249858856, + "learning_rate": 1.303659888151753e-05 + }, + { + "step": 550, + "epoch": 0.8814102564102564, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455617536, + "loss": 1.3777, + "grad_norm": 0.18439553678035736, + "learning_rate": 1.2696231015883913e-05 + }, + { + "step": 551, + "epoch": 0.8830128205128205, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455588352, + "loss": 1.3935, + "grad_norm": 0.19071577489376068, + "learning_rate": 1.2360168965055301e-05 + }, + { + "step": 552, + "epoch": 0.8846153846153846, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455619072, + "loss": 1.3745, + "grad_norm": 0.16462069749832153, + "learning_rate": 1.2028423267860805e-05 + }, + { + "step": 553, + "epoch": 0.8862179487179487, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455628288, + "loss": 1.3886, + "grad_norm": 0.13687770068645477, + "learning_rate": 1.1701004327769709e-05 + }, + { + "step": 554, + "epoch": 0.8878205128205128, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455589888, + "loss": 1.3812, + "grad_norm": 0.11779221147298813, + "learning_rate": 1.1377922412565005e-05 + }, + { + "step": 555, + "epoch": 0.8894230769230769, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455594496, + "loss": 1.373, + "grad_norm": 0.13091064989566803, + "learning_rate": 1.1059187654021762e-05 + }, + { + "step": 556, + "epoch": 0.8910256410256411, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455619072, + "loss": 1.3745, + "grad_norm": 0.09474749863147736, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 557, + "epoch": 0.8926282051282052, + "cpu_mem": 1.959903232, + "gpu_mem": 4.455637504, + "loss": 1.3776, + "grad_norm": 0.09142878651618958, + "learning_rate": 1.0434799452076915e-05 + }, + { + "step": 558, + "epoch": 0.8942307692307693, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455619072, + "loss": 1.4031, + "grad_norm": 0.2543553113937378, + "learning_rate": 1.0129165589346643e-05 + }, + { + "step": 559, + "epoch": 0.8958333333333334, + "cpu_mem": 1.96009984, + "gpu_mem": 4.45566976, + "loss": 1.3866, + "grad_norm": 0.15314586460590363, + "learning_rate": 9.82791804400626e-06 + }, + { + "step": 560, + "epoch": 0.8974358974358975, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455602176, + "loss": 1.3666, + "grad_norm": 0.14817368984222412, + "learning_rate": 9.531066263109971e-06 + }, + { + "step": 561, + "epoch": 0.8990384615384616, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455603712, + "loss": 1.3827, + "grad_norm": 0.12016596645116806, + "learning_rate": 9.238619555861731e-06 + }, + { + "step": 562, + "epoch": 0.9006410256410257, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455603712, + "loss": 1.4035, + "grad_norm": 0.19155895709991455, + "learning_rate": 8.950587093323435e-06 + }, + { + "step": 563, + "epoch": 0.9022435897435898, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455609856, + "loss": 1.3716, + "grad_norm": 0.13089391589164734, + "learning_rate": 8.66697790812731e-06 + }, + { + "step": 564, + "epoch": 0.9038461538461539, + "cpu_mem": 1.96009984, + "gpu_mem": 4.45562368, + "loss": 1.3843, + "grad_norm": 0.14605748653411865, + "learning_rate": 8.387800894192453e-06 + }, + { + "step": 565, + "epoch": 0.905448717948718, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455628288, + "loss": 1.3946, + "grad_norm": 0.11438155919313431, + "learning_rate": 8.113064806446285e-06 + }, + { + "step": 566, + "epoch": 0.907051282051282, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455622144, + "loss": 1.391, + "grad_norm": 0.15182887017726898, + "learning_rate": 7.842778260549654e-06 + }, + { + "step": 567, + "epoch": 0.9086538461538461, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455616, + "loss": 1.397, + "grad_norm": 0.19055500626564026, + "learning_rate": 7.5769497326268804e-06 + }, + { + "step": 568, + "epoch": 0.9102564102564102, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455629824, + "loss": 1.3794, + "grad_norm": 0.2129579782485962, + "learning_rate": 7.315587558999864e-06 + }, + { + "step": 569, + "epoch": 0.9118589743589743, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455622144, + "loss": 1.3737, + "grad_norm": 0.10240568220615387, + "learning_rate": 7.058699935926526e-06 + }, + { + "step": 570, + "epoch": 0.9134615384615384, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455606784, + "loss": 1.4033, + "grad_norm": 0.23083072900772095, + "learning_rate": 6.8062949193440515e-06 + }, + { + "step": 571, + "epoch": 0.9150641025641025, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455616, + "loss": 1.3778, + "grad_norm": 0.11061505973339081, + "learning_rate": 6.5583804246160385e-06 + }, + { + "step": 572, + "epoch": 0.9166666666666666, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455625216, + "loss": 1.3784, + "grad_norm": 0.11961214989423752, + "learning_rate": 6.3149642262843804e-06 + }, + { + "step": 573, + "epoch": 0.9182692307692307, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455628288, + "loss": 1.3918, + "grad_norm": 0.20316389203071594, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 574, + "epoch": 0.9198717948717948, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455582208, + "loss": 1.3755, + "grad_norm": 0.1126033291220665, + "learning_rate": 5.84165711141048e-06 + }, + { + "step": 575, + "epoch": 0.9214743589743589, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455635968, + "loss": 1.3893, + "grad_norm": 0.18289880454540253, + "learning_rate": 5.611781037671176e-06 + }, + { + "step": 576, + "epoch": 0.9230769230769231, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455637504, + "loss": 1.3852, + "grad_norm": 0.11101100593805313, + "learning_rate": 5.386432945468555e-06 + }, + { + "step": 577, + "epoch": 0.9246794871794872, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455582208, + "loss": 1.3898, + "grad_norm": 0.10690543055534363, + "learning_rate": 5.165619901667311e-06 + }, + { + "step": 578, + "epoch": 0.9262820512820513, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455616, + "loss": 1.3723, + "grad_norm": 0.1652965396642685, + "learning_rate": 4.949348830914002e-06 + }, + { + "step": 579, + "epoch": 0.9278846153846154, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455594496, + "loss": 1.3643, + "grad_norm": 0.14307641983032227, + "learning_rate": 4.737626515419951e-06 + }, + { + "step": 580, + "epoch": 0.9294871794871795, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455625216, + "loss": 1.376, + "grad_norm": 0.09126594662666321, + "learning_rate": 4.530459594748592e-06 + }, + { + "step": 581, + "epoch": 0.9310897435897436, + "cpu_mem": 1.96009984, + "gpu_mem": 4.45560064, + "loss": 1.3922, + "grad_norm": 0.20290158689022064, + "learning_rate": 4.327854565607164e-06 + }, + { + "step": 582, + "epoch": 0.9326923076923077, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455634432, + "loss": 1.3844, + "grad_norm": 0.1729162484407425, + "learning_rate": 4.129817781643091e-06 + }, + { + "step": 583, + "epoch": 0.9342948717948718, + "cpu_mem": 1.96009984, + "gpu_mem": 4.4556544, + "loss": 1.3908, + "grad_norm": 0.12463398277759552, + "learning_rate": 3.9363554532446276e-06 + }, + { + "step": 584, + "epoch": 0.9358974358974359, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455619072, + "loss": 1.3875, + "grad_norm": 0.14790940284729004, + "learning_rate": 3.7474736473461607e-06 + }, + { + "step": 585, + "epoch": 0.9375, + "cpu_mem": 1.96009984, + "gpu_mem": 4.45563904, + "loss": 1.3767, + "grad_norm": 0.12771911919116974, + "learning_rate": 3.56317828723795e-06 + }, + { + "step": 586, + "epoch": 0.9391025641025641, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455619072, + "loss": 1.3798, + "grad_norm": 0.16484121978282928, + "learning_rate": 3.383475152380355e-06 + }, + { + "step": 587, + "epoch": 0.9407051282051282, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455620608, + "loss": 1.3766, + "grad_norm": 0.18543551862239838, + "learning_rate": 3.2083698782225997e-06 + }, + { + "step": 588, + "epoch": 0.9423076923076923, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455612928, + "loss": 1.3808, + "grad_norm": 0.14693662524223328, + "learning_rate": 3.0378679560260467e-06 + }, + { + "step": 589, + "epoch": 0.9439102564102564, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455611392, + "loss": 1.373, + "grad_norm": 0.18759794533252716, + "learning_rate": 2.871974732691984e-06 + }, + { + "step": 590, + "epoch": 0.9455128205128205, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455625216, + "loss": 1.391, + "grad_norm": 0.12348757684230804, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 591, + "epoch": 0.9471153846153846, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455596032, + "loss": 1.3899, + "grad_norm": 0.11732631921768188, + "learning_rate": 2.554035047414732e-06 + }, + { + "step": 592, + "epoch": 0.9487179487179487, + "cpu_mem": 1.96009984, + "gpu_mem": 4.45564672, + "loss": 1.39, + "grad_norm": 0.26204946637153625, + "learning_rate": 2.401998555987389e-06 + }, + { + "step": 593, + "epoch": 0.9503205128205128, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455643648, + "loss": 1.3822, + "grad_norm": 0.11750993132591248, + "learning_rate": 2.2545907041415457e-06 + }, + { + "step": 594, + "epoch": 0.9519230769230769, + "cpu_mem": 1.96009984, + "gpu_mem": 4.45562368, + "loss": 1.3871, + "grad_norm": 0.1407662034034729, + "learning_rate": 2.1118161145537436e-06 + }, + { + "step": 595, + "epoch": 0.9535256410256411, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455605248, + "loss": 1.3964, + "grad_norm": 0.15434487164020538, + "learning_rate": 1.973679264602485e-06 + }, + { + "step": 596, + "epoch": 0.9551282051282052, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455614464, + "loss": 1.3951, + "grad_norm": 0.08506003767251968, + "learning_rate": 1.840184486227808e-06 + }, + { + "step": 597, + "epoch": 0.9567307692307693, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455582208, + "loss": 1.3781, + "grad_norm": 0.1437395215034485, + "learning_rate": 1.711335965795435e-06 + }, + { + "step": 598, + "epoch": 0.9583333333333334, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455643648, + "loss": 1.377, + "grad_norm": 0.13745751976966858, + "learning_rate": 1.5871377439655054e-06 + }, + { + "step": 599, + "epoch": 0.9599358974358975, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455642112, + "loss": 1.3944, + "grad_norm": 0.09884796291589737, + "learning_rate": 1.4675937155658456e-06 + }, + { + "step": 600, + "epoch": 0.9615384615384616, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455597568, + "loss": 1.3768, + "grad_norm": 0.1483048051595688, + "learning_rate": 1.3527076294698846e-06 + }, + { + "step": 601, + "epoch": 0.9631410256410257, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455629824, + "loss": 1.3599, + "grad_norm": 0.11833728849887848, + "learning_rate": 1.2424830884790126e-06 + }, + { + "step": 602, + "epoch": 0.9647435897435898, + "cpu_mem": 1.96009984, + "gpu_mem": 4.45562368, + "loss": 1.3734, + "grad_norm": 0.1995954066514969, + "learning_rate": 1.1369235492096397e-06 + }, + { + "step": 603, + "epoch": 0.9663461538461539, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455609856, + "loss": 1.3941, + "grad_norm": 0.1858077347278595, + "learning_rate": 1.0360323219847645e-06 + }, + { + "step": 604, + "epoch": 0.967948717948718, + "cpu_mem": 1.96009984, + "gpu_mem": 4.455609856, + "loss": 1.3691, + "grad_norm": 0.1311863362789154, + "learning_rate": 9.398125707302084e-07 + }, + { + "step": 605, + "epoch": 0.969551282051282, + "cpu_mem": 1.960296448, + "gpu_mem": 4.455635968, + "loss": 1.3805, + "grad_norm": 0.11760501563549042, + "learning_rate": 8.482673128753947e-07 + }, + { + "step": 606, + "epoch": 0.9711538461538461, + "cpu_mem": 1.960296448, + "gpu_mem": 4.45562368, + "loss": 1.3947, + "grad_norm": 0.1497272253036499, + "learning_rate": 7.613994192586736e-07 + }, + { + "step": 607, + "epoch": 0.9727564102564102, + "cpu_mem": 1.960493056, + "gpu_mem": 4.455614464, + "loss": 1.3889, + "grad_norm": 0.13449910283088684, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 608, + "epoch": 0.9743589743589743, + "cpu_mem": 1.960493056, + "gpu_mem": 4.455606784, + "loss": 1.376, + "grad_norm": 0.11478990316390991, + "learning_rate": 6.017064746021094e-07 + }, + { + "step": 609, + "epoch": 0.9759615384615384, + "cpu_mem": 1.960493056, + "gpu_mem": 4.455629824, + "loss": 1.3727, + "grad_norm": 0.1740272045135498, + "learning_rate": 5.288864314965003e-07 + }, + { + "step": 610, + "epoch": 0.9775641025641025, + "cpu_mem": 1.960493056, + "gpu_mem": 4.455619072, + "loss": 1.3648, + "grad_norm": 0.1411309689283371, + "learning_rate": 4.607537683404106e-07 + }, + { + "step": 611, + "epoch": 0.9791666666666666, + "cpu_mem": 1.960493056, + "gpu_mem": 4.455603712, + "loss": 1.3915, + "grad_norm": 0.20162473618984222, + "learning_rate": 3.973106217585842e-07 + }, + { + "step": 612, + "epoch": 0.9807692307692307, + "cpu_mem": 1.960493056, + "gpu_mem": 4.45566976, + "loss": 1.3774, + "grad_norm": 0.18679168820381165, + "learning_rate": 3.3855898131356915e-07 + }, + { + "step": 613, + "epoch": 0.9823717948717948, + "cpu_mem": 1.960493056, + "gpu_mem": 4.455612928, + "loss": 1.3886, + "grad_norm": 0.11987175047397614, + "learning_rate": 2.845006894433843e-07 + }, + { + "step": 614, + "epoch": 0.9839743589743589, + "cpu_mem": 1.960493056, + "gpu_mem": 4.455599104, + "loss": 1.3847, + "grad_norm": 0.13719777762889862, + "learning_rate": 2.351374414037155e-07 + }, + { + "step": 615, + "epoch": 0.9855769230769231, + "cpu_mem": 1.960493056, + "gpu_mem": 4.455663616, + "loss": 1.3947, + "grad_norm": 0.1739129275083542, + "learning_rate": 1.9047078521474135e-07 + }, + { + "step": 616, + "epoch": 0.9871794871794872, + "cpu_mem": 1.960493056, + "gpu_mem": 4.45559296, + "loss": 1.3806, + "grad_norm": 0.16488467156887054, + "learning_rate": 1.505021216125557e-07 + }, + { + "step": 617, + "epoch": 0.9887820512820513, + "cpu_mem": 1.960493056, + "gpu_mem": 4.455620608, + "loss": 1.3688, + "grad_norm": 0.12963488698005676, + "learning_rate": 1.1523270400535245e-07 + }, + { + "step": 618, + "epoch": 0.9903846153846154, + "cpu_mem": 1.960493056, + "gpu_mem": 4.45562368, + "loss": 1.3866, + "grad_norm": 0.11231622844934464, + "learning_rate": 8.466363843397383e-08 + }, + { + "step": 619, + "epoch": 0.9919871794871795, + "cpu_mem": 1.960493056, + "gpu_mem": 4.455599104, + "loss": 1.384, + "grad_norm": 0.1978352963924408, + "learning_rate": 5.8795883537338106e-08 + }, + { + "step": 620, + "epoch": 0.9935897435897436, + "cpu_mem": 1.960493056, + "gpu_mem": 4.455626752, + "loss": 1.3827, + "grad_norm": 0.13128869235515594, + "learning_rate": 3.763025052231361e-08 + }, + { + "step": 621, + "epoch": 0.9951923076923077, + "cpu_mem": 1.960493056, + "gpu_mem": 4.455635968, + "loss": 1.3911, + "grad_norm": 0.11878422647714615, + "learning_rate": 2.1167403138339088e-08 + }, + { + "step": 622, + "epoch": 0.9967948717948718, + "cpu_mem": 1.960493056, + "gpu_mem": 4.455632896, + "loss": 1.4017, + "grad_norm": 0.1326499581336975, + "learning_rate": 9.407857656540397e-09 + }, + { + "step": 623, + "epoch": 0.9983974358974359, + "cpu_mem": 1.960493056, + "gpu_mem": 4.455605248, + "loss": 1.3852, + "grad_norm": 0.17195501923561096, + "learning_rate": 2.3519828535434325e-09 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.960493056, + "gpu_mem": 4.455351808, + "loss": 1.3856, + "grad_norm": 0.14645199477672577, + "learning_rate": 0.0 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.960493056, + "gpu_mem": 4.455351808, + "train_runtime": 16066.5632, + "train_samples_per_second": 2.484, + "train_steps_per_second": 0.039, + "total_flos": 8.44927549775954e+16, + "train_loss": 1.4820800524873612 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r32-a2/README.md b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r32-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r32-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r32-a2/adapter_config.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0ef1d724eca7640a4f365c193cda2fc4efdb2073 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r32-a2/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha": 64, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "inference_mode": true, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 32, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r32-a2/eval_results.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..793753678f63ad975424df0cab753e652b74909e --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "hellaswag", + "results": 0.5558653654650468 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r32-a2/training_configuration.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..f868bfa4208c1a0254d5dda0aa5acc01010dfa62 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "HELLASWAG", + "dataset_id": "Rowan/hellaswag", + "preprocess_id": "hellaswag_train_deepeval" + }, + "peft_config": { + "method": "loha", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 50462720 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 1, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loha-hellaswag-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r32-a2", + "seed": 42, + "timestamp": "2025-09-14T06:02:25.573059" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r32-a2/training_logs.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..3a68bba435e3b8b917f7063fdd0ea49536655e7d --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r32-a2/training_logs.json @@ -0,0 +1,5629 @@ +[ + { + "step": 1, + "epoch": 0.0016025641025641025, + "cpu_mem": 1.84870912, + "gpu_mem": 4.6196096, + "loss": 3.4877, + "grad_norm": 2.696638345718384, + "learning_rate": 4.7619047619047615e-06 + }, + { + "step": 2, + "epoch": 0.003205128205128205, + "cpu_mem": 1.854803968, + "gpu_mem": 5.023304704, + "loss": 3.6203, + "grad_norm": 2.5981273651123047, + "learning_rate": 9.523809523809523e-06 + }, + { + "step": 3, + "epoch": 0.004807692307692308, + "cpu_mem": 1.855983616, + "gpu_mem": 5.023312384, + "loss": 3.4237, + "grad_norm": 2.648874521255493, + "learning_rate": 1.4285714285714284e-05 + }, + { + "step": 4, + "epoch": 0.00641025641025641, + "cpu_mem": 1.856966656, + "gpu_mem": 5.023346176, + "loss": 3.6011, + "grad_norm": 2.627807855606079, + "learning_rate": 1.9047619047619046e-05 + }, + { + "step": 5, + "epoch": 0.008012820512820512, + "cpu_mem": 1.857949696, + "gpu_mem": 5.023309312, + "loss": 3.4911, + "grad_norm": 2.611544609069824, + "learning_rate": 2.3809523809523807e-05 + }, + { + "step": 6, + "epoch": 0.009615384615384616, + "cpu_mem": 1.858932736, + "gpu_mem": 5.023355392, + "loss": 3.5742, + "grad_norm": 2.757070302963257, + "learning_rate": 2.8571428571428567e-05 + }, + { + "step": 7, + "epoch": 0.011217948717948718, + "cpu_mem": 1.859719168, + "gpu_mem": 5.023315456, + "loss": 3.52, + "grad_norm": 2.505768060684204, + "learning_rate": 3.333333333333333e-05 + }, + { + "step": 8, + "epoch": 0.01282051282051282, + "cpu_mem": 1.860702208, + "gpu_mem": 5.023346176, + "loss": 3.2369, + "grad_norm": 2.6375603675842285, + "learning_rate": 3.809523809523809e-05 + }, + { + "step": 9, + "epoch": 0.014423076923076924, + "cpu_mem": 1.86148864, + "gpu_mem": 5.023346176, + "loss": 3.1479, + "grad_norm": 2.4941816329956055, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 10, + "epoch": 0.016025641025641024, + "cpu_mem": 1.862275072, + "gpu_mem": 5.023289344, + "loss": 2.9718, + "grad_norm": 2.4022130966186523, + "learning_rate": 4.7619047619047614e-05 + }, + { + "step": 11, + "epoch": 0.017628205128205128, + "cpu_mem": 1.863061504, + "gpu_mem": 5.023309312, + "loss": 2.9136, + "grad_norm": 2.579561233520508, + "learning_rate": 5.238095238095237e-05 + }, + { + "step": 12, + "epoch": 0.019230769230769232, + "cpu_mem": 1.863847936, + "gpu_mem": 5.02330624, + "loss": 3.1279, + "grad_norm": 2.5186097621917725, + "learning_rate": 5.7142857142857135e-05 + }, + { + "step": 13, + "epoch": 0.020833333333333332, + "cpu_mem": 1.86443776, + "gpu_mem": 5.02329856, + "loss": 2.8593, + "grad_norm": 2.222869873046875, + "learning_rate": 6.190476190476189e-05 + }, + { + "step": 14, + "epoch": 0.022435897435897436, + "cpu_mem": 1.8654208, + "gpu_mem": 5.023324672, + "loss": 2.7062, + "grad_norm": 2.150005340576172, + "learning_rate": 6.666666666666666e-05 + }, + { + "step": 15, + "epoch": 0.02403846153846154, + "cpu_mem": 1.866010624, + "gpu_mem": 5.023323136, + "loss": 2.4799, + "grad_norm": 1.9038403034210205, + "learning_rate": 7.142857142857142e-05 + }, + { + "step": 16, + "epoch": 0.02564102564102564, + "cpu_mem": 1.866797056, + "gpu_mem": 5.023315456, + "loss": 2.5545, + "grad_norm": 1.7763649225234985, + "learning_rate": 7.619047619047618e-05 + }, + { + "step": 17, + "epoch": 0.027243589743589744, + "cpu_mem": 1.86738688, + "gpu_mem": 5.023315456, + "loss": 2.2961, + "grad_norm": 1.4517396688461304, + "learning_rate": 8.095238095238093e-05 + }, + { + "step": 18, + "epoch": 0.028846153846153848, + "cpu_mem": 1.868173312, + "gpu_mem": 5.023315456, + "loss": 2.2845, + "grad_norm": 1.4001959562301636, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 19, + "epoch": 0.030448717948717948, + "cpu_mem": 1.868959744, + "gpu_mem": 5.023315456, + "loss": 2.2261, + "grad_norm": 1.253830075263977, + "learning_rate": 9.047619047619046e-05 + }, + { + "step": 20, + "epoch": 0.03205128205128205, + "cpu_mem": 1.869549568, + "gpu_mem": 5.023289344, + "loss": 2.0651, + "grad_norm": 1.2541093826293945, + "learning_rate": 9.523809523809523e-05 + }, + { + "step": 21, + "epoch": 0.03365384615384615, + "cpu_mem": 1.870139392, + "gpu_mem": 5.02330624, + "loss": 2.0582, + "grad_norm": 1.1301401853561401, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 22, + "epoch": 0.035256410256410256, + "cpu_mem": 1.870729216, + "gpu_mem": 5.02331392, + "loss": 1.8067, + "grad_norm": 0.7521812319755554, + "learning_rate": 0.00010476190476190474 + }, + { + "step": 23, + "epoch": 0.03685897435897436, + "cpu_mem": 1.87131904, + "gpu_mem": 5.023327744, + "loss": 1.7535, + "grad_norm": 0.8587539792060852, + "learning_rate": 0.0001095238095238095 + }, + { + "step": 24, + "epoch": 0.038461538461538464, + "cpu_mem": 1.872105472, + "gpu_mem": 5.023312384, + "loss": 1.7686, + "grad_norm": 0.9783183336257935, + "learning_rate": 0.00011428571428571427 + }, + { + "step": 25, + "epoch": 0.04006410256410257, + "cpu_mem": 1.872695296, + "gpu_mem": 5.023300096, + "loss": 1.5821, + "grad_norm": 0.4712306261062622, + "learning_rate": 0.00011904761904761903 + }, + { + "step": 26, + "epoch": 0.041666666666666664, + "cpu_mem": 1.87328512, + "gpu_mem": 5.02330624, + "loss": 1.495, + "grad_norm": 0.36066922545433044, + "learning_rate": 0.00012380952380952378 + }, + { + "step": 27, + "epoch": 0.04326923076923077, + "cpu_mem": 1.873874944, + "gpu_mem": 5.02331392, + "loss": 1.4999, + "grad_norm": 0.32544535398483276, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 28, + "epoch": 0.04487179487179487, + "cpu_mem": 1.87426816, + "gpu_mem": 5.023309312, + "loss": 1.5213, + "grad_norm": 0.3280967175960541, + "learning_rate": 0.0001333333333333333 + }, + { + "step": 29, + "epoch": 0.046474358974358976, + "cpu_mem": 1.874857984, + "gpu_mem": 5.023318528, + "loss": 1.4032, + "grad_norm": 0.2005397081375122, + "learning_rate": 0.00013809523809523808 + }, + { + "step": 30, + "epoch": 0.04807692307692308, + "cpu_mem": 1.875447808, + "gpu_mem": 5.02329088, + "loss": 1.4878, + "grad_norm": 0.25324374437332153, + "learning_rate": 0.00014285714285714284 + }, + { + "step": 31, + "epoch": 0.049679487179487176, + "cpu_mem": 1.876037632, + "gpu_mem": 5.023346176, + "loss": 1.4151, + "grad_norm": 0.2827390730381012, + "learning_rate": 0.0001476190476190476 + }, + { + "step": 32, + "epoch": 0.05128205128205128, + "cpu_mem": 1.876627456, + "gpu_mem": 5.023338496, + "loss": 1.3992, + "grad_norm": 0.19125023484230042, + "learning_rate": 0.00015238095238095237 + }, + { + "step": 33, + "epoch": 0.052884615384615384, + "cpu_mem": 1.87721728, + "gpu_mem": 5.023292416, + "loss": 1.4065, + "grad_norm": 0.1686975657939911, + "learning_rate": 0.00015714285714285713 + }, + { + "step": 34, + "epoch": 0.05448717948717949, + "cpu_mem": 1.877807104, + "gpu_mem": 5.023310848, + "loss": 1.4047, + "grad_norm": 0.16880421340465546, + "learning_rate": 0.00016190476190476187 + }, + { + "step": 35, + "epoch": 0.05608974358974359, + "cpu_mem": 1.878396928, + "gpu_mem": 5.023332352, + "loss": 1.4998, + "grad_norm": 0.5426452159881592, + "learning_rate": 0.00016666666666666666 + }, + { + "step": 36, + "epoch": 0.057692307692307696, + "cpu_mem": 1.878986752, + "gpu_mem": 5.023330816, + "loss": 1.3891, + "grad_norm": 0.23604154586791992, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 37, + "epoch": 0.05929487179487179, + "cpu_mem": 1.879576576, + "gpu_mem": 5.023363072, + "loss": 1.402, + "grad_norm": 0.2298724204301834, + "learning_rate": 0.0001761904761904762 + }, + { + "step": 38, + "epoch": 0.060897435897435896, + "cpu_mem": 1.8801664, + "gpu_mem": 5.023315456, + "loss": 1.3838, + "grad_norm": 0.1426660418510437, + "learning_rate": 0.00018095238095238093 + }, + { + "step": 39, + "epoch": 0.0625, + "cpu_mem": 1.880756224, + "gpu_mem": 5.023372288, + "loss": 1.3924, + "grad_norm": 0.31723204255104065, + "learning_rate": 0.00018571428571428572 + }, + { + "step": 40, + "epoch": 0.0641025641025641, + "cpu_mem": 1.88114944, + "gpu_mem": 5.023300096, + "loss": 1.4086, + "grad_norm": 0.1981196254491806, + "learning_rate": 0.00019047619047619045 + }, + { + "step": 41, + "epoch": 0.06570512820512821, + "cpu_mem": 1.881739264, + "gpu_mem": 5.023327744, + "loss": 1.4163, + "grad_norm": 0.1509743332862854, + "learning_rate": 0.00019523809523809522 + }, + { + "step": 42, + "epoch": 0.0673076923076923, + "cpu_mem": 1.88213248, + "gpu_mem": 5.023341568, + "loss": 1.3785, + "grad_norm": 0.14269323647022247, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 43, + "epoch": 0.06891025641025642, + "cpu_mem": 1.882918912, + "gpu_mem": 5.023347712, + "loss": 1.3771, + "grad_norm": 0.16572846472263336, + "learning_rate": 0.00020476190476190475 + }, + { + "step": 44, + "epoch": 0.07051282051282051, + "cpu_mem": 1.883312128, + "gpu_mem": 5.023326208, + "loss": 1.3871, + "grad_norm": 0.23220685124397278, + "learning_rate": 0.00020952380952380948 + }, + { + "step": 45, + "epoch": 0.07211538461538461, + "cpu_mem": 1.883901952, + "gpu_mem": 5.023326208, + "loss": 1.4066, + "grad_norm": 0.21317724883556366, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 46, + "epoch": 0.07371794871794872, + "cpu_mem": 1.884491776, + "gpu_mem": 5.023326208, + "loss": 1.4256, + "grad_norm": 0.33091095089912415, + "learning_rate": 0.000219047619047619 + }, + { + "step": 47, + "epoch": 0.07532051282051282, + "cpu_mem": 1.884884992, + "gpu_mem": 5.023312384, + "loss": 1.4095, + "grad_norm": 0.15897364914417267, + "learning_rate": 0.0002238095238095238 + }, + { + "step": 48, + "epoch": 0.07692307692307693, + "cpu_mem": 1.885474816, + "gpu_mem": 5.023330816, + "loss": 1.3991, + "grad_norm": 0.24789851903915405, + "learning_rate": 0.00022857142857142854 + }, + { + "step": 49, + "epoch": 0.07852564102564102, + "cpu_mem": 1.885868032, + "gpu_mem": 5.023343104, + "loss": 1.4124, + "grad_norm": 0.20800015330314636, + "learning_rate": 0.0002333333333333333 + }, + { + "step": 50, + "epoch": 0.08012820512820513, + "cpu_mem": 1.886457856, + "gpu_mem": 5.023320064, + "loss": 1.392, + "grad_norm": 0.2078103870153427, + "learning_rate": 0.00023809523809523807 + }, + { + "step": 51, + "epoch": 0.08173076923076923, + "cpu_mem": 1.886851072, + "gpu_mem": 5.023304704, + "loss": 1.358, + "grad_norm": 0.172748401761055, + "learning_rate": 0.00024285714285714283 + }, + { + "step": 52, + "epoch": 0.08333333333333333, + "cpu_mem": 1.887440896, + "gpu_mem": 5.023309312, + "loss": 1.3749, + "grad_norm": 0.18119178712368011, + "learning_rate": 0.00024761904761904757 + }, + { + "step": 53, + "epoch": 0.08493589743589744, + "cpu_mem": 1.88803072, + "gpu_mem": 5.02333696, + "loss": 1.4467, + "grad_norm": 0.4085313379764557, + "learning_rate": 0.0002523809523809524 + }, + { + "step": 54, + "epoch": 0.08653846153846154, + "cpu_mem": 1.888423936, + "gpu_mem": 5.023312384, + "loss": 1.3875, + "grad_norm": 0.24164755642414093, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 55, + "epoch": 0.08814102564102565, + "cpu_mem": 1.888817152, + "gpu_mem": 5.023330816, + "loss": 1.4113, + "grad_norm": 0.25499606132507324, + "learning_rate": 0.00026190476190476186 + }, + { + "step": 56, + "epoch": 0.08974358974358974, + "cpu_mem": 1.889406976, + "gpu_mem": 5.023324672, + "loss": 1.4001, + "grad_norm": 0.22803981602191925, + "learning_rate": 0.0002666666666666666 + }, + { + "step": 57, + "epoch": 0.09134615384615384, + "cpu_mem": 1.8899968, + "gpu_mem": 5.02329088, + "loss": 1.3818, + "grad_norm": 0.15454097092151642, + "learning_rate": 0.0002714285714285714 + }, + { + "step": 58, + "epoch": 0.09294871794871795, + "cpu_mem": 1.890390016, + "gpu_mem": 5.023320064, + "loss": 1.4214, + "grad_norm": 0.20220178365707397, + "learning_rate": 0.00027619047619047615 + }, + { + "step": 59, + "epoch": 0.09455128205128205, + "cpu_mem": 1.89097984, + "gpu_mem": 5.023303168, + "loss": 1.3422, + "grad_norm": 0.1734497994184494, + "learning_rate": 0.0002809523809523809 + }, + { + "step": 60, + "epoch": 0.09615384615384616, + "cpu_mem": 1.891373056, + "gpu_mem": 5.02334464, + "loss": 1.3723, + "grad_norm": 0.18264496326446533, + "learning_rate": 0.0002857142857142857 + }, + { + "step": 61, + "epoch": 0.09775641025641026, + "cpu_mem": 1.891766272, + "gpu_mem": 5.023310848, + "loss": 1.3939, + "grad_norm": 0.12945598363876343, + "learning_rate": 0.00029047619047619045 + }, + { + "step": 62, + "epoch": 0.09935897435897435, + "cpu_mem": 1.892356096, + "gpu_mem": 5.023350784, + "loss": 1.3213, + "grad_norm": 0.13898183405399323, + "learning_rate": 0.0002952380952380952 + }, + { + "step": 63, + "epoch": 0.10096153846153846, + "cpu_mem": 1.892749312, + "gpu_mem": 5.023304704, + "loss": 1.4769, + "grad_norm": 0.22493194043636322, + "learning_rate": 0.0003 + }, + { + "step": 64, + "epoch": 0.10256410256410256, + "cpu_mem": 1.893142528, + "gpu_mem": 5.023309312, + "loss": 1.4785, + "grad_norm": 0.2292277216911316, + "learning_rate": 0.00029999764801714643 + }, + { + "step": 65, + "epoch": 0.10416666666666667, + "cpu_mem": 1.893535744, + "gpu_mem": 5.02330624, + "loss": 1.472, + "grad_norm": 0.25121432542800903, + "learning_rate": 0.00029999059214234344 + }, + { + "step": 66, + "epoch": 0.10576923076923077, + "cpu_mem": 1.89392896, + "gpu_mem": 5.023324672, + "loss": 1.4528, + "grad_norm": 0.15623274445533752, + "learning_rate": 0.00029997883259686163 + }, + { + "step": 67, + "epoch": 0.10737179487179487, + "cpu_mem": 1.894518784, + "gpu_mem": 5.023316992, + "loss": 1.3979, + "grad_norm": 0.096467986702919, + "learning_rate": 0.00029996236974947764 + }, + { + "step": 68, + "epoch": 0.10897435897435898, + "cpu_mem": 1.894912, + "gpu_mem": 5.023301632, + "loss": 1.3966, + "grad_norm": 0.14920856058597565, + "learning_rate": 0.00029994120411646263 + }, + { + "step": 69, + "epoch": 0.11057692307692307, + "cpu_mem": 1.895305216, + "gpu_mem": 5.023372288, + "loss": 1.428, + "grad_norm": 0.19545769691467285, + "learning_rate": 0.00029991533636156603 + }, + { + "step": 70, + "epoch": 0.11217948717948718, + "cpu_mem": 1.89589504, + "gpu_mem": 5.023323136, + "loss": 1.4113, + "grad_norm": 0.18419542908668518, + "learning_rate": 0.00029988476729599464 + }, + { + "step": 71, + "epoch": 0.11378205128205128, + "cpu_mem": 1.896288256, + "gpu_mem": 5.023347712, + "loss": 1.4263, + "grad_norm": 0.25037682056427, + "learning_rate": 0.0002998494978783874 + }, + { + "step": 72, + "epoch": 0.11538461538461539, + "cpu_mem": 1.89687808, + "gpu_mem": 5.023318528, + "loss": 1.3719, + "grad_norm": 0.07696887105703354, + "learning_rate": 0.0002998095292147852 + }, + { + "step": 73, + "epoch": 0.11698717948717949, + "cpu_mem": 1.897271296, + "gpu_mem": 5.023310848, + "loss": 1.3715, + "grad_norm": 0.15148325264453888, + "learning_rate": 0.0002997648625585962 + }, + { + "step": 74, + "epoch": 0.11858974358974358, + "cpu_mem": 1.897664512, + "gpu_mem": 5.023304704, + "loss": 1.3747, + "grad_norm": 0.10169406980276108, + "learning_rate": 0.0002997154993105566 + }, + { + "step": 75, + "epoch": 0.1201923076923077, + "cpu_mem": 1.898057728, + "gpu_mem": 5.023333888, + "loss": 1.4393, + "grad_norm": 0.19543886184692383, + "learning_rate": 0.00029966144101868636 + }, + { + "step": 76, + "epoch": 0.12179487179487179, + "cpu_mem": 1.898450944, + "gpu_mem": 5.023324672, + "loss": 1.4814, + "grad_norm": 0.3185161054134369, + "learning_rate": 0.0002996026893782414 + }, + { + "step": 77, + "epoch": 0.1233974358974359, + "cpu_mem": 1.89884416, + "gpu_mem": 5.023312384, + "loss": 1.4567, + "grad_norm": 0.2161218374967575, + "learning_rate": 0.00029953924623165955 + }, + { + "step": 78, + "epoch": 0.125, + "cpu_mem": 1.899237376, + "gpu_mem": 5.023304704, + "loss": 1.3661, + "grad_norm": 0.10899048298597336, + "learning_rate": 0.0002994711135685035 + }, + { + "step": 79, + "epoch": 0.1266025641025641, + "cpu_mem": 1.899630592, + "gpu_mem": 5.023356928, + "loss": 1.3987, + "grad_norm": 0.13670292496681213, + "learning_rate": 0.00029939829352539787 + }, + { + "step": 80, + "epoch": 0.1282051282051282, + "cpu_mem": 1.900023808, + "gpu_mem": 5.023335424, + "loss": 1.4082, + "grad_norm": 0.0789019837975502, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 81, + "epoch": 0.12980769230769232, + "cpu_mem": 1.900417024, + "gpu_mem": 5.02332928, + "loss": 1.3782, + "grad_norm": 0.12293507158756256, + "learning_rate": 0.0002992386005807413 + }, + { + "step": 82, + "epoch": 0.13141025641025642, + "cpu_mem": 1.90081024, + "gpu_mem": 5.02330624, + "loss": 1.3893, + "grad_norm": 0.13373561203479767, + "learning_rate": 0.00029915173268712456 + }, + { + "step": 83, + "epoch": 0.1330128205128205, + "cpu_mem": 1.901203456, + "gpu_mem": 5.023327744, + "loss": 1.3955, + "grad_norm": 0.14496657252311707, + "learning_rate": 0.0002990601874292698 + }, + { + "step": 84, + "epoch": 0.1346153846153846, + "cpu_mem": 1.90179328, + "gpu_mem": 5.023300096, + "loss": 1.4229, + "grad_norm": 0.1542472392320633, + "learning_rate": 0.0002989639676780152 + }, + { + "step": 85, + "epoch": 0.1362179487179487, + "cpu_mem": 1.901989888, + "gpu_mem": 5.023307776, + "loss": 1.387, + "grad_norm": 0.07924240827560425, + "learning_rate": 0.0002988630764507904 + }, + { + "step": 86, + "epoch": 0.13782051282051283, + "cpu_mem": 1.902383104, + "gpu_mem": 5.023326208, + "loss": 1.3704, + "grad_norm": 0.06577291339635849, + "learning_rate": 0.00029875751691152094 + }, + { + "step": 87, + "epoch": 0.13942307692307693, + "cpu_mem": 1.902972928, + "gpu_mem": 5.023315456, + "loss": 1.4021, + "grad_norm": 0.11769996583461761, + "learning_rate": 0.0002986472923705301 + }, + { + "step": 88, + "epoch": 0.14102564102564102, + "cpu_mem": 1.903366144, + "gpu_mem": 5.02331392, + "loss": 1.414, + "grad_norm": 0.17415818572044373, + "learning_rate": 0.0002985324062844341 + }, + { + "step": 89, + "epoch": 0.14262820512820512, + "cpu_mem": 1.903562752, + "gpu_mem": 5.023309312, + "loss": 1.3941, + "grad_norm": 0.1198030486702919, + "learning_rate": 0.0002984128622560345 + }, + { + "step": 90, + "epoch": 0.14423076923076922, + "cpu_mem": 1.903955968, + "gpu_mem": 5.02331392, + "loss": 1.4103, + "grad_norm": 0.13897398114204407, + "learning_rate": 0.0002982886640342046 + }, + { + "step": 91, + "epoch": 0.14583333333333334, + "cpu_mem": 1.904349184, + "gpu_mem": 5.023324672, + "loss": 1.3651, + "grad_norm": 0.0625867173075676, + "learning_rate": 0.00029815981551377217 + }, + { + "step": 92, + "epoch": 0.14743589743589744, + "cpu_mem": 1.9047424, + "gpu_mem": 5.023327744, + "loss": 1.4014, + "grad_norm": 0.09447936713695526, + "learning_rate": 0.00029802632073539745 + }, + { + "step": 93, + "epoch": 0.14903846153846154, + "cpu_mem": 1.905135616, + "gpu_mem": 5.023327744, + "loss": 1.4095, + "grad_norm": 0.09981872141361237, + "learning_rate": 0.0002978881838854462 + }, + { + "step": 94, + "epoch": 0.15064102564102563, + "cpu_mem": 1.905528832, + "gpu_mem": 5.023323136, + "loss": 1.4025, + "grad_norm": 0.11521104723215103, + "learning_rate": 0.00029774540929585847 + }, + { + "step": 95, + "epoch": 0.15224358974358973, + "cpu_mem": 1.905922048, + "gpu_mem": 5.023341568, + "loss": 1.3759, + "grad_norm": 0.10609796643257141, + "learning_rate": 0.0002975980014440126 + }, + { + "step": 96, + "epoch": 0.15384615384615385, + "cpu_mem": 1.906315264, + "gpu_mem": 5.02334464, + "loss": 1.3869, + "grad_norm": 0.06238195300102234, + "learning_rate": 0.00029744596495258525 + }, + { + "step": 97, + "epoch": 0.15544871794871795, + "cpu_mem": 1.90670848, + "gpu_mem": 5.0233216, + "loss": 1.3975, + "grad_norm": 0.08044589310884476, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 98, + "epoch": 0.15705128205128205, + "cpu_mem": 1.907101696, + "gpu_mem": 5.023332352, + "loss": 1.396, + "grad_norm": 0.06972131133079529, + "learning_rate": 0.000297128025267308 + }, + { + "step": 99, + "epoch": 0.15865384615384615, + "cpu_mem": 1.907494912, + "gpu_mem": 5.023332352, + "loss": 1.3856, + "grad_norm": 0.13119818270206451, + "learning_rate": 0.00029696213204397396 + }, + { + "step": 100, + "epoch": 0.16025641025641027, + "cpu_mem": 1.907888128, + "gpu_mem": 5.023307776, + "loss": 1.3967, + "grad_norm": 0.0990079715847969, + "learning_rate": 0.00029679163012177737 + }, + { + "step": 101, + "epoch": 0.16185897435897437, + "cpu_mem": 1.908281344, + "gpu_mem": 5.02333696, + "loss": 1.4089, + "grad_norm": 0.16337399184703827, + "learning_rate": 0.0002966165248476196 + }, + { + "step": 102, + "epoch": 0.16346153846153846, + "cpu_mem": 1.90867456, + "gpu_mem": 5.02331392, + "loss": 1.3764, + "grad_norm": 0.13376621901988983, + "learning_rate": 0.00029643682171276203 + }, + { + "step": 103, + "epoch": 0.16506410256410256, + "cpu_mem": 1.909067776, + "gpu_mem": 5.023330816, + "loss": 1.4009, + "grad_norm": 0.10628398507833481, + "learning_rate": 0.0002962525263526538 + }, + { + "step": 104, + "epoch": 0.16666666666666666, + "cpu_mem": 1.909460992, + "gpu_mem": 5.02329856, + "loss": 1.396, + "grad_norm": 0.130917027592659, + "learning_rate": 0.0002960636445467553 + }, + { + "step": 105, + "epoch": 0.16826923076923078, + "cpu_mem": 1.9096576, + "gpu_mem": 5.02331392, + "loss": 1.4, + "grad_norm": 0.11310996115207672, + "learning_rate": 0.0002958701822183569 + }, + { + "step": 106, + "epoch": 0.16987179487179488, + "cpu_mem": 1.909854208, + "gpu_mem": 5.023293952, + "loss": 1.3847, + "grad_norm": 0.11370261013507843, + "learning_rate": 0.0002956721454343928 + }, + { + "step": 107, + "epoch": 0.17147435897435898, + "cpu_mem": 1.910247424, + "gpu_mem": 5.023335424, + "loss": 1.3771, + "grad_norm": 0.05997665598988533, + "learning_rate": 0.0002954695404052514 + }, + { + "step": 108, + "epoch": 0.17307692307692307, + "cpu_mem": 1.91064064, + "gpu_mem": 5.023330816, + "loss": 1.3852, + "grad_norm": 0.06802321970462799, + "learning_rate": 0.00029526237348458003 + }, + { + "step": 109, + "epoch": 0.17467948717948717, + "cpu_mem": 1.911033856, + "gpu_mem": 5.02333696, + "loss": 1.3937, + "grad_norm": 0.13648726046085358, + "learning_rate": 0.000295050651169086 + }, + { + "step": 110, + "epoch": 0.1762820512820513, + "cpu_mem": 1.911230464, + "gpu_mem": 5.023333888, + "loss": 1.3871, + "grad_norm": 0.1030195951461792, + "learning_rate": 0.00029483438009833264 + }, + { + "step": 111, + "epoch": 0.1778846153846154, + "cpu_mem": 1.91162368, + "gpu_mem": 5.023335424, + "loss": 1.3781, + "grad_norm": 0.10505808889865875, + "learning_rate": 0.0002946135670545314 + }, + { + "step": 112, + "epoch": 0.1794871794871795, + "cpu_mem": 1.912016896, + "gpu_mem": 5.023332352, + "loss": 1.3929, + "grad_norm": 0.11069456487894058, + "learning_rate": 0.0002943882189623288 + }, + { + "step": 113, + "epoch": 0.18108974358974358, + "cpu_mem": 1.912213504, + "gpu_mem": 5.023312384, + "loss": 1.3958, + "grad_norm": 0.08010127395391464, + "learning_rate": 0.00029415834288858947 + }, + { + "step": 114, + "epoch": 0.18269230769230768, + "cpu_mem": 1.91260672, + "gpu_mem": 5.023307776, + "loss": 1.3965, + "grad_norm": 0.15402397513389587, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 115, + "epoch": 0.1842948717948718, + "cpu_mem": 1.912999936, + "gpu_mem": 5.023326208, + "loss": 1.3984, + "grad_norm": 0.06303899735212326, + "learning_rate": 0.0002936850357737156 + }, + { + "step": 116, + "epoch": 0.1858974358974359, + "cpu_mem": 1.913196544, + "gpu_mem": 5.02333696, + "loss": 1.3956, + "grad_norm": 0.078012615442276, + "learning_rate": 0.0002934416195753839 + }, + { + "step": 117, + "epoch": 0.1875, + "cpu_mem": 1.91358976, + "gpu_mem": 5.023323136, + "loss": 1.3843, + "grad_norm": 0.06066165119409561, + "learning_rate": 0.00029319370508065594 + }, + { + "step": 118, + "epoch": 0.1891025641025641, + "cpu_mem": 1.913982976, + "gpu_mem": 5.023338496, + "loss": 1.4142, + "grad_norm": 0.16598118841648102, + "learning_rate": 0.0002929413000640735 + }, + { + "step": 119, + "epoch": 0.1907051282051282, + "cpu_mem": 1.914376192, + "gpu_mem": 5.023320064, + "loss": 1.3568, + "grad_norm": 0.11242596805095673, + "learning_rate": 0.0002926844124410001 + }, + { + "step": 120, + "epoch": 0.19230769230769232, + "cpu_mem": 1.914769408, + "gpu_mem": 5.023346176, + "loss": 1.4179, + "grad_norm": 0.13754227757453918, + "learning_rate": 0.0002924230502673731 + }, + { + "step": 121, + "epoch": 0.19391025641025642, + "cpu_mem": 1.915162624, + "gpu_mem": 5.023304704, + "loss": 1.396, + "grad_norm": 0.10168537497520447, + "learning_rate": 0.00029215722173945034 + }, + { + "step": 122, + "epoch": 0.1955128205128205, + "cpu_mem": 1.915359232, + "gpu_mem": 5.02333696, + "loss": 1.3783, + "grad_norm": 0.08319191634654999, + "learning_rate": 0.0002918869351935537 + }, + { + "step": 123, + "epoch": 0.1971153846153846, + "cpu_mem": 1.915752448, + "gpu_mem": 5.023330816, + "loss": 1.3981, + "grad_norm": 0.08464114367961884, + "learning_rate": 0.00029161219910580754 + }, + { + "step": 124, + "epoch": 0.1987179487179487, + "cpu_mem": 1.916145664, + "gpu_mem": 5.023332352, + "loss": 1.3803, + "grad_norm": 0.10402780026197433, + "learning_rate": 0.00029133302209187267 + }, + { + "step": 125, + "epoch": 0.20032051282051283, + "cpu_mem": 1.916342272, + "gpu_mem": 5.023307776, + "loss": 1.3579, + "grad_norm": 0.07578554004430771, + "learning_rate": 0.00029104941290667655 + }, + { + "step": 126, + "epoch": 0.20192307692307693, + "cpu_mem": 1.916735488, + "gpu_mem": 5.023316992, + "loss": 1.3536, + "grad_norm": 0.08276208490133286, + "learning_rate": 0.00029076138044413827 + }, + { + "step": 127, + "epoch": 0.20352564102564102, + "cpu_mem": 1.916932096, + "gpu_mem": 5.023303168, + "loss": 1.423, + "grad_norm": 0.13037818670272827, + "learning_rate": 0.00029046893373689 + }, + { + "step": 128, + "epoch": 0.20512820512820512, + "cpu_mem": 1.917325312, + "gpu_mem": 5.023340032, + "loss": 1.3668, + "grad_norm": 0.07861297577619553, + "learning_rate": 0.00029017208195599375 + }, + { + "step": 129, + "epoch": 0.20673076923076922, + "cpu_mem": 1.91752192, + "gpu_mem": 5.02333696, + "loss": 1.3881, + "grad_norm": 0.06564931571483612, + "learning_rate": 0.0002898708344106533 + }, + { + "step": 130, + "epoch": 0.20833333333333334, + "cpu_mem": 1.917915136, + "gpu_mem": 5.02333696, + "loss": 1.4201, + "grad_norm": 0.10143253952264786, + "learning_rate": 0.00028956520054792303 + }, + { + "step": 131, + "epoch": 0.20993589743589744, + "cpu_mem": 1.918308352, + "gpu_mem": 5.023326208, + "loss": 1.4068, + "grad_norm": 0.10862765461206436, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 132, + "epoch": 0.21153846153846154, + "cpu_mem": 1.91850496, + "gpu_mem": 5.023326208, + "loss": 1.3563, + "grad_norm": 0.10232546180486679, + "learning_rate": 0.0002889408123459782 + }, + { + "step": 133, + "epoch": 0.21314102564102563, + "cpu_mem": 1.918898176, + "gpu_mem": 5.023307776, + "loss": 1.3839, + "grad_norm": 0.08506571501493454, + "learning_rate": 0.000288622077587435 + }, + { + "step": 134, + "epoch": 0.21474358974358973, + "cpu_mem": 1.919291392, + "gpu_mem": 5.023318528, + "loss": 1.4129, + "grad_norm": 0.12507399916648865, + "learning_rate": 0.0002882989956722303 + }, + { + "step": 135, + "epoch": 0.21634615384615385, + "cpu_mem": 1.919684608, + "gpu_mem": 5.023327744, + "loss": 1.3824, + "grad_norm": 0.10143767297267914, + "learning_rate": 0.00028797157673213914 + }, + { + "step": 136, + "epoch": 0.21794871794871795, + "cpu_mem": 1.919881216, + "gpu_mem": 5.023343104, + "loss": 1.3949, + "grad_norm": 0.11637111753225327, + "learning_rate": 0.00028763983103494465 + }, + { + "step": 137, + "epoch": 0.21955128205128205, + "cpu_mem": 1.920274432, + "gpu_mem": 5.02329088, + "loss": 1.3766, + "grad_norm": 0.06515251100063324, + "learning_rate": 0.00028730376898411606 + }, + { + "step": 138, + "epoch": 0.22115384615384615, + "cpu_mem": 1.92047104, + "gpu_mem": 5.023310848, + "loss": 1.403, + "grad_norm": 0.09658361971378326, + "learning_rate": 0.00028696340111848245 + }, + { + "step": 139, + "epoch": 0.22275641025641027, + "cpu_mem": 1.920667648, + "gpu_mem": 5.023292416, + "loss": 1.3866, + "grad_norm": 0.11037879437208176, + "learning_rate": 0.00028661873811190226 + }, + { + "step": 140, + "epoch": 0.22435897435897437, + "cpu_mem": 1.920864256, + "gpu_mem": 5.023309312, + "loss": 1.3817, + "grad_norm": 0.11071012914180756, + "learning_rate": 0.0002862697907729285 + }, + { + "step": 141, + "epoch": 0.22596153846153846, + "cpu_mem": 1.921060864, + "gpu_mem": 5.023315456, + "loss": 1.3836, + "grad_norm": 0.0940612182021141, + "learning_rate": 0.0002859165700444701 + }, + { + "step": 142, + "epoch": 0.22756410256410256, + "cpu_mem": 1.92145408, + "gpu_mem": 5.023312384, + "loss": 1.385, + "grad_norm": 0.06641366332769394, + "learning_rate": 0.00028555908700344824 + }, + { + "step": 143, + "epoch": 0.22916666666666666, + "cpu_mem": 1.921650688, + "gpu_mem": 5.023338496, + "loss": 1.3838, + "grad_norm": 0.05796550586819649, + "learning_rate": 0.00028519735286044936 + }, + { + "step": 144, + "epoch": 0.23076923076923078, + "cpu_mem": 1.922043904, + "gpu_mem": 5.023312384, + "loss": 1.3886, + "grad_norm": 0.06009750813245773, + "learning_rate": 0.0002848313789593736 + }, + { + "step": 145, + "epoch": 0.23237179487179488, + "cpu_mem": 1.92243712, + "gpu_mem": 5.02335232, + "loss": 1.3898, + "grad_norm": 0.12071718275547028, + "learning_rate": 0.00028446117677707867 + }, + { + "step": 146, + "epoch": 0.23397435897435898, + "cpu_mem": 1.922633728, + "gpu_mem": 5.023301632, + "loss": 1.383, + "grad_norm": 0.0640796348452568, + "learning_rate": 0.0002840867579230205 + }, + { + "step": 147, + "epoch": 0.23557692307692307, + "cpu_mem": 1.923026944, + "gpu_mem": 5.023310848, + "loss": 1.4148, + "grad_norm": 0.11847314983606339, + "learning_rate": 0.00028370813413888866 + }, + { + "step": 148, + "epoch": 0.23717948717948717, + "cpu_mem": 1.923223552, + "gpu_mem": 5.023330816, + "loss": 1.3763, + "grad_norm": 0.10196895897388458, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 149, + "epoch": 0.2387820512820513, + "cpu_mem": 1.92342016, + "gpu_mem": 5.0233216, + "loss": 1.3809, + "grad_norm": 0.07185522466897964, + "learning_rate": 0.0002829383194061186 + }, + { + "step": 150, + "epoch": 0.2403846153846154, + "cpu_mem": 1.923616768, + "gpu_mem": 5.023333888, + "loss": 1.3741, + "grad_norm": 0.08821900933980942, + "learning_rate": 0.00028254715259869444 + }, + { + "step": 151, + "epoch": 0.2419871794871795, + "cpu_mem": 1.924009984, + "gpu_mem": 5.02329856, + "loss": 1.3888, + "grad_norm": 0.1223701760172844, + "learning_rate": 0.00028215182914286766 + }, + { + "step": 152, + "epoch": 0.24358974358974358, + "cpu_mem": 1.924206592, + "gpu_mem": 5.02332928, + "loss": 1.3955, + "grad_norm": 0.12652020156383514, + "learning_rate": 0.00028175236143589144 + }, + { + "step": 153, + "epoch": 0.24519230769230768, + "cpu_mem": 1.924599808, + "gpu_mem": 5.023324672, + "loss": 1.3702, + "grad_norm": 0.0953860655426979, + "learning_rate": 0.0002813487620049817 + }, + { + "step": 154, + "epoch": 0.2467948717948718, + "cpu_mem": 1.924796416, + "gpu_mem": 5.023349248, + "loss": 1.3853, + "grad_norm": 0.07284790277481079, + "learning_rate": 0.00028094104350692435 + }, + { + "step": 155, + "epoch": 0.2483974358974359, + "cpu_mem": 1.924993024, + "gpu_mem": 5.023286272, + "loss": 1.3922, + "grad_norm": 0.08390913903713226, + "learning_rate": 0.0002805292187276783 + }, + { + "step": 156, + "epoch": 0.25, + "cpu_mem": 1.925189632, + "gpu_mem": 5.023340032, + "loss": 1.3708, + "grad_norm": 0.06896407902240753, + "learning_rate": 0.0002801133005819744 + }, + { + "step": 157, + "epoch": 0.2516025641025641, + "cpu_mem": 1.925582848, + "gpu_mem": 5.023332352, + "loss": 1.3953, + "grad_norm": 0.08345433324575424, + "learning_rate": 0.00027969330211291077 + }, + { + "step": 158, + "epoch": 0.2532051282051282, + "cpu_mem": 1.925779456, + "gpu_mem": 5.023347712, + "loss": 1.3974, + "grad_norm": 0.06772471219301224, + "learning_rate": 0.00027926923649154327 + }, + { + "step": 159, + "epoch": 0.2548076923076923, + "cpu_mem": 1.926172672, + "gpu_mem": 5.023349248, + "loss": 1.3889, + "grad_norm": 0.0739889070391655, + "learning_rate": 0.00027884111701647284 + }, + { + "step": 160, + "epoch": 0.2564102564102564, + "cpu_mem": 1.92636928, + "gpu_mem": 5.023316992, + "loss": 1.4295, + "grad_norm": 0.12384482473134995, + "learning_rate": 0.00027840895711342834 + }, + { + "step": 161, + "epoch": 0.25801282051282054, + "cpu_mem": 1.926762496, + "gpu_mem": 5.023309312, + "loss": 1.4051, + "grad_norm": 0.12709808349609375, + "learning_rate": 0.00027797277033484553 + }, + { + "step": 162, + "epoch": 0.25961538461538464, + "cpu_mem": 1.926959104, + "gpu_mem": 5.02334464, + "loss": 1.401, + "grad_norm": 0.0774528980255127, + "learning_rate": 0.0002775325703594421 + }, + { + "step": 163, + "epoch": 0.26121794871794873, + "cpu_mem": 1.927155712, + "gpu_mem": 5.023292416, + "loss": 1.4011, + "grad_norm": 0.12956048548221588, + "learning_rate": 0.0002770883709917886 + }, + { + "step": 164, + "epoch": 0.26282051282051283, + "cpu_mem": 1.92735232, + "gpu_mem": 5.023327744, + "loss": 1.3806, + "grad_norm": 0.10378757864236832, + "learning_rate": 0.0002766401861618757 + }, + { + "step": 165, + "epoch": 0.2644230769230769, + "cpu_mem": 1.927745536, + "gpu_mem": 5.023316992, + "loss": 1.3765, + "grad_norm": 0.05631071329116821, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 166, + "epoch": 0.266025641025641, + "cpu_mem": 1.927942144, + "gpu_mem": 5.023349248, + "loss": 1.3914, + "grad_norm": 0.09063878655433655, + "learning_rate": 0.0002757319164597092 + }, + { + "step": 167, + "epoch": 0.2676282051282051, + "cpu_mem": 1.928138752, + "gpu_mem": 5.023343104, + "loss": 1.378, + "grad_norm": 0.12472362816333771, + "learning_rate": 0.0002752718600705858 + }, + { + "step": 168, + "epoch": 0.2692307692307692, + "cpu_mem": 1.92833536, + "gpu_mem": 5.0233216, + "loss": 1.3786, + "grad_norm": 0.05094921216368675, + "learning_rate": 0.00027480787518457023 + }, + { + "step": 169, + "epoch": 0.2708333333333333, + "cpu_mem": 1.928728576, + "gpu_mem": 5.023318528, + "loss": 1.3792, + "grad_norm": 0.09183906763792038, + "learning_rate": 0.0002743399763521223 + }, + { + "step": 170, + "epoch": 0.2724358974358974, + "cpu_mem": 1.928925184, + "gpu_mem": 5.023355392, + "loss": 1.3994, + "grad_norm": 0.06827178597450256, + "learning_rate": 0.0002738681782464426 + }, + { + "step": 171, + "epoch": 0.27403846153846156, + "cpu_mem": 1.929121792, + "gpu_mem": 5.02332928, + "loss": 1.3729, + "grad_norm": 0.10591227561235428, + "learning_rate": 0.0002733924956630117 + }, + { + "step": 172, + "epoch": 0.27564102564102566, + "cpu_mem": 1.9293184, + "gpu_mem": 5.02330624, + "loss": 1.3941, + "grad_norm": 0.09883173555135727, + "learning_rate": 0.00027291294351912664 + }, + { + "step": 173, + "epoch": 0.27724358974358976, + "cpu_mem": 1.929515008, + "gpu_mem": 5.023332352, + "loss": 1.4118, + "grad_norm": 0.11038918793201447, + "learning_rate": 0.00027242953685343327 + }, + { + "step": 174, + "epoch": 0.27884615384615385, + "cpu_mem": 1.929711616, + "gpu_mem": 5.02334464, + "loss": 1.4102, + "grad_norm": 0.10931120067834854, + "learning_rate": 0.0002719422908254538 + }, + { + "step": 175, + "epoch": 0.28044871794871795, + "cpu_mem": 1.929908224, + "gpu_mem": 5.02330624, + "loss": 1.398, + "grad_norm": 0.09447190165519714, + "learning_rate": 0.0002714512207151125 + }, + { + "step": 176, + "epoch": 0.28205128205128205, + "cpu_mem": 1.930104832, + "gpu_mem": 5.023315456, + "loss": 1.4014, + "grad_norm": 0.07802090793848038, + "learning_rate": 0.0002709563419222557 + }, + { + "step": 177, + "epoch": 0.28365384615384615, + "cpu_mem": 1.93030144, + "gpu_mem": 5.023297024, + "loss": 1.3812, + "grad_norm": 0.08479056507349014, + "learning_rate": 0.0002704576699661691 + }, + { + "step": 178, + "epoch": 0.28525641025641024, + "cpu_mem": 1.930694656, + "gpu_mem": 5.023310848, + "loss": 1.3929, + "grad_norm": 0.12547212839126587, + "learning_rate": 0.0002699552204850914 + }, + { + "step": 179, + "epoch": 0.28685897435897434, + "cpu_mem": 1.930891264, + "gpu_mem": 5.023318528, + "loss": 1.3912, + "grad_norm": 0.08018206804990768, + "learning_rate": 0.0002694490092357233 + }, + { + "step": 180, + "epoch": 0.28846153846153844, + "cpu_mem": 1.931087872, + "gpu_mem": 5.023300096, + "loss": 1.407, + "grad_norm": 0.11204525828361511, + "learning_rate": 0.00026893905209273404 + }, + { + "step": 181, + "epoch": 0.2900641025641026, + "cpu_mem": 1.93128448, + "gpu_mem": 5.023330816, + "loss": 1.3876, + "grad_norm": 0.0805511623620987, + "learning_rate": 0.00026842536504826286 + }, + { + "step": 182, + "epoch": 0.2916666666666667, + "cpu_mem": 1.931481088, + "gpu_mem": 5.023301632, + "loss": 1.3953, + "grad_norm": 0.1375339925289154, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 183, + "epoch": 0.2932692307692308, + "cpu_mem": 1.931677696, + "gpu_mem": 5.023326208, + "loss": 1.3767, + "grad_norm": 0.06267879903316498, + "learning_rate": 0.0002673868658077717 + }, + { + "step": 184, + "epoch": 0.2948717948717949, + "cpu_mem": 1.932070912, + "gpu_mem": 5.02330624, + "loss": 1.3936, + "grad_norm": 0.10652276128530502, + "learning_rate": 0.00026686208617885055 + }, + { + "step": 185, + "epoch": 0.296474358974359, + "cpu_mem": 1.932464128, + "gpu_mem": 5.023338496, + "loss": 1.4089, + "grad_norm": 0.13011586666107178, + "learning_rate": 0.0002663336417816238 + }, + { + "step": 186, + "epoch": 0.2980769230769231, + "cpu_mem": 1.932660736, + "gpu_mem": 5.02332928, + "loss": 1.3925, + "grad_norm": 0.09760840237140656, + "learning_rate": 0.0002658015491879868 + }, + { + "step": 187, + "epoch": 0.29967948717948717, + "cpu_mem": 1.932857344, + "gpu_mem": 5.023324672, + "loss": 1.4125, + "grad_norm": 0.1506986767053604, + "learning_rate": 0.00026526582508424175 + }, + { + "step": 188, + "epoch": 0.30128205128205127, + "cpu_mem": 1.933053952, + "gpu_mem": 5.023281664, + "loss": 1.3673, + "grad_norm": 0.06849190592765808, + "learning_rate": 0.0002647264862705741 + }, + { + "step": 189, + "epoch": 0.30288461538461536, + "cpu_mem": 1.93325056, + "gpu_mem": 5.023361536, + "loss": 1.3769, + "grad_norm": 0.04308665171265602, + "learning_rate": 0.00026418354966052573 + }, + { + "step": 190, + "epoch": 0.30448717948717946, + "cpu_mem": 1.933447168, + "gpu_mem": 5.023312384, + "loss": 1.3752, + "grad_norm": 0.10129988938570023, + "learning_rate": 0.00026363703228046454 + }, + { + "step": 191, + "epoch": 0.3060897435897436, + "cpu_mem": 1.933643776, + "gpu_mem": 5.023312384, + "loss": 1.3657, + "grad_norm": 0.07608870416879654, + "learning_rate": 0.0002630869512690507 + }, + { + "step": 192, + "epoch": 0.3076923076923077, + "cpu_mem": 1.933840384, + "gpu_mem": 5.023278592, + "loss": 1.4334, + "grad_norm": 0.17019452154636383, + "learning_rate": 0.0002625333238766989 + }, + { + "step": 193, + "epoch": 0.3092948717948718, + "cpu_mem": 1.934036992, + "gpu_mem": 5.023318528, + "loss": 1.3723, + "grad_norm": 0.10775455087423325, + "learning_rate": 0.0002619761674650377 + }, + { + "step": 194, + "epoch": 0.3108974358974359, + "cpu_mem": 1.9342336, + "gpu_mem": 5.02331392, + "loss": 1.365, + "grad_norm": 0.11186672002077103, + "learning_rate": 0.0002614154995063647 + }, + { + "step": 195, + "epoch": 0.3125, + "cpu_mem": 1.934430208, + "gpu_mem": 5.023301632, + "loss": 1.3906, + "grad_norm": 0.08264806121587753, + "learning_rate": 0.00026085133758309883 + }, + { + "step": 196, + "epoch": 0.3141025641025641, + "cpu_mem": 1.934626816, + "gpu_mem": 5.023326208, + "loss": 1.4051, + "grad_norm": 0.1857685148715973, + "learning_rate": 0.0002602836993872292 + }, + { + "step": 197, + "epoch": 0.3157051282051282, + "cpu_mem": 1.934823424, + "gpu_mem": 5.023341568, + "loss": 1.4013, + "grad_norm": 0.1446775645017624, + "learning_rate": 0.0002597126027197598 + }, + { + "step": 198, + "epoch": 0.3173076923076923, + "cpu_mem": 1.935020032, + "gpu_mem": 5.02331392, + "loss": 1.3753, + "grad_norm": 0.08465363830327988, + "learning_rate": 0.0002591380654901515 + }, + { + "step": 199, + "epoch": 0.3189102564102564, + "cpu_mem": 1.93521664, + "gpu_mem": 5.023310848, + "loss": 1.398, + "grad_norm": 0.10660053789615631, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 200, + "epoch": 0.32051282051282054, + "cpu_mem": 1.935413248, + "gpu_mem": 5.023326208, + "loss": 1.4222, + "grad_norm": 0.11382400244474411, + "learning_rate": 0.0002579787415212732 + }, + { + "step": 201, + "epoch": 0.32211538461538464, + "cpu_mem": 1.935609856, + "gpu_mem": 5.023303168, + "loss": 1.4015, + "grad_norm": 0.06596478819847107, + "learning_rate": 0.00025739399113813784 + }, + { + "step": 202, + "epoch": 0.32371794871794873, + "cpu_mem": 1.935806464, + "gpu_mem": 5.023304704, + "loss": 1.3938, + "grad_norm": 0.07286670804023743, + "learning_rate": 0.00025680587290399277 + }, + { + "step": 203, + "epoch": 0.32532051282051283, + "cpu_mem": 1.936003072, + "gpu_mem": 5.023346176, + "loss": 1.4018, + "grad_norm": 0.12952618300914764, + "learning_rate": 0.0002562144052620913 + }, + { + "step": 204, + "epoch": 0.3269230769230769, + "cpu_mem": 1.93619968, + "gpu_mem": 5.023316992, + "loss": 1.3922, + "grad_norm": 0.06528601050376892, + "learning_rate": 0.00025561960676072354 + }, + { + "step": 205, + "epoch": 0.328525641025641, + "cpu_mem": 1.936396288, + "gpu_mem": 5.023316992, + "loss": 1.3773, + "grad_norm": 0.07670952379703522, + "learning_rate": 0.0002550214960526344 + }, + { + "step": 206, + "epoch": 0.3301282051282051, + "cpu_mem": 1.936592896, + "gpu_mem": 5.02331392, + "loss": 1.4002, + "grad_norm": 0.14406822621822357, + "learning_rate": 0.000254420091894439 + }, + { + "step": 207, + "epoch": 0.3317307692307692, + "cpu_mem": 1.936789504, + "gpu_mem": 5.02331392, + "loss": 1.394, + "grad_norm": 0.07153785228729248, + "learning_rate": 0.0002538154131460342 + }, + { + "step": 208, + "epoch": 0.3333333333333333, + "cpu_mem": 1.93718272, + "gpu_mem": 5.023304704, + "loss": 1.3832, + "grad_norm": 0.0581243596971035, + "learning_rate": 0.00025320747877000745 + }, + { + "step": 209, + "epoch": 0.3349358974358974, + "cpu_mem": 1.93718272, + "gpu_mem": 5.023340032, + "loss": 1.377, + "grad_norm": 0.08443662524223328, + "learning_rate": 0.00025259630783104164 + }, + { + "step": 210, + "epoch": 0.33653846153846156, + "cpu_mem": 1.937379328, + "gpu_mem": 5.023297024, + "loss": 1.3749, + "grad_norm": 0.06430171430110931, + "learning_rate": 0.00025198191949531786 + }, + { + "step": 211, + "epoch": 0.33814102564102566, + "cpu_mem": 1.937575936, + "gpu_mem": 5.023324672, + "loss": 1.3869, + "grad_norm": 0.07202492654323578, + "learning_rate": 0.00025136433302991366 + }, + { + "step": 212, + "epoch": 0.33974358974358976, + "cpu_mem": 1.937772544, + "gpu_mem": 5.023333888, + "loss": 1.4034, + "grad_norm": 0.16461052000522614, + "learning_rate": 0.00025074356780219946 + }, + { + "step": 213, + "epoch": 0.34134615384615385, + "cpu_mem": 1.937969152, + "gpu_mem": 5.02330624, + "loss": 1.3851, + "grad_norm": 0.13475705683231354, + "learning_rate": 0.000250119643279231 + }, + { + "step": 214, + "epoch": 0.34294871794871795, + "cpu_mem": 1.93816576, + "gpu_mem": 5.023315456, + "loss": 1.3839, + "grad_norm": 0.08623498678207397, + "learning_rate": 0.0002494925790271386 + }, + { + "step": 215, + "epoch": 0.34455128205128205, + "cpu_mem": 1.938362368, + "gpu_mem": 5.023316992, + "loss": 1.3888, + "grad_norm": 0.11942587792873383, + "learning_rate": 0.00024886239471051376 + }, + { + "step": 216, + "epoch": 0.34615384615384615, + "cpu_mem": 1.938558976, + "gpu_mem": 5.023316992, + "loss": 1.3925, + "grad_norm": 0.10716133564710617, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 217, + "epoch": 0.34775641025641024, + "cpu_mem": 1.938755584, + "gpu_mem": 5.023301632, + "loss": 1.3921, + "grad_norm": 0.07383876293897629, + "learning_rate": 0.0002475927450306363 + }, + { + "step": 218, + "epoch": 0.34935897435897434, + "cpu_mem": 1.9391488, + "gpu_mem": 5.023323136, + "loss": 1.3825, + "grad_norm": 0.08507664501667023, + "learning_rate": 0.0002469533194833073 + }, + { + "step": 219, + "epoch": 0.35096153846153844, + "cpu_mem": 1.9391488, + "gpu_mem": 5.023356928, + "loss": 1.3839, + "grad_norm": 0.0924685001373291, + "learning_rate": 0.0002463108535020447 + }, + { + "step": 220, + "epoch": 0.3525641025641026, + "cpu_mem": 1.939345408, + "gpu_mem": 5.023310848, + "loss": 1.3946, + "grad_norm": 0.0966474711894989, + "learning_rate": 0.0002456653672344348 + }, + { + "step": 221, + "epoch": 0.3541666666666667, + "cpu_mem": 1.939542016, + "gpu_mem": 5.023316992, + "loss": 1.3947, + "grad_norm": 0.08543919026851654, + "learning_rate": 0.0002450168809227794 + }, + { + "step": 222, + "epoch": 0.3557692307692308, + "cpu_mem": 1.940721664, + "gpu_mem": 5.023332352, + "loss": 1.3933, + "grad_norm": 0.0923055112361908, + "learning_rate": 0.00024436541490346095 + }, + { + "step": 223, + "epoch": 0.3573717948717949, + "cpu_mem": 1.940918272, + "gpu_mem": 5.023350784, + "loss": 1.3836, + "grad_norm": 0.08221800625324249, + "learning_rate": 0.00024371098960630495 + }, + { + "step": 224, + "epoch": 0.358974358974359, + "cpu_mem": 1.940918272, + "gpu_mem": 5.023320064, + "loss": 1.3846, + "grad_norm": 0.04437697306275368, + "learning_rate": 0.000243053625553939 + }, + { + "step": 225, + "epoch": 0.3605769230769231, + "cpu_mem": 1.941311488, + "gpu_mem": 5.02330624, + "loss": 1.3723, + "grad_norm": 0.05562363192439079, + "learning_rate": 0.00024239334336114953 + }, + { + "step": 226, + "epoch": 0.36217948717948717, + "cpu_mem": 1.941508096, + "gpu_mem": 5.02329856, + "loss": 1.3872, + "grad_norm": 0.04246796667575836, + "learning_rate": 0.0002417301637342352 + }, + { + "step": 227, + "epoch": 0.36378205128205127, + "cpu_mem": 1.941704704, + "gpu_mem": 5.023363072, + "loss": 1.3773, + "grad_norm": 0.07069160044193268, + "learning_rate": 0.00024106410747035744 + }, + { + "step": 228, + "epoch": 0.36538461538461536, + "cpu_mem": 1.941901312, + "gpu_mem": 5.023301632, + "loss": 1.3913, + "grad_norm": 0.1456192582845688, + "learning_rate": 0.00024039519545688846 + }, + { + "step": 229, + "epoch": 0.36698717948717946, + "cpu_mem": 1.94209792, + "gpu_mem": 5.023353856, + "loss": 1.3784, + "grad_norm": 0.0418720468878746, + "learning_rate": 0.000239723448670756 + }, + { + "step": 230, + "epoch": 0.3685897435897436, + "cpu_mem": 1.942294528, + "gpu_mem": 5.023335424, + "loss": 1.3745, + "grad_norm": 0.06194714084267616, + "learning_rate": 0.0002390488881777858 + }, + { + "step": 231, + "epoch": 0.3701923076923077, + "cpu_mem": 1.942491136, + "gpu_mem": 5.023333888, + "loss": 1.381, + "grad_norm": 0.09384922683238983, + "learning_rate": 0.0002383715351320406 + }, + { + "step": 232, + "epoch": 0.3717948717948718, + "cpu_mem": 1.942687744, + "gpu_mem": 5.023338496, + "loss": 1.3802, + "grad_norm": 0.053847797214984894, + "learning_rate": 0.00023769141077515713 + }, + { + "step": 233, + "epoch": 0.3733974358974359, + "cpu_mem": 1.942884352, + "gpu_mem": 5.02331392, + "loss": 1.3957, + "grad_norm": 0.19388212263584137, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 234, + "epoch": 0.375, + "cpu_mem": 1.94308096, + "gpu_mem": 5.023343104, + "loss": 1.3958, + "grad_norm": 0.08045337349176407, + "learning_rate": 0.0002363229335283915 + }, + { + "step": 235, + "epoch": 0.3766025641025641, + "cpu_mem": 1.943277568, + "gpu_mem": 5.023320064, + "loss": 1.3887, + "grad_norm": 0.1177559643983841, + "learning_rate": 0.00023563462355364297 + }, + { + "step": 236, + "epoch": 0.3782051282051282, + "cpu_mem": 1.943277568, + "gpu_mem": 5.023381504, + "loss": 1.3873, + "grad_norm": 0.13248060643672943, + "learning_rate": 0.0002349436280966775 + }, + { + "step": 237, + "epoch": 0.3798076923076923, + "cpu_mem": 1.943474176, + "gpu_mem": 5.02330624, + "loss": 1.4077, + "grad_norm": 0.12970410287380219, + "learning_rate": 0.00023424996882695468 + }, + { + "step": 238, + "epoch": 0.3814102564102564, + "cpu_mem": 1.943670784, + "gpu_mem": 5.023316992, + "loss": 1.402, + "grad_norm": 0.09697296470403671, + "learning_rate": 0.00023355366749747063 + }, + { + "step": 239, + "epoch": 0.38301282051282054, + "cpu_mem": 1.943867392, + "gpu_mem": 5.023315456, + "loss": 1.4049, + "grad_norm": 0.08131905645132065, + "learning_rate": 0.00023285474594407585 + }, + { + "step": 240, + "epoch": 0.38461538461538464, + "cpu_mem": 1.944064, + "gpu_mem": 5.023312384, + "loss": 1.3695, + "grad_norm": 0.06505239754915237, + "learning_rate": 0.0002321532260847905 + }, + { + "step": 241, + "epoch": 0.38621794871794873, + "cpu_mem": 1.944260608, + "gpu_mem": 5.023343104, + "loss": 1.3815, + "grad_norm": 0.09809297323226929, + "learning_rate": 0.00023144912991911691 + }, + { + "step": 242, + "epoch": 0.38782051282051283, + "cpu_mem": 1.944457216, + "gpu_mem": 5.0233216, + "loss": 1.3759, + "grad_norm": 0.08884736895561218, + "learning_rate": 0.0002307424795273499 + }, + { + "step": 243, + "epoch": 0.3894230769230769, + "cpu_mem": 1.944653824, + "gpu_mem": 5.023316992, + "loss": 1.3768, + "grad_norm": 0.08147265762090683, + "learning_rate": 0.00023003329706988425 + }, + { + "step": 244, + "epoch": 0.391025641025641, + "cpu_mem": 1.944850432, + "gpu_mem": 5.023327744, + "loss": 1.3802, + "grad_norm": 0.13457126915454865, + "learning_rate": 0.00022932160478651963 + }, + { + "step": 245, + "epoch": 0.3926282051282051, + "cpu_mem": 1.94504704, + "gpu_mem": 5.023332352, + "loss": 1.3964, + "grad_norm": 0.07021345943212509, + "learning_rate": 0.00022860742499576338 + }, + { + "step": 246, + "epoch": 0.3942307692307692, + "cpu_mem": 1.94504704, + "gpu_mem": 5.023293952, + "loss": 1.3896, + "grad_norm": 0.07000038027763367, + "learning_rate": 0.00022789078009413042 + }, + { + "step": 247, + "epoch": 0.3958333333333333, + "cpu_mem": 1.945243648, + "gpu_mem": 5.023361536, + "loss": 1.3696, + "grad_norm": 0.06343337148427963, + "learning_rate": 0.00022717169255544108 + }, + { + "step": 248, + "epoch": 0.3974358974358974, + "cpu_mem": 1.945440256, + "gpu_mem": 5.023324672, + "loss": 1.3674, + "grad_norm": 0.09255696833133698, + "learning_rate": 0.00022645018493011612 + }, + { + "step": 249, + "epoch": 0.39903846153846156, + "cpu_mem": 1.945636864, + "gpu_mem": 5.02331392, + "loss": 1.4037, + "grad_norm": 0.11339548230171204, + "learning_rate": 0.0002257262798444698 + }, + { + "step": 250, + "epoch": 0.40064102564102566, + "cpu_mem": 1.945636864, + "gpu_mem": 5.023330816, + "loss": 1.3767, + "grad_norm": 0.09691799432039261, + "learning_rate": 0.000225 + }, + { + "step": 251, + "epoch": 0.40224358974358976, + "cpu_mem": 1.945833472, + "gpu_mem": 5.023304704, + "loss": 1.401, + "grad_norm": 0.143568217754364, + "learning_rate": 0.00022427136817267668 + }, + { + "step": 252, + "epoch": 0.40384615384615385, + "cpu_mem": 1.94603008, + "gpu_mem": 5.02335232, + "loss": 1.3958, + "grad_norm": 0.08461976796388626, + "learning_rate": 0.0002235404072122273 + }, + { + "step": 253, + "epoch": 0.40544871794871795, + "cpu_mem": 1.94603008, + "gpu_mem": 5.023320064, + "loss": 1.3805, + "grad_norm": 0.10645119100809097, + "learning_rate": 0.00022280714004142054 + }, + { + "step": 254, + "epoch": 0.40705128205128205, + "cpu_mem": 1.946226688, + "gpu_mem": 5.023309312, + "loss": 1.3648, + "grad_norm": 0.07926703244447708, + "learning_rate": 0.00022207158965534726 + }, + { + "step": 255, + "epoch": 0.40865384615384615, + "cpu_mem": 1.946423296, + "gpu_mem": 5.023324672, + "loss": 1.3812, + "grad_norm": 0.06090373918414116, + "learning_rate": 0.0002213337791206993 + }, + { + "step": 256, + "epoch": 0.41025641025641024, + "cpu_mem": 1.946619904, + "gpu_mem": 5.0233216, + "loss": 1.3652, + "grad_norm": 0.05924367904663086, + "learning_rate": 0.00022059373157504636 + }, + { + "step": 257, + "epoch": 0.41185897435897434, + "cpu_mem": 1.946816512, + "gpu_mem": 5.0233216, + "loss": 1.4067, + "grad_norm": 0.16248932480812073, + "learning_rate": 0.00021985147022611038 + }, + { + "step": 258, + "epoch": 0.41346153846153844, + "cpu_mem": 1.946816512, + "gpu_mem": 5.023309312, + "loss": 1.3831, + "grad_norm": 0.09256824851036072, + "learning_rate": 0.0002191070183510375 + }, + { + "step": 259, + "epoch": 0.4150641025641026, + "cpu_mem": 1.94701312, + "gpu_mem": 5.023292416, + "loss": 1.3964, + "grad_norm": 0.1217724159359932, + "learning_rate": 0.00021836039929566835 + }, + { + "step": 260, + "epoch": 0.4166666666666667, + "cpu_mem": 1.94701312, + "gpu_mem": 5.023355392, + "loss": 1.3986, + "grad_norm": 0.09990359097719193, + "learning_rate": 0.0002176116364738058 + }, + { + "step": 261, + "epoch": 0.4182692307692308, + "cpu_mem": 1.947209728, + "gpu_mem": 5.023309312, + "loss": 1.398, + "grad_norm": 0.09939128905534744, + "learning_rate": 0.00021686075336648075 + }, + { + "step": 262, + "epoch": 0.4198717948717949, + "cpu_mem": 1.947406336, + "gpu_mem": 5.023318528, + "loss": 1.3806, + "grad_norm": 0.13333991169929504, + "learning_rate": 0.00021610777352121574 + }, + { + "step": 263, + "epoch": 0.421474358974359, + "cpu_mem": 1.947602944, + "gpu_mem": 5.023353856, + "loss": 1.3864, + "grad_norm": 0.06326144188642502, + "learning_rate": 0.0002153527205512867 + }, + { + "step": 264, + "epoch": 0.4230769230769231, + "cpu_mem": 1.947602944, + "gpu_mem": 5.023318528, + "loss": 1.3737, + "grad_norm": 0.0626857727766037, + "learning_rate": 0.0002145956181349821 + }, + { + "step": 265, + "epoch": 0.42467948717948717, + "cpu_mem": 1.947799552, + "gpu_mem": 5.023323136, + "loss": 1.3955, + "grad_norm": 0.1097937524318695, + "learning_rate": 0.00021383649001486055 + }, + { + "step": 266, + "epoch": 0.42628205128205127, + "cpu_mem": 1.94799616, + "gpu_mem": 5.023370752, + "loss": 1.3767, + "grad_norm": 0.060742951929569244, + "learning_rate": 0.00021307535999700637 + }, + { + "step": 267, + "epoch": 0.42788461538461536, + "cpu_mem": 1.948192768, + "gpu_mem": 5.023379968, + "loss": 1.3998, + "grad_norm": 0.09859129786491394, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 268, + "epoch": 0.42948717948717946, + "cpu_mem": 1.948389376, + "gpu_mem": 5.023333888, + "loss": 1.3895, + "grad_norm": 0.10059262067079544, + "learning_rate": 0.00021154718980558417 + }, + { + "step": 269, + "epoch": 0.4310897435897436, + "cpu_mem": 1.948585984, + "gpu_mem": 5.023327744, + "loss": 1.4197, + "grad_norm": 0.17765209078788757, + "learning_rate": 0.00021078019755508398 + }, + { + "step": 270, + "epoch": 0.4326923076923077, + "cpu_mem": 1.948585984, + "gpu_mem": 5.023389184, + "loss": 1.3886, + "grad_norm": 0.12458042800426483, + "learning_rate": 0.00021001129925148396 + }, + { + "step": 271, + "epoch": 0.4342948717948718, + "cpu_mem": 1.948782592, + "gpu_mem": 5.023315456, + "loss": 1.3832, + "grad_norm": 0.09236852824687958, + "learning_rate": 0.00020924051900725923 + }, + { + "step": 272, + "epoch": 0.4358974358974359, + "cpu_mem": 1.9489792, + "gpu_mem": 5.02331392, + "loss": 1.3835, + "grad_norm": 0.07377377152442932, + "learning_rate": 0.00020846788099390188 + }, + { + "step": 273, + "epoch": 0.4375, + "cpu_mem": 1.9489792, + "gpu_mem": 5.023316992, + "loss": 1.3792, + "grad_norm": 0.0759974792599678, + "learning_rate": 0.0002076934094411635 + }, + { + "step": 274, + "epoch": 0.4391025641025641, + "cpu_mem": 1.949175808, + "gpu_mem": 5.023303168, + "loss": 1.3966, + "grad_norm": 0.0979863703250885, + "learning_rate": 0.0002069171286362949 + }, + { + "step": 275, + "epoch": 0.4407051282051282, + "cpu_mem": 1.949372416, + "gpu_mem": 5.023318528, + "loss": 1.39, + "grad_norm": 0.0815352275967598, + "learning_rate": 0.00020613906292328457 + }, + { + "step": 276, + "epoch": 0.4423076923076923, + "cpu_mem": 1.949372416, + "gpu_mem": 5.023356928, + "loss": 1.3881, + "grad_norm": 0.0774846225976944, + "learning_rate": 0.0002053592367020955 + }, + { + "step": 277, + "epoch": 0.4439102564102564, + "cpu_mem": 1.949569024, + "gpu_mem": 5.02333696, + "loss": 1.3784, + "grad_norm": 0.07886581122875214, + "learning_rate": 0.0002045776744278996 + }, + { + "step": 278, + "epoch": 0.44551282051282054, + "cpu_mem": 1.949765632, + "gpu_mem": 5.023363072, + "loss": 1.3846, + "grad_norm": 0.08084969967603683, + "learning_rate": 0.00020379440061031118 + }, + { + "step": 279, + "epoch": 0.44711538461538464, + "cpu_mem": 1.94996224, + "gpu_mem": 5.02331392, + "loss": 1.3654, + "grad_norm": 0.07025124877691269, + "learning_rate": 0.00020300943981261808 + }, + { + "step": 280, + "epoch": 0.44871794871794873, + "cpu_mem": 1.94996224, + "gpu_mem": 5.023307776, + "loss": 1.3926, + "grad_norm": 0.0980955958366394, + "learning_rate": 0.0002022228166510114 + }, + { + "step": 281, + "epoch": 0.45032051282051283, + "cpu_mem": 1.950158848, + "gpu_mem": 5.023330816, + "loss": 1.362, + "grad_norm": 0.06912469118833542, + "learning_rate": 0.00020143455579381373 + }, + { + "step": 282, + "epoch": 0.4519230769230769, + "cpu_mem": 1.950158848, + "gpu_mem": 5.023309312, + "loss": 1.3666, + "grad_norm": 0.08163390308618546, + "learning_rate": 0.0002006446819607053 + }, + { + "step": 283, + "epoch": 0.453525641025641, + "cpu_mem": 1.950355456, + "gpu_mem": 5.023323136, + "loss": 1.3852, + "grad_norm": 0.0722646713256836, + "learning_rate": 0.00019985321992194892 + }, + { + "step": 284, + "epoch": 0.4551282051282051, + "cpu_mem": 1.950552064, + "gpu_mem": 5.023327744, + "loss": 1.3664, + "grad_norm": 0.10155905038118362, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 285, + "epoch": 0.4567307692307692, + "cpu_mem": 1.950552064, + "gpu_mem": 5.023346176, + "loss": 1.3981, + "grad_norm": 0.12271221727132797, + "learning_rate": 0.00019826563055679418 + }, + { + "step": 286, + "epoch": 0.4583333333333333, + "cpu_mem": 1.950748672, + "gpu_mem": 5.023316992, + "loss": 1.3479, + "grad_norm": 0.09022720903158188, + "learning_rate": 0.00019746955301683537 + }, + { + "step": 287, + "epoch": 0.4599358974358974, + "cpu_mem": 1.950748672, + "gpu_mem": 5.02334464, + "loss": 1.4008, + "grad_norm": 0.0711941197514534, + "learning_rate": 0.0001966719868425464 + }, + { + "step": 288, + "epoch": 0.46153846153846156, + "cpu_mem": 1.95094528, + "gpu_mem": 5.023326208, + "loss": 1.3712, + "grad_norm": 0.11645002663135529, + "learning_rate": 0.0001958729570454201 + }, + { + "step": 289, + "epoch": 0.46314102564102566, + "cpu_mem": 1.95094528, + "gpu_mem": 5.02331392, + "loss": 1.3944, + "grad_norm": 0.09439445286989212, + "learning_rate": 0.0001950724886828484 + }, + { + "step": 290, + "epoch": 0.46474358974358976, + "cpu_mem": 1.951141888, + "gpu_mem": 5.023323136, + "loss": 1.3784, + "grad_norm": 0.1039954423904419, + "learning_rate": 0.000194270606857336 + }, + { + "step": 291, + "epoch": 0.46634615384615385, + "cpu_mem": 1.951338496, + "gpu_mem": 5.023320064, + "loss": 1.3851, + "grad_norm": 0.10483407974243164, + "learning_rate": 0.00019346733671571367 + }, + { + "step": 292, + "epoch": 0.46794871794871795, + "cpu_mem": 1.951338496, + "gpu_mem": 5.023335424, + "loss": 1.3965, + "grad_norm": 0.11360722035169601, + "learning_rate": 0.00019266270344834942 + }, + { + "step": 293, + "epoch": 0.46955128205128205, + "cpu_mem": 1.951535104, + "gpu_mem": 5.023343104, + "loss": 1.3748, + "grad_norm": 0.07995929569005966, + "learning_rate": 0.00019185673228835857 + }, + { + "step": 294, + "epoch": 0.47115384615384615, + "cpu_mem": 1.951731712, + "gpu_mem": 5.023332352, + "loss": 1.4001, + "grad_norm": 0.08931335061788559, + "learning_rate": 0.00019104944851081244 + }, + { + "step": 295, + "epoch": 0.47275641025641024, + "cpu_mem": 1.95192832, + "gpu_mem": 5.023316992, + "loss": 1.3833, + "grad_norm": 0.09271125495433807, + "learning_rate": 0.00019024087743194564 + }, + { + "step": 296, + "epoch": 0.47435897435897434, + "cpu_mem": 1.95192832, + "gpu_mem": 5.023320064, + "loss": 1.3717, + "grad_norm": 0.06446663290262222, + "learning_rate": 0.0001894310444083625 + }, + { + "step": 297, + "epoch": 0.47596153846153844, + "cpu_mem": 1.95192832, + "gpu_mem": 5.02331392, + "loss": 1.3742, + "grad_norm": 0.07929670810699463, + "learning_rate": 0.00018861997483624136 + }, + { + "step": 298, + "epoch": 0.4775641025641026, + "cpu_mem": 1.952124928, + "gpu_mem": 5.023309312, + "loss": 1.3965, + "grad_norm": 0.15091188251972198, + "learning_rate": 0.00018780769415053866 + }, + { + "step": 299, + "epoch": 0.4791666666666667, + "cpu_mem": 1.952321536, + "gpu_mem": 5.023330816, + "loss": 1.386, + "grad_norm": 0.07510553300380707, + "learning_rate": 0.00018699422782419094 + }, + { + "step": 300, + "epoch": 0.4807692307692308, + "cpu_mem": 1.952321536, + "gpu_mem": 5.023323136, + "loss": 1.3566, + "grad_norm": 0.0947006493806839, + "learning_rate": 0.00018617960136731624 + }, + { + "step": 301, + "epoch": 0.4823717948717949, + "cpu_mem": 1.952518144, + "gpu_mem": 5.023295488, + "loss": 1.3694, + "grad_norm": 0.06558796763420105, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 302, + "epoch": 0.483974358974359, + "cpu_mem": 1.952518144, + "gpu_mem": 5.023293952, + "loss": 1.3828, + "grad_norm": 0.07512862980365753, + "learning_rate": 0.0001845469702835641 + }, + { + "step": 303, + "epoch": 0.4855769230769231, + "cpu_mem": 1.952714752, + "gpu_mem": 5.023320064, + "loss": 1.3898, + "grad_norm": 0.10914589464664459, + "learning_rate": 0.00018372901685562414 + }, + { + "step": 304, + "epoch": 0.48717948717948717, + "cpu_mem": 1.952714752, + "gpu_mem": 5.023303168, + "loss": 1.3793, + "grad_norm": 0.08353345841169357, + "learning_rate": 0.00018291000569342676 + }, + { + "step": 305, + "epoch": 0.48878205128205127, + "cpu_mem": 1.95291136, + "gpu_mem": 5.023333888, + "loss": 1.3469, + "grad_norm": 0.22554799914360046, + "learning_rate": 0.00018208996248097458 + }, + { + "step": 306, + "epoch": 0.49038461538461536, + "cpu_mem": 1.95291136, + "gpu_mem": 5.023316992, + "loss": 1.4001, + "grad_norm": 0.10435575991868973, + "learning_rate": 0.00018126891293463547 + }, + { + "step": 307, + "epoch": 0.49198717948717946, + "cpu_mem": 1.953107968, + "gpu_mem": 5.023347712, + "loss": 1.397, + "grad_norm": 0.13112817704677582, + "learning_rate": 0.0001804468828023354 + }, + { + "step": 308, + "epoch": 0.4935897435897436, + "cpu_mem": 1.953107968, + "gpu_mem": 5.023315456, + "loss": 1.3918, + "grad_norm": 0.07967864722013474, + "learning_rate": 0.00017962389786275142 + }, + { + "step": 309, + "epoch": 0.4951923076923077, + "cpu_mem": 1.953107968, + "gpu_mem": 5.023341568, + "loss": 1.3601, + "grad_norm": 0.08441589027643204, + "learning_rate": 0.000178799983924503 + }, + { + "step": 310, + "epoch": 0.4967948717948718, + "cpu_mem": 1.953304576, + "gpu_mem": 5.023316992, + "loss": 1.3803, + "grad_norm": 0.049407023936510086, + "learning_rate": 0.00017797516682534293 + }, + { + "step": 311, + "epoch": 0.4983974358974359, + "cpu_mem": 1.953304576, + "gpu_mem": 5.023312384, + "loss": 1.3857, + "grad_norm": 0.06897051632404327, + "learning_rate": 0.00017714947243134695 + }, + { + "step": 312, + "epoch": 0.5, + "cpu_mem": 1.953501184, + "gpu_mem": 5.023315456, + "loss": 1.3719, + "grad_norm": 0.09136711806058884, + "learning_rate": 0.0001763229266361024 + }, + { + "step": 313, + "epoch": 0.5016025641025641, + "cpu_mem": 1.953501184, + "gpu_mem": 5.023333888, + "loss": 1.3894, + "grad_norm": 0.08591338992118835, + "learning_rate": 0.00017549555535989648 + }, + { + "step": 314, + "epoch": 0.5032051282051282, + "cpu_mem": 1.953501184, + "gpu_mem": 5.02331392, + "loss": 1.372, + "grad_norm": 0.10230668634176254, + "learning_rate": 0.00017466738454890323 + }, + { + "step": 315, + "epoch": 0.5048076923076923, + "cpu_mem": 1.953697792, + "gpu_mem": 5.023318528, + "loss": 1.3875, + "grad_norm": 0.0709449052810669, + "learning_rate": 0.00017383844017436996 + }, + { + "step": 316, + "epoch": 0.5064102564102564, + "cpu_mem": 1.9538944, + "gpu_mem": 5.02331392, + "loss": 1.4044, + "grad_norm": 0.10817074775695801, + "learning_rate": 0.00017300874823180282 + }, + { + "step": 317, + "epoch": 0.5080128205128205, + "cpu_mem": 1.954091008, + "gpu_mem": 5.0233216, + "loss": 1.3857, + "grad_norm": 0.05447469279170036, + "learning_rate": 0.00017217833474015128 + }, + { + "step": 318, + "epoch": 0.5096153846153846, + "cpu_mem": 1.954287616, + "gpu_mem": 5.023346176, + "loss": 1.3759, + "grad_norm": 0.0846274197101593, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 319, + "epoch": 0.5112179487179487, + "cpu_mem": 1.954287616, + "gpu_mem": 5.023338496, + "loss": 1.3757, + "grad_norm": 0.08072542399168015, + "learning_rate": 0.0001705154472977154 + }, + { + "step": 320, + "epoch": 0.5128205128205128, + "cpu_mem": 1.954287616, + "gpu_mem": 5.023340032, + "loss": 1.3845, + "grad_norm": 0.08584184944629669, + "learning_rate": 0.00016968302549470095 + }, + { + "step": 321, + "epoch": 0.5144230769230769, + "cpu_mem": 1.954484224, + "gpu_mem": 5.023315456, + "loss": 1.3816, + "grad_norm": 0.08229397237300873, + "learning_rate": 0.00016884998643650694 + }, + { + "step": 322, + "epoch": 0.5160256410256411, + "cpu_mem": 1.954484224, + "gpu_mem": 5.023316992, + "loss": 1.3908, + "grad_norm": 0.0681251585483551, + "learning_rate": 0.00016801635624704776 + }, + { + "step": 323, + "epoch": 0.5176282051282052, + "cpu_mem": 1.954680832, + "gpu_mem": 5.02333696, + "loss": 1.383, + "grad_norm": 0.07159695029258728, + "learning_rate": 0.0001671821610687756 + }, + { + "step": 324, + "epoch": 0.5192307692307693, + "cpu_mem": 1.954680832, + "gpu_mem": 5.023309312, + "loss": 1.3828, + "grad_norm": 0.12737791240215302, + "learning_rate": 0.00016634742706186036 + }, + { + "step": 325, + "epoch": 0.5208333333333334, + "cpu_mem": 1.95487744, + "gpu_mem": 5.0233216, + "loss": 1.3471, + "grad_norm": 0.08340204507112503, + "learning_rate": 0.00016551218040336993 + }, + { + "step": 326, + "epoch": 0.5224358974358975, + "cpu_mem": 1.95487744, + "gpu_mem": 5.023330816, + "loss": 1.3914, + "grad_norm": 0.1098005622625351, + "learning_rate": 0.00016467644728644843 + }, + { + "step": 327, + "epoch": 0.5240384615384616, + "cpu_mem": 1.955074048, + "gpu_mem": 5.023307776, + "loss": 1.3666, + "grad_norm": 0.14338287711143494, + "learning_rate": 0.0001638402539194953 + }, + { + "step": 328, + "epoch": 0.5256410256410257, + "cpu_mem": 1.955074048, + "gpu_mem": 5.023332352, + "loss": 1.3879, + "grad_norm": 0.11075535416603088, + "learning_rate": 0.00016300362652534346 + }, + { + "step": 329, + "epoch": 0.5272435897435898, + "cpu_mem": 1.955074048, + "gpu_mem": 5.023332352, + "loss": 1.3707, + "grad_norm": 0.07359617203474045, + "learning_rate": 0.00016216659134043657 + }, + { + "step": 330, + "epoch": 0.5288461538461539, + "cpu_mem": 1.955270656, + "gpu_mem": 5.023315456, + "loss": 1.3745, + "grad_norm": 0.09032268077135086, + "learning_rate": 0.00016132917461400686 + }, + { + "step": 331, + "epoch": 0.530448717948718, + "cpu_mem": 1.955270656, + "gpu_mem": 5.023312384, + "loss": 1.3749, + "grad_norm": 0.18635468184947968, + "learning_rate": 0.00016049140260725127 + }, + { + "step": 332, + "epoch": 0.532051282051282, + "cpu_mem": 1.955467264, + "gpu_mem": 5.023304704, + "loss": 1.3794, + "grad_norm": 0.08387758582830429, + "learning_rate": 0.00015965330159250845 + }, + { + "step": 333, + "epoch": 0.5336538461538461, + "cpu_mem": 1.955467264, + "gpu_mem": 5.023343104, + "loss": 1.4164, + "grad_norm": 0.13700629770755768, + "learning_rate": 0.00015881489785243467 + }, + { + "step": 334, + "epoch": 0.5352564102564102, + "cpu_mem": 1.955467264, + "gpu_mem": 5.023320064, + "loss": 1.3828, + "grad_norm": 0.07802138477563858, + "learning_rate": 0.00015797621767917942 + }, + { + "step": 335, + "epoch": 0.5368589743589743, + "cpu_mem": 1.955663872, + "gpu_mem": 5.023318528, + "loss": 1.3888, + "grad_norm": 0.07542707026004791, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 336, + "epoch": 0.5384615384615384, + "cpu_mem": 1.95586048, + "gpu_mem": 5.023335424, + "loss": 1.3867, + "grad_norm": 0.10383594036102295, + "learning_rate": 0.00015629813324424292 + }, + { + "step": 337, + "epoch": 0.5400641025641025, + "cpu_mem": 1.95586048, + "gpu_mem": 5.023320064, + "loss": 1.3536, + "grad_norm": 0.06877502053976059, + "learning_rate": 0.00015545878160690583 + }, + { + "step": 338, + "epoch": 0.5416666666666666, + "cpu_mem": 1.95586048, + "gpu_mem": 5.023332352, + "loss": 1.4029, + "grad_norm": 0.11512263864278793, + "learning_rate": 0.00015461925878342556 + }, + { + "step": 339, + "epoch": 0.5432692307692307, + "cpu_mem": 1.956057088, + "gpu_mem": 5.02334464, + "loss": 1.3785, + "grad_norm": 0.09147729724645615, + "learning_rate": 0.00015377959110104584 + }, + { + "step": 340, + "epoch": 0.5448717948717948, + "cpu_mem": 1.956057088, + "gpu_mem": 5.023320064, + "loss": 1.3827, + "grad_norm": 0.09075796604156494, + "learning_rate": 0.00015293980489155333 + }, + { + "step": 341, + "epoch": 0.5464743589743589, + "cpu_mem": 1.956057088, + "gpu_mem": 5.023364608, + "loss": 1.3985, + "grad_norm": 0.13801220059394836, + "learning_rate": 0.00015209992649045152 + }, + { + "step": 342, + "epoch": 0.5480769230769231, + "cpu_mem": 1.956253696, + "gpu_mem": 5.023338496, + "loss": 1.3569, + "grad_norm": 0.06648971140384674, + "learning_rate": 0.000151259982236135 + }, + { + "step": 343, + "epoch": 0.5496794871794872, + "cpu_mem": 1.956253696, + "gpu_mem": 5.023335424, + "loss": 1.377, + "grad_norm": 0.09948590397834778, + "learning_rate": 0.00015041999846906367 + }, + { + "step": 344, + "epoch": 0.5512820512820513, + "cpu_mem": 1.956253696, + "gpu_mem": 5.023316992, + "loss": 1.3666, + "grad_norm": 0.0629144236445427, + "learning_rate": 0.00014958000153093634 + }, + { + "step": 345, + "epoch": 0.5528846153846154, + "cpu_mem": 1.956253696, + "gpu_mem": 5.023323136, + "loss": 1.3989, + "grad_norm": 0.123702272772789, + "learning_rate": 0.000148740017763865 + }, + { + "step": 346, + "epoch": 0.5544871794871795, + "cpu_mem": 1.956450304, + "gpu_mem": 5.023292416, + "loss": 1.385, + "grad_norm": 0.08887743949890137, + "learning_rate": 0.00014790007350954845 + }, + { + "step": 347, + "epoch": 0.5560897435897436, + "cpu_mem": 1.956450304, + "gpu_mem": 5.023356928, + "loss": 1.3839, + "grad_norm": 0.08047608286142349, + "learning_rate": 0.00014706019510844664 + }, + { + "step": 348, + "epoch": 0.5576923076923077, + "cpu_mem": 1.956646912, + "gpu_mem": 5.023310848, + "loss": 1.4031, + "grad_norm": 0.11981704831123352, + "learning_rate": 0.0001462204088989541 + }, + { + "step": 349, + "epoch": 0.5592948717948718, + "cpu_mem": 1.956646912, + "gpu_mem": 5.023304704, + "loss": 1.3818, + "grad_norm": 0.08963096886873245, + "learning_rate": 0.00014538074121657447 + }, + { + "step": 350, + "epoch": 0.5608974358974359, + "cpu_mem": 1.95684352, + "gpu_mem": 5.02336, + "loss": 1.3643, + "grad_norm": 0.05309353768825531, + "learning_rate": 0.00014454121839309415 + }, + { + "step": 351, + "epoch": 0.5625, + "cpu_mem": 1.95684352, + "gpu_mem": 5.023326208, + "loss": 1.3898, + "grad_norm": 0.12192069739103317, + "learning_rate": 0.00014370186675575705 + }, + { + "step": 352, + "epoch": 0.5641025641025641, + "cpu_mem": 1.957040128, + "gpu_mem": 5.02331392, + "loss": 1.3463, + "grad_norm": 0.10794790089130402, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 353, + "epoch": 0.5657051282051282, + "cpu_mem": 1.957040128, + "gpu_mem": 5.023318528, + "loss": 1.3833, + "grad_norm": 0.08630389720201492, + "learning_rate": 0.00014202378232082053 + }, + { + "step": 354, + "epoch": 0.5673076923076923, + "cpu_mem": 1.957040128, + "gpu_mem": 5.02329856, + "loss": 1.3688, + "grad_norm": 0.07415412366390228, + "learning_rate": 0.00014118510214756536 + }, + { + "step": 355, + "epoch": 0.5689102564102564, + "cpu_mem": 1.957236736, + "gpu_mem": 5.023323136, + "loss": 1.3572, + "grad_norm": 0.07280701398849487, + "learning_rate": 0.00014034669840749152 + }, + { + "step": 356, + "epoch": 0.5705128205128205, + "cpu_mem": 1.957433344, + "gpu_mem": 5.023301632, + "loss": 1.3782, + "grad_norm": 0.06400305777788162, + "learning_rate": 0.0001395085973927487 + }, + { + "step": 357, + "epoch": 0.5721153846153846, + "cpu_mem": 1.957433344, + "gpu_mem": 5.023318528, + "loss": 1.3719, + "grad_norm": 0.07611364126205444, + "learning_rate": 0.00013867082538599317 + }, + { + "step": 358, + "epoch": 0.5737179487179487, + "cpu_mem": 1.957433344, + "gpu_mem": 5.0232832, + "loss": 1.3844, + "grad_norm": 0.07107598334550858, + "learning_rate": 0.00013783340865956338 + }, + { + "step": 359, + "epoch": 0.5753205128205128, + "cpu_mem": 1.957433344, + "gpu_mem": 5.023315456, + "loss": 1.3617, + "grad_norm": 0.08351916074752808, + "learning_rate": 0.0001369963734746566 + }, + { + "step": 360, + "epoch": 0.5769230769230769, + "cpu_mem": 1.957433344, + "gpu_mem": 5.023304704, + "loss": 1.3778, + "grad_norm": 0.09009706228971481, + "learning_rate": 0.0001361597460805047 + }, + { + "step": 361, + "epoch": 0.5785256410256411, + "cpu_mem": 1.957629952, + "gpu_mem": 5.023341568, + "loss": 1.3714, + "grad_norm": 0.09729083627462387, + "learning_rate": 0.00013532355271355154 + }, + { + "step": 362, + "epoch": 0.5801282051282052, + "cpu_mem": 1.95782656, + "gpu_mem": 5.023307776, + "loss": 1.363, + "grad_norm": 0.08235626667737961, + "learning_rate": 0.00013448781959663004 + }, + { + "step": 363, + "epoch": 0.5817307692307693, + "cpu_mem": 1.95782656, + "gpu_mem": 5.023330816, + "loss": 1.3807, + "grad_norm": 0.10569480806589127, + "learning_rate": 0.00013365257293813956 + }, + { + "step": 364, + "epoch": 0.5833333333333334, + "cpu_mem": 1.95782656, + "gpu_mem": 5.023320064, + "loss": 1.3637, + "grad_norm": 0.09219318628311157, + "learning_rate": 0.00013281783893122446 + }, + { + "step": 365, + "epoch": 0.5849358974358975, + "cpu_mem": 1.95782656, + "gpu_mem": 5.023326208, + "loss": 1.3522, + "grad_norm": 0.07217380404472351, + "learning_rate": 0.00013198364375295224 + }, + { + "step": 366, + "epoch": 0.5865384615384616, + "cpu_mem": 1.958023168, + "gpu_mem": 5.023320064, + "loss": 1.3988, + "grad_norm": 0.13897374272346497, + "learning_rate": 0.000131150013563493 + }, + { + "step": 367, + "epoch": 0.5881410256410257, + "cpu_mem": 1.958023168, + "gpu_mem": 5.023338496, + "loss": 1.3817, + "grad_norm": 0.09503920376300812, + "learning_rate": 0.00013031697450529902 + }, + { + "step": 368, + "epoch": 0.5897435897435898, + "cpu_mem": 1.958023168, + "gpu_mem": 5.02329856, + "loss": 1.3747, + "grad_norm": 0.06174430251121521, + "learning_rate": 0.0001294845527022846 + }, + { + "step": 369, + "epoch": 0.5913461538461539, + "cpu_mem": 1.958023168, + "gpu_mem": 5.023330816, + "loss": 1.3558, + "grad_norm": 0.07876177132129669, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 370, + "epoch": 0.592948717948718, + "cpu_mem": 1.958219776, + "gpu_mem": 5.023350784, + "loss": 1.3788, + "grad_norm": 0.09520623832941055, + "learning_rate": 0.0001278216652598487 + }, + { + "step": 371, + "epoch": 0.594551282051282, + "cpu_mem": 1.958219776, + "gpu_mem": 5.02334464, + "loss": 1.3656, + "grad_norm": 0.09041670709848404, + "learning_rate": 0.00012699125176819716 + }, + { + "step": 372, + "epoch": 0.5961538461538461, + "cpu_mem": 1.958219776, + "gpu_mem": 5.023307776, + "loss": 1.3891, + "grad_norm": 0.12929053604602814, + "learning_rate": 0.00012616155982563 + }, + { + "step": 373, + "epoch": 0.5977564102564102, + "cpu_mem": 1.958219776, + "gpu_mem": 5.023324672, + "loss": 1.3625, + "grad_norm": 0.15532872080802917, + "learning_rate": 0.00012533261545109674 + }, + { + "step": 374, + "epoch": 0.5993589743589743, + "cpu_mem": 1.958416384, + "gpu_mem": 5.023301632, + "loss": 1.3407, + "grad_norm": 0.13268089294433594, + "learning_rate": 0.00012450444464010352 + }, + { + "step": 375, + "epoch": 0.6009615384615384, + "cpu_mem": 1.958416384, + "gpu_mem": 5.023333888, + "loss": 1.3785, + "grad_norm": 0.21846511960029602, + "learning_rate": 0.0001236770733638976 + }, + { + "step": 376, + "epoch": 0.6025641025641025, + "cpu_mem": 1.958612992, + "gpu_mem": 5.02332928, + "loss": 1.3383, + "grad_norm": 0.10668228566646576, + "learning_rate": 0.000122850527568653 + }, + { + "step": 377, + "epoch": 0.6041666666666666, + "cpu_mem": 1.958612992, + "gpu_mem": 5.023338496, + "loss": 1.3634, + "grad_norm": 0.10398916900157928, + "learning_rate": 0.00012202483317465704 + }, + { + "step": 378, + "epoch": 0.6057692307692307, + "cpu_mem": 1.958612992, + "gpu_mem": 5.023312384, + "loss": 1.3603, + "grad_norm": 0.09240524470806122, + "learning_rate": 0.00012120001607549698 + }, + { + "step": 379, + "epoch": 0.6073717948717948, + "cpu_mem": 1.958612992, + "gpu_mem": 5.023332352, + "loss": 1.3478, + "grad_norm": 0.19422654807567596, + "learning_rate": 0.00012037610213724862 + }, + { + "step": 380, + "epoch": 0.6089743589743589, + "cpu_mem": 1.958612992, + "gpu_mem": 5.02330624, + "loss": 1.3368, + "grad_norm": 0.11079961806535721, + "learning_rate": 0.0001195531171976646 + }, + { + "step": 381, + "epoch": 0.6105769230769231, + "cpu_mem": 1.9588096, + "gpu_mem": 5.023330816, + "loss": 1.3851, + "grad_norm": 0.12626340985298157, + "learning_rate": 0.00011873108706536448 + }, + { + "step": 382, + "epoch": 0.6121794871794872, + "cpu_mem": 1.9588096, + "gpu_mem": 5.023315456, + "loss": 1.3638, + "grad_norm": 0.21447362005710602, + "learning_rate": 0.00011791003751902542 + }, + { + "step": 383, + "epoch": 0.6137820512820513, + "cpu_mem": 1.9588096, + "gpu_mem": 5.023349248, + "loss": 1.3344, + "grad_norm": 0.18830470740795135, + "learning_rate": 0.00011708999430657325 + }, + { + "step": 384, + "epoch": 0.6153846153846154, + "cpu_mem": 1.959006208, + "gpu_mem": 5.02332928, + "loss": 1.3477, + "grad_norm": 0.1189810112118721, + "learning_rate": 0.00011627098314437586 + }, + { + "step": 385, + "epoch": 0.6169871794871795, + "cpu_mem": 1.959202816, + "gpu_mem": 5.02331392, + "loss": 1.3434, + "grad_norm": 0.13893313705921173, + "learning_rate": 0.0001154530297164359 + }, + { + "step": 386, + "epoch": 0.6185897435897436, + "cpu_mem": 1.959202816, + "gpu_mem": 5.023349248, + "loss": 1.3467, + "grad_norm": 0.12827660143375397, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 387, + "epoch": 0.6201923076923077, + "cpu_mem": 1.959202816, + "gpu_mem": 5.023355392, + "loss": 1.3642, + "grad_norm": 0.19886890053749084, + "learning_rate": 0.00011382039863268374 + }, + { + "step": 388, + "epoch": 0.6217948717948718, + "cpu_mem": 1.959202816, + "gpu_mem": 5.023318528, + "loss": 1.4113, + "grad_norm": 0.23842692375183105, + "learning_rate": 0.00011300577217580905 + }, + { + "step": 389, + "epoch": 0.6233974358974359, + "cpu_mem": 1.959202816, + "gpu_mem": 5.023297024, + "loss": 1.3482, + "grad_norm": 0.22474387288093567, + "learning_rate": 0.00011219230584946136 + }, + { + "step": 390, + "epoch": 0.625, + "cpu_mem": 1.959202816, + "gpu_mem": 5.023349248, + "loss": 1.3061, + "grad_norm": 0.19454948604106903, + "learning_rate": 0.00011138002516375864 + }, + { + "step": 391, + "epoch": 0.6266025641025641, + "cpu_mem": 1.959202816, + "gpu_mem": 5.023335424, + "loss": 1.3263, + "grad_norm": 0.19075030088424683, + "learning_rate": 0.00011056895559163748 + }, + { + "step": 392, + "epoch": 0.6282051282051282, + "cpu_mem": 1.959399424, + "gpu_mem": 5.02332928, + "loss": 1.3416, + "grad_norm": 0.25634366273880005, + "learning_rate": 0.00010975912256805436 + }, + { + "step": 393, + "epoch": 0.6298076923076923, + "cpu_mem": 1.959399424, + "gpu_mem": 5.023335424, + "loss": 1.335, + "grad_norm": 0.2184402197599411, + "learning_rate": 0.00010895055148918756 + }, + { + "step": 394, + "epoch": 0.6314102564102564, + "cpu_mem": 1.959399424, + "gpu_mem": 5.023312384, + "loss": 1.3894, + "grad_norm": 0.2978273630142212, + "learning_rate": 0.00010814326771164141 + }, + { + "step": 395, + "epoch": 0.6330128205128205, + "cpu_mem": 1.959399424, + "gpu_mem": 5.023326208, + "loss": 1.3641, + "grad_norm": 0.3813970983028412, + "learning_rate": 0.00010733729655165054 + }, + { + "step": 396, + "epoch": 0.6346153846153846, + "cpu_mem": 1.959596032, + "gpu_mem": 5.023326208, + "loss": 1.3798, + "grad_norm": 0.24497756361961365, + "learning_rate": 0.00010653266328428628 + }, + { + "step": 397, + "epoch": 0.6362179487179487, + "cpu_mem": 1.959596032, + "gpu_mem": 5.023295488, + "loss": 1.3053, + "grad_norm": 0.2516267001628876, + "learning_rate": 0.00010572939314266402 + }, + { + "step": 398, + "epoch": 0.6378205128205128, + "cpu_mem": 1.959596032, + "gpu_mem": 5.02332928, + "loss": 1.3492, + "grad_norm": 0.21037733554840088, + "learning_rate": 0.00010492751131715159 + }, + { + "step": 399, + "epoch": 0.6394230769230769, + "cpu_mem": 1.959596032, + "gpu_mem": 5.023307776, + "loss": 1.2994, + "grad_norm": 0.1526341438293457, + "learning_rate": 0.00010412704295457988 + }, + { + "step": 400, + "epoch": 0.6410256410256411, + "cpu_mem": 1.95979264, + "gpu_mem": 5.023315456, + "loss": 1.3379, + "grad_norm": 0.24322471022605896, + "learning_rate": 0.00010332801315745361 + }, + { + "step": 401, + "epoch": 0.6426282051282052, + "cpu_mem": 1.95979264, + "gpu_mem": 5.023333888, + "loss": 1.3125, + "grad_norm": 0.15088976919651031, + "learning_rate": 0.00010253044698316464 + }, + { + "step": 402, + "epoch": 0.6442307692307693, + "cpu_mem": 1.95979264, + "gpu_mem": 5.023301632, + "loss": 1.3917, + "grad_norm": 0.2748621702194214, + "learning_rate": 0.00010173436944320582 + }, + { + "step": 403, + "epoch": 0.6458333333333334, + "cpu_mem": 1.959989248, + "gpu_mem": 5.02330624, + "loss": 1.3279, + "grad_norm": 0.1796865016222, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 404, + "epoch": 0.6474358974358975, + "cpu_mem": 1.959989248, + "gpu_mem": 5.023301632, + "loss": 1.2976, + "grad_norm": 0.21324394643306732, + "learning_rate": 0.00010014678007805106 + }, + { + "step": 405, + "epoch": 0.6490384615384616, + "cpu_mem": 1.959989248, + "gpu_mem": 5.023346176, + "loss": 1.3743, + "grad_norm": 0.16637343168258667, + "learning_rate": 9.935531803929469e-05 + }, + { + "step": 406, + "epoch": 0.6506410256410257, + "cpu_mem": 1.960185856, + "gpu_mem": 5.02332928, + "loss": 1.3303, + "grad_norm": 0.28788501024246216, + "learning_rate": 9.856544420618624e-05 + }, + { + "step": 407, + "epoch": 0.6522435897435898, + "cpu_mem": 1.960185856, + "gpu_mem": 5.023318528, + "loss": 1.2782, + "grad_norm": 0.16239605844020844, + "learning_rate": 9.777718334898859e-05 + }, + { + "step": 408, + "epoch": 0.6538461538461539, + "cpu_mem": 1.960185856, + "gpu_mem": 5.023340032, + "loss": 1.2789, + "grad_norm": 0.2628838121891022, + "learning_rate": 9.699056018738192e-05 + }, + { + "step": 409, + "epoch": 0.655448717948718, + "cpu_mem": 1.960185856, + "gpu_mem": 5.02330624, + "loss": 1.2985, + "grad_norm": 0.1943783015012741, + "learning_rate": 9.62055993896888e-05 + }, + { + "step": 410, + "epoch": 0.657051282051282, + "cpu_mem": 1.960382464, + "gpu_mem": 5.0233216, + "loss": 1.3092, + "grad_norm": 0.26594457030296326, + "learning_rate": 9.542232557210039e-05 + }, + { + "step": 411, + "epoch": 0.6586538461538461, + "cpu_mem": 1.960382464, + "gpu_mem": 5.0233216, + "loss": 1.3565, + "grad_norm": 0.28785601258277893, + "learning_rate": 9.464076329790451e-05 + }, + { + "step": 412, + "epoch": 0.6602564102564102, + "cpu_mem": 1.960382464, + "gpu_mem": 5.023312384, + "loss": 1.2666, + "grad_norm": 0.25067195296287537, + "learning_rate": 9.386093707671543e-05 + }, + { + "step": 413, + "epoch": 0.6618589743589743, + "cpu_mem": 1.960579072, + "gpu_mem": 5.023323136, + "loss": 1.4236, + "grad_norm": 0.6054099798202515, + "learning_rate": 9.308287136370511e-05 + }, + { + "step": 414, + "epoch": 0.6634615384615384, + "cpu_mem": 1.960579072, + "gpu_mem": 5.023347712, + "loss": 1.3219, + "grad_norm": 0.2349090874195099, + "learning_rate": 9.230659055883649e-05 + }, + { + "step": 415, + "epoch": 0.6650641025641025, + "cpu_mem": 1.960579072, + "gpu_mem": 5.023300096, + "loss": 1.3109, + "grad_norm": 0.18480688333511353, + "learning_rate": 9.15321190060981e-05 + }, + { + "step": 416, + "epoch": 0.6666666666666666, + "cpu_mem": 1.960579072, + "gpu_mem": 5.023335424, + "loss": 1.27, + "grad_norm": 0.24125900864601135, + "learning_rate": 9.075948099274078e-05 + }, + { + "step": 417, + "epoch": 0.6682692307692307, + "cpu_mem": 1.960579072, + "gpu_mem": 5.023297024, + "loss": 1.2903, + "grad_norm": 0.3414970934391022, + "learning_rate": 8.998870074851604e-05 + }, + { + "step": 418, + "epoch": 0.6698717948717948, + "cpu_mem": 1.960579072, + "gpu_mem": 5.023315456, + "loss": 1.347, + "grad_norm": 0.27936938405036926, + "learning_rate": 8.9219802444916e-05 + }, + { + "step": 419, + "epoch": 0.6714743589743589, + "cpu_mem": 1.960579072, + "gpu_mem": 5.023307776, + "loss": 1.3129, + "grad_norm": 0.232723668217659, + "learning_rate": 8.845281019441583e-05 + }, + { + "step": 420, + "epoch": 0.6730769230769231, + "cpu_mem": 1.96077568, + "gpu_mem": 5.02334464, + "loss": 1.2894, + "grad_norm": 0.23088233172893524, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 421, + "epoch": 0.6746794871794872, + "cpu_mem": 1.96077568, + "gpu_mem": 5.023304704, + "loss": 1.3046, + "grad_norm": 0.32552120089530945, + "learning_rate": 8.692464000299362e-05 + }, + { + "step": 422, + "epoch": 0.6762820512820513, + "cpu_mem": 1.96077568, + "gpu_mem": 5.023318528, + "loss": 1.3582, + "grad_norm": 0.46067243814468384, + "learning_rate": 8.61635099851395e-05 + }, + { + "step": 423, + "epoch": 0.6778846153846154, + "cpu_mem": 1.960972288, + "gpu_mem": 5.023323136, + "loss": 1.2574, + "grad_norm": 0.3221355974674225, + "learning_rate": 8.540438186501792e-05 + }, + { + "step": 424, + "epoch": 0.6794871794871795, + "cpu_mem": 1.960972288, + "gpu_mem": 5.023284736, + "loss": 1.3224, + "grad_norm": 0.3334082365036011, + "learning_rate": 8.464727944871322e-05 + }, + { + "step": 425, + "epoch": 0.6810897435897436, + "cpu_mem": 1.960972288, + "gpu_mem": 5.023307776, + "loss": 1.3328, + "grad_norm": 0.20741058886051178, + "learning_rate": 8.389222647878426e-05 + }, + { + "step": 426, + "epoch": 0.6826923076923077, + "cpu_mem": 1.961168896, + "gpu_mem": 5.02330624, + "loss": 1.3081, + "grad_norm": 0.2283719778060913, + "learning_rate": 8.313924663351926e-05 + }, + { + "step": 427, + "epoch": 0.6842948717948718, + "cpu_mem": 1.961168896, + "gpu_mem": 5.023324672, + "loss": 1.3136, + "grad_norm": 0.2267938107252121, + "learning_rate": 8.238836352619424e-05 + }, + { + "step": 428, + "epoch": 0.6858974358974359, + "cpu_mem": 1.961168896, + "gpu_mem": 5.0233216, + "loss": 1.4204, + "grad_norm": 0.24719291925430298, + "learning_rate": 8.163960070433164e-05 + }, + { + "step": 429, + "epoch": 0.6875, + "cpu_mem": 1.961168896, + "gpu_mem": 5.023320064, + "loss": 1.2727, + "grad_norm": 0.200495645403862, + "learning_rate": 8.089298164896245e-05 + }, + { + "step": 430, + "epoch": 0.6891025641025641, + "cpu_mem": 1.961365504, + "gpu_mem": 5.023338496, + "loss": 1.1902, + "grad_norm": 0.20681127905845642, + "learning_rate": 8.014852977388964e-05 + }, + { + "step": 431, + "epoch": 0.6907051282051282, + "cpu_mem": 1.961365504, + "gpu_mem": 5.023300096, + "loss": 1.2849, + "grad_norm": 0.18254365026950836, + "learning_rate": 7.940626842495362e-05 + }, + { + "step": 432, + "epoch": 0.6923076923076923, + "cpu_mem": 1.961365504, + "gpu_mem": 5.02334464, + "loss": 1.2628, + "grad_norm": 0.23995956778526306, + "learning_rate": 7.866622087930074e-05 + }, + { + "step": 433, + "epoch": 0.6939102564102564, + "cpu_mem": 1.961365504, + "gpu_mem": 5.023309312, + "loss": 1.2366, + "grad_norm": 0.2192598581314087, + "learning_rate": 7.792841034465275e-05 + }, + { + "step": 434, + "epoch": 0.6955128205128205, + "cpu_mem": 1.961365504, + "gpu_mem": 5.02333696, + "loss": 1.2472, + "grad_norm": 0.2615692913532257, + "learning_rate": 7.719285995857938e-05 + }, + { + "step": 435, + "epoch": 0.6971153846153846, + "cpu_mem": 1.961365504, + "gpu_mem": 5.023316992, + "loss": 1.2876, + "grad_norm": 0.18461230397224426, + "learning_rate": 7.64595927877727e-05 + }, + { + "step": 436, + "epoch": 0.6987179487179487, + "cpu_mem": 1.961365504, + "gpu_mem": 5.023363072, + "loss": 1.3166, + "grad_norm": 0.2970774471759796, + "learning_rate": 7.572863182732332e-05 + }, + { + "step": 437, + "epoch": 0.7003205128205128, + "cpu_mem": 1.961365504, + "gpu_mem": 5.023327744, + "loss": 1.2754, + "grad_norm": 0.2147286981344223, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 438, + "epoch": 0.7019230769230769, + "cpu_mem": 1.961365504, + "gpu_mem": 5.023318528, + "loss": 1.2999, + "grad_norm": 0.2667350769042969, + "learning_rate": 7.42737201555302e-05 + }, + { + "step": 439, + "epoch": 0.7035256410256411, + "cpu_mem": 1.961365504, + "gpu_mem": 5.023312384, + "loss": 1.2495, + "grad_norm": 0.24207349121570587, + "learning_rate": 7.354981506988387e-05 + }, + { + "step": 440, + "epoch": 0.7051282051282052, + "cpu_mem": 1.961562112, + "gpu_mem": 5.023297024, + "loss": 1.2769, + "grad_norm": 0.22169771790504456, + "learning_rate": 7.282830744455895e-05 + }, + { + "step": 441, + "epoch": 0.7067307692307693, + "cpu_mem": 1.961562112, + "gpu_mem": 5.023315456, + "loss": 1.307, + "grad_norm": 0.317728728055954, + "learning_rate": 7.210921990586957e-05 + }, + { + "step": 442, + "epoch": 0.7083333333333334, + "cpu_mem": 1.961562112, + "gpu_mem": 5.023316992, + "loss": 1.2102, + "grad_norm": 0.27581551671028137, + "learning_rate": 7.139257500423665e-05 + }, + { + "step": 443, + "epoch": 0.7099358974358975, + "cpu_mem": 1.961562112, + "gpu_mem": 5.0233216, + "loss": 1.2022, + "grad_norm": 0.3589105010032654, + "learning_rate": 7.067839521348035e-05 + }, + { + "step": 444, + "epoch": 0.7115384615384616, + "cpu_mem": 1.96175872, + "gpu_mem": 5.023324672, + "loss": 1.2748, + "grad_norm": 0.26467981934547424, + "learning_rate": 6.996670293011575e-05 + }, + { + "step": 445, + "epoch": 0.7131410256410257, + "cpu_mem": 1.96175872, + "gpu_mem": 5.023318528, + "loss": 1.3291, + "grad_norm": 0.38566431403160095, + "learning_rate": 6.92575204726501e-05 + }, + { + "step": 446, + "epoch": 0.7147435897435898, + "cpu_mem": 1.96175872, + "gpu_mem": 5.02334464, + "loss": 1.1793, + "grad_norm": 0.2767994999885559, + "learning_rate": 6.855087008088307e-05 + }, + { + "step": 447, + "epoch": 0.7163461538461539, + "cpu_mem": 1.96175872, + "gpu_mem": 5.023312384, + "loss": 1.1801, + "grad_norm": 0.33007732033729553, + "learning_rate": 6.784677391520952e-05 + }, + { + "step": 448, + "epoch": 0.717948717948718, + "cpu_mem": 1.96175872, + "gpu_mem": 5.023340032, + "loss": 1.2342, + "grad_norm": 0.29500719904899597, + "learning_rate": 6.714525405592412e-05 + }, + { + "step": 449, + "epoch": 0.719551282051282, + "cpu_mem": 1.961955328, + "gpu_mem": 5.023347712, + "loss": 1.2729, + "grad_norm": 0.46389278769493103, + "learning_rate": 6.644633250252937e-05 + }, + { + "step": 450, + "epoch": 0.7211538461538461, + "cpu_mem": 1.961955328, + "gpu_mem": 5.02332928, + "loss": 1.2924, + "grad_norm": 0.3603247106075287, + "learning_rate": 6.575003117304535e-05 + }, + { + "step": 451, + "epoch": 0.7227564102564102, + "cpu_mem": 1.961955328, + "gpu_mem": 5.023315456, + "loss": 1.2223, + "grad_norm": 0.30431416630744934, + "learning_rate": 6.50563719033225e-05 + }, + { + "step": 452, + "epoch": 0.7243589743589743, + "cpu_mem": 1.961955328, + "gpu_mem": 5.023326208, + "loss": 1.2079, + "grad_norm": 0.32069671154022217, + "learning_rate": 6.436537644635705e-05 + }, + { + "step": 453, + "epoch": 0.7259615384615384, + "cpu_mem": 1.961955328, + "gpu_mem": 5.023318528, + "loss": 1.2272, + "grad_norm": 0.31153491139411926, + "learning_rate": 6.367706647160847e-05 + }, + { + "step": 454, + "epoch": 0.7275641025641025, + "cpu_mem": 1.961955328, + "gpu_mem": 5.023335424, + "loss": 1.1917, + "grad_norm": 0.24644835293293, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 455, + "epoch": 0.7291666666666666, + "cpu_mem": 1.961955328, + "gpu_mem": 5.023307776, + "loss": 1.1148, + "grad_norm": 0.34049850702285767, + "learning_rate": 6.230858922484288e-05 + }, + { + "step": 456, + "epoch": 0.7307692307692307, + "cpu_mem": 1.961955328, + "gpu_mem": 5.023338496, + "loss": 1.3019, + "grad_norm": 0.42138370871543884, + "learning_rate": 6.162846486795938e-05 + }, + { + "step": 457, + "epoch": 0.7323717948717948, + "cpu_mem": 1.961955328, + "gpu_mem": 5.023320064, + "loss": 1.2727, + "grad_norm": 0.40594279766082764, + "learning_rate": 6.095111182221422e-05 + }, + { + "step": 458, + "epoch": 0.7339743589743589, + "cpu_mem": 1.961955328, + "gpu_mem": 5.023307776, + "loss": 1.2337, + "grad_norm": 0.27854859828948975, + "learning_rate": 6.027655132924397e-05 + }, + { + "step": 459, + "epoch": 0.7355769230769231, + "cpu_mem": 1.961955328, + "gpu_mem": 5.023320064, + "loss": 1.2601, + "grad_norm": 0.4017044007778168, + "learning_rate": 5.960480454311155e-05 + }, + { + "step": 460, + "epoch": 0.7371794871794872, + "cpu_mem": 1.961955328, + "gpu_mem": 5.023326208, + "loss": 1.1727, + "grad_norm": 0.3153959810733795, + "learning_rate": 5.893589252964258e-05 + }, + { + "step": 461, + "epoch": 0.7387820512820513, + "cpu_mem": 1.962151936, + "gpu_mem": 5.02331392, + "loss": 1.185, + "grad_norm": 0.266571581363678, + "learning_rate": 5.826983626576479e-05 + }, + { + "step": 462, + "epoch": 0.7403846153846154, + "cpu_mem": 1.962151936, + "gpu_mem": 5.023303168, + "loss": 1.1274, + "grad_norm": 0.29491114616394043, + "learning_rate": 5.760665663885046e-05 + }, + { + "step": 463, + "epoch": 0.7419871794871795, + "cpu_mem": 1.962151936, + "gpu_mem": 5.023304704, + "loss": 1.1645, + "grad_norm": 0.2712647616863251, + "learning_rate": 5.6946374446060984e-05 + }, + { + "step": 464, + "epoch": 0.7435897435897436, + "cpu_mem": 1.962151936, + "gpu_mem": 5.023318528, + "loss": 1.1713, + "grad_norm": 0.2790778577327728, + "learning_rate": 5.6289010393695056e-05 + }, + { + "step": 465, + "epoch": 0.7451923076923077, + "cpu_mem": 1.962151936, + "gpu_mem": 5.0233216, + "loss": 1.1485, + "grad_norm": 0.33170032501220703, + "learning_rate": 5.563458509653904e-05 + }, + { + "step": 466, + "epoch": 0.7467948717948718, + "cpu_mem": 1.962151936, + "gpu_mem": 5.023332352, + "loss": 1.2584, + "grad_norm": 0.5038016438484192, + "learning_rate": 5.498311907722057e-05 + }, + { + "step": 467, + "epoch": 0.7483974358974359, + "cpu_mem": 1.962151936, + "gpu_mem": 5.02330624, + "loss": 1.1905, + "grad_norm": 0.3545354902744293, + "learning_rate": 5.43346327655652e-05 + }, + { + "step": 468, + "epoch": 0.75, + "cpu_mem": 1.962151936, + "gpu_mem": 5.0233216, + "loss": 1.1933, + "grad_norm": 0.3034311830997467, + "learning_rate": 5.3689146497955274e-05 + }, + { + "step": 469, + "epoch": 0.7516025641025641, + "cpu_mem": 1.962151936, + "gpu_mem": 5.023330816, + "loss": 1.1787, + "grad_norm": 0.41254904866218567, + "learning_rate": 5.30466805166927e-05 + }, + { + "step": 470, + "epoch": 0.7532051282051282, + "cpu_mem": 1.962151936, + "gpu_mem": 5.023304704, + "loss": 1.1374, + "grad_norm": 0.44032856822013855, + "learning_rate": 5.240725496936372e-05 + }, + { + "step": 471, + "epoch": 0.7548076923076923, + "cpu_mem": 1.962348544, + "gpu_mem": 5.023310848, + "loss": 1.2759, + "grad_norm": 0.5232130289077759, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 472, + "epoch": 0.7564102564102564, + "cpu_mem": 1.962348544, + "gpu_mem": 5.023300096, + "loss": 1.2167, + "grad_norm": 0.6897673010826111, + "learning_rate": 5.113760528948622e-05 + }, + { + "step": 473, + "epoch": 0.7580128205128205, + "cpu_mem": 1.962348544, + "gpu_mem": 5.02330624, + "loss": 1.2122, + "grad_norm": 0.40759700536727905, + "learning_rate": 5.05074209728614e-05 + }, + { + "step": 474, + "epoch": 0.7596153846153846, + "cpu_mem": 1.962348544, + "gpu_mem": 5.023343104, + "loss": 1.1718, + "grad_norm": 0.3442457914352417, + "learning_rate": 4.988035672076899e-05 + }, + { + "step": 475, + "epoch": 0.7612179487179487, + "cpu_mem": 1.962348544, + "gpu_mem": 5.02329088, + "loss": 1.2001, + "grad_norm": 0.3555448353290558, + "learning_rate": 4.925643219780052e-05 + }, + { + "step": 476, + "epoch": 0.7628205128205128, + "cpu_mem": 1.962545152, + "gpu_mem": 5.023310848, + "loss": 1.2403, + "grad_norm": 0.3581390976905823, + "learning_rate": 4.863566697008634e-05 + }, + { + "step": 477, + "epoch": 0.7644230769230769, + "cpu_mem": 1.962545152, + "gpu_mem": 5.023310848, + "loss": 1.161, + "grad_norm": 0.41425949335098267, + "learning_rate": 4.801808050468219e-05 + }, + { + "step": 478, + "epoch": 0.7660256410256411, + "cpu_mem": 1.962545152, + "gpu_mem": 5.023309312, + "loss": 1.1869, + "grad_norm": 0.3776678740978241, + "learning_rate": 4.7403692168958305e-05 + }, + { + "step": 479, + "epoch": 0.7676282051282052, + "cpu_mem": 1.962545152, + "gpu_mem": 5.023307776, + "loss": 1.0445, + "grad_norm": 0.4371272623538971, + "learning_rate": 4.679252122999255e-05 + }, + { + "step": 480, + "epoch": 0.7692307692307693, + "cpu_mem": 1.962545152, + "gpu_mem": 5.023300096, + "loss": 1.159, + "grad_norm": 0.34236231446266174, + "learning_rate": 4.618458685396579e-05 + }, + { + "step": 481, + "epoch": 0.7708333333333334, + "cpu_mem": 1.962545152, + "gpu_mem": 5.02336, + "loss": 1.2021, + "grad_norm": 0.44835516810417175, + "learning_rate": 4.5579908105561016e-05 + }, + { + "step": 482, + "epoch": 0.7724358974358975, + "cpu_mem": 1.962545152, + "gpu_mem": 5.023304704, + "loss": 1.0845, + "grad_norm": 0.41059523820877075, + "learning_rate": 4.497850394736563e-05 + }, + { + "step": 483, + "epoch": 0.7740384615384616, + "cpu_mem": 1.96274176, + "gpu_mem": 5.023287808, + "loss": 1.1263, + "grad_norm": 0.4388139843940735, + "learning_rate": 4.438039323927648e-05 + }, + { + "step": 484, + "epoch": 0.7756410256410257, + "cpu_mem": 1.96274176, + "gpu_mem": 5.023318528, + "loss": 1.1459, + "grad_norm": 0.6627200245857239, + "learning_rate": 4.3785594737908676e-05 + }, + { + "step": 485, + "epoch": 0.7772435897435898, + "cpu_mem": 1.96274176, + "gpu_mem": 5.023363072, + "loss": 1.1403, + "grad_norm": 0.5658724904060364, + "learning_rate": 4.319412709600723e-05 + }, + { + "step": 486, + "epoch": 0.7788461538461539, + "cpu_mem": 1.96274176, + "gpu_mem": 5.023343104, + "loss": 1.1378, + "grad_norm": 0.3939796984195709, + "learning_rate": 4.2606008861862116e-05 + }, + { + "step": 487, + "epoch": 0.780448717948718, + "cpu_mem": 1.96274176, + "gpu_mem": 5.023343104, + "loss": 1.1549, + "grad_norm": 0.46567296981811523, + "learning_rate": 4.2021258478726774e-05 + }, + { + "step": 488, + "epoch": 0.782051282051282, + "cpu_mem": 1.96274176, + "gpu_mem": 5.023309312, + "loss": 1.2362, + "grad_norm": 0.6266024112701416, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 489, + "epoch": 0.7836538461538461, + "cpu_mem": 1.96274176, + "gpu_mem": 5.023333888, + "loss": 1.203, + "grad_norm": 0.47325217723846436, + "learning_rate": 4.0861934509848507e-05 + }, + { + "step": 490, + "epoch": 0.7852564102564102, + "cpu_mem": 1.96274176, + "gpu_mem": 5.02333696, + "loss": 1.1204, + "grad_norm": 0.47215789556503296, + "learning_rate": 4.028739728024022e-05 + }, + { + "step": 491, + "epoch": 0.7868589743589743, + "cpu_mem": 1.96274176, + "gpu_mem": 5.023315456, + "loss": 1.0368, + "grad_norm": 0.41097962856292725, + "learning_rate": 3.971630061277077e-05 + }, + { + "step": 492, + "epoch": 0.7884615384615384, + "cpu_mem": 1.96274176, + "gpu_mem": 5.023338496, + "loss": 0.9234, + "grad_norm": 0.5529661774635315, + "learning_rate": 3.914866241690115e-05 + }, + { + "step": 493, + "epoch": 0.7900641025641025, + "cpu_mem": 1.96274176, + "gpu_mem": 5.023318528, + "loss": 1.1973, + "grad_norm": 0.46409183740615845, + "learning_rate": 3.858450049363532e-05 + }, + { + "step": 494, + "epoch": 0.7916666666666666, + "cpu_mem": 1.96274176, + "gpu_mem": 5.023343104, + "loss": 0.935, + "grad_norm": 0.3184066414833069, + "learning_rate": 3.8023832534962314e-05 + }, + { + "step": 495, + "epoch": 0.7932692307692307, + "cpu_mem": 1.96274176, + "gpu_mem": 5.023326208, + "loss": 1.0432, + "grad_norm": 0.3471692204475403, + "learning_rate": 3.746667612330109e-05 + }, + { + "step": 496, + "epoch": 0.7948717948717948, + "cpu_mem": 1.96274176, + "gpu_mem": 5.0233216, + "loss": 1.0392, + "grad_norm": 0.44997406005859375, + "learning_rate": 3.691304873094927e-05 + }, + { + "step": 497, + "epoch": 0.7964743589743589, + "cpu_mem": 1.962938368, + "gpu_mem": 5.023333888, + "loss": 1.1729, + "grad_norm": 0.3817955255508423, + "learning_rate": 3.636296771953544e-05 + }, + { + "step": 498, + "epoch": 0.7980769230769231, + "cpu_mem": 1.962938368, + "gpu_mem": 5.023304704, + "loss": 1.132, + "grad_norm": 0.500446081161499, + "learning_rate": 3.581645033947425e-05 + }, + { + "step": 499, + "epoch": 0.7996794871794872, + "cpu_mem": 1.962938368, + "gpu_mem": 5.023318528, + "loss": 1.1055, + "grad_norm": 0.43451279401779175, + "learning_rate": 3.527351372942588e-05 + }, + { + "step": 500, + "epoch": 0.8012820512820513, + "cpu_mem": 1.963134976, + "gpu_mem": 5.023304704, + "loss": 1.1108, + "grad_norm": 0.5038739442825317, + "learning_rate": 3.473417491575824e-05 + }, + { + "step": 501, + "epoch": 0.8028846153846154, + "cpu_mem": 1.963134976, + "gpu_mem": 5.02329856, + "loss": 1.2256, + "grad_norm": 0.40767911076545715, + "learning_rate": 3.41984508120132e-05 + }, + { + "step": 502, + "epoch": 0.8044871794871795, + "cpu_mem": 1.963134976, + "gpu_mem": 5.023304704, + "loss": 1.0467, + "grad_norm": 0.39241424202919006, + "learning_rate": 3.366635821837627e-05 + }, + { + "step": 503, + "epoch": 0.8060897435897436, + "cpu_mem": 1.963134976, + "gpu_mem": 5.023318528, + "loss": 1.1523, + "grad_norm": 0.3733694851398468, + "learning_rate": 3.3137913821149425e-05 + }, + { + "step": 504, + "epoch": 0.8076923076923077, + "cpu_mem": 1.963134976, + "gpu_mem": 5.023301632, + "loss": 1.1077, + "grad_norm": 0.5801820755004883, + "learning_rate": 3.261313419222825e-05 + }, + { + "step": 505, + "epoch": 0.8092948717948718, + "cpu_mem": 1.963134976, + "gpu_mem": 5.023355392, + "loss": 0.9985, + "grad_norm": 0.37825968861579895, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 506, + "epoch": 0.8108974358974359, + "cpu_mem": 1.963134976, + "gpu_mem": 5.02329856, + "loss": 1.0761, + "grad_norm": 0.39953407645225525, + "learning_rate": 3.157463495173713e-05 + }, + { + "step": 507, + "epoch": 0.8125, + "cpu_mem": 1.963134976, + "gpu_mem": 5.023376896, + "loss": 1.1198, + "grad_norm": 0.5847802758216858, + "learning_rate": 3.1060947907265936e-05 + }, + { + "step": 508, + "epoch": 0.8141025641025641, + "cpu_mem": 1.963134976, + "gpu_mem": 5.023320064, + "loss": 0.9751, + "grad_norm": 0.5731764435768127, + "learning_rate": 3.0550990764276634e-05 + }, + { + "step": 509, + "epoch": 0.8157051282051282, + "cpu_mem": 1.963134976, + "gpu_mem": 5.023338496, + "loss": 1.1274, + "grad_norm": 0.5150119662284851, + "learning_rate": 3.00447795149086e-05 + }, + { + "step": 510, + "epoch": 0.8173076923076923, + "cpu_mem": 1.963134976, + "gpu_mem": 5.02331392, + "loss": 1.0382, + "grad_norm": 0.543439507484436, + "learning_rate": 2.9542330033830884e-05 + }, + { + "step": 511, + "epoch": 0.8189102564102564, + "cpu_mem": 1.963134976, + "gpu_mem": 5.023346176, + "loss": 0.9566, + "grad_norm": 0.3839191496372223, + "learning_rate": 2.9043658077744316e-05 + }, + { + "step": 512, + "epoch": 0.8205128205128205, + "cpu_mem": 1.963134976, + "gpu_mem": 5.023366144, + "loss": 1.202, + "grad_norm": 0.5727313756942749, + "learning_rate": 2.8548779284887442e-05 + }, + { + "step": 513, + "epoch": 0.8221153846153846, + "cpu_mem": 1.963134976, + "gpu_mem": 5.023295488, + "loss": 0.9978, + "grad_norm": 0.4193228483200073, + "learning_rate": 2.805770917454614e-05 + }, + { + "step": 514, + "epoch": 0.8237179487179487, + "cpu_mem": 1.963134976, + "gpu_mem": 5.023309312, + "loss": 1.0528, + "grad_norm": 0.5263128876686096, + "learning_rate": 2.7570463146566758e-05 + }, + { + "step": 515, + "epoch": 0.8253205128205128, + "cpu_mem": 1.963134976, + "gpu_mem": 5.023293952, + "loss": 1.1261, + "grad_norm": 0.5092751979827881, + "learning_rate": 2.708705648087332e-05 + }, + { + "step": 516, + "epoch": 0.8269230769230769, + "cpu_mem": 1.963134976, + "gpu_mem": 5.023332352, + "loss": 0.9409, + "grad_norm": 0.567868709564209, + "learning_rate": 2.6607504336988317e-05 + }, + { + "step": 517, + "epoch": 0.8285256410256411, + "cpu_mem": 1.963331584, + "gpu_mem": 5.023332352, + "loss": 1.1634, + "grad_norm": 0.5862645506858826, + "learning_rate": 2.613182175355739e-05 + }, + { + "step": 518, + "epoch": 0.8301282051282052, + "cpu_mem": 1.963331584, + "gpu_mem": 5.023318528, + "loss": 1.1769, + "grad_norm": 0.5031980872154236, + "learning_rate": 2.5660023647877644e-05 + }, + { + "step": 519, + "epoch": 0.8317307692307693, + "cpu_mem": 1.963331584, + "gpu_mem": 5.023309312, + "loss": 1.0407, + "grad_norm": 0.4071241617202759, + "learning_rate": 2.5192124815429777e-05 + }, + { + "step": 520, + "epoch": 0.8333333333333334, + "cpu_mem": 1.963331584, + "gpu_mem": 5.02331392, + "loss": 1.0531, + "grad_norm": 0.4046994745731354, + "learning_rate": 2.472813992941418e-05 + }, + { + "step": 521, + "epoch": 0.8349358974358975, + "cpu_mem": 1.963331584, + "gpu_mem": 5.023316992, + "loss": 1.2217, + "grad_norm": 0.6485798954963684, + "learning_rate": 2.426808354029078e-05 + }, + { + "step": 522, + "epoch": 0.8365384615384616, + "cpu_mem": 1.963331584, + "gpu_mem": 5.023323136, + "loss": 1.0031, + "grad_norm": 0.41321662068367004, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 523, + "epoch": 0.8381410256410257, + "cpu_mem": 1.963331584, + "gpu_mem": 5.023341568, + "loss": 1.0791, + "grad_norm": 0.39257705211639404, + "learning_rate": 2.3359813838124277e-05 + }, + { + "step": 524, + "epoch": 0.8397435897435898, + "cpu_mem": 1.963331584, + "gpu_mem": 5.023335424, + "loss": 1.14, + "grad_norm": 0.5263834595680237, + "learning_rate": 2.2911629008211363e-05 + }, + { + "step": 525, + "epoch": 0.8413461538461539, + "cpu_mem": 1.963528192, + "gpu_mem": 5.023312384, + "loss": 1.0784, + "grad_norm": 0.5483826398849487, + "learning_rate": 2.24674296405579e-05 + }, + { + "step": 526, + "epoch": 0.842948717948718, + "cpu_mem": 1.963528192, + "gpu_mem": 5.023300096, + "loss": 1.1055, + "grad_norm": 0.5275043845176697, + "learning_rate": 2.2027229665154446e-05 + }, + { + "step": 527, + "epoch": 0.844551282051282, + "cpu_mem": 1.963528192, + "gpu_mem": 5.023266304, + "loss": 1.1323, + "grad_norm": 0.8036730289459229, + "learning_rate": 2.1591042886571634e-05 + }, + { + "step": 528, + "epoch": 0.8461538461538461, + "cpu_mem": 1.963528192, + "gpu_mem": 5.02331392, + "loss": 1.0268, + "grad_norm": 0.492184042930603, + "learning_rate": 2.1158882983527166e-05 + }, + { + "step": 529, + "epoch": 0.8477564102564102, + "cpu_mem": 1.963528192, + "gpu_mem": 5.023280128, + "loss": 1.1323, + "grad_norm": 0.4511490762233734, + "learning_rate": 2.0730763508456738e-05 + }, + { + "step": 530, + "epoch": 0.8493589743589743, + "cpu_mem": 1.963528192, + "gpu_mem": 5.023327744, + "loss": 1.0772, + "grad_norm": 0.5964053869247437, + "learning_rate": 2.0306697887089235e-05 + }, + { + "step": 531, + "epoch": 0.8509615384615384, + "cpu_mem": 1.963528192, + "gpu_mem": 5.023326208, + "loss": 1.0934, + "grad_norm": 0.6043298244476318, + "learning_rate": 1.9886699418025543e-05 + }, + { + "step": 532, + "epoch": 0.8525641025641025, + "cpu_mem": 1.963528192, + "gpu_mem": 5.023327744, + "loss": 0.9037, + "grad_norm": 0.5789465308189392, + "learning_rate": 1.947078127232169e-05 + }, + { + "step": 533, + "epoch": 0.8541666666666666, + "cpu_mem": 1.963528192, + "gpu_mem": 5.02333696, + "loss": 1.1659, + "grad_norm": 0.470492959022522, + "learning_rate": 1.9058956493075644e-05 + }, + { + "step": 534, + "epoch": 0.8557692307692307, + "cpu_mem": 1.963528192, + "gpu_mem": 5.023312384, + "loss": 1.1343, + "grad_norm": 0.44621017575263977, + "learning_rate": 1.8651237995018324e-05 + }, + { + "step": 535, + "epoch": 0.8573717948717948, + "cpu_mem": 1.963528192, + "gpu_mem": 5.023297024, + "loss": 1.0141, + "grad_norm": 0.4191407859325409, + "learning_rate": 1.8247638564108607e-05 + }, + { + "step": 536, + "epoch": 0.8589743589743589, + "cpu_mem": 1.963528192, + "gpu_mem": 5.023326208, + "loss": 1.1004, + "grad_norm": 0.5875828862190247, + "learning_rate": 1.7848170857132325e-05 + }, + { + "step": 537, + "epoch": 0.8605769230769231, + "cpu_mem": 1.963528192, + "gpu_mem": 5.023340032, + "loss": 1.0639, + "grad_norm": 0.4438910484313965, + "learning_rate": 1.7452847401305496e-05 + }, + { + "step": 538, + "epoch": 0.8621794871794872, + "cpu_mem": 1.963528192, + "gpu_mem": 5.023295488, + "loss": 1.1101, + "grad_norm": 0.6536675095558167, + "learning_rate": 1.7061680593881344e-05 + }, + { + "step": 539, + "epoch": 0.8637820512820513, + "cpu_mem": 1.963528192, + "gpu_mem": 5.023301632, + "loss": 1.2045, + "grad_norm": 0.48392200469970703, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 540, + "epoch": 0.8653846153846154, + "cpu_mem": 1.963528192, + "gpu_mem": 5.023330816, + "loss": 1.2, + "grad_norm": 0.5156332850456238, + "learning_rate": 1.6291865861111353e-05 + }, + { + "step": 541, + "epoch": 0.8669871794871795, + "cpu_mem": 1.963528192, + "gpu_mem": 5.023326208, + "loss": 0.9604, + "grad_norm": 0.4589671492576599, + "learning_rate": 1.5913242076979493e-05 + }, + { + "step": 542, + "epoch": 0.8685897435897436, + "cpu_mem": 1.9637248, + "gpu_mem": 5.023312384, + "loss": 1.0888, + "grad_norm": 0.5387463569641113, + "learning_rate": 1.5538823222921288e-05 + }, + { + "step": 543, + "epoch": 0.8701923076923077, + "cpu_mem": 1.9637248, + "gpu_mem": 5.023326208, + "loss": 1.0185, + "grad_norm": 0.6027670502662659, + "learning_rate": 1.5168621040626388e-05 + }, + { + "step": 544, + "epoch": 0.8717948717948718, + "cpu_mem": 1.9637248, + "gpu_mem": 5.023315456, + "loss": 1.112, + "grad_norm": 0.4265848398208618, + "learning_rate": 1.4802647139550577e-05 + }, + { + "step": 545, + "epoch": 0.8733974358974359, + "cpu_mem": 1.9637248, + "gpu_mem": 5.0233216, + "loss": 0.9895, + "grad_norm": 0.4802420437335968, + "learning_rate": 1.444091299655175e-05 + }, + { + "step": 546, + "epoch": 0.875, + "cpu_mem": 1.9637248, + "gpu_mem": 5.023326208, + "loss": 1.1728, + "grad_norm": 0.5389770865440369, + "learning_rate": 1.408342995552988e-05 + }, + { + "step": 547, + "epoch": 0.8766025641025641, + "cpu_mem": 1.9637248, + "gpu_mem": 5.0233216, + "loss": 0.9831, + "grad_norm": 0.5070933103561401, + "learning_rate": 1.3730209227071436e-05 + }, + { + "step": 548, + "epoch": 0.8782051282051282, + "cpu_mem": 1.9637248, + "gpu_mem": 5.023295488, + "loss": 0.9975, + "grad_norm": 0.431145578622818, + "learning_rate": 1.3381261888097755e-05 + }, + { + "step": 549, + "epoch": 0.8798076923076923, + "cpu_mem": 1.9637248, + "gpu_mem": 5.023304704, + "loss": 1.1101, + "grad_norm": 0.5050808787345886, + "learning_rate": 1.303659888151753e-05 + }, + { + "step": 550, + "epoch": 0.8814102564102564, + "cpu_mem": 1.9637248, + "gpu_mem": 5.023323136, + "loss": 1.1111, + "grad_norm": 0.5073701739311218, + "learning_rate": 1.2696231015883913e-05 + }, + { + "step": 551, + "epoch": 0.8830128205128205, + "cpu_mem": 1.9637248, + "gpu_mem": 5.023293952, + "loss": 1.0211, + "grad_norm": 0.3915492594242096, + "learning_rate": 1.2360168965055301e-05 + }, + { + "step": 552, + "epoch": 0.8846153846153846, + "cpu_mem": 1.9637248, + "gpu_mem": 5.023324672, + "loss": 1.1537, + "grad_norm": 0.49069464206695557, + "learning_rate": 1.2028423267860805e-05 + }, + { + "step": 553, + "epoch": 0.8862179487179487, + "cpu_mem": 1.9637248, + "gpu_mem": 5.023333888, + "loss": 1.0338, + "grad_norm": 0.513830840587616, + "learning_rate": 1.1701004327769709e-05 + }, + { + "step": 554, + "epoch": 0.8878205128205128, + "cpu_mem": 1.9637248, + "gpu_mem": 5.023295488, + "loss": 1.1425, + "grad_norm": 0.4547666311264038, + "learning_rate": 1.1377922412565005e-05 + }, + { + "step": 555, + "epoch": 0.8894230769230769, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023300096, + "loss": 0.9498, + "grad_norm": 0.40007373690605164, + "learning_rate": 1.1059187654021762e-05 + }, + { + "step": 556, + "epoch": 0.8910256410256411, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023324672, + "loss": 0.9338, + "grad_norm": 0.37017980217933655, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 557, + "epoch": 0.8926282051282052, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023343104, + "loss": 1.0399, + "grad_norm": 0.3665134012699127, + "learning_rate": 1.0434799452076915e-05 + }, + { + "step": 558, + "epoch": 0.8942307692307693, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023324672, + "loss": 1.15, + "grad_norm": 0.610748827457428, + "learning_rate": 1.0129165589346643e-05 + }, + { + "step": 559, + "epoch": 0.8958333333333334, + "cpu_mem": 1.963921408, + "gpu_mem": 5.02337536, + "loss": 1.0308, + "grad_norm": 0.5005159378051758, + "learning_rate": 9.82791804400626e-06 + }, + { + "step": 560, + "epoch": 0.8974358974358975, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023307776, + "loss": 1.1703, + "grad_norm": 0.5598139762878418, + "learning_rate": 9.531066263109971e-06 + }, + { + "step": 561, + "epoch": 0.8990384615384616, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023309312, + "loss": 1.0687, + "grad_norm": 0.6039681434631348, + "learning_rate": 9.238619555861731e-06 + }, + { + "step": 562, + "epoch": 0.9006410256410257, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023309312, + "loss": 1.1061, + "grad_norm": 0.5780653357505798, + "learning_rate": 8.950587093323435e-06 + }, + { + "step": 563, + "epoch": 0.9022435897435898, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023315456, + "loss": 1.0163, + "grad_norm": 0.48772644996643066, + "learning_rate": 8.66697790812731e-06 + }, + { + "step": 564, + "epoch": 0.9038461538461539, + "cpu_mem": 1.963921408, + "gpu_mem": 5.02332928, + "loss": 0.9676, + "grad_norm": 0.6020269393920898, + "learning_rate": 8.387800894192453e-06 + }, + { + "step": 565, + "epoch": 0.905448717948718, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023333888, + "loss": 1.1824, + "grad_norm": 0.4887208342552185, + "learning_rate": 8.113064806446285e-06 + }, + { + "step": 566, + "epoch": 0.907051282051282, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023327744, + "loss": 1.0323, + "grad_norm": 0.6141750812530518, + "learning_rate": 7.842778260549654e-06 + }, + { + "step": 567, + "epoch": 0.9086538461538461, + "cpu_mem": 1.963921408, + "gpu_mem": 5.0233216, + "loss": 1.1402, + "grad_norm": 0.6193527579307556, + "learning_rate": 7.5769497326268804e-06 + }, + { + "step": 568, + "epoch": 0.9102564102564102, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023335424, + "loss": 1.0483, + "grad_norm": 0.5877699255943298, + "learning_rate": 7.315587558999864e-06 + }, + { + "step": 569, + "epoch": 0.9118589743589743, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023327744, + "loss": 1.0694, + "grad_norm": 0.483978271484375, + "learning_rate": 7.058699935926526e-06 + }, + { + "step": 570, + "epoch": 0.9134615384615384, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023312384, + "loss": 1.1411, + "grad_norm": 0.5609557628631592, + "learning_rate": 6.8062949193440515e-06 + }, + { + "step": 571, + "epoch": 0.9150641025641025, + "cpu_mem": 1.963921408, + "gpu_mem": 5.0233216, + "loss": 1.0535, + "grad_norm": 0.3509097993373871, + "learning_rate": 6.5583804246160385e-06 + }, + { + "step": 572, + "epoch": 0.9166666666666666, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023330816, + "loss": 1.0067, + "grad_norm": 0.5354563593864441, + "learning_rate": 6.3149642262843804e-06 + }, + { + "step": 573, + "epoch": 0.9182692307692307, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023333888, + "loss": 1.2093, + "grad_norm": 0.5104676485061646, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 574, + "epoch": 0.9198717948717948, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023287808, + "loss": 1.0713, + "grad_norm": 0.4866039752960205, + "learning_rate": 5.84165711141048e-06 + }, + { + "step": 575, + "epoch": 0.9214743589743589, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023341568, + "loss": 0.8902, + "grad_norm": 0.5462644696235657, + "learning_rate": 5.611781037671176e-06 + }, + { + "step": 576, + "epoch": 0.9230769230769231, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023343104, + "loss": 0.8575, + "grad_norm": 0.44156742095947266, + "learning_rate": 5.386432945468555e-06 + }, + { + "step": 577, + "epoch": 0.9246794871794872, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023287808, + "loss": 1.1877, + "grad_norm": 0.5066599249839783, + "learning_rate": 5.165619901667311e-06 + }, + { + "step": 578, + "epoch": 0.9262820512820513, + "cpu_mem": 1.963921408, + "gpu_mem": 5.0233216, + "loss": 1.1348, + "grad_norm": 0.5533562302589417, + "learning_rate": 4.949348830914002e-06 + }, + { + "step": 579, + "epoch": 0.9278846153846154, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023300096, + "loss": 1.0291, + "grad_norm": 0.47313812375068665, + "learning_rate": 4.737626515419951e-06 + }, + { + "step": 580, + "epoch": 0.9294871794871795, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023330816, + "loss": 1.0718, + "grad_norm": 0.4158135652542114, + "learning_rate": 4.530459594748592e-06 + }, + { + "step": 581, + "epoch": 0.9310897435897436, + "cpu_mem": 1.963921408, + "gpu_mem": 5.02330624, + "loss": 1.0683, + "grad_norm": 0.6613315343856812, + "learning_rate": 4.327854565607164e-06 + }, + { + "step": 582, + "epoch": 0.9326923076923077, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023340032, + "loss": 0.9685, + "grad_norm": 0.6325422525405884, + "learning_rate": 4.129817781643091e-06 + }, + { + "step": 583, + "epoch": 0.9342948717948718, + "cpu_mem": 1.963921408, + "gpu_mem": 5.02336, + "loss": 1.0883, + "grad_norm": 0.5540536046028137, + "learning_rate": 3.9363554532446276e-06 + }, + { + "step": 584, + "epoch": 0.9358974358974359, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023324672, + "loss": 1.1265, + "grad_norm": 0.7901585698127747, + "learning_rate": 3.7474736473461607e-06 + }, + { + "step": 585, + "epoch": 0.9375, + "cpu_mem": 1.963921408, + "gpu_mem": 5.02334464, + "loss": 0.9586, + "grad_norm": 0.5022393465042114, + "learning_rate": 3.56317828723795e-06 + }, + { + "step": 586, + "epoch": 0.9391025641025641, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023324672, + "loss": 0.8811, + "grad_norm": 0.4811021387577057, + "learning_rate": 3.383475152380355e-06 + }, + { + "step": 587, + "epoch": 0.9407051282051282, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023326208, + "loss": 0.9333, + "grad_norm": 0.5494473576545715, + "learning_rate": 3.2083698782225997e-06 + }, + { + "step": 588, + "epoch": 0.9423076923076923, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023318528, + "loss": 1.0506, + "grad_norm": 0.6232452392578125, + "learning_rate": 3.0378679560260467e-06 + }, + { + "step": 589, + "epoch": 0.9439102564102564, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023316992, + "loss": 1.1489, + "grad_norm": 0.6469204425811768, + "learning_rate": 2.871974732691984e-06 + }, + { + "step": 590, + "epoch": 0.9455128205128205, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023330816, + "loss": 1.0327, + "grad_norm": 0.383417010307312, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 591, + "epoch": 0.9471153846153846, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023301632, + "loss": 0.9703, + "grad_norm": 0.4473437964916229, + "learning_rate": 2.554035047414732e-06 + }, + { + "step": 592, + "epoch": 0.9487179487179487, + "cpu_mem": 1.963921408, + "gpu_mem": 5.02335232, + "loss": 1.0999, + "grad_norm": 0.5709439516067505, + "learning_rate": 2.401998555987389e-06 + }, + { + "step": 593, + "epoch": 0.9503205128205128, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023349248, + "loss": 1.1725, + "grad_norm": 0.6588503122329712, + "learning_rate": 2.2545907041415457e-06 + }, + { + "step": 594, + "epoch": 0.9519230769230769, + "cpu_mem": 1.963921408, + "gpu_mem": 5.02332928, + "loss": 1.0559, + "grad_norm": 0.4603111445903778, + "learning_rate": 2.1118161145537436e-06 + }, + { + "step": 595, + "epoch": 0.9535256410256411, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023310848, + "loss": 1.0535, + "grad_norm": 0.518408477306366, + "learning_rate": 1.973679264602485e-06 + }, + { + "step": 596, + "epoch": 0.9551282051282052, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023320064, + "loss": 1.1327, + "grad_norm": 0.5250354409217834, + "learning_rate": 1.840184486227808e-06 + }, + { + "step": 597, + "epoch": 0.9567307692307693, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023287808, + "loss": 1.0972, + "grad_norm": 0.5418365597724915, + "learning_rate": 1.711335965795435e-06 + }, + { + "step": 598, + "epoch": 0.9583333333333334, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023349248, + "loss": 1.0724, + "grad_norm": 0.4864099621772766, + "learning_rate": 1.5871377439655054e-06 + }, + { + "step": 599, + "epoch": 0.9599358974358975, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023347712, + "loss": 1.0054, + "grad_norm": 0.44942715764045715, + "learning_rate": 1.4675937155658456e-06 + }, + { + "step": 600, + "epoch": 0.9615384615384616, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023303168, + "loss": 1.0829, + "grad_norm": 0.4113433063030243, + "learning_rate": 1.3527076294698846e-06 + }, + { + "step": 601, + "epoch": 0.9631410256410257, + "cpu_mem": 1.963921408, + "gpu_mem": 5.023335424, + "loss": 0.9526, + "grad_norm": 0.5454650521278381, + "learning_rate": 1.2424830884790126e-06 + }, + { + "step": 602, + "epoch": 0.9647435897435898, + "cpu_mem": 1.964118016, + "gpu_mem": 5.02332928, + "loss": 0.8891, + "grad_norm": 0.490966260433197, + "learning_rate": 1.1369235492096397e-06 + }, + { + "step": 603, + "epoch": 0.9663461538461539, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023315456, + "loss": 1.0422, + "grad_norm": 0.7750758528709412, + "learning_rate": 1.0360323219847645e-06 + }, + { + "step": 604, + "epoch": 0.967948717948718, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023315456, + "loss": 1.0992, + "grad_norm": 0.5282580852508545, + "learning_rate": 9.398125707302084e-07 + }, + { + "step": 605, + "epoch": 0.969551282051282, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023341568, + "loss": 0.9662, + "grad_norm": 0.4632655084133148, + "learning_rate": 8.482673128753947e-07 + }, + { + "step": 606, + "epoch": 0.9711538461538461, + "cpu_mem": 1.964118016, + "gpu_mem": 5.02332928, + "loss": 1.0919, + "grad_norm": 0.5678972005844116, + "learning_rate": 7.613994192586736e-07 + }, + { + "step": 607, + "epoch": 0.9727564102564102, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023320064, + "loss": 1.0873, + "grad_norm": 0.4645240604877472, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 608, + "epoch": 0.9743589743589743, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023312384, + "loss": 0.9596, + "grad_norm": 0.5567759871482849, + "learning_rate": 6.017064746021094e-07 + }, + { + "step": 609, + "epoch": 0.9759615384615384, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023335424, + "loss": 0.9447, + "grad_norm": 0.5124657154083252, + "learning_rate": 5.288864314965003e-07 + }, + { + "step": 610, + "epoch": 0.9775641025641025, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023324672, + "loss": 0.9723, + "grad_norm": 0.4699349105358124, + "learning_rate": 4.607537683404106e-07 + }, + { + "step": 611, + "epoch": 0.9791666666666666, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023309312, + "loss": 0.9566, + "grad_norm": 0.5540416836738586, + "learning_rate": 3.973106217585842e-07 + }, + { + "step": 612, + "epoch": 0.9807692307692307, + "cpu_mem": 1.964118016, + "gpu_mem": 5.02337536, + "loss": 1.0772, + "grad_norm": 0.5270485877990723, + "learning_rate": 3.3855898131356915e-07 + }, + { + "step": 613, + "epoch": 0.9823717948717948, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023318528, + "loss": 1.1574, + "grad_norm": 0.5362735986709595, + "learning_rate": 2.845006894433843e-07 + }, + { + "step": 614, + "epoch": 0.9839743589743589, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023304704, + "loss": 1.0163, + "grad_norm": 0.4219372570514679, + "learning_rate": 2.351374414037155e-07 + }, + { + "step": 615, + "epoch": 0.9855769230769231, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023369216, + "loss": 1.068, + "grad_norm": 0.5902315974235535, + "learning_rate": 1.9047078521474135e-07 + }, + { + "step": 616, + "epoch": 0.9871794871794872, + "cpu_mem": 1.964118016, + "gpu_mem": 5.02329856, + "loss": 1.0716, + "grad_norm": 0.514237105846405, + "learning_rate": 1.505021216125557e-07 + }, + { + "step": 617, + "epoch": 0.9887820512820513, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023326208, + "loss": 0.8974, + "grad_norm": 0.5701006650924683, + "learning_rate": 1.1523270400535245e-07 + }, + { + "step": 618, + "epoch": 0.9903846153846154, + "cpu_mem": 1.964118016, + "gpu_mem": 5.02332928, + "loss": 0.958, + "grad_norm": 0.45110800862312317, + "learning_rate": 8.466363843397383e-08 + }, + { + "step": 619, + "epoch": 0.9919871794871795, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023304704, + "loss": 1.1057, + "grad_norm": 0.7585674524307251, + "learning_rate": 5.8795883537338106e-08 + }, + { + "step": 620, + "epoch": 0.9935897435897436, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023332352, + "loss": 0.8619, + "grad_norm": 0.4372211992740631, + "learning_rate": 3.763025052231361e-08 + }, + { + "step": 621, + "epoch": 0.9951923076923077, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023341568, + "loss": 1.0466, + "grad_norm": 0.7389890551567078, + "learning_rate": 2.1167403138339088e-08 + }, + { + "step": 622, + "epoch": 0.9967948717948718, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023338496, + "loss": 1.0586, + "grad_norm": 0.5455394983291626, + "learning_rate": 9.407857656540397e-09 + }, + { + "step": 623, + "epoch": 0.9983974358974359, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023310848, + "loss": 1.0954, + "grad_norm": 0.4956252872943878, + "learning_rate": 2.3519828535434325e-09 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023057408, + "loss": 1.0516, + "grad_norm": 0.7133793830871582, + "learning_rate": 0.0 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.964118016, + "gpu_mem": 5.023057408, + "train_runtime": 16106.0118, + "train_samples_per_second": 2.478, + "train_steps_per_second": 0.039, + "total_flos": 8.83449094045778e+16, + "train_loss": 1.3510505271454651 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r8-a2/README.md b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r8-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r8-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r8-a2/adapter_config.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5723daa9f5f7b854bf548bbee9a6d37e12198a3a --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r8-a2/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha": 16, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "inference_mode": true, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 8, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r8-a2/eval_results.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..f48c17aa3f7551219c29cf7a1247061a3d2f4b15 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "hellaswag", + "results": 0.2884883489344752 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r8-a2/training_configuration.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..e2ce7c776bbeee099220077df204a640eff6984b --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "HELLASWAG", + "dataset_id": "Rowan/hellaswag", + "preprocess_id": "hellaswag_train_deepeval" + }, + "peft_config": { + "method": "loha", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 12615680 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 1, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loha-hellaswag-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r8-a2", + "seed": 42, + "timestamp": "2025-09-13T15:56:48.558626" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r8-a2/training_logs.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..c9dff65e598dc6e153267076a4faa799522cdf2b --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-hellaswag-r8-a2/training_logs.json @@ -0,0 +1,5629 @@ +[ + { + "step": 1, + "epoch": 0.0016025641025641025, + "cpu_mem": 1.845194752, + "gpu_mem": 4.46822144, + "loss": 3.4877, + "grad_norm": 2.652858018875122, + "learning_rate": 4.7619047619047615e-06 + }, + { + "step": 2, + "epoch": 0.003205128205128205, + "cpu_mem": 1.851486208, + "gpu_mem": 4.569140224, + "loss": 3.6203, + "grad_norm": 2.569589614868164, + "learning_rate": 9.523809523809523e-06 + }, + { + "step": 3, + "epoch": 0.004807692307692308, + "cpu_mem": 1.852665856, + "gpu_mem": 4.569147904, + "loss": 3.4298, + "grad_norm": 2.5694985389709473, + "learning_rate": 1.4285714285714284e-05 + }, + { + "step": 4, + "epoch": 0.00641025641025641, + "cpu_mem": 1.853648896, + "gpu_mem": 4.569181696, + "loss": 3.6191, + "grad_norm": 2.5661938190460205, + "learning_rate": 1.9047619047619046e-05 + }, + { + "step": 5, + "epoch": 0.008012820512820512, + "cpu_mem": 1.854631936, + "gpu_mem": 4.569144832, + "loss": 3.5275, + "grad_norm": 2.5922415256500244, + "learning_rate": 2.3809523809523807e-05 + }, + { + "step": 6, + "epoch": 0.009615384615384616, + "cpu_mem": 1.855614976, + "gpu_mem": 4.569190912, + "loss": 3.6362, + "grad_norm": 2.705122232437134, + "learning_rate": 2.8571428571428567e-05 + }, + { + "step": 7, + "epoch": 0.011217948717948718, + "cpu_mem": 1.856598016, + "gpu_mem": 4.569150976, + "loss": 3.6073, + "grad_norm": 2.494671106338501, + "learning_rate": 3.333333333333333e-05 + }, + { + "step": 8, + "epoch": 0.01282051282051282, + "cpu_mem": 1.857384448, + "gpu_mem": 4.569181696, + "loss": 3.361, + "grad_norm": 2.599144697189331, + "learning_rate": 3.809523809523809e-05 + }, + { + "step": 9, + "epoch": 0.014423076923076924, + "cpu_mem": 1.85817088, + "gpu_mem": 4.569181696, + "loss": 3.3063, + "grad_norm": 2.5035932064056396, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 10, + "epoch": 0.016025641025641024, + "cpu_mem": 1.858957312, + "gpu_mem": 4.569124864, + "loss": 3.1704, + "grad_norm": 2.5214931964874268, + "learning_rate": 4.7619047619047614e-05 + }, + { + "step": 11, + "epoch": 0.017628205128205128, + "cpu_mem": 1.859743744, + "gpu_mem": 4.569144832, + "loss": 3.1784, + "grad_norm": 2.6764233112335205, + "learning_rate": 5.238095238095237e-05 + }, + { + "step": 12, + "epoch": 0.019230769230769232, + "cpu_mem": 1.860333568, + "gpu_mem": 4.56914176, + "loss": 3.4514, + "grad_norm": 2.7263848781585693, + "learning_rate": 5.7142857142857135e-05 + }, + { + "step": 13, + "epoch": 0.020833333333333332, + "cpu_mem": 1.86112, + "gpu_mem": 4.56913408, + "loss": 3.2131, + "grad_norm": 2.5129001140594482, + "learning_rate": 6.190476190476189e-05 + }, + { + "step": 14, + "epoch": 0.022435897435897436, + "cpu_mem": 1.861906432, + "gpu_mem": 4.569160192, + "loss": 3.1123, + "grad_norm": 2.6172397136688232, + "learning_rate": 6.666666666666666e-05 + }, + { + "step": 15, + "epoch": 0.02403846153846154, + "cpu_mem": 1.862496256, + "gpu_mem": 4.569158656, + "loss": 2.923, + "grad_norm": 2.44769024848938, + "learning_rate": 7.142857142857142e-05 + }, + { + "step": 16, + "epoch": 0.02564102564102564, + "cpu_mem": 1.863282688, + "gpu_mem": 4.569150976, + "loss": 3.0318, + "grad_norm": 2.3047540187835693, + "learning_rate": 7.619047619047618e-05 + }, + { + "step": 17, + "epoch": 0.027243589743589744, + "cpu_mem": 1.86406912, + "gpu_mem": 4.569150976, + "loss": 2.7886, + "grad_norm": 2.163897752761841, + "learning_rate": 8.095238095238093e-05 + }, + { + "step": 18, + "epoch": 0.028846153846153848, + "cpu_mem": 1.864658944, + "gpu_mem": 4.569150976, + "loss": 2.79, + "grad_norm": 2.0896389484405518, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 19, + "epoch": 0.030448717948717948, + "cpu_mem": 1.865445376, + "gpu_mem": 4.569150976, + "loss": 2.7953, + "grad_norm": 1.981105923652649, + "learning_rate": 9.047619047619046e-05 + }, + { + "step": 20, + "epoch": 0.03205128205128205, + "cpu_mem": 1.8660352, + "gpu_mem": 4.569124864, + "loss": 2.6643, + "grad_norm": 2.062620162963867, + "learning_rate": 9.523809523809523e-05 + }, + { + "step": 21, + "epoch": 0.03365384615384615, + "cpu_mem": 1.866625024, + "gpu_mem": 4.56914176, + "loss": 2.6779, + "grad_norm": 1.8565504550933838, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 22, + "epoch": 0.035256410256410256, + "cpu_mem": 1.867411456, + "gpu_mem": 4.56914944, + "loss": 2.3192, + "grad_norm": 1.5183587074279785, + "learning_rate": 0.00010476190476190474 + }, + { + "step": 23, + "epoch": 0.03685897435897436, + "cpu_mem": 1.86800128, + "gpu_mem": 4.569163264, + "loss": 2.4056, + "grad_norm": 1.7455663681030273, + "learning_rate": 0.0001095238095238095 + }, + { + "step": 24, + "epoch": 0.038461538461538464, + "cpu_mem": 1.868591104, + "gpu_mem": 4.569147904, + "loss": 2.4678, + "grad_norm": 1.8693830966949463, + "learning_rate": 0.00011428571428571427 + }, + { + "step": 25, + "epoch": 0.04006410256410257, + "cpu_mem": 1.869180928, + "gpu_mem": 4.569135616, + "loss": 2.0585, + "grad_norm": 1.262090802192688, + "learning_rate": 0.00011904761904761903 + }, + { + "step": 26, + "epoch": 0.041666666666666664, + "cpu_mem": 1.869770752, + "gpu_mem": 4.56914176, + "loss": 1.8765, + "grad_norm": 1.0624538660049438, + "learning_rate": 0.00012380952380952378 + }, + { + "step": 27, + "epoch": 0.04326923076923077, + "cpu_mem": 1.870360576, + "gpu_mem": 4.56914944, + "loss": 1.8914, + "grad_norm": 1.0873037576675415, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 28, + "epoch": 0.04487179487179487, + "cpu_mem": 1.8709504, + "gpu_mem": 4.569144832, + "loss": 1.8107, + "grad_norm": 0.870336651802063, + "learning_rate": 0.0001333333333333333 + }, + { + "step": 29, + "epoch": 0.046474358974358976, + "cpu_mem": 1.871540224, + "gpu_mem": 4.569154048, + "loss": 1.7835, + "grad_norm": 1.0354729890823364, + "learning_rate": 0.00013809523809523808 + }, + { + "step": 30, + "epoch": 0.04807692307692308, + "cpu_mem": 1.872130048, + "gpu_mem": 4.5691264, + "loss": 1.7714, + "grad_norm": 0.8415583372116089, + "learning_rate": 0.00014285714285714284 + }, + { + "step": 31, + "epoch": 0.049679487179487176, + "cpu_mem": 1.872719872, + "gpu_mem": 4.569181696, + "loss": 1.6225, + "grad_norm": 0.6792495250701904, + "learning_rate": 0.0001476190476190476 + }, + { + "step": 32, + "epoch": 0.05128205128205128, + "cpu_mem": 1.873309696, + "gpu_mem": 4.569174016, + "loss": 1.653, + "grad_norm": 0.8474215865135193, + "learning_rate": 0.00015238095238095237 + }, + { + "step": 33, + "epoch": 0.052884615384615384, + "cpu_mem": 1.87389952, + "gpu_mem": 4.569127936, + "loss": 1.5302, + "grad_norm": 0.5080743432044983, + "learning_rate": 0.00015714285714285713 + }, + { + "step": 34, + "epoch": 0.05448717948717949, + "cpu_mem": 1.874489344, + "gpu_mem": 4.569146368, + "loss": 1.4951, + "grad_norm": 0.449371337890625, + "learning_rate": 0.00016190476190476187 + }, + { + "step": 35, + "epoch": 0.05608974358974359, + "cpu_mem": 1.875079168, + "gpu_mem": 4.569167872, + "loss": 1.4463, + "grad_norm": 0.3512953817844391, + "learning_rate": 0.00016666666666666666 + }, + { + "step": 36, + "epoch": 0.057692307692307696, + "cpu_mem": 1.875668992, + "gpu_mem": 4.569166336, + "loss": 1.4215, + "grad_norm": 0.2538783848285675, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 37, + "epoch": 0.05929487179487179, + "cpu_mem": 1.876258816, + "gpu_mem": 4.569198592, + "loss": 1.4321, + "grad_norm": 0.2348223477602005, + "learning_rate": 0.0001761904761904762 + }, + { + "step": 38, + "epoch": 0.060897435897435896, + "cpu_mem": 1.87684864, + "gpu_mem": 4.569150976, + "loss": 1.4403, + "grad_norm": 0.24639829993247986, + "learning_rate": 0.00018095238095238093 + }, + { + "step": 39, + "epoch": 0.0625, + "cpu_mem": 1.877438464, + "gpu_mem": 4.569207808, + "loss": 1.4524, + "grad_norm": 0.5610830187797546, + "learning_rate": 0.00018571428571428572 + }, + { + "step": 40, + "epoch": 0.0641025641025641, + "cpu_mem": 1.87783168, + "gpu_mem": 4.569135616, + "loss": 1.457, + "grad_norm": 0.2614257335662842, + "learning_rate": 0.00019047619047619045 + }, + { + "step": 41, + "epoch": 0.06570512820512821, + "cpu_mem": 1.878421504, + "gpu_mem": 4.569163264, + "loss": 1.4706, + "grad_norm": 0.30777761340141296, + "learning_rate": 0.00019523809523809522 + }, + { + "step": 42, + "epoch": 0.0673076923076923, + "cpu_mem": 1.879011328, + "gpu_mem": 4.569177088, + "loss": 1.4031, + "grad_norm": 0.27493953704833984, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 43, + "epoch": 0.06891025641025642, + "cpu_mem": 1.879601152, + "gpu_mem": 4.569183232, + "loss": 1.4109, + "grad_norm": 0.21519558131694794, + "learning_rate": 0.00020476190476190475 + }, + { + "step": 44, + "epoch": 0.07051282051282051, + "cpu_mem": 1.879994368, + "gpu_mem": 4.569161728, + "loss": 1.402, + "grad_norm": 0.20435571670532227, + "learning_rate": 0.00020952380952380948 + }, + { + "step": 45, + "epoch": 0.07211538461538461, + "cpu_mem": 1.880584192, + "gpu_mem": 4.569161728, + "loss": 1.4046, + "grad_norm": 0.18874406814575195, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 46, + "epoch": 0.07371794871794872, + "cpu_mem": 1.880977408, + "gpu_mem": 4.569161728, + "loss": 1.4111, + "grad_norm": 0.29865503311157227, + "learning_rate": 0.000219047619047619 + }, + { + "step": 47, + "epoch": 0.07532051282051282, + "cpu_mem": 1.881567232, + "gpu_mem": 4.569147904, + "loss": 1.3944, + "grad_norm": 0.14381153881549835, + "learning_rate": 0.0002238095238095238 + }, + { + "step": 48, + "epoch": 0.07692307692307693, + "cpu_mem": 1.882157056, + "gpu_mem": 4.569166336, + "loss": 1.4169, + "grad_norm": 0.29505881667137146, + "learning_rate": 0.00022857142857142854 + }, + { + "step": 49, + "epoch": 0.07852564102564102, + "cpu_mem": 1.88274688, + "gpu_mem": 4.569178624, + "loss": 1.393, + "grad_norm": 0.14160187542438507, + "learning_rate": 0.0002333333333333333 + }, + { + "step": 50, + "epoch": 0.08012820512820513, + "cpu_mem": 1.883140096, + "gpu_mem": 4.569155584, + "loss": 1.3741, + "grad_norm": 0.1527193784713745, + "learning_rate": 0.00023809523809523807 + }, + { + "step": 51, + "epoch": 0.08173076923076923, + "cpu_mem": 1.883533312, + "gpu_mem": 4.569140224, + "loss": 1.3949, + "grad_norm": 0.2776142358779907, + "learning_rate": 0.00024285714285714283 + }, + { + "step": 52, + "epoch": 0.08333333333333333, + "cpu_mem": 1.884123136, + "gpu_mem": 4.569144832, + "loss": 1.3675, + "grad_norm": 0.16329404711723328, + "learning_rate": 0.00024761904761904757 + }, + { + "step": 53, + "epoch": 0.08493589743589744, + "cpu_mem": 1.884516352, + "gpu_mem": 4.56917248, + "loss": 1.415, + "grad_norm": 0.3132256269454956, + "learning_rate": 0.0002523809523809524 + }, + { + "step": 54, + "epoch": 0.08653846153846154, + "cpu_mem": 1.884909568, + "gpu_mem": 4.569147904, + "loss": 1.41, + "grad_norm": 0.38708823919296265, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 55, + "epoch": 0.08814102564102565, + "cpu_mem": 1.885499392, + "gpu_mem": 4.569166336, + "loss": 1.403, + "grad_norm": 0.2274034172296524, + "learning_rate": 0.00026190476190476186 + }, + { + "step": 56, + "epoch": 0.08974358974358974, + "cpu_mem": 1.886089216, + "gpu_mem": 4.569160192, + "loss": 1.4124, + "grad_norm": 0.3139621317386627, + "learning_rate": 0.0002666666666666666 + }, + { + "step": 57, + "epoch": 0.09134615384615384, + "cpu_mem": 1.886482432, + "gpu_mem": 4.5691264, + "loss": 1.395, + "grad_norm": 0.23100294172763824, + "learning_rate": 0.0002714285714285714 + }, + { + "step": 58, + "epoch": 0.09294871794871795, + "cpu_mem": 1.886875648, + "gpu_mem": 4.569155584, + "loss": 1.4002, + "grad_norm": 0.1820942610502243, + "learning_rate": 0.00027619047619047615 + }, + { + "step": 59, + "epoch": 0.09455128205128205, + "cpu_mem": 1.887465472, + "gpu_mem": 4.569138688, + "loss": 1.3715, + "grad_norm": 0.2588822543621063, + "learning_rate": 0.0002809523809523809 + }, + { + "step": 60, + "epoch": 0.09615384615384616, + "cpu_mem": 1.888055296, + "gpu_mem": 4.56918016, + "loss": 1.4223, + "grad_norm": 0.39941850304603577, + "learning_rate": 0.0002857142857142857 + }, + { + "step": 61, + "epoch": 0.09775641025641026, + "cpu_mem": 1.888448512, + "gpu_mem": 4.569146368, + "loss": 1.3869, + "grad_norm": 0.14979925751686096, + "learning_rate": 0.00029047619047619045 + }, + { + "step": 62, + "epoch": 0.09935897435897435, + "cpu_mem": 1.888841728, + "gpu_mem": 4.569186304, + "loss": 1.3704, + "grad_norm": 0.34556126594543457, + "learning_rate": 0.0002952380952380952 + }, + { + "step": 63, + "epoch": 0.10096153846153846, + "cpu_mem": 1.889234944, + "gpu_mem": 4.569140224, + "loss": 1.4505, + "grad_norm": 0.19694042205810547, + "learning_rate": 0.0003 + }, + { + "step": 64, + "epoch": 0.10256410256410256, + "cpu_mem": 1.889824768, + "gpu_mem": 4.569144832, + "loss": 1.4529, + "grad_norm": 0.27252641320228577, + "learning_rate": 0.00029999764801714643 + }, + { + "step": 65, + "epoch": 0.10416666666666667, + "cpu_mem": 1.890217984, + "gpu_mem": 4.56914176, + "loss": 1.4559, + "grad_norm": 0.28496047854423523, + "learning_rate": 0.00029999059214234344 + }, + { + "step": 66, + "epoch": 0.10576923076923077, + "cpu_mem": 1.8906112, + "gpu_mem": 4.569160192, + "loss": 1.4538, + "grad_norm": 0.19004642963409424, + "learning_rate": 0.00029997883259686163 + }, + { + "step": 67, + "epoch": 0.10737179487179487, + "cpu_mem": 1.891201024, + "gpu_mem": 4.569152512, + "loss": 1.4056, + "grad_norm": 0.1516844928264618, + "learning_rate": 0.00029996236974947764 + }, + { + "step": 68, + "epoch": 0.10897435897435898, + "cpu_mem": 1.891790848, + "gpu_mem": 4.569137152, + "loss": 1.3811, + "grad_norm": 0.14968623220920563, + "learning_rate": 0.00029994120411646263 + }, + { + "step": 69, + "epoch": 0.11057692307692307, + "cpu_mem": 1.892184064, + "gpu_mem": 4.569207808, + "loss": 1.4345, + "grad_norm": 0.21478185057640076, + "learning_rate": 0.00029991533636156603 + }, + { + "step": 70, + "epoch": 0.11217948717948718, + "cpu_mem": 1.892773888, + "gpu_mem": 4.569158656, + "loss": 1.4399, + "grad_norm": 0.2844275236129761, + "learning_rate": 0.00029988476729599464 + }, + { + "step": 71, + "epoch": 0.11378205128205128, + "cpu_mem": 1.893167104, + "gpu_mem": 4.569183232, + "loss": 1.4554, + "grad_norm": 0.32969391345977783, + "learning_rate": 0.0002998494978783874 + }, + { + "step": 72, + "epoch": 0.11538461538461539, + "cpu_mem": 1.89356032, + "gpu_mem": 4.569154048, + "loss": 1.3629, + "grad_norm": 0.09676369279623032, + "learning_rate": 0.0002998095292147852 + }, + { + "step": 73, + "epoch": 0.11698717948717949, + "cpu_mem": 1.893953536, + "gpu_mem": 4.569146368, + "loss": 1.3908, + "grad_norm": 0.2293398231267929, + "learning_rate": 0.0002997648625585962 + }, + { + "step": 74, + "epoch": 0.11858974358974358, + "cpu_mem": 1.89454336, + "gpu_mem": 4.569140224, + "loss": 1.389, + "grad_norm": 0.13706089556217194, + "learning_rate": 0.0002997154993105566 + }, + { + "step": 75, + "epoch": 0.1201923076923077, + "cpu_mem": 1.894936576, + "gpu_mem": 4.569169408, + "loss": 1.4062, + "grad_norm": 0.15513446927070618, + "learning_rate": 0.00029966144101868636 + }, + { + "step": 76, + "epoch": 0.12179487179487179, + "cpu_mem": 1.895133184, + "gpu_mem": 4.569160192, + "loss": 1.4178, + "grad_norm": 0.27336132526397705, + "learning_rate": 0.0002996026893782414 + }, + { + "step": 77, + "epoch": 0.1233974358974359, + "cpu_mem": 1.8955264, + "gpu_mem": 4.569147904, + "loss": 1.4372, + "grad_norm": 0.22154369950294495, + "learning_rate": 0.00029953924623165955 + }, + { + "step": 78, + "epoch": 0.125, + "cpu_mem": 1.895919616, + "gpu_mem": 4.569140224, + "loss": 1.3528, + "grad_norm": 0.13841509819030762, + "learning_rate": 0.0002994711135685035 + }, + { + "step": 79, + "epoch": 0.1266025641025641, + "cpu_mem": 1.89650944, + "gpu_mem": 4.569192448, + "loss": 1.3994, + "grad_norm": 0.17541813850402832, + "learning_rate": 0.00029939829352539787 + }, + { + "step": 80, + "epoch": 0.1282051282051282, + "cpu_mem": 1.896902656, + "gpu_mem": 4.569170944, + "loss": 1.416, + "grad_norm": 0.14436671137809753, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 81, + "epoch": 0.12980769230769232, + "cpu_mem": 1.897295872, + "gpu_mem": 4.5691648, + "loss": 1.3795, + "grad_norm": 0.1713201254606247, + "learning_rate": 0.0002992386005807413 + }, + { + "step": 82, + "epoch": 0.13141025641025642, + "cpu_mem": 1.897689088, + "gpu_mem": 4.56914176, + "loss": 1.3876, + "grad_norm": 0.1324116438627243, + "learning_rate": 0.00029915173268712456 + }, + { + "step": 83, + "epoch": 0.1330128205128205, + "cpu_mem": 1.898082304, + "gpu_mem": 4.569163264, + "loss": 1.4125, + "grad_norm": 0.2507175803184509, + "learning_rate": 0.0002990601874292698 + }, + { + "step": 84, + "epoch": 0.1346153846153846, + "cpu_mem": 1.898672128, + "gpu_mem": 4.569135616, + "loss": 1.423, + "grad_norm": 0.19219312071800232, + "learning_rate": 0.0002989639676780152 + }, + { + "step": 85, + "epoch": 0.1362179487179487, + "cpu_mem": 1.898868736, + "gpu_mem": 4.569143296, + "loss": 1.396, + "grad_norm": 0.1266571283340454, + "learning_rate": 0.0002988630764507904 + }, + { + "step": 86, + "epoch": 0.13782051282051283, + "cpu_mem": 1.899261952, + "gpu_mem": 4.569161728, + "loss": 1.378, + "grad_norm": 0.10452074557542801, + "learning_rate": 0.00029875751691152094 + }, + { + "step": 87, + "epoch": 0.13942307692307693, + "cpu_mem": 1.899851776, + "gpu_mem": 4.569150976, + "loss": 1.402, + "grad_norm": 0.156269833445549, + "learning_rate": 0.0002986472923705301 + }, + { + "step": 88, + "epoch": 0.14102564102564102, + "cpu_mem": 1.900048384, + "gpu_mem": 4.56914944, + "loss": 1.402, + "grad_norm": 0.20419596135616302, + "learning_rate": 0.0002985324062844341 + }, + { + "step": 89, + "epoch": 0.14262820512820512, + "cpu_mem": 1.9004416, + "gpu_mem": 4.569144832, + "loss": 1.38, + "grad_norm": 0.14830653369426727, + "learning_rate": 0.0002984128622560345 + }, + { + "step": 90, + "epoch": 0.14423076923076922, + "cpu_mem": 1.900834816, + "gpu_mem": 4.56914944, + "loss": 1.3982, + "grad_norm": 0.16685380041599274, + "learning_rate": 0.0002982886640342046 + }, + { + "step": 91, + "epoch": 0.14583333333333334, + "cpu_mem": 1.901228032, + "gpu_mem": 4.569160192, + "loss": 1.3722, + "grad_norm": 0.08698570728302002, + "learning_rate": 0.00029815981551377217 + }, + { + "step": 92, + "epoch": 0.14743589743589744, + "cpu_mem": 1.901621248, + "gpu_mem": 4.569163264, + "loss": 1.4024, + "grad_norm": 0.13656233251094818, + "learning_rate": 0.00029802632073539745 + }, + { + "step": 93, + "epoch": 0.14903846153846154, + "cpu_mem": 1.902014464, + "gpu_mem": 4.569163264, + "loss": 1.4206, + "grad_norm": 0.1606617122888565, + "learning_rate": 0.0002978881838854462 + }, + { + "step": 94, + "epoch": 0.15064102564102563, + "cpu_mem": 1.90240768, + "gpu_mem": 4.569158656, + "loss": 1.4156, + "grad_norm": 0.1821456104516983, + "learning_rate": 0.00029774540929585847 + }, + { + "step": 95, + "epoch": 0.15224358974358973, + "cpu_mem": 1.902800896, + "gpu_mem": 4.569177088, + "loss": 1.3807, + "grad_norm": 0.15714570879936218, + "learning_rate": 0.0002975980014440126 + }, + { + "step": 96, + "epoch": 0.15384615384615385, + "cpu_mem": 1.903194112, + "gpu_mem": 4.56918016, + "loss": 1.3971, + "grad_norm": 0.09955421835184097, + "learning_rate": 0.00029744596495258525 + }, + { + "step": 97, + "epoch": 0.15544871794871795, + "cpu_mem": 1.903587328, + "gpu_mem": 4.56915712, + "loss": 1.3973, + "grad_norm": 0.10488687455654144, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 98, + "epoch": 0.15705128205128205, + "cpu_mem": 1.903980544, + "gpu_mem": 4.569167872, + "loss": 1.3964, + "grad_norm": 0.08006884902715683, + "learning_rate": 0.000297128025267308 + }, + { + "step": 99, + "epoch": 0.15865384615384615, + "cpu_mem": 1.90437376, + "gpu_mem": 4.569167872, + "loss": 1.3918, + "grad_norm": 0.19516435265541077, + "learning_rate": 0.00029696213204397396 + }, + { + "step": 100, + "epoch": 0.16025641025641027, + "cpu_mem": 1.904766976, + "gpu_mem": 4.569143296, + "loss": 1.3958, + "grad_norm": 0.12893664836883545, + "learning_rate": 0.00029679163012177737 + }, + { + "step": 101, + "epoch": 0.16185897435897437, + "cpu_mem": 1.905160192, + "gpu_mem": 4.56917248, + "loss": 1.4, + "grad_norm": 0.18006132543087006, + "learning_rate": 0.0002966165248476196 + }, + { + "step": 102, + "epoch": 0.16346153846153846, + "cpu_mem": 1.9053568, + "gpu_mem": 4.56914944, + "loss": 1.3703, + "grad_norm": 0.1705038845539093, + "learning_rate": 0.00029643682171276203 + }, + { + "step": 103, + "epoch": 0.16506410256410256, + "cpu_mem": 1.905750016, + "gpu_mem": 4.569166336, + "loss": 1.3961, + "grad_norm": 0.14437200129032135, + "learning_rate": 0.0002962525263526538 + }, + { + "step": 104, + "epoch": 0.16666666666666666, + "cpu_mem": 1.905946624, + "gpu_mem": 4.56913408, + "loss": 1.3856, + "grad_norm": 0.15177342295646667, + "learning_rate": 0.0002960636445467553 + }, + { + "step": 105, + "epoch": 0.16826923076923078, + "cpu_mem": 1.90633984, + "gpu_mem": 4.56914944, + "loss": 1.3979, + "grad_norm": 0.13017188012599945, + "learning_rate": 0.0002958701822183569 + }, + { + "step": 106, + "epoch": 0.16987179487179488, + "cpu_mem": 1.906536448, + "gpu_mem": 4.569129472, + "loss": 1.3972, + "grad_norm": 0.1511073112487793, + "learning_rate": 0.0002956721454343928 + }, + { + "step": 107, + "epoch": 0.17147435897435898, + "cpu_mem": 1.906929664, + "gpu_mem": 4.569170944, + "loss": 1.3737, + "grad_norm": 0.07639097422361374, + "learning_rate": 0.0002954695404052514 + }, + { + "step": 108, + "epoch": 0.17307692307692307, + "cpu_mem": 1.90732288, + "gpu_mem": 4.569166336, + "loss": 1.3886, + "grad_norm": 0.1020292416214943, + "learning_rate": 0.00029526237348458003 + }, + { + "step": 109, + "epoch": 0.17467948717948717, + "cpu_mem": 1.907716096, + "gpu_mem": 4.56917248, + "loss": 1.3804, + "grad_norm": 0.1501128375530243, + "learning_rate": 0.000295050651169086 + }, + { + "step": 110, + "epoch": 0.1762820512820513, + "cpu_mem": 1.908109312, + "gpu_mem": 4.569169408, + "loss": 1.3825, + "grad_norm": 0.10923729836940765, + "learning_rate": 0.00029483438009833264 + }, + { + "step": 111, + "epoch": 0.1778846153846154, + "cpu_mem": 1.90830592, + "gpu_mem": 4.569170944, + "loss": 1.374, + "grad_norm": 0.1207469254732132, + "learning_rate": 0.0002946135670545314 + }, + { + "step": 112, + "epoch": 0.1794871794871795, + "cpu_mem": 1.908699136, + "gpu_mem": 4.569167872, + "loss": 1.392, + "grad_norm": 0.12664152681827545, + "learning_rate": 0.0002943882189623288 + }, + { + "step": 113, + "epoch": 0.18108974358974358, + "cpu_mem": 1.909092352, + "gpu_mem": 4.569147904, + "loss": 1.402, + "grad_norm": 0.10610536485910416, + "learning_rate": 0.00029415834288858947 + }, + { + "step": 114, + "epoch": 0.18269230769230768, + "cpu_mem": 1.909485568, + "gpu_mem": 4.569143296, + "loss": 1.3964, + "grad_norm": 0.18948015570640564, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 115, + "epoch": 0.1842948717948718, + "cpu_mem": 1.909682176, + "gpu_mem": 4.569161728, + "loss": 1.3967, + "grad_norm": 0.07808728516101837, + "learning_rate": 0.0002936850357737156 + }, + { + "step": 116, + "epoch": 0.1858974358974359, + "cpu_mem": 1.910075392, + "gpu_mem": 4.56917248, + "loss": 1.3927, + "grad_norm": 0.11011093109846115, + "learning_rate": 0.0002934416195753839 + }, + { + "step": 117, + "epoch": 0.1875, + "cpu_mem": 1.910272, + "gpu_mem": 4.569158656, + "loss": 1.3874, + "grad_norm": 0.08363348245620728, + "learning_rate": 0.00029319370508065594 + }, + { + "step": 118, + "epoch": 0.1891025641025641, + "cpu_mem": 1.910665216, + "gpu_mem": 4.569174016, + "loss": 1.4059, + "grad_norm": 0.19525128602981567, + "learning_rate": 0.0002929413000640735 + }, + { + "step": 119, + "epoch": 0.1907051282051282, + "cpu_mem": 1.911058432, + "gpu_mem": 4.569155584, + "loss": 1.3652, + "grad_norm": 0.1660914570093155, + "learning_rate": 0.0002926844124410001 + }, + { + "step": 120, + "epoch": 0.19230769230769232, + "cpu_mem": 1.911451648, + "gpu_mem": 4.569181696, + "loss": 1.4025, + "grad_norm": 0.1700599044561386, + "learning_rate": 0.0002924230502673731 + }, + { + "step": 121, + "epoch": 0.19391025641025642, + "cpu_mem": 1.911648256, + "gpu_mem": 4.569140224, + "loss": 1.3974, + "grad_norm": 0.13998501002788544, + "learning_rate": 0.00029215722173945034 + }, + { + "step": 122, + "epoch": 0.1955128205128205, + "cpu_mem": 1.912041472, + "gpu_mem": 4.56917248, + "loss": 1.3846, + "grad_norm": 0.11206551641225815, + "learning_rate": 0.0002918869351935537 + }, + { + "step": 123, + "epoch": 0.1971153846153846, + "cpu_mem": 1.912434688, + "gpu_mem": 4.569166336, + "loss": 1.3882, + "grad_norm": 0.09623062610626221, + "learning_rate": 0.00029161219910580754 + }, + { + "step": 124, + "epoch": 0.1987179487179487, + "cpu_mem": 1.912631296, + "gpu_mem": 4.569167872, + "loss": 1.3824, + "grad_norm": 0.128830224275589, + "learning_rate": 0.00029133302209187267 + }, + { + "step": 125, + "epoch": 0.20032051282051283, + "cpu_mem": 1.913024512, + "gpu_mem": 4.569143296, + "loss": 1.3577, + "grad_norm": 0.11152134090662003, + "learning_rate": 0.00029104941290667655 + }, + { + "step": 126, + "epoch": 0.20192307692307693, + "cpu_mem": 1.91322112, + "gpu_mem": 4.569152512, + "loss": 1.355, + "grad_norm": 0.11871733516454697, + "learning_rate": 0.00029076138044413827 + }, + { + "step": 127, + "epoch": 0.20352564102564102, + "cpu_mem": 1.913614336, + "gpu_mem": 4.569138688, + "loss": 1.4241, + "grad_norm": 0.16917580366134644, + "learning_rate": 0.00029046893373689 + }, + { + "step": 128, + "epoch": 0.20512820512820512, + "cpu_mem": 1.914007552, + "gpu_mem": 4.569175552, + "loss": 1.3671, + "grad_norm": 0.10125470161437988, + "learning_rate": 0.00029017208195599375 + }, + { + "step": 129, + "epoch": 0.20673076923076922, + "cpu_mem": 1.914400768, + "gpu_mem": 4.56917248, + "loss": 1.3935, + "grad_norm": 0.08817076683044434, + "learning_rate": 0.0002898708344106533 + }, + { + "step": 130, + "epoch": 0.20833333333333334, + "cpu_mem": 1.914597376, + "gpu_mem": 4.56917248, + "loss": 1.4182, + "grad_norm": 0.13928735256195068, + "learning_rate": 0.00028956520054792303 + }, + { + "step": 131, + "epoch": 0.20993589743589744, + "cpu_mem": 1.914990592, + "gpu_mem": 4.569161728, + "loss": 1.4068, + "grad_norm": 0.14608316123485565, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 132, + "epoch": 0.21153846153846154, + "cpu_mem": 1.9151872, + "gpu_mem": 4.569161728, + "loss": 1.3526, + "grad_norm": 0.12650689482688904, + "learning_rate": 0.0002889408123459782 + }, + { + "step": 133, + "epoch": 0.21314102564102563, + "cpu_mem": 1.915580416, + "gpu_mem": 4.569143296, + "loss": 1.3851, + "grad_norm": 0.11890316009521484, + "learning_rate": 0.000288622077587435 + }, + { + "step": 134, + "epoch": 0.21474358974358973, + "cpu_mem": 1.915973632, + "gpu_mem": 4.569154048, + "loss": 1.4149, + "grad_norm": 0.1657579392194748, + "learning_rate": 0.0002882989956722303 + }, + { + "step": 135, + "epoch": 0.21634615384615385, + "cpu_mem": 1.91617024, + "gpu_mem": 4.569163264, + "loss": 1.3896, + "grad_norm": 0.14170406758785248, + "learning_rate": 0.00028797157673213914 + }, + { + "step": 136, + "epoch": 0.21794871794871795, + "cpu_mem": 1.916563456, + "gpu_mem": 4.569178624, + "loss": 1.4003, + "grad_norm": 0.15257301926612854, + "learning_rate": 0.00028763983103494465 + }, + { + "step": 137, + "epoch": 0.21955128205128205, + "cpu_mem": 1.916760064, + "gpu_mem": 4.5691264, + "loss": 1.3774, + "grad_norm": 0.08356064558029175, + "learning_rate": 0.00028730376898411606 + }, + { + "step": 138, + "epoch": 0.22115384615384615, + "cpu_mem": 1.91715328, + "gpu_mem": 4.569146368, + "loss": 1.4032, + "grad_norm": 0.1284860074520111, + "learning_rate": 0.00028696340111848245 + }, + { + "step": 139, + "epoch": 0.22275641025641027, + "cpu_mem": 1.917349888, + "gpu_mem": 4.569127936, + "loss": 1.3901, + "grad_norm": 0.15495565533638, + "learning_rate": 0.00028661873811190226 + }, + { + "step": 140, + "epoch": 0.22435897435897437, + "cpu_mem": 1.917743104, + "gpu_mem": 4.569144832, + "loss": 1.3833, + "grad_norm": 0.13998712599277496, + "learning_rate": 0.0002862697907729285 + }, + { + "step": 141, + "epoch": 0.22596153846153846, + "cpu_mem": 1.917939712, + "gpu_mem": 4.569150976, + "loss": 1.3816, + "grad_norm": 0.11999405920505524, + "learning_rate": 0.0002859165700444701 + }, + { + "step": 142, + "epoch": 0.22756410256410256, + "cpu_mem": 1.918332928, + "gpu_mem": 4.569147904, + "loss": 1.382, + "grad_norm": 0.0808078870177269, + "learning_rate": 0.00028555908700344824 + }, + { + "step": 143, + "epoch": 0.22916666666666666, + "cpu_mem": 1.918529536, + "gpu_mem": 4.569174016, + "loss": 1.3848, + "grad_norm": 0.09504427015781403, + "learning_rate": 0.00028519735286044936 + }, + { + "step": 144, + "epoch": 0.23076923076923078, + "cpu_mem": 1.918726144, + "gpu_mem": 4.569147904, + "loss": 1.3899, + "grad_norm": 0.08437266200780869, + "learning_rate": 0.0002848313789593736 + }, + { + "step": 145, + "epoch": 0.23237179487179488, + "cpu_mem": 1.91911936, + "gpu_mem": 4.56918784, + "loss": 1.3925, + "grad_norm": 0.1684160977602005, + "learning_rate": 0.00028446117677707867 + }, + { + "step": 146, + "epoch": 0.23397435897435898, + "cpu_mem": 1.919315968, + "gpu_mem": 4.569137152, + "loss": 1.3814, + "grad_norm": 0.08926627039909363, + "learning_rate": 0.0002840867579230205 + }, + { + "step": 147, + "epoch": 0.23557692307692307, + "cpu_mem": 1.919512576, + "gpu_mem": 4.569146368, + "loss": 1.4166, + "grad_norm": 0.1395501345396042, + "learning_rate": 0.00028370813413888866 + }, + { + "step": 148, + "epoch": 0.23717948717948717, + "cpu_mem": 1.919905792, + "gpu_mem": 4.569166336, + "loss": 1.381, + "grad_norm": 0.12583552300930023, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 149, + "epoch": 0.2387820512820513, + "cpu_mem": 1.9201024, + "gpu_mem": 4.56915712, + "loss": 1.3824, + "grad_norm": 0.10115263611078262, + "learning_rate": 0.0002829383194061186 + }, + { + "step": 150, + "epoch": 0.2403846153846154, + "cpu_mem": 1.920495616, + "gpu_mem": 4.569169408, + "loss": 1.3738, + "grad_norm": 0.11516670882701874, + "learning_rate": 0.00028254715259869444 + }, + { + "step": 151, + "epoch": 0.2419871794871795, + "cpu_mem": 1.920692224, + "gpu_mem": 4.56913408, + "loss": 1.3839, + "grad_norm": 0.15094754099845886, + "learning_rate": 0.00028215182914286766 + }, + { + "step": 152, + "epoch": 0.24358974358974358, + "cpu_mem": 1.920888832, + "gpu_mem": 4.5691648, + "loss": 1.3909, + "grad_norm": 0.15433722734451294, + "learning_rate": 0.00028175236143589144 + }, + { + "step": 153, + "epoch": 0.24519230769230768, + "cpu_mem": 1.921282048, + "gpu_mem": 4.569160192, + "loss": 1.3733, + "grad_norm": 0.13195116817951202, + "learning_rate": 0.0002813487620049817 + }, + { + "step": 154, + "epoch": 0.2467948717948718, + "cpu_mem": 1.921478656, + "gpu_mem": 4.569184768, + "loss": 1.3872, + "grad_norm": 0.09619386494159698, + "learning_rate": 0.00028094104350692435 + }, + { + "step": 155, + "epoch": 0.2483974358974359, + "cpu_mem": 1.921675264, + "gpu_mem": 4.569121792, + "loss": 1.3929, + "grad_norm": 0.10976003855466843, + "learning_rate": 0.0002805292187276783 + }, + { + "step": 156, + "epoch": 0.25, + "cpu_mem": 1.921871872, + "gpu_mem": 4.569175552, + "loss": 1.3703, + "grad_norm": 0.10335914045572281, + "learning_rate": 0.0002801133005819744 + }, + { + "step": 157, + "epoch": 0.2516025641025641, + "cpu_mem": 1.92206848, + "gpu_mem": 4.569167872, + "loss": 1.3914, + "grad_norm": 0.10780985653400421, + "learning_rate": 0.00027969330211291077 + }, + { + "step": 158, + "epoch": 0.2532051282051282, + "cpu_mem": 1.922461696, + "gpu_mem": 4.569183232, + "loss": 1.3966, + "grad_norm": 0.09943587332963943, + "learning_rate": 0.00027926923649154327 + }, + { + "step": 159, + "epoch": 0.2548076923076923, + "cpu_mem": 1.922854912, + "gpu_mem": 4.569184768, + "loss": 1.3888, + "grad_norm": 0.0958065316081047, + "learning_rate": 0.00027884111701647284 + }, + { + "step": 160, + "epoch": 0.2564102564102564, + "cpu_mem": 1.92305152, + "gpu_mem": 4.569152512, + "loss": 1.4285, + "grad_norm": 0.16363629698753357, + "learning_rate": 0.00027840895711342834 + }, + { + "step": 161, + "epoch": 0.25801282051282054, + "cpu_mem": 1.923248128, + "gpu_mem": 4.569144832, + "loss": 1.4036, + "grad_norm": 0.16475588083267212, + "learning_rate": 0.00027797277033484553 + }, + { + "step": 162, + "epoch": 0.25961538461538464, + "cpu_mem": 1.923641344, + "gpu_mem": 4.56918016, + "loss": 1.3981, + "grad_norm": 0.10214866697788239, + "learning_rate": 0.0002775325703594421 + }, + { + "step": 163, + "epoch": 0.26121794871794873, + "cpu_mem": 1.923837952, + "gpu_mem": 4.569127936, + "loss": 1.4047, + "grad_norm": 0.1731015145778656, + "learning_rate": 0.0002770883709917886 + }, + { + "step": 164, + "epoch": 0.26282051282051283, + "cpu_mem": 1.92403456, + "gpu_mem": 4.569163264, + "loss": 1.3795, + "grad_norm": 0.1303660124540329, + "learning_rate": 0.0002766401861618757 + }, + { + "step": 165, + "epoch": 0.2644230769230769, + "cpu_mem": 1.924231168, + "gpu_mem": 4.569152512, + "loss": 1.3761, + "grad_norm": 0.07194478064775467, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 166, + "epoch": 0.266025641025641, + "cpu_mem": 1.924427776, + "gpu_mem": 4.569184768, + "loss": 1.3905, + "grad_norm": 0.12756556272506714, + "learning_rate": 0.0002757319164597092 + }, + { + "step": 167, + "epoch": 0.2676282051282051, + "cpu_mem": 1.924820992, + "gpu_mem": 4.569178624, + "loss": 1.3736, + "grad_norm": 0.161830872297287, + "learning_rate": 0.0002752718600705858 + }, + { + "step": 168, + "epoch": 0.2692307692307692, + "cpu_mem": 1.9250176, + "gpu_mem": 4.56915712, + "loss": 1.3806, + "grad_norm": 0.07469745725393295, + "learning_rate": 0.00027480787518457023 + }, + { + "step": 169, + "epoch": 0.2708333333333333, + "cpu_mem": 1.925410816, + "gpu_mem": 4.569154048, + "loss": 1.3781, + "grad_norm": 0.12786777317523956, + "learning_rate": 0.0002743399763521223 + }, + { + "step": 170, + "epoch": 0.2724358974358974, + "cpu_mem": 1.925607424, + "gpu_mem": 4.569190912, + "loss": 1.4048, + "grad_norm": 0.09335099905729294, + "learning_rate": 0.0002738681782464426 + }, + { + "step": 171, + "epoch": 0.27403846153846156, + "cpu_mem": 1.927376896, + "gpu_mem": 4.5691648, + "loss": 1.3705, + "grad_norm": 0.12744824588298798, + "learning_rate": 0.0002733924956630117 + }, + { + "step": 172, + "epoch": 0.27564102564102566, + "cpu_mem": 1.927573504, + "gpu_mem": 4.56914176, + "loss": 1.3893, + "grad_norm": 0.12286381423473358, + "learning_rate": 0.00027291294351912664 + }, + { + "step": 173, + "epoch": 0.27724358974358976, + "cpu_mem": 1.927770112, + "gpu_mem": 4.569167872, + "loss": 1.4104, + "grad_norm": 0.15195141732692719, + "learning_rate": 0.00027242953685343327 + }, + { + "step": 174, + "epoch": 0.27884615384615385, + "cpu_mem": 1.92796672, + "gpu_mem": 4.56918016, + "loss": 1.4121, + "grad_norm": 0.14796558022499084, + "learning_rate": 0.0002719422908254538 + }, + { + "step": 175, + "epoch": 0.28044871794871795, + "cpu_mem": 1.928163328, + "gpu_mem": 4.56914176, + "loss": 1.3964, + "grad_norm": 0.1289375275373459, + "learning_rate": 0.0002714512207151125 + }, + { + "step": 176, + "epoch": 0.28205128205128205, + "cpu_mem": 1.928359936, + "gpu_mem": 4.569150976, + "loss": 1.4007, + "grad_norm": 0.10092747956514359, + "learning_rate": 0.0002709563419222557 + }, + { + "step": 177, + "epoch": 0.28365384615384615, + "cpu_mem": 1.928556544, + "gpu_mem": 4.569132544, + "loss": 1.3839, + "grad_norm": 0.11587506532669067, + "learning_rate": 0.0002704576699661691 + }, + { + "step": 178, + "epoch": 0.28525641025641024, + "cpu_mem": 1.928753152, + "gpu_mem": 4.569146368, + "loss": 1.3933, + "grad_norm": 0.17092053592205048, + "learning_rate": 0.0002699552204850914 + }, + { + "step": 179, + "epoch": 0.28685897435897434, + "cpu_mem": 1.92894976, + "gpu_mem": 4.569154048, + "loss": 1.3914, + "grad_norm": 0.10316187888383865, + "learning_rate": 0.0002694490092357233 + }, + { + "step": 180, + "epoch": 0.28846153846153844, + "cpu_mem": 1.929146368, + "gpu_mem": 4.569135616, + "loss": 1.4044, + "grad_norm": 0.15141098201274872, + "learning_rate": 0.00026893905209273404 + }, + { + "step": 181, + "epoch": 0.2900641025641026, + "cpu_mem": 1.929342976, + "gpu_mem": 4.569166336, + "loss": 1.389, + "grad_norm": 0.11251378059387207, + "learning_rate": 0.00026842536504826286 + }, + { + "step": 182, + "epoch": 0.2916666666666667, + "cpu_mem": 1.929539584, + "gpu_mem": 4.569137152, + "loss": 1.3971, + "grad_norm": 0.17825596034526825, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 183, + "epoch": 0.2932692307692308, + "cpu_mem": 1.9299328, + "gpu_mem": 4.569161728, + "loss": 1.3774, + "grad_norm": 0.0844452753663063, + "learning_rate": 0.0002673868658077717 + }, + { + "step": 184, + "epoch": 0.2948717948717949, + "cpu_mem": 1.930129408, + "gpu_mem": 4.56914176, + "loss": 1.3903, + "grad_norm": 0.1294938623905182, + "learning_rate": 0.00026686208617885055 + }, + { + "step": 185, + "epoch": 0.296474358974359, + "cpu_mem": 1.930326016, + "gpu_mem": 4.569174016, + "loss": 1.4099, + "grad_norm": 0.1917700469493866, + "learning_rate": 0.0002663336417816238 + }, + { + "step": 186, + "epoch": 0.2980769230769231, + "cpu_mem": 1.930522624, + "gpu_mem": 4.5691648, + "loss": 1.386, + "grad_norm": 0.11671796441078186, + "learning_rate": 0.0002658015491879868 + }, + { + "step": 187, + "epoch": 0.29967948717948717, + "cpu_mem": 1.930719232, + "gpu_mem": 4.569160192, + "loss": 1.4121, + "grad_norm": 0.20046761631965637, + "learning_rate": 0.00026526582508424175 + }, + { + "step": 188, + "epoch": 0.30128205128205127, + "cpu_mem": 1.93091584, + "gpu_mem": 4.569117184, + "loss": 1.3687, + "grad_norm": 0.08852381259202957, + "learning_rate": 0.0002647264862705741 + }, + { + "step": 189, + "epoch": 0.30288461538461536, + "cpu_mem": 1.931309056, + "gpu_mem": 4.569197056, + "loss": 1.3756, + "grad_norm": 0.05853727459907532, + "learning_rate": 0.00026418354966052573 + }, + { + "step": 190, + "epoch": 0.30448717948717946, + "cpu_mem": 1.931505664, + "gpu_mem": 4.569147904, + "loss": 1.3787, + "grad_norm": 0.140967458486557, + "learning_rate": 0.00026363703228046454 + }, + { + "step": 191, + "epoch": 0.3060897435897436, + "cpu_mem": 1.931702272, + "gpu_mem": 4.569147904, + "loss": 1.3685, + "grad_norm": 0.09987839311361313, + "learning_rate": 0.0002630869512690507 + }, + { + "step": 192, + "epoch": 0.3076923076923077, + "cpu_mem": 1.932095488, + "gpu_mem": 4.569114112, + "loss": 1.4316, + "grad_norm": 0.22619743645191193, + "learning_rate": 0.0002625333238766989 + }, + { + "step": 193, + "epoch": 0.3092948717948718, + "cpu_mem": 1.932292096, + "gpu_mem": 4.569154048, + "loss": 1.3758, + "grad_norm": 0.1543143093585968, + "learning_rate": 0.0002619761674650377 + }, + { + "step": 194, + "epoch": 0.3108974358974359, + "cpu_mem": 1.932488704, + "gpu_mem": 4.56914944, + "loss": 1.3668, + "grad_norm": 0.1452859342098236, + "learning_rate": 0.0002614154995063647 + }, + { + "step": 195, + "epoch": 0.3125, + "cpu_mem": 1.932685312, + "gpu_mem": 4.569137152, + "loss": 1.3984, + "grad_norm": 0.1145983561873436, + "learning_rate": 0.00026085133758309883 + }, + { + "step": 196, + "epoch": 0.3141025641025641, + "cpu_mem": 1.93288192, + "gpu_mem": 4.569161728, + "loss": 1.4159, + "grad_norm": 0.253730833530426, + "learning_rate": 0.0002602836993872292 + }, + { + "step": 197, + "epoch": 0.3157051282051282, + "cpu_mem": 1.933078528, + "gpu_mem": 4.569177088, + "loss": 1.3983, + "grad_norm": 0.19392472505569458, + "learning_rate": 0.0002597126027197598 + }, + { + "step": 198, + "epoch": 0.3173076923076923, + "cpu_mem": 1.933275136, + "gpu_mem": 4.56914944, + "loss": 1.3832, + "grad_norm": 0.11371741443872452, + "learning_rate": 0.0002591380654901515 + }, + { + "step": 199, + "epoch": 0.3189102564102564, + "cpu_mem": 1.933471744, + "gpu_mem": 4.569146368, + "loss": 1.3942, + "grad_norm": 0.12761908769607544, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 200, + "epoch": 0.32051282051282054, + "cpu_mem": 1.933668352, + "gpu_mem": 4.569161728, + "loss": 1.4234, + "grad_norm": 0.15882179141044617, + "learning_rate": 0.0002579787415212732 + }, + { + "step": 201, + "epoch": 0.32211538461538464, + "cpu_mem": 1.93386496, + "gpu_mem": 4.569138688, + "loss": 1.3984, + "grad_norm": 0.08784448355436325, + "learning_rate": 0.00025739399113813784 + }, + { + "step": 202, + "epoch": 0.32371794871794873, + "cpu_mem": 1.934061568, + "gpu_mem": 4.569140224, + "loss": 1.3946, + "grad_norm": 0.0962081030011177, + "learning_rate": 0.00025680587290399277 + }, + { + "step": 203, + "epoch": 0.32532051282051283, + "cpu_mem": 1.934061568, + "gpu_mem": 4.569181696, + "loss": 1.3971, + "grad_norm": 0.1639123409986496, + "learning_rate": 0.0002562144052620913 + }, + { + "step": 204, + "epoch": 0.3269230769230769, + "cpu_mem": 1.934258176, + "gpu_mem": 4.569152512, + "loss": 1.3947, + "grad_norm": 0.10268871486186981, + "learning_rate": 0.00025561960676072354 + }, + { + "step": 205, + "epoch": 0.328525641025641, + "cpu_mem": 1.934454784, + "gpu_mem": 4.569152512, + "loss": 1.3808, + "grad_norm": 0.11725164204835892, + "learning_rate": 0.0002550214960526344 + }, + { + "step": 206, + "epoch": 0.3301282051282051, + "cpu_mem": 1.934651392, + "gpu_mem": 4.56914944, + "loss": 1.4009, + "grad_norm": 0.17192034423351288, + "learning_rate": 0.000254420091894439 + }, + { + "step": 207, + "epoch": 0.3317307692307692, + "cpu_mem": 1.934848, + "gpu_mem": 4.56914944, + "loss": 1.3959, + "grad_norm": 0.09936308115720749, + "learning_rate": 0.0002538154131460342 + }, + { + "step": 208, + "epoch": 0.3333333333333333, + "cpu_mem": 1.935044608, + "gpu_mem": 4.569140224, + "loss": 1.3879, + "grad_norm": 0.08508932590484619, + "learning_rate": 0.00025320747877000745 + }, + { + "step": 209, + "epoch": 0.3349358974358974, + "cpu_mem": 1.935241216, + "gpu_mem": 4.569175552, + "loss": 1.3736, + "grad_norm": 0.11456809937953949, + "learning_rate": 0.00025259630783104164 + }, + { + "step": 210, + "epoch": 0.33653846153846156, + "cpu_mem": 1.935437824, + "gpu_mem": 4.569132544, + "loss": 1.3773, + "grad_norm": 0.09572583436965942, + "learning_rate": 0.00025198191949531786 + }, + { + "step": 211, + "epoch": 0.33814102564102566, + "cpu_mem": 1.935634432, + "gpu_mem": 4.569160192, + "loss": 1.3852, + "grad_norm": 0.0914580374956131, + "learning_rate": 0.00025136433302991366 + }, + { + "step": 212, + "epoch": 0.33974358974358976, + "cpu_mem": 1.93583104, + "gpu_mem": 4.569169408, + "loss": 1.395, + "grad_norm": 0.23109567165374756, + "learning_rate": 0.00025074356780219946 + }, + { + "step": 213, + "epoch": 0.34134615384615385, + "cpu_mem": 1.936027648, + "gpu_mem": 4.56914176, + "loss": 1.3764, + "grad_norm": 0.17007100582122803, + "learning_rate": 0.000250119643279231 + }, + { + "step": 214, + "epoch": 0.34294871794871795, + "cpu_mem": 1.936224256, + "gpu_mem": 4.569150976, + "loss": 1.3907, + "grad_norm": 0.11849001795053482, + "learning_rate": 0.0002494925790271386 + }, + { + "step": 215, + "epoch": 0.34455128205128205, + "cpu_mem": 1.937403904, + "gpu_mem": 4.569152512, + "loss": 1.3881, + "grad_norm": 0.17202003300189972, + "learning_rate": 0.00024886239471051376 + }, + { + "step": 216, + "epoch": 0.34615384615384615, + "cpu_mem": 1.93779712, + "gpu_mem": 4.569152512, + "loss": 1.3958, + "grad_norm": 0.1403442919254303, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 217, + "epoch": 0.34775641025641024, + "cpu_mem": 1.937993728, + "gpu_mem": 4.569137152, + "loss": 1.3945, + "grad_norm": 0.09395784884691238, + "learning_rate": 0.0002475927450306363 + }, + { + "step": 218, + "epoch": 0.34935897435897434, + "cpu_mem": 1.938190336, + "gpu_mem": 4.569158656, + "loss": 1.3839, + "grad_norm": 0.1229684129357338, + "learning_rate": 0.0002469533194833073 + }, + { + "step": 219, + "epoch": 0.35096153846153844, + "cpu_mem": 1.938386944, + "gpu_mem": 4.569192448, + "loss": 1.3827, + "grad_norm": 0.1263919472694397, + "learning_rate": 0.0002463108535020447 + }, + { + "step": 220, + "epoch": 0.3525641025641026, + "cpu_mem": 1.938583552, + "gpu_mem": 4.569146368, + "loss": 1.395, + "grad_norm": 0.13413845002651215, + "learning_rate": 0.0002456653672344348 + }, + { + "step": 221, + "epoch": 0.3541666666666667, + "cpu_mem": 1.93878016, + "gpu_mem": 4.569152512, + "loss": 1.3944, + "grad_norm": 0.11542266607284546, + "learning_rate": 0.0002450168809227794 + }, + { + "step": 222, + "epoch": 0.3557692307692308, + "cpu_mem": 1.938976768, + "gpu_mem": 4.569167872, + "loss": 1.3942, + "grad_norm": 0.11249082535505295, + "learning_rate": 0.00024436541490346095 + }, + { + "step": 223, + "epoch": 0.3573717948717949, + "cpu_mem": 1.938976768, + "gpu_mem": 4.569186304, + "loss": 1.3811, + "grad_norm": 0.09335729479789734, + "learning_rate": 0.00024371098960630495 + }, + { + "step": 224, + "epoch": 0.358974358974359, + "cpu_mem": 1.939173376, + "gpu_mem": 4.569155584, + "loss": 1.384, + "grad_norm": 0.05563786253333092, + "learning_rate": 0.000243053625553939 + }, + { + "step": 225, + "epoch": 0.3605769230769231, + "cpu_mem": 1.939369984, + "gpu_mem": 4.56914176, + "loss": 1.377, + "grad_norm": 0.08558055013418198, + "learning_rate": 0.00024239334336114953 + }, + { + "step": 226, + "epoch": 0.36217948717948717, + "cpu_mem": 1.939566592, + "gpu_mem": 4.56913408, + "loss": 1.3879, + "grad_norm": 0.05541250482201576, + "learning_rate": 0.0002417301637342352 + }, + { + "step": 227, + "epoch": 0.36378205128205127, + "cpu_mem": 1.9397632, + "gpu_mem": 4.569198592, + "loss": 1.38, + "grad_norm": 0.08435282111167908, + "learning_rate": 0.00024106410747035744 + }, + { + "step": 228, + "epoch": 0.36538461538461536, + "cpu_mem": 1.939959808, + "gpu_mem": 4.569137152, + "loss": 1.3977, + "grad_norm": 0.19042043387889862, + "learning_rate": 0.00024039519545688846 + }, + { + "step": 229, + "epoch": 0.36698717948717946, + "cpu_mem": 1.940156416, + "gpu_mem": 4.569189376, + "loss": 1.3786, + "grad_norm": 0.054216593503952026, + "learning_rate": 0.000239723448670756 + }, + { + "step": 230, + "epoch": 0.3685897435897436, + "cpu_mem": 1.940353024, + "gpu_mem": 4.569170944, + "loss": 1.3767, + "grad_norm": 0.07976459711790085, + "learning_rate": 0.0002390488881777858 + }, + { + "step": 231, + "epoch": 0.3701923076923077, + "cpu_mem": 1.940549632, + "gpu_mem": 4.569169408, + "loss": 1.3778, + "grad_norm": 0.12045295536518097, + "learning_rate": 0.0002383715351320406 + }, + { + "step": 232, + "epoch": 0.3717948717948718, + "cpu_mem": 1.94074624, + "gpu_mem": 4.569174016, + "loss": 1.3799, + "grad_norm": 0.06797172874212265, + "learning_rate": 0.00023769141077515713 + }, + { + "step": 233, + "epoch": 0.3733974358974359, + "cpu_mem": 1.940942848, + "gpu_mem": 4.56914944, + "loss": 1.3876, + "grad_norm": 0.23848429322242737, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 234, + "epoch": 0.375, + "cpu_mem": 1.941139456, + "gpu_mem": 4.569178624, + "loss": 1.4015, + "grad_norm": 0.11648571491241455, + "learning_rate": 0.0002363229335283915 + }, + { + "step": 235, + "epoch": 0.3766025641025641, + "cpu_mem": 1.941336064, + "gpu_mem": 4.569155584, + "loss": 1.3842, + "grad_norm": 0.1455588936805725, + "learning_rate": 0.00023563462355364297 + }, + { + "step": 236, + "epoch": 0.3782051282051282, + "cpu_mem": 1.941532672, + "gpu_mem": 4.569217024, + "loss": 1.3837, + "grad_norm": 0.16792842745780945, + "learning_rate": 0.0002349436280966775 + }, + { + "step": 237, + "epoch": 0.3798076923076923, + "cpu_mem": 1.941532672, + "gpu_mem": 4.56914176, + "loss": 1.4075, + "grad_norm": 0.18150931596755981, + "learning_rate": 0.00023424996882695468 + }, + { + "step": 238, + "epoch": 0.3814102564102564, + "cpu_mem": 1.94172928, + "gpu_mem": 4.569152512, + "loss": 1.4047, + "grad_norm": 0.15213412046432495, + "learning_rate": 0.00023355366749747063 + }, + { + "step": 239, + "epoch": 0.38301282051282054, + "cpu_mem": 1.941925888, + "gpu_mem": 4.569150976, + "loss": 1.4059, + "grad_norm": 0.08434338867664337, + "learning_rate": 0.00023285474594407585 + }, + { + "step": 240, + "epoch": 0.38461538461538464, + "cpu_mem": 1.942122496, + "gpu_mem": 4.569147904, + "loss": 1.3722, + "grad_norm": 0.0873139426112175, + "learning_rate": 0.0002321532260847905 + }, + { + "step": 241, + "epoch": 0.38621794871794873, + "cpu_mem": 1.942319104, + "gpu_mem": 4.569178624, + "loss": 1.3805, + "grad_norm": 0.1251145750284195, + "learning_rate": 0.00023144912991911691 + }, + { + "step": 242, + "epoch": 0.38782051282051283, + "cpu_mem": 1.942515712, + "gpu_mem": 4.56915712, + "loss": 1.3809, + "grad_norm": 0.11215078085660934, + "learning_rate": 0.0002307424795273499 + }, + { + "step": 243, + "epoch": 0.3894230769230769, + "cpu_mem": 1.94271232, + "gpu_mem": 4.569152512, + "loss": 1.3813, + "grad_norm": 0.11048711091279984, + "learning_rate": 0.00023003329706988425 + }, + { + "step": 244, + "epoch": 0.391025641025641, + "cpu_mem": 1.942908928, + "gpu_mem": 4.569163264, + "loss": 1.3839, + "grad_norm": 0.1583673655986786, + "learning_rate": 0.00022932160478651963 + }, + { + "step": 245, + "epoch": 0.3926282051282051, + "cpu_mem": 1.943105536, + "gpu_mem": 4.569167872, + "loss": 1.3982, + "grad_norm": 0.08232268691062927, + "learning_rate": 0.00022860742499576338 + }, + { + "step": 246, + "epoch": 0.3942307692307692, + "cpu_mem": 1.943105536, + "gpu_mem": 4.569129472, + "loss": 1.3888, + "grad_norm": 0.07884624600410461, + "learning_rate": 0.00022789078009413042 + }, + { + "step": 247, + "epoch": 0.3958333333333333, + "cpu_mem": 1.943302144, + "gpu_mem": 4.569197056, + "loss": 1.3699, + "grad_norm": 0.07859917730093002, + "learning_rate": 0.00022717169255544108 + }, + { + "step": 248, + "epoch": 0.3974358974358974, + "cpu_mem": 1.943498752, + "gpu_mem": 4.569160192, + "loss": 1.3709, + "grad_norm": 0.11804059892892838, + "learning_rate": 0.00022645018493011612 + }, + { + "step": 249, + "epoch": 0.39903846153846156, + "cpu_mem": 1.94369536, + "gpu_mem": 4.56914944, + "loss": 1.4, + "grad_norm": 0.13607589900493622, + "learning_rate": 0.0002257262798444698 + }, + { + "step": 250, + "epoch": 0.40064102564102566, + "cpu_mem": 1.94369536, + "gpu_mem": 4.569166336, + "loss": 1.3806, + "grad_norm": 0.13464748859405518, + "learning_rate": 0.000225 + }, + { + "step": 251, + "epoch": 0.40224358974358976, + "cpu_mem": 1.943891968, + "gpu_mem": 4.569140224, + "loss": 1.4013, + "grad_norm": 0.1877242624759674, + "learning_rate": 0.00022427136817267668 + }, + { + "step": 252, + "epoch": 0.40384615384615385, + "cpu_mem": 1.943891968, + "gpu_mem": 4.56918784, + "loss": 1.4002, + "grad_norm": 0.11500083655118942, + "learning_rate": 0.0002235404072122273 + }, + { + "step": 253, + "epoch": 0.40544871794871795, + "cpu_mem": 1.944088576, + "gpu_mem": 4.569155584, + "loss": 1.3774, + "grad_norm": 0.12008476257324219, + "learning_rate": 0.00022280714004142054 + }, + { + "step": 254, + "epoch": 0.40705128205128205, + "cpu_mem": 1.944285184, + "gpu_mem": 4.569144832, + "loss": 1.371, + "grad_norm": 0.11749988794326782, + "learning_rate": 0.00022207158965534726 + }, + { + "step": 255, + "epoch": 0.40865384615384615, + "cpu_mem": 1.944481792, + "gpu_mem": 4.569160192, + "loss": 1.3791, + "grad_norm": 0.08551786839962006, + "learning_rate": 0.0002213337791206993 + }, + { + "step": 256, + "epoch": 0.41025641025641024, + "cpu_mem": 1.9446784, + "gpu_mem": 4.56915712, + "loss": 1.3694, + "grad_norm": 0.08188202232122421, + "learning_rate": 0.00022059373157504636 + }, + { + "step": 257, + "epoch": 0.41185897435897434, + "cpu_mem": 1.944875008, + "gpu_mem": 4.56915712, + "loss": 1.4018, + "grad_norm": 0.1974252164363861, + "learning_rate": 0.00021985147022611038 + }, + { + "step": 258, + "epoch": 0.41346153846153844, + "cpu_mem": 1.945071616, + "gpu_mem": 4.569144832, + "loss": 1.3813, + "grad_norm": 0.10544128715991974, + "learning_rate": 0.0002191070183510375 + }, + { + "step": 259, + "epoch": 0.4150641025641026, + "cpu_mem": 1.945268224, + "gpu_mem": 4.569127936, + "loss": 1.3928, + "grad_norm": 0.14718236029148102, + "learning_rate": 0.00021836039929566835 + }, + { + "step": 260, + "epoch": 0.4166666666666667, + "cpu_mem": 1.945268224, + "gpu_mem": 4.569190912, + "loss": 1.3987, + "grad_norm": 0.13644054532051086, + "learning_rate": 0.0002176116364738058 + }, + { + "step": 261, + "epoch": 0.4182692307692308, + "cpu_mem": 1.945464832, + "gpu_mem": 4.569144832, + "loss": 1.3997, + "grad_norm": 0.12411530315876007, + "learning_rate": 0.00021686075336648075 + }, + { + "step": 262, + "epoch": 0.4198717948717949, + "cpu_mem": 1.94566144, + "gpu_mem": 4.569154048, + "loss": 1.3844, + "grad_norm": 0.1677798628807068, + "learning_rate": 0.00021610777352121574 + }, + { + "step": 263, + "epoch": 0.421474358974359, + "cpu_mem": 1.945858048, + "gpu_mem": 4.569189376, + "loss": 1.3855, + "grad_norm": 0.08090117573738098, + "learning_rate": 0.0002153527205512867 + }, + { + "step": 264, + "epoch": 0.4230769230769231, + "cpu_mem": 1.945858048, + "gpu_mem": 4.569154048, + "loss": 1.3809, + "grad_norm": 0.0836615040898323, + "learning_rate": 0.0002145956181349821 + }, + { + "step": 265, + "epoch": 0.42467948717948717, + "cpu_mem": 1.946054656, + "gpu_mem": 4.569158656, + "loss": 1.3913, + "grad_norm": 0.11616785824298859, + "learning_rate": 0.00021383649001486055 + }, + { + "step": 266, + "epoch": 0.42628205128205127, + "cpu_mem": 1.946251264, + "gpu_mem": 4.569206272, + "loss": 1.3804, + "grad_norm": 0.08309703320264816, + "learning_rate": 0.00021307535999700637 + }, + { + "step": 267, + "epoch": 0.42788461538461536, + "cpu_mem": 1.946251264, + "gpu_mem": 4.569215488, + "loss": 1.3927, + "grad_norm": 0.10747293382883072, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 268, + "epoch": 0.42948717948717946, + "cpu_mem": 1.946447872, + "gpu_mem": 4.569169408, + "loss": 1.3927, + "grad_norm": 0.14724832773208618, + "learning_rate": 0.00021154718980558417 + }, + { + "step": 269, + "epoch": 0.4310897435897436, + "cpu_mem": 1.94664448, + "gpu_mem": 4.569163264, + "loss": 1.4261, + "grad_norm": 0.2329377830028534, + "learning_rate": 0.00021078019755508398 + }, + { + "step": 270, + "epoch": 0.4326923076923077, + "cpu_mem": 1.946841088, + "gpu_mem": 4.569224704, + "loss": 1.3938, + "grad_norm": 0.15457101166248322, + "learning_rate": 0.00021001129925148396 + }, + { + "step": 271, + "epoch": 0.4342948717948718, + "cpu_mem": 1.946841088, + "gpu_mem": 4.569150976, + "loss": 1.3835, + "grad_norm": 0.09995131939649582, + "learning_rate": 0.00020924051900725923 + }, + { + "step": 272, + "epoch": 0.4358974358974359, + "cpu_mem": 1.947037696, + "gpu_mem": 4.56914944, + "loss": 1.3818, + "grad_norm": 0.07764334976673126, + "learning_rate": 0.00020846788099390188 + }, + { + "step": 273, + "epoch": 0.4375, + "cpu_mem": 1.947234304, + "gpu_mem": 4.569152512, + "loss": 1.3817, + "grad_norm": 0.09559372067451477, + "learning_rate": 0.0002076934094411635 + }, + { + "step": 274, + "epoch": 0.4391025641025641, + "cpu_mem": 1.947234304, + "gpu_mem": 4.569138688, + "loss": 1.4027, + "grad_norm": 0.14215907454490662, + "learning_rate": 0.0002069171286362949 + }, + { + "step": 275, + "epoch": 0.4407051282051282, + "cpu_mem": 1.947234304, + "gpu_mem": 4.569154048, + "loss": 1.3979, + "grad_norm": 0.11523955315351486, + "learning_rate": 0.00020613906292328457 + }, + { + "step": 276, + "epoch": 0.4423076923076923, + "cpu_mem": 1.947430912, + "gpu_mem": 4.569192448, + "loss": 1.383, + "grad_norm": 0.0859469622373581, + "learning_rate": 0.0002053592367020955 + }, + { + "step": 277, + "epoch": 0.4439102564102564, + "cpu_mem": 1.94762752, + "gpu_mem": 4.56917248, + "loss": 1.3737, + "grad_norm": 0.08982454985380173, + "learning_rate": 0.0002045776744278996 + }, + { + "step": 278, + "epoch": 0.44551282051282054, + "cpu_mem": 1.947824128, + "gpu_mem": 4.569198592, + "loss": 1.3938, + "grad_norm": 0.12573374807834625, + "learning_rate": 0.00020379440061031118 + }, + { + "step": 279, + "epoch": 0.44711538461538464, + "cpu_mem": 1.948020736, + "gpu_mem": 4.56914944, + "loss": 1.369, + "grad_norm": 0.09659311175346375, + "learning_rate": 0.00020300943981261808 + }, + { + "step": 280, + "epoch": 0.44871794871794873, + "cpu_mem": 1.948020736, + "gpu_mem": 4.569143296, + "loss": 1.399, + "grad_norm": 0.13779152929782867, + "learning_rate": 0.0002022228166510114 + }, + { + "step": 281, + "epoch": 0.45032051282051283, + "cpu_mem": 1.948217344, + "gpu_mem": 4.569166336, + "loss": 1.3713, + "grad_norm": 0.10317379981279373, + "learning_rate": 0.00020143455579381373 + }, + { + "step": 282, + "epoch": 0.4519230769230769, + "cpu_mem": 1.948217344, + "gpu_mem": 4.569144832, + "loss": 1.362, + "grad_norm": 0.0936991423368454, + "learning_rate": 0.0002006446819607053 + }, + { + "step": 283, + "epoch": 0.453525641025641, + "cpu_mem": 1.948413952, + "gpu_mem": 4.569158656, + "loss": 1.3826, + "grad_norm": 0.10224805027246475, + "learning_rate": 0.00019985321992194892 + }, + { + "step": 284, + "epoch": 0.4551282051282051, + "cpu_mem": 1.94861056, + "gpu_mem": 4.569163264, + "loss": 1.3695, + "grad_norm": 0.15013185143470764, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 285, + "epoch": 0.4567307692307692, + "cpu_mem": 1.94861056, + "gpu_mem": 4.569181696, + "loss": 1.4054, + "grad_norm": 0.16456399857997894, + "learning_rate": 0.00019826563055679418 + }, + { + "step": 286, + "epoch": 0.4583333333333333, + "cpu_mem": 1.948807168, + "gpu_mem": 4.569152512, + "loss": 1.3567, + "grad_norm": 0.11232182383537292, + "learning_rate": 0.00019746955301683537 + }, + { + "step": 287, + "epoch": 0.4599358974358974, + "cpu_mem": 1.949003776, + "gpu_mem": 4.56918016, + "loss": 1.4032, + "grad_norm": 0.07850407809019089, + "learning_rate": 0.0001966719868425464 + }, + { + "step": 288, + "epoch": 0.46153846153846156, + "cpu_mem": 1.949003776, + "gpu_mem": 4.569161728, + "loss": 1.3804, + "grad_norm": 0.1547841876745224, + "learning_rate": 0.0001958729570454201 + }, + { + "step": 289, + "epoch": 0.46314102564102566, + "cpu_mem": 1.949200384, + "gpu_mem": 4.56914944, + "loss": 1.3984, + "grad_norm": 0.14424015581607819, + "learning_rate": 0.0001950724886828484 + }, + { + "step": 290, + "epoch": 0.46474358974358976, + "cpu_mem": 1.949200384, + "gpu_mem": 4.569158656, + "loss": 1.3754, + "grad_norm": 0.10868766903877258, + "learning_rate": 0.000194270606857336 + }, + { + "step": 291, + "epoch": 0.46634615384615385, + "cpu_mem": 1.949396992, + "gpu_mem": 4.569155584, + "loss": 1.3874, + "grad_norm": 0.12306096404790878, + "learning_rate": 0.00019346733671571367 + }, + { + "step": 292, + "epoch": 0.46794871794871795, + "cpu_mem": 1.949396992, + "gpu_mem": 4.569170944, + "loss": 1.4039, + "grad_norm": 0.14712324738502502, + "learning_rate": 0.00019266270344834942 + }, + { + "step": 293, + "epoch": 0.46955128205128205, + "cpu_mem": 1.9495936, + "gpu_mem": 4.569178624, + "loss": 1.3786, + "grad_norm": 0.08959011733531952, + "learning_rate": 0.00019185673228835857 + }, + { + "step": 294, + "epoch": 0.47115384615384615, + "cpu_mem": 1.9495936, + "gpu_mem": 4.569167872, + "loss": 1.4078, + "grad_norm": 0.11256465315818787, + "learning_rate": 0.00019104944851081244 + }, + { + "step": 295, + "epoch": 0.47275641025641024, + "cpu_mem": 1.949790208, + "gpu_mem": 4.569152512, + "loss": 1.3848, + "grad_norm": 0.12587976455688477, + "learning_rate": 0.00019024087743194564 + }, + { + "step": 296, + "epoch": 0.47435897435897434, + "cpu_mem": 1.949986816, + "gpu_mem": 4.569155584, + "loss": 1.3744, + "grad_norm": 0.08181307464838028, + "learning_rate": 0.0001894310444083625 + }, + { + "step": 297, + "epoch": 0.47596153846153844, + "cpu_mem": 1.949986816, + "gpu_mem": 4.56914944, + "loss": 1.3798, + "grad_norm": 0.10923133790493011, + "learning_rate": 0.00018861997483624136 + }, + { + "step": 298, + "epoch": 0.4775641025641026, + "cpu_mem": 1.950183424, + "gpu_mem": 4.569144832, + "loss": 1.395, + "grad_norm": 0.17862950265407562, + "learning_rate": 0.00018780769415053866 + }, + { + "step": 299, + "epoch": 0.4791666666666667, + "cpu_mem": 1.950380032, + "gpu_mem": 4.569166336, + "loss": 1.3872, + "grad_norm": 0.0906771570444107, + "learning_rate": 0.00018699422782419094 + }, + { + "step": 300, + "epoch": 0.4807692307692308, + "cpu_mem": 1.950380032, + "gpu_mem": 4.569158656, + "loss": 1.3613, + "grad_norm": 0.12308082729578018, + "learning_rate": 0.00018617960136731624 + }, + { + "step": 301, + "epoch": 0.4823717948717949, + "cpu_mem": 1.950380032, + "gpu_mem": 4.569131008, + "loss": 1.3717, + "grad_norm": 0.06348498165607452, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 302, + "epoch": 0.483974358974359, + "cpu_mem": 1.95057664, + "gpu_mem": 4.569129472, + "loss": 1.3846, + "grad_norm": 0.08248110860586166, + "learning_rate": 0.0001845469702835641 + }, + { + "step": 303, + "epoch": 0.4855769230769231, + "cpu_mem": 1.950773248, + "gpu_mem": 4.569155584, + "loss": 1.4062, + "grad_norm": 0.14066562056541443, + "learning_rate": 0.00018372901685562414 + }, + { + "step": 304, + "epoch": 0.48717948717948717, + "cpu_mem": 1.950773248, + "gpu_mem": 4.569138688, + "loss": 1.3804, + "grad_norm": 0.09256920963525772, + "learning_rate": 0.00018291000569342676 + }, + { + "step": 305, + "epoch": 0.48878205128205127, + "cpu_mem": 1.950773248, + "gpu_mem": 4.569169408, + "loss": 1.3496, + "grad_norm": 0.25364428758621216, + "learning_rate": 0.00018208996248097458 + }, + { + "step": 306, + "epoch": 0.49038461538461536, + "cpu_mem": 1.950969856, + "gpu_mem": 4.569152512, + "loss": 1.4041, + "grad_norm": 0.13906043767929077, + "learning_rate": 0.00018126891293463547 + }, + { + "step": 307, + "epoch": 0.49198717948717946, + "cpu_mem": 1.951166464, + "gpu_mem": 4.569183232, + "loss": 1.3945, + "grad_norm": 0.1494535654783249, + "learning_rate": 0.0001804468828023354 + }, + { + "step": 308, + "epoch": 0.4935897435897436, + "cpu_mem": 1.951166464, + "gpu_mem": 4.569150976, + "loss": 1.3949, + "grad_norm": 0.09512457251548767, + "learning_rate": 0.00017962389786275142 + }, + { + "step": 309, + "epoch": 0.4951923076923077, + "cpu_mem": 1.951363072, + "gpu_mem": 4.569177088, + "loss": 1.3666, + "grad_norm": 0.0951942726969719, + "learning_rate": 0.000178799983924503 + }, + { + "step": 310, + "epoch": 0.4967948717948718, + "cpu_mem": 1.951363072, + "gpu_mem": 4.569152512, + "loss": 1.3846, + "grad_norm": 0.05956839397549629, + "learning_rate": 0.00017797516682534293 + }, + { + "step": 311, + "epoch": 0.4983974358974359, + "cpu_mem": 1.95155968, + "gpu_mem": 4.569147904, + "loss": 1.387, + "grad_norm": 0.07758490741252899, + "learning_rate": 0.00017714947243134695 + }, + { + "step": 312, + "epoch": 0.5, + "cpu_mem": 1.951756288, + "gpu_mem": 4.569150976, + "loss": 1.3825, + "grad_norm": 0.1281156986951828, + "learning_rate": 0.0001763229266361024 + }, + { + "step": 313, + "epoch": 0.5016025641025641, + "cpu_mem": 1.951756288, + "gpu_mem": 4.569169408, + "loss": 1.3963, + "grad_norm": 0.11398234218358994, + "learning_rate": 0.00017549555535989648 + }, + { + "step": 314, + "epoch": 0.5032051282051282, + "cpu_mem": 1.951756288, + "gpu_mem": 4.56914944, + "loss": 1.3686, + "grad_norm": 0.13046856224536896, + "learning_rate": 0.00017466738454890323 + }, + { + "step": 315, + "epoch": 0.5048076923076923, + "cpu_mem": 1.951952896, + "gpu_mem": 4.569154048, + "loss": 1.3848, + "grad_norm": 0.07770109176635742, + "learning_rate": 0.00017383844017436996 + }, + { + "step": 316, + "epoch": 0.5064102564102564, + "cpu_mem": 1.952149504, + "gpu_mem": 4.56914944, + "loss": 1.4071, + "grad_norm": 0.12624643743038177, + "learning_rate": 0.00017300874823180282 + }, + { + "step": 317, + "epoch": 0.5080128205128205, + "cpu_mem": 1.952149504, + "gpu_mem": 4.56915712, + "loss": 1.389, + "grad_norm": 0.06815124303102493, + "learning_rate": 0.00017217833474015128 + }, + { + "step": 318, + "epoch": 0.5096153846153846, + "cpu_mem": 1.952149504, + "gpu_mem": 4.569181696, + "loss": 1.3761, + "grad_norm": 0.08491852134466171, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 319, + "epoch": 0.5112179487179487, + "cpu_mem": 1.952346112, + "gpu_mem": 4.569174016, + "loss": 1.3845, + "grad_norm": 0.10456173121929169, + "learning_rate": 0.0001705154472977154 + }, + { + "step": 320, + "epoch": 0.5128205128205128, + "cpu_mem": 1.95254272, + "gpu_mem": 4.569175552, + "loss": 1.3871, + "grad_norm": 0.10268915444612503, + "learning_rate": 0.00016968302549470095 + }, + { + "step": 321, + "epoch": 0.5144230769230769, + "cpu_mem": 1.95254272, + "gpu_mem": 4.569150976, + "loss": 1.3846, + "grad_norm": 0.09995705634355545, + "learning_rate": 0.00016884998643650694 + }, + { + "step": 322, + "epoch": 0.5160256410256411, + "cpu_mem": 1.95254272, + "gpu_mem": 4.569152512, + "loss": 1.3867, + "grad_norm": 0.08077079802751541, + "learning_rate": 0.00016801635624704776 + }, + { + "step": 323, + "epoch": 0.5176282051282052, + "cpu_mem": 1.952739328, + "gpu_mem": 4.56917248, + "loss": 1.3852, + "grad_norm": 0.08230598270893097, + "learning_rate": 0.0001671821610687756 + }, + { + "step": 324, + "epoch": 0.5192307692307693, + "cpu_mem": 1.952935936, + "gpu_mem": 4.569144832, + "loss": 1.3893, + "grad_norm": 0.14326904714107513, + "learning_rate": 0.00016634742706186036 + }, + { + "step": 325, + "epoch": 0.5208333333333334, + "cpu_mem": 1.952935936, + "gpu_mem": 4.56915712, + "loss": 1.3603, + "grad_norm": 0.1029609739780426, + "learning_rate": 0.00016551218040336993 + }, + { + "step": 326, + "epoch": 0.5224358974358975, + "cpu_mem": 1.953132544, + "gpu_mem": 4.569166336, + "loss": 1.3978, + "grad_norm": 0.14329442381858826, + "learning_rate": 0.00016467644728644843 + }, + { + "step": 327, + "epoch": 0.5240384615384616, + "cpu_mem": 1.953132544, + "gpu_mem": 4.569143296, + "loss": 1.3674, + "grad_norm": 0.17070156335830688, + "learning_rate": 0.0001638402539194953 + }, + { + "step": 328, + "epoch": 0.5256410256410257, + "cpu_mem": 1.953329152, + "gpu_mem": 4.569167872, + "loss": 1.3925, + "grad_norm": 0.13663966953754425, + "learning_rate": 0.00016300362652534346 + }, + { + "step": 329, + "epoch": 0.5272435897435898, + "cpu_mem": 1.953329152, + "gpu_mem": 4.569167872, + "loss": 1.3824, + "grad_norm": 0.08660689741373062, + "learning_rate": 0.00016216659134043657 + }, + { + "step": 330, + "epoch": 0.5288461538461539, + "cpu_mem": 1.95352576, + "gpu_mem": 4.569150976, + "loss": 1.3755, + "grad_norm": 0.10117126256227493, + "learning_rate": 0.00016132917461400686 + }, + { + "step": 331, + "epoch": 0.530448717948718, + "cpu_mem": 1.95352576, + "gpu_mem": 4.569147904, + "loss": 1.3813, + "grad_norm": 0.22153809666633606, + "learning_rate": 0.00016049140260725127 + }, + { + "step": 332, + "epoch": 0.532051282051282, + "cpu_mem": 1.95352576, + "gpu_mem": 4.569140224, + "loss": 1.3828, + "grad_norm": 0.09859436750411987, + "learning_rate": 0.00015965330159250845 + }, + { + "step": 333, + "epoch": 0.5336538461538461, + "cpu_mem": 1.95352576, + "gpu_mem": 4.569178624, + "loss": 1.4117, + "grad_norm": 0.14838232100009918, + "learning_rate": 0.00015881489785243467 + }, + { + "step": 334, + "epoch": 0.5352564102564102, + "cpu_mem": 1.95352576, + "gpu_mem": 4.569155584, + "loss": 1.387, + "grad_norm": 0.08850575983524323, + "learning_rate": 0.00015797621767917942 + }, + { + "step": 335, + "epoch": 0.5368589743589743, + "cpu_mem": 1.953722368, + "gpu_mem": 4.569154048, + "loss": 1.3909, + "grad_norm": 0.07854010909795761, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 336, + "epoch": 0.5384615384615384, + "cpu_mem": 1.953722368, + "gpu_mem": 4.569170944, + "loss": 1.3907, + "grad_norm": 0.10881248116493225, + "learning_rate": 0.00015629813324424292 + }, + { + "step": 337, + "epoch": 0.5400641025641025, + "cpu_mem": 1.953918976, + "gpu_mem": 4.569155584, + "loss": 1.3647, + "grad_norm": 0.07028242945671082, + "learning_rate": 0.00015545878160690583 + }, + { + "step": 338, + "epoch": 0.5416666666666666, + "cpu_mem": 1.953918976, + "gpu_mem": 4.569167872, + "loss": 1.4028, + "grad_norm": 0.1382720172405243, + "learning_rate": 0.00015461925878342556 + }, + { + "step": 339, + "epoch": 0.5432692307692307, + "cpu_mem": 1.953918976, + "gpu_mem": 4.56918016, + "loss": 1.3887, + "grad_norm": 0.1248403936624527, + "learning_rate": 0.00015377959110104584 + }, + { + "step": 340, + "epoch": 0.5448717948717948, + "cpu_mem": 1.954115584, + "gpu_mem": 4.569155584, + "loss": 1.3972, + "grad_norm": 0.1116277277469635, + "learning_rate": 0.00015293980489155333 + }, + { + "step": 341, + "epoch": 0.5464743589743589, + "cpu_mem": 1.954312192, + "gpu_mem": 4.569200128, + "loss": 1.4084, + "grad_norm": 0.17364366352558136, + "learning_rate": 0.00015209992649045152 + }, + { + "step": 342, + "epoch": 0.5480769230769231, + "cpu_mem": 1.954312192, + "gpu_mem": 4.569174016, + "loss": 1.3646, + "grad_norm": 0.07603185623884201, + "learning_rate": 0.000151259982236135 + }, + { + "step": 343, + "epoch": 0.5496794871794872, + "cpu_mem": 1.954312192, + "gpu_mem": 4.569170944, + "loss": 1.3928, + "grad_norm": 0.10405686497688293, + "learning_rate": 0.00015041999846906367 + }, + { + "step": 344, + "epoch": 0.5512820512820513, + "cpu_mem": 1.954312192, + "gpu_mem": 4.569152512, + "loss": 1.3736, + "grad_norm": 0.06193113699555397, + "learning_rate": 0.00014958000153093634 + }, + { + "step": 345, + "epoch": 0.5528846153846154, + "cpu_mem": 1.9545088, + "gpu_mem": 4.569158656, + "loss": 1.4003, + "grad_norm": 0.13078409433364868, + "learning_rate": 0.000148740017763865 + }, + { + "step": 346, + "epoch": 0.5544871794871795, + "cpu_mem": 1.9545088, + "gpu_mem": 4.569127936, + "loss": 1.3855, + "grad_norm": 0.0992245301604271, + "learning_rate": 0.00014790007350954845 + }, + { + "step": 347, + "epoch": 0.5560897435897436, + "cpu_mem": 1.954705408, + "gpu_mem": 4.569192448, + "loss": 1.3881, + "grad_norm": 0.101511649787426, + "learning_rate": 0.00014706019510844664 + }, + { + "step": 348, + "epoch": 0.5576923076923077, + "cpu_mem": 1.954705408, + "gpu_mem": 4.569146368, + "loss": 1.4102, + "grad_norm": 0.13083696365356445, + "learning_rate": 0.0001462204088989541 + }, + { + "step": 349, + "epoch": 0.5592948717948718, + "cpu_mem": 1.954705408, + "gpu_mem": 4.569140224, + "loss": 1.3872, + "grad_norm": 0.08346622437238693, + "learning_rate": 0.00014538074121657447 + }, + { + "step": 350, + "epoch": 0.5608974358974359, + "cpu_mem": 1.954902016, + "gpu_mem": 4.56919552, + "loss": 1.379, + "grad_norm": 0.052734918892383575, + "learning_rate": 0.00014454121839309415 + }, + { + "step": 351, + "epoch": 0.5625, + "cpu_mem": 1.954902016, + "gpu_mem": 4.569161728, + "loss": 1.4027, + "grad_norm": 0.143284872174263, + "learning_rate": 0.00014370186675575705 + }, + { + "step": 352, + "epoch": 0.5641025641025641, + "cpu_mem": 1.955098624, + "gpu_mem": 4.56914944, + "loss": 1.3722, + "grad_norm": 0.1271652728319168, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 353, + "epoch": 0.5657051282051282, + "cpu_mem": 1.955098624, + "gpu_mem": 4.569154048, + "loss": 1.3838, + "grad_norm": 0.08342558145523071, + "learning_rate": 0.00014202378232082053 + }, + { + "step": 354, + "epoch": 0.5673076923076923, + "cpu_mem": 1.955295232, + "gpu_mem": 4.56913408, + "loss": 1.3807, + "grad_norm": 0.07938611507415771, + "learning_rate": 0.00014118510214756536 + }, + { + "step": 355, + "epoch": 0.5689102564102564, + "cpu_mem": 1.955295232, + "gpu_mem": 4.569158656, + "loss": 1.3714, + "grad_norm": 0.0720868855714798, + "learning_rate": 0.00014034669840749152 + }, + { + "step": 356, + "epoch": 0.5705128205128205, + "cpu_mem": 1.95549184, + "gpu_mem": 4.569137152, + "loss": 1.3856, + "grad_norm": 0.0667443722486496, + "learning_rate": 0.0001395085973927487 + }, + { + "step": 357, + "epoch": 0.5721153846153846, + "cpu_mem": 1.95549184, + "gpu_mem": 4.569154048, + "loss": 1.3798, + "grad_norm": 0.07146485894918442, + "learning_rate": 0.00013867082538599317 + }, + { + "step": 358, + "epoch": 0.5737179487179487, + "cpu_mem": 1.95549184, + "gpu_mem": 4.56911872, + "loss": 1.388, + "grad_norm": 0.06780857592821121, + "learning_rate": 0.00013783340865956338 + }, + { + "step": 359, + "epoch": 0.5753205128205128, + "cpu_mem": 1.95549184, + "gpu_mem": 4.569150976, + "loss": 1.3803, + "grad_norm": 0.09353932738304138, + "learning_rate": 0.0001369963734746566 + }, + { + "step": 360, + "epoch": 0.5769230769230769, + "cpu_mem": 1.955688448, + "gpu_mem": 4.569140224, + "loss": 1.3845, + "grad_norm": 0.06944283097982407, + "learning_rate": 0.0001361597460805047 + }, + { + "step": 361, + "epoch": 0.5785256410256411, + "cpu_mem": 1.955688448, + "gpu_mem": 4.569177088, + "loss": 1.392, + "grad_norm": 0.11544737219810486, + "learning_rate": 0.00013532355271355154 + }, + { + "step": 362, + "epoch": 0.5801282051282052, + "cpu_mem": 1.955688448, + "gpu_mem": 4.569143296, + "loss": 1.3721, + "grad_norm": 0.08759613335132599, + "learning_rate": 0.00013448781959663004 + }, + { + "step": 363, + "epoch": 0.5817307692307693, + "cpu_mem": 1.955688448, + "gpu_mem": 4.569166336, + "loss": 1.3806, + "grad_norm": 0.12256285548210144, + "learning_rate": 0.00013365257293813956 + }, + { + "step": 364, + "epoch": 0.5833333333333334, + "cpu_mem": 1.955885056, + "gpu_mem": 4.569155584, + "loss": 1.3805, + "grad_norm": 0.09091100096702576, + "learning_rate": 0.00013281783893122446 + }, + { + "step": 365, + "epoch": 0.5849358974358975, + "cpu_mem": 1.956081664, + "gpu_mem": 4.569161728, + "loss": 1.374, + "grad_norm": 0.07430815696716309, + "learning_rate": 0.00013198364375295224 + }, + { + "step": 366, + "epoch": 0.5865384615384616, + "cpu_mem": 1.956081664, + "gpu_mem": 4.569155584, + "loss": 1.3989, + "grad_norm": 0.1702882945537567, + "learning_rate": 0.000131150013563493 + }, + { + "step": 367, + "epoch": 0.5881410256410257, + "cpu_mem": 1.956081664, + "gpu_mem": 4.569174016, + "loss": 1.3881, + "grad_norm": 0.1057715117931366, + "learning_rate": 0.00013031697450529902 + }, + { + "step": 368, + "epoch": 0.5897435897435898, + "cpu_mem": 1.956081664, + "gpu_mem": 4.56913408, + "loss": 1.3879, + "grad_norm": 0.053218476474285126, + "learning_rate": 0.0001294845527022846 + }, + { + "step": 369, + "epoch": 0.5913461538461539, + "cpu_mem": 1.956081664, + "gpu_mem": 4.569166336, + "loss": 1.3681, + "grad_norm": 0.06490162760019302, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 370, + "epoch": 0.592948717948718, + "cpu_mem": 1.956278272, + "gpu_mem": 4.569186304, + "loss": 1.3908, + "grad_norm": 0.08681511878967285, + "learning_rate": 0.0001278216652598487 + }, + { + "step": 371, + "epoch": 0.594551282051282, + "cpu_mem": 1.95647488, + "gpu_mem": 4.56918016, + "loss": 1.3907, + "grad_norm": 0.10255490243434906, + "learning_rate": 0.00012699125176819716 + }, + { + "step": 372, + "epoch": 0.5961538461538461, + "cpu_mem": 1.95647488, + "gpu_mem": 4.569143296, + "loss": 1.4003, + "grad_norm": 0.13598914444446564, + "learning_rate": 0.00012616155982563 + }, + { + "step": 373, + "epoch": 0.5977564102564102, + "cpu_mem": 1.95647488, + "gpu_mem": 4.569160192, + "loss": 1.4021, + "grad_norm": 0.14073297381401062, + "learning_rate": 0.00012533261545109674 + }, + { + "step": 374, + "epoch": 0.5993589743589743, + "cpu_mem": 1.95647488, + "gpu_mem": 4.569137152, + "loss": 1.3593, + "grad_norm": 0.1357545405626297, + "learning_rate": 0.00012450444464010352 + }, + { + "step": 375, + "epoch": 0.6009615384615384, + "cpu_mem": 1.95647488, + "gpu_mem": 4.569169408, + "loss": 1.3689, + "grad_norm": 0.20364461839199066, + "learning_rate": 0.0001236770733638976 + }, + { + "step": 376, + "epoch": 0.6025641025641025, + "cpu_mem": 1.95647488, + "gpu_mem": 4.5691648, + "loss": 1.3728, + "grad_norm": 0.08505731076002121, + "learning_rate": 0.000122850527568653 + }, + { + "step": 377, + "epoch": 0.6041666666666666, + "cpu_mem": 1.956671488, + "gpu_mem": 4.569174016, + "loss": 1.3827, + "grad_norm": 0.0947883352637291, + "learning_rate": 0.00012202483317465704 + }, + { + "step": 378, + "epoch": 0.6057692307692307, + "cpu_mem": 1.956671488, + "gpu_mem": 4.569147904, + "loss": 1.3906, + "grad_norm": 0.08319840580224991, + "learning_rate": 0.00012120001607549698 + }, + { + "step": 379, + "epoch": 0.6073717948717948, + "cpu_mem": 1.956671488, + "gpu_mem": 4.569167872, + "loss": 1.3724, + "grad_norm": 0.17546437680721283, + "learning_rate": 0.00012037610213724862 + }, + { + "step": 380, + "epoch": 0.6089743589743589, + "cpu_mem": 1.956671488, + "gpu_mem": 4.56914176, + "loss": 1.381, + "grad_norm": 0.08807572722434998, + "learning_rate": 0.0001195531171976646 + }, + { + "step": 381, + "epoch": 0.6105769230769231, + "cpu_mem": 1.956868096, + "gpu_mem": 4.569166336, + "loss": 1.4013, + "grad_norm": 0.11155908554792404, + "learning_rate": 0.00011873108706536448 + }, + { + "step": 382, + "epoch": 0.6121794871794872, + "cpu_mem": 1.956868096, + "gpu_mem": 4.569150976, + "loss": 1.3768, + "grad_norm": 0.16212379932403564, + "learning_rate": 0.00011791003751902542 + }, + { + "step": 383, + "epoch": 0.6137820512820513, + "cpu_mem": 1.956868096, + "gpu_mem": 4.569184768, + "loss": 1.3758, + "grad_norm": 0.16032327711582184, + "learning_rate": 0.00011708999430657325 + }, + { + "step": 384, + "epoch": 0.6153846153846154, + "cpu_mem": 1.957064704, + "gpu_mem": 4.5691648, + "loss": 1.3842, + "grad_norm": 0.09929832816123962, + "learning_rate": 0.00011627098314437586 + }, + { + "step": 385, + "epoch": 0.6169871794871795, + "cpu_mem": 1.957261312, + "gpu_mem": 4.56914944, + "loss": 1.3802, + "grad_norm": 0.09439581632614136, + "learning_rate": 0.0001154530297164359 + }, + { + "step": 386, + "epoch": 0.6185897435897436, + "cpu_mem": 1.957261312, + "gpu_mem": 4.569184768, + "loss": 1.3739, + "grad_norm": 0.07851336896419525, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 387, + "epoch": 0.6201923076923077, + "cpu_mem": 1.957261312, + "gpu_mem": 4.569190912, + "loss": 1.3862, + "grad_norm": 0.12528516352176666, + "learning_rate": 0.00011382039863268374 + }, + { + "step": 388, + "epoch": 0.6217948717948718, + "cpu_mem": 1.957261312, + "gpu_mem": 4.569154048, + "loss": 1.3953, + "grad_norm": 0.12453577667474747, + "learning_rate": 0.00011300577217580905 + }, + { + "step": 389, + "epoch": 0.6233974358974359, + "cpu_mem": 1.957261312, + "gpu_mem": 4.569132544, + "loss": 1.3851, + "grad_norm": 0.17714989185333252, + "learning_rate": 0.00011219230584946136 + }, + { + "step": 390, + "epoch": 0.625, + "cpu_mem": 1.95745792, + "gpu_mem": 4.569184768, + "loss": 1.3889, + "grad_norm": 0.16647300124168396, + "learning_rate": 0.00011138002516375864 + }, + { + "step": 391, + "epoch": 0.6266025641025641, + "cpu_mem": 1.95745792, + "gpu_mem": 4.569170944, + "loss": 1.3791, + "grad_norm": 0.11350125074386597, + "learning_rate": 0.00011056895559163748 + }, + { + "step": 392, + "epoch": 0.6282051282051282, + "cpu_mem": 1.95745792, + "gpu_mem": 4.5691648, + "loss": 1.3869, + "grad_norm": 0.1488281637430191, + "learning_rate": 0.00010975912256805436 + }, + { + "step": 393, + "epoch": 0.6298076923076923, + "cpu_mem": 1.95745792, + "gpu_mem": 4.569170944, + "loss": 1.408, + "grad_norm": 0.16477453708648682, + "learning_rate": 0.00010895055148918756 + }, + { + "step": 394, + "epoch": 0.6314102564102564, + "cpu_mem": 1.95745792, + "gpu_mem": 4.569147904, + "loss": 1.3812, + "grad_norm": 0.13375796377658844, + "learning_rate": 0.00010814326771164141 + }, + { + "step": 395, + "epoch": 0.6330128205128205, + "cpu_mem": 1.95745792, + "gpu_mem": 4.569161728, + "loss": 1.361, + "grad_norm": 0.12730145454406738, + "learning_rate": 0.00010733729655165054 + }, + { + "step": 396, + "epoch": 0.6346153846153846, + "cpu_mem": 1.95745792, + "gpu_mem": 4.569161728, + "loss": 1.3821, + "grad_norm": 0.12114223092794418, + "learning_rate": 0.00010653266328428628 + }, + { + "step": 397, + "epoch": 0.6362179487179487, + "cpu_mem": 1.957654528, + "gpu_mem": 4.569131008, + "loss": 1.3717, + "grad_norm": 0.24267542362213135, + "learning_rate": 0.00010572939314266402 + }, + { + "step": 398, + "epoch": 0.6378205128205128, + "cpu_mem": 1.957654528, + "gpu_mem": 4.5691648, + "loss": 1.3808, + "grad_norm": 0.13982945680618286, + "learning_rate": 0.00010492751131715159 + }, + { + "step": 399, + "epoch": 0.6394230769230769, + "cpu_mem": 1.957851136, + "gpu_mem": 4.569143296, + "loss": 1.3755, + "grad_norm": 0.08249422907829285, + "learning_rate": 0.00010412704295457988 + }, + { + "step": 400, + "epoch": 0.6410256410256411, + "cpu_mem": 1.957851136, + "gpu_mem": 4.569150976, + "loss": 1.3812, + "grad_norm": 0.07850506901741028, + "learning_rate": 0.00010332801315745361 + }, + { + "step": 401, + "epoch": 0.6426282051282052, + "cpu_mem": 1.957851136, + "gpu_mem": 4.569169408, + "loss": 1.3754, + "grad_norm": 0.05807549133896828, + "learning_rate": 0.00010253044698316464 + }, + { + "step": 402, + "epoch": 0.6442307692307693, + "cpu_mem": 1.958047744, + "gpu_mem": 4.569137152, + "loss": 1.3895, + "grad_norm": 0.09704960882663727, + "learning_rate": 0.00010173436944320582 + }, + { + "step": 403, + "epoch": 0.6458333333333334, + "cpu_mem": 1.958047744, + "gpu_mem": 4.56914176, + "loss": 1.3867, + "grad_norm": 0.07102946937084198, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 404, + "epoch": 0.6474358974358975, + "cpu_mem": 1.958047744, + "gpu_mem": 4.569137152, + "loss": 1.3814, + "grad_norm": 0.11083949357271194, + "learning_rate": 0.00010014678007805106 + }, + { + "step": 405, + "epoch": 0.6490384615384616, + "cpu_mem": 1.958047744, + "gpu_mem": 4.569181696, + "loss": 1.3834, + "grad_norm": 0.08707565069198608, + "learning_rate": 9.935531803929469e-05 + }, + { + "step": 406, + "epoch": 0.6506410256410257, + "cpu_mem": 1.958047744, + "gpu_mem": 4.5691648, + "loss": 1.396, + "grad_norm": 0.183969184756279, + "learning_rate": 9.856544420618624e-05 + }, + { + "step": 407, + "epoch": 0.6522435897435898, + "cpu_mem": 1.958047744, + "gpu_mem": 4.569154048, + "loss": 1.3744, + "grad_norm": 0.07683484256267548, + "learning_rate": 9.777718334898859e-05 + }, + { + "step": 408, + "epoch": 0.6538461538461539, + "cpu_mem": 1.958244352, + "gpu_mem": 4.569175552, + "loss": 1.3987, + "grad_norm": 0.20644834637641907, + "learning_rate": 9.699056018738192e-05 + }, + { + "step": 409, + "epoch": 0.655448717948718, + "cpu_mem": 1.958244352, + "gpu_mem": 4.56914176, + "loss": 1.3989, + "grad_norm": 0.10844528675079346, + "learning_rate": 9.62055993896888e-05 + }, + { + "step": 410, + "epoch": 0.657051282051282, + "cpu_mem": 1.95844096, + "gpu_mem": 4.56915712, + "loss": 1.3475, + "grad_norm": 0.13026072084903717, + "learning_rate": 9.542232557210039e-05 + }, + { + "step": 411, + "epoch": 0.6586538461538461, + "cpu_mem": 1.95844096, + "gpu_mem": 4.56915712, + "loss": 1.3866, + "grad_norm": 0.11253026872873306, + "learning_rate": 9.464076329790451e-05 + }, + { + "step": 412, + "epoch": 0.6602564102564102, + "cpu_mem": 1.95844096, + "gpu_mem": 4.569147904, + "loss": 1.3832, + "grad_norm": 0.12082858383655548, + "learning_rate": 9.386093707671543e-05 + }, + { + "step": 413, + "epoch": 0.6618589743589743, + "cpu_mem": 1.95844096, + "gpu_mem": 4.569158656, + "loss": 1.4226, + "grad_norm": 0.27194857597351074, + "learning_rate": 9.308287136370511e-05 + }, + { + "step": 414, + "epoch": 0.6634615384615384, + "cpu_mem": 1.958637568, + "gpu_mem": 4.569183232, + "loss": 1.3734, + "grad_norm": 0.10392075031995773, + "learning_rate": 9.230659055883649e-05 + }, + { + "step": 415, + "epoch": 0.6650641025641025, + "cpu_mem": 1.958637568, + "gpu_mem": 4.569135616, + "loss": 1.3918, + "grad_norm": 0.1604335904121399, + "learning_rate": 9.15321190060981e-05 + }, + { + "step": 416, + "epoch": 0.6666666666666666, + "cpu_mem": 1.958637568, + "gpu_mem": 4.569170944, + "loss": 1.3864, + "grad_norm": 0.1377021223306656, + "learning_rate": 9.075948099274078e-05 + }, + { + "step": 417, + "epoch": 0.6682692307692307, + "cpu_mem": 1.958834176, + "gpu_mem": 4.569132544, + "loss": 1.3528, + "grad_norm": 0.15297776460647583, + "learning_rate": 8.998870074851604e-05 + }, + { + "step": 418, + "epoch": 0.6698717948717948, + "cpu_mem": 1.958834176, + "gpu_mem": 4.569150976, + "loss": 1.3964, + "grad_norm": 0.16597171127796173, + "learning_rate": 8.9219802444916e-05 + }, + { + "step": 419, + "epoch": 0.6714743589743589, + "cpu_mem": 1.958834176, + "gpu_mem": 4.569143296, + "loss": 1.3786, + "grad_norm": 0.09608740359544754, + "learning_rate": 8.845281019441583e-05 + }, + { + "step": 420, + "epoch": 0.6730769230769231, + "cpu_mem": 1.958834176, + "gpu_mem": 4.56918016, + "loss": 1.3862, + "grad_norm": 0.1125231385231018, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 421, + "epoch": 0.6746794871794872, + "cpu_mem": 1.959030784, + "gpu_mem": 4.569140224, + "loss": 1.3792, + "grad_norm": 0.10351355373859406, + "learning_rate": 8.692464000299362e-05 + }, + { + "step": 422, + "epoch": 0.6762820512820513, + "cpu_mem": 1.959030784, + "gpu_mem": 4.569154048, + "loss": 1.3945, + "grad_norm": 0.19650213420391083, + "learning_rate": 8.61635099851395e-05 + }, + { + "step": 423, + "epoch": 0.6778846153846154, + "cpu_mem": 1.959030784, + "gpu_mem": 4.569158656, + "loss": 1.3874, + "grad_norm": 0.14407646656036377, + "learning_rate": 8.540438186501792e-05 + }, + { + "step": 424, + "epoch": 0.6794871794871795, + "cpu_mem": 1.959030784, + "gpu_mem": 4.569120256, + "loss": 1.3723, + "grad_norm": 0.17791759967803955, + "learning_rate": 8.464727944871322e-05 + }, + { + "step": 425, + "epoch": 0.6810897435897436, + "cpu_mem": 1.959030784, + "gpu_mem": 4.569143296, + "loss": 1.3969, + "grad_norm": 0.10026402771472931, + "learning_rate": 8.389222647878426e-05 + }, + { + "step": 426, + "epoch": 0.6826923076923077, + "cpu_mem": 1.959030784, + "gpu_mem": 4.56914176, + "loss": 1.3621, + "grad_norm": 0.09971196204423904, + "learning_rate": 8.313924663351926e-05 + }, + { + "step": 427, + "epoch": 0.6842948717948718, + "cpu_mem": 1.959030784, + "gpu_mem": 4.569160192, + "loss": 1.4051, + "grad_norm": 0.14844252169132233, + "learning_rate": 8.238836352619424e-05 + }, + { + "step": 428, + "epoch": 0.6858974358974359, + "cpu_mem": 1.959227392, + "gpu_mem": 4.56915712, + "loss": 1.3979, + "grad_norm": 0.08524385839700699, + "learning_rate": 8.163960070433164e-05 + }, + { + "step": 429, + "epoch": 0.6875, + "cpu_mem": 1.959227392, + "gpu_mem": 4.569155584, + "loss": 1.3844, + "grad_norm": 0.0682341456413269, + "learning_rate": 8.089298164896245e-05 + }, + { + "step": 430, + "epoch": 0.6891025641025641, + "cpu_mem": 1.959424, + "gpu_mem": 4.569174016, + "loss": 1.3711, + "grad_norm": 0.1187443882226944, + "learning_rate": 8.014852977388964e-05 + }, + { + "step": 431, + "epoch": 0.6907051282051282, + "cpu_mem": 1.959424, + "gpu_mem": 4.569135616, + "loss": 1.3845, + "grad_norm": 0.08752217888832092, + "learning_rate": 7.940626842495362e-05 + }, + { + "step": 432, + "epoch": 0.6923076923076923, + "cpu_mem": 1.959424, + "gpu_mem": 4.56918016, + "loss": 1.3781, + "grad_norm": 0.09890313446521759, + "learning_rate": 7.866622087930074e-05 + }, + { + "step": 433, + "epoch": 0.6939102564102564, + "cpu_mem": 1.959424, + "gpu_mem": 4.569144832, + "loss": 1.3844, + "grad_norm": 0.1402551382780075, + "learning_rate": 7.792841034465275e-05 + }, + { + "step": 434, + "epoch": 0.6955128205128205, + "cpu_mem": 1.959424, + "gpu_mem": 4.56917248, + "loss": 1.3754, + "grad_norm": 0.12130479514598846, + "learning_rate": 7.719285995857938e-05 + }, + { + "step": 435, + "epoch": 0.6971153846153846, + "cpu_mem": 1.959424, + "gpu_mem": 4.569152512, + "loss": 1.3903, + "grad_norm": 0.10303548723459244, + "learning_rate": 7.64595927877727e-05 + }, + { + "step": 436, + "epoch": 0.6987179487179487, + "cpu_mem": 1.959424, + "gpu_mem": 4.569198592, + "loss": 1.3966, + "grad_norm": 0.2057006061077118, + "learning_rate": 7.572863182732332e-05 + }, + { + "step": 437, + "epoch": 0.7003205128205128, + "cpu_mem": 1.959620608, + "gpu_mem": 4.569163264, + "loss": 1.3945, + "grad_norm": 0.13439930975437164, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 438, + "epoch": 0.7019230769230769, + "cpu_mem": 1.959620608, + "gpu_mem": 4.569154048, + "loss": 1.3882, + "grad_norm": 0.11835765838623047, + "learning_rate": 7.42737201555302e-05 + }, + { + "step": 439, + "epoch": 0.7035256410256411, + "cpu_mem": 1.959620608, + "gpu_mem": 4.569147904, + "loss": 1.4023, + "grad_norm": 0.107454814016819, + "learning_rate": 7.354981506988387e-05 + }, + { + "step": 440, + "epoch": 0.7051282051282052, + "cpu_mem": 1.959620608, + "gpu_mem": 4.569132544, + "loss": 1.3797, + "grad_norm": 0.10486225038766861, + "learning_rate": 7.282830744455895e-05 + }, + { + "step": 441, + "epoch": 0.7067307692307693, + "cpu_mem": 1.959620608, + "gpu_mem": 4.569150976, + "loss": 1.398, + "grad_norm": 0.19299599528312683, + "learning_rate": 7.210921990586957e-05 + }, + { + "step": 442, + "epoch": 0.7083333333333334, + "cpu_mem": 1.959620608, + "gpu_mem": 4.569152512, + "loss": 1.3831, + "grad_norm": 0.1713951975107193, + "learning_rate": 7.139257500423665e-05 + }, + { + "step": 443, + "epoch": 0.7099358974358975, + "cpu_mem": 1.959620608, + "gpu_mem": 4.56915712, + "loss": 1.3989, + "grad_norm": 0.13624431192874908, + "learning_rate": 7.067839521348035e-05 + }, + { + "step": 444, + "epoch": 0.7115384615384616, + "cpu_mem": 1.959620608, + "gpu_mem": 4.569160192, + "loss": 1.3895, + "grad_norm": 0.08052192628383636, + "learning_rate": 6.996670293011575e-05 + }, + { + "step": 445, + "epoch": 0.7131410256410257, + "cpu_mem": 1.959817216, + "gpu_mem": 4.569154048, + "loss": 1.385, + "grad_norm": 0.11876460909843445, + "learning_rate": 6.92575204726501e-05 + }, + { + "step": 446, + "epoch": 0.7147435897435898, + "cpu_mem": 1.959817216, + "gpu_mem": 4.56918016, + "loss": 1.3798, + "grad_norm": 0.09041713923215866, + "learning_rate": 6.855087008088307e-05 + }, + { + "step": 447, + "epoch": 0.7163461538461539, + "cpu_mem": 1.959817216, + "gpu_mem": 4.569147904, + "loss": 1.3793, + "grad_norm": 0.1543312966823578, + "learning_rate": 6.784677391520952e-05 + }, + { + "step": 448, + "epoch": 0.717948717948718, + "cpu_mem": 1.960013824, + "gpu_mem": 4.569175552, + "loss": 1.3881, + "grad_norm": 0.1315462440252304, + "learning_rate": 6.714525405592412e-05 + }, + { + "step": 449, + "epoch": 0.719551282051282, + "cpu_mem": 1.960013824, + "gpu_mem": 4.569183232, + "loss": 1.3681, + "grad_norm": 0.15974967181682587, + "learning_rate": 6.644633250252937e-05 + }, + { + "step": 450, + "epoch": 0.7211538461538461, + "cpu_mem": 1.960013824, + "gpu_mem": 4.5691648, + "loss": 1.4079, + "grad_norm": 0.15909689664840698, + "learning_rate": 6.575003117304535e-05 + }, + { + "step": 451, + "epoch": 0.7227564102564102, + "cpu_mem": 1.960013824, + "gpu_mem": 4.569150976, + "loss": 1.4017, + "grad_norm": 0.1320515275001526, + "learning_rate": 6.50563719033225e-05 + }, + { + "step": 452, + "epoch": 0.7243589743589743, + "cpu_mem": 1.960013824, + "gpu_mem": 4.569161728, + "loss": 1.3889, + "grad_norm": 0.1017974466085434, + "learning_rate": 6.436537644635705e-05 + }, + { + "step": 453, + "epoch": 0.7259615384615384, + "cpu_mem": 1.960013824, + "gpu_mem": 4.569154048, + "loss": 1.3668, + "grad_norm": 0.10052844882011414, + "learning_rate": 6.367706647160847e-05 + }, + { + "step": 454, + "epoch": 0.7275641025641025, + "cpu_mem": 1.960013824, + "gpu_mem": 4.569170944, + "loss": 1.3732, + "grad_norm": 0.08052248507738113, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 455, + "epoch": 0.7291666666666666, + "cpu_mem": 1.960013824, + "gpu_mem": 4.569143296, + "loss": 1.3703, + "grad_norm": 0.1469639241695404, + "learning_rate": 6.230858922484288e-05 + }, + { + "step": 456, + "epoch": 0.7307692307692307, + "cpu_mem": 1.960013824, + "gpu_mem": 4.569174016, + "loss": 1.3982, + "grad_norm": 0.10704222321510315, + "learning_rate": 6.162846486795938e-05 + }, + { + "step": 457, + "epoch": 0.7323717948717948, + "cpu_mem": 1.960013824, + "gpu_mem": 4.569155584, + "loss": 1.3703, + "grad_norm": 0.13294459879398346, + "learning_rate": 6.095111182221422e-05 + }, + { + "step": 458, + "epoch": 0.7339743589743589, + "cpu_mem": 1.960013824, + "gpu_mem": 4.569143296, + "loss": 1.3948, + "grad_norm": 0.11210019141435623, + "learning_rate": 6.027655132924397e-05 + }, + { + "step": 459, + "epoch": 0.7355769230769231, + "cpu_mem": 1.960210432, + "gpu_mem": 4.569155584, + "loss": 1.372, + "grad_norm": 0.12349337339401245, + "learning_rate": 5.960480454311155e-05 + }, + { + "step": 460, + "epoch": 0.7371794871794872, + "cpu_mem": 1.960210432, + "gpu_mem": 4.569161728, + "loss": 1.3792, + "grad_norm": 0.08982829004526138, + "learning_rate": 5.893589252964258e-05 + }, + { + "step": 461, + "epoch": 0.7387820512820513, + "cpu_mem": 1.960210432, + "gpu_mem": 4.56914944, + "loss": 1.3881, + "grad_norm": 0.11219499260187149, + "learning_rate": 5.826983626576479e-05 + }, + { + "step": 462, + "epoch": 0.7403846153846154, + "cpu_mem": 1.960210432, + "gpu_mem": 4.569138688, + "loss": 1.3798, + "grad_norm": 0.1221214234828949, + "learning_rate": 5.760665663885046e-05 + }, + { + "step": 463, + "epoch": 0.7419871794871795, + "cpu_mem": 1.960210432, + "gpu_mem": 4.569140224, + "loss": 1.385, + "grad_norm": 0.15491320192813873, + "learning_rate": 5.6946374446060984e-05 + }, + { + "step": 464, + "epoch": 0.7435897435897436, + "cpu_mem": 1.960210432, + "gpu_mem": 4.569154048, + "loss": 1.3782, + "grad_norm": 0.07401300966739655, + "learning_rate": 5.6289010393695056e-05 + }, + { + "step": 465, + "epoch": 0.7451923076923077, + "cpu_mem": 1.96040704, + "gpu_mem": 4.56915712, + "loss": 1.3723, + "grad_norm": 0.07736221700906754, + "learning_rate": 5.563458509653904e-05 + }, + { + "step": 466, + "epoch": 0.7467948717948718, + "cpu_mem": 1.96040704, + "gpu_mem": 4.569167872, + "loss": 1.3774, + "grad_norm": 0.1161329373717308, + "learning_rate": 5.498311907722057e-05 + }, + { + "step": 467, + "epoch": 0.7483974358974359, + "cpu_mem": 1.96040704, + "gpu_mem": 4.56914176, + "loss": 1.3732, + "grad_norm": 0.10256291925907135, + "learning_rate": 5.43346327655652e-05 + }, + { + "step": 468, + "epoch": 0.75, + "cpu_mem": 1.96040704, + "gpu_mem": 4.56915712, + "loss": 1.3681, + "grad_norm": 0.08353433758020401, + "learning_rate": 5.3689146497955274e-05 + }, + { + "step": 469, + "epoch": 0.7516025641025641, + "cpu_mem": 1.96040704, + "gpu_mem": 4.569166336, + "loss": 1.3659, + "grad_norm": 0.11406828463077545, + "learning_rate": 5.30466805166927e-05 + }, + { + "step": 470, + "epoch": 0.7532051282051282, + "cpu_mem": 1.96040704, + "gpu_mem": 4.569140224, + "loss": 1.3827, + "grad_norm": 0.12741383910179138, + "learning_rate": 5.240725496936372e-05 + }, + { + "step": 471, + "epoch": 0.7548076923076923, + "cpu_mem": 1.96040704, + "gpu_mem": 4.569146368, + "loss": 1.3884, + "grad_norm": 0.12806014716625214, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 472, + "epoch": 0.7564102564102564, + "cpu_mem": 1.96040704, + "gpu_mem": 4.569135616, + "loss": 1.3765, + "grad_norm": 0.183338925242424, + "learning_rate": 5.113760528948622e-05 + }, + { + "step": 473, + "epoch": 0.7580128205128205, + "cpu_mem": 1.96040704, + "gpu_mem": 4.56914176, + "loss": 1.3787, + "grad_norm": 0.11088994145393372, + "learning_rate": 5.05074209728614e-05 + }, + { + "step": 474, + "epoch": 0.7596153846153846, + "cpu_mem": 1.960603648, + "gpu_mem": 4.569178624, + "loss": 1.4011, + "grad_norm": 0.10986243933439255, + "learning_rate": 4.988035672076899e-05 + }, + { + "step": 475, + "epoch": 0.7612179487179487, + "cpu_mem": 1.960603648, + "gpu_mem": 4.5691264, + "loss": 1.3873, + "grad_norm": 0.11504164338111877, + "learning_rate": 4.925643219780052e-05 + }, + { + "step": 476, + "epoch": 0.7628205128205128, + "cpu_mem": 1.960603648, + "gpu_mem": 4.569146368, + "loss": 1.3927, + "grad_norm": 0.1607152372598648, + "learning_rate": 4.863566697008634e-05 + }, + { + "step": 477, + "epoch": 0.7644230769230769, + "cpu_mem": 1.960603648, + "gpu_mem": 4.569146368, + "loss": 1.4092, + "grad_norm": 0.17742939293384552, + "learning_rate": 4.801808050468219e-05 + }, + { + "step": 478, + "epoch": 0.7660256410256411, + "cpu_mem": 1.960603648, + "gpu_mem": 4.569144832, + "loss": 1.4106, + "grad_norm": 0.1331031173467636, + "learning_rate": 4.7403692168958305e-05 + }, + { + "step": 479, + "epoch": 0.7676282051282052, + "cpu_mem": 1.960603648, + "gpu_mem": 4.569143296, + "loss": 1.3518, + "grad_norm": 0.13255657255649567, + "learning_rate": 4.679252122999255e-05 + }, + { + "step": 480, + "epoch": 0.7692307692307693, + "cpu_mem": 1.960603648, + "gpu_mem": 4.569135616, + "loss": 1.3838, + "grad_norm": 0.1781572848558426, + "learning_rate": 4.618458685396579e-05 + }, + { + "step": 481, + "epoch": 0.7708333333333334, + "cpu_mem": 1.960603648, + "gpu_mem": 4.56919552, + "loss": 1.3807, + "grad_norm": 0.09947041422128677, + "learning_rate": 4.5579908105561016e-05 + }, + { + "step": 482, + "epoch": 0.7724358974358975, + "cpu_mem": 1.960603648, + "gpu_mem": 4.569140224, + "loss": 1.3648, + "grad_norm": 0.09922625869512558, + "learning_rate": 4.497850394736563e-05 + }, + { + "step": 483, + "epoch": 0.7740384615384616, + "cpu_mem": 1.960603648, + "gpu_mem": 4.569123328, + "loss": 1.3713, + "grad_norm": 0.1432727575302124, + "learning_rate": 4.438039323927648e-05 + }, + { + "step": 484, + "epoch": 0.7756410256410257, + "cpu_mem": 1.960800256, + "gpu_mem": 4.569154048, + "loss": 1.3837, + "grad_norm": 0.1682610660791397, + "learning_rate": 4.3785594737908676e-05 + }, + { + "step": 485, + "epoch": 0.7772435897435898, + "cpu_mem": 1.960800256, + "gpu_mem": 4.569198592, + "loss": 1.3763, + "grad_norm": 0.16376347839832306, + "learning_rate": 4.319412709600723e-05 + }, + { + "step": 486, + "epoch": 0.7788461538461539, + "cpu_mem": 1.960800256, + "gpu_mem": 4.569178624, + "loss": 1.4008, + "grad_norm": 0.14201273024082184, + "learning_rate": 4.2606008861862116e-05 + }, + { + "step": 487, + "epoch": 0.780448717948718, + "cpu_mem": 1.960800256, + "gpu_mem": 4.569178624, + "loss": 1.3946, + "grad_norm": 0.12247451394796371, + "learning_rate": 4.2021258478726774e-05 + }, + { + "step": 488, + "epoch": 0.782051282051282, + "cpu_mem": 1.960800256, + "gpu_mem": 4.569144832, + "loss": 1.4025, + "grad_norm": 0.1915765106678009, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 489, + "epoch": 0.7836538461538461, + "cpu_mem": 1.960800256, + "gpu_mem": 4.569169408, + "loss": 1.3675, + "grad_norm": 0.08160535991191864, + "learning_rate": 4.0861934509848507e-05 + }, + { + "step": 490, + "epoch": 0.7852564102564102, + "cpu_mem": 1.960800256, + "gpu_mem": 4.56917248, + "loss": 1.3786, + "grad_norm": 0.10405699908733368, + "learning_rate": 4.028739728024022e-05 + }, + { + "step": 491, + "epoch": 0.7868589743589743, + "cpu_mem": 1.960800256, + "gpu_mem": 4.569150976, + "loss": 1.3782, + "grad_norm": 0.157465860247612, + "learning_rate": 3.971630061277077e-05 + }, + { + "step": 492, + "epoch": 0.7884615384615384, + "cpu_mem": 1.960800256, + "gpu_mem": 4.569174016, + "loss": 1.3783, + "grad_norm": 0.20200689136981964, + "learning_rate": 3.914866241690115e-05 + }, + { + "step": 493, + "epoch": 0.7900641025641025, + "cpu_mem": 1.960800256, + "gpu_mem": 4.569154048, + "loss": 1.3701, + "grad_norm": 0.14676855504512787, + "learning_rate": 3.858450049363532e-05 + }, + { + "step": 494, + "epoch": 0.7916666666666666, + "cpu_mem": 1.960800256, + "gpu_mem": 4.569178624, + "loss": 1.3771, + "grad_norm": 0.10739074647426605, + "learning_rate": 3.8023832534962314e-05 + }, + { + "step": 495, + "epoch": 0.7932692307692307, + "cpu_mem": 1.960800256, + "gpu_mem": 4.569161728, + "loss": 1.3762, + "grad_norm": 0.09707512706518173, + "learning_rate": 3.746667612330109e-05 + }, + { + "step": 496, + "epoch": 0.7948717948717948, + "cpu_mem": 1.960800256, + "gpu_mem": 4.56915712, + "loss": 1.3781, + "grad_norm": 0.20561768114566803, + "learning_rate": 3.691304873094927e-05 + }, + { + "step": 497, + "epoch": 0.7964743589743589, + "cpu_mem": 1.960996864, + "gpu_mem": 4.569169408, + "loss": 1.3774, + "grad_norm": 0.11660339683294296, + "learning_rate": 3.636296771953544e-05 + }, + { + "step": 498, + "epoch": 0.7980769230769231, + "cpu_mem": 1.960996864, + "gpu_mem": 4.569140224, + "loss": 1.3755, + "grad_norm": 0.13415414094924927, + "learning_rate": 3.581645033947425e-05 + }, + { + "step": 499, + "epoch": 0.7996794871794872, + "cpu_mem": 1.960996864, + "gpu_mem": 4.569154048, + "loss": 1.3785, + "grad_norm": 0.15125708281993866, + "learning_rate": 3.527351372942588e-05 + }, + { + "step": 500, + "epoch": 0.8012820512820513, + "cpu_mem": 1.961193472, + "gpu_mem": 4.569140224, + "loss": 1.3661, + "grad_norm": 0.1302867829799652, + "learning_rate": 3.473417491575824e-05 + }, + { + "step": 501, + "epoch": 0.8028846153846154, + "cpu_mem": 1.961193472, + "gpu_mem": 4.56913408, + "loss": 1.3917, + "grad_norm": 0.080440454185009, + "learning_rate": 3.41984508120132e-05 + }, + { + "step": 502, + "epoch": 0.8044871794871795, + "cpu_mem": 1.961193472, + "gpu_mem": 4.569140224, + "loss": 1.3832, + "grad_norm": 0.08249234408140182, + "learning_rate": 3.366635821837627e-05 + }, + { + "step": 503, + "epoch": 0.8060897435897436, + "cpu_mem": 1.961193472, + "gpu_mem": 4.569154048, + "loss": 1.3902, + "grad_norm": 0.15014439821243286, + "learning_rate": 3.3137913821149425e-05 + }, + { + "step": 504, + "epoch": 0.8076923076923077, + "cpu_mem": 1.961193472, + "gpu_mem": 4.569137152, + "loss": 1.3696, + "grad_norm": 0.3668355941772461, + "learning_rate": 3.261313419222825e-05 + }, + { + "step": 505, + "epoch": 0.8092948717948718, + "cpu_mem": 1.961193472, + "gpu_mem": 4.569190912, + "loss": 1.3698, + "grad_norm": 0.13635320961475372, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 506, + "epoch": 0.8108974358974359, + "cpu_mem": 1.961193472, + "gpu_mem": 4.56913408, + "loss": 1.3702, + "grad_norm": 0.14207756519317627, + "learning_rate": 3.157463495173713e-05 + }, + { + "step": 507, + "epoch": 0.8125, + "cpu_mem": 1.961193472, + "gpu_mem": 4.569212416, + "loss": 1.3865, + "grad_norm": 0.16973446309566498, + "learning_rate": 3.1060947907265936e-05 + }, + { + "step": 508, + "epoch": 0.8141025641025641, + "cpu_mem": 1.961193472, + "gpu_mem": 4.569155584, + "loss": 1.372, + "grad_norm": 0.2787920832633972, + "learning_rate": 3.0550990764276634e-05 + }, + { + "step": 509, + "epoch": 0.8157051282051282, + "cpu_mem": 1.961193472, + "gpu_mem": 4.569174016, + "loss": 1.38, + "grad_norm": 0.2493170201778412, + "learning_rate": 3.00447795149086e-05 + }, + { + "step": 510, + "epoch": 0.8173076923076923, + "cpu_mem": 1.961193472, + "gpu_mem": 4.56914944, + "loss": 1.3845, + "grad_norm": 0.22436852753162384, + "learning_rate": 2.9542330033830884e-05 + }, + { + "step": 511, + "epoch": 0.8189102564102564, + "cpu_mem": 1.961193472, + "gpu_mem": 4.569181696, + "loss": 1.3758, + "grad_norm": 0.2282821536064148, + "learning_rate": 2.9043658077744316e-05 + }, + { + "step": 512, + "epoch": 0.8205128205128205, + "cpu_mem": 1.961193472, + "gpu_mem": 4.569201664, + "loss": 1.3685, + "grad_norm": 0.16475269198417664, + "learning_rate": 2.8548779284887442e-05 + }, + { + "step": 513, + "epoch": 0.8221153846153846, + "cpu_mem": 1.961193472, + "gpu_mem": 4.569131008, + "loss": 1.3795, + "grad_norm": 0.1725904643535614, + "learning_rate": 2.805770917454614e-05 + }, + { + "step": 514, + "epoch": 0.8237179487179487, + "cpu_mem": 1.961193472, + "gpu_mem": 4.569144832, + "loss": 1.3582, + "grad_norm": 0.30766022205352783, + "learning_rate": 2.7570463146566758e-05 + }, + { + "step": 515, + "epoch": 0.8253205128205128, + "cpu_mem": 1.961193472, + "gpu_mem": 4.569129472, + "loss": 1.3825, + "grad_norm": 0.34755128622055054, + "learning_rate": 2.708705648087332e-05 + }, + { + "step": 516, + "epoch": 0.8269230769230769, + "cpu_mem": 1.96139008, + "gpu_mem": 4.569167872, + "loss": 1.3761, + "grad_norm": 0.13856643438339233, + "learning_rate": 2.6607504336988317e-05 + }, + { + "step": 517, + "epoch": 0.8285256410256411, + "cpu_mem": 1.96139008, + "gpu_mem": 4.569167872, + "loss": 1.3994, + "grad_norm": 0.19383861124515533, + "learning_rate": 2.613182175355739e-05 + }, + { + "step": 518, + "epoch": 0.8301282051282052, + "cpu_mem": 1.96139008, + "gpu_mem": 4.569154048, + "loss": 1.3671, + "grad_norm": 0.2092168778181076, + "learning_rate": 2.5660023647877644e-05 + }, + { + "step": 519, + "epoch": 0.8317307692307693, + "cpu_mem": 1.96139008, + "gpu_mem": 4.569144832, + "loss": 1.3568, + "grad_norm": 0.1319851279258728, + "learning_rate": 2.5192124815429777e-05 + }, + { + "step": 520, + "epoch": 0.8333333333333334, + "cpu_mem": 1.96139008, + "gpu_mem": 4.56914944, + "loss": 1.3732, + "grad_norm": 0.17948079109191895, + "learning_rate": 2.472813992941418e-05 + }, + { + "step": 521, + "epoch": 0.8349358974358975, + "cpu_mem": 1.96139008, + "gpu_mem": 4.569152512, + "loss": 1.4014, + "grad_norm": 0.2592010796070099, + "learning_rate": 2.426808354029078e-05 + }, + { + "step": 522, + "epoch": 0.8365384615384616, + "cpu_mem": 1.96139008, + "gpu_mem": 4.569158656, + "loss": 1.3662, + "grad_norm": 0.30415722727775574, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 523, + "epoch": 0.8381410256410257, + "cpu_mem": 1.961586688, + "gpu_mem": 4.569177088, + "loss": 1.3707, + "grad_norm": 0.11225222051143646, + "learning_rate": 2.3359813838124277e-05 + }, + { + "step": 524, + "epoch": 0.8397435897435898, + "cpu_mem": 1.961586688, + "gpu_mem": 4.569170944, + "loss": 1.3943, + "grad_norm": 0.18453404307365417, + "learning_rate": 2.2911629008211363e-05 + }, + { + "step": 525, + "epoch": 0.8413461538461539, + "cpu_mem": 1.961586688, + "gpu_mem": 4.569147904, + "loss": 1.3795, + "grad_norm": 0.16773836314678192, + "learning_rate": 2.24674296405579e-05 + }, + { + "step": 526, + "epoch": 0.842948717948718, + "cpu_mem": 1.961586688, + "gpu_mem": 4.569135616, + "loss": 1.3695, + "grad_norm": 0.15111969411373138, + "learning_rate": 2.2027229665154446e-05 + }, + { + "step": 527, + "epoch": 0.844551282051282, + "cpu_mem": 1.961586688, + "gpu_mem": 4.569101824, + "loss": 1.3665, + "grad_norm": 0.21759431064128876, + "learning_rate": 2.1591042886571634e-05 + }, + { + "step": 528, + "epoch": 0.8461538461538461, + "cpu_mem": 1.961586688, + "gpu_mem": 4.56914944, + "loss": 1.3745, + "grad_norm": 0.2828618586063385, + "learning_rate": 2.1158882983527166e-05 + }, + { + "step": 529, + "epoch": 0.8477564102564102, + "cpu_mem": 1.961586688, + "gpu_mem": 4.569115648, + "loss": 1.3785, + "grad_norm": 0.1290382444858551, + "learning_rate": 2.0730763508456738e-05 + }, + { + "step": 530, + "epoch": 0.8493589743589743, + "cpu_mem": 1.961586688, + "gpu_mem": 4.569163264, + "loss": 1.3602, + "grad_norm": 0.16809441149234772, + "learning_rate": 2.0306697887089235e-05 + }, + { + "step": 531, + "epoch": 0.8509615384615384, + "cpu_mem": 1.961586688, + "gpu_mem": 4.569161728, + "loss": 1.388, + "grad_norm": 0.16591447591781616, + "learning_rate": 1.9886699418025543e-05 + }, + { + "step": 532, + "epoch": 0.8525641025641025, + "cpu_mem": 1.961586688, + "gpu_mem": 4.569163264, + "loss": 1.383, + "grad_norm": 0.2656993269920349, + "learning_rate": 1.947078127232169e-05 + }, + { + "step": 533, + "epoch": 0.8541666666666666, + "cpu_mem": 1.961586688, + "gpu_mem": 4.56917248, + "loss": 1.3824, + "grad_norm": 0.1918942630290985, + "learning_rate": 1.9058956493075644e-05 + }, + { + "step": 534, + "epoch": 0.8557692307692307, + "cpu_mem": 1.961586688, + "gpu_mem": 4.569147904, + "loss": 1.3807, + "grad_norm": 0.11682610958814621, + "learning_rate": 1.8651237995018324e-05 + }, + { + "step": 535, + "epoch": 0.8573717948717948, + "cpu_mem": 1.961586688, + "gpu_mem": 4.569132544, + "loss": 1.3767, + "grad_norm": 0.15440618991851807, + "learning_rate": 1.8247638564108607e-05 + }, + { + "step": 536, + "epoch": 0.8589743589743589, + "cpu_mem": 1.961586688, + "gpu_mem": 4.569161728, + "loss": 1.3812, + "grad_norm": 0.20276911556720734, + "learning_rate": 1.7848170857132325e-05 + }, + { + "step": 537, + "epoch": 0.8605769230769231, + "cpu_mem": 1.961586688, + "gpu_mem": 4.569175552, + "loss": 1.3733, + "grad_norm": 0.1529669314622879, + "learning_rate": 1.7452847401305496e-05 + }, + { + "step": 538, + "epoch": 0.8621794871794872, + "cpu_mem": 1.961586688, + "gpu_mem": 4.569131008, + "loss": 1.3817, + "grad_norm": 0.18623152375221252, + "learning_rate": 1.7061680593881344e-05 + }, + { + "step": 539, + "epoch": 0.8637820512820513, + "cpu_mem": 1.961783296, + "gpu_mem": 4.569137152, + "loss": 1.3743, + "grad_norm": 0.17978331446647644, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 540, + "epoch": 0.8653846153846154, + "cpu_mem": 1.961783296, + "gpu_mem": 4.569166336, + "loss": 1.382, + "grad_norm": 0.1286793053150177, + "learning_rate": 1.6291865861111353e-05 + }, + { + "step": 541, + "epoch": 0.8669871794871795, + "cpu_mem": 1.961783296, + "gpu_mem": 4.569161728, + "loss": 1.3718, + "grad_norm": 0.15323203802108765, + "learning_rate": 1.5913242076979493e-05 + }, + { + "step": 542, + "epoch": 0.8685897435897436, + "cpu_mem": 1.961783296, + "gpu_mem": 4.569147904, + "loss": 1.3791, + "grad_norm": 0.16321462392807007, + "learning_rate": 1.5538823222921288e-05 + }, + { + "step": 543, + "epoch": 0.8701923076923077, + "cpu_mem": 1.961783296, + "gpu_mem": 4.569161728, + "loss": 1.3813, + "grad_norm": 0.17773151397705078, + "learning_rate": 1.5168621040626388e-05 + }, + { + "step": 544, + "epoch": 0.8717948717948718, + "cpu_mem": 1.961783296, + "gpu_mem": 4.569150976, + "loss": 1.3653, + "grad_norm": 0.13639989495277405, + "learning_rate": 1.4802647139550577e-05 + }, + { + "step": 545, + "epoch": 0.8733974358974359, + "cpu_mem": 1.961783296, + "gpu_mem": 4.56915712, + "loss": 1.3582, + "grad_norm": 0.15526163578033447, + "learning_rate": 1.444091299655175e-05 + }, + { + "step": 546, + "epoch": 0.875, + "cpu_mem": 1.961783296, + "gpu_mem": 4.569161728, + "loss": 1.3893, + "grad_norm": 0.21147063374519348, + "learning_rate": 1.408342995552988e-05 + }, + { + "step": 547, + "epoch": 0.8766025641025641, + "cpu_mem": 1.961783296, + "gpu_mem": 4.56915712, + "loss": 1.3813, + "grad_norm": 0.10892124474048615, + "learning_rate": 1.3730209227071436e-05 + }, + { + "step": 548, + "epoch": 0.8782051282051282, + "cpu_mem": 1.961783296, + "gpu_mem": 4.569131008, + "loss": 1.3866, + "grad_norm": 0.15443618595600128, + "learning_rate": 1.3381261888097755e-05 + }, + { + "step": 549, + "epoch": 0.8798076923076923, + "cpu_mem": 1.961783296, + "gpu_mem": 4.569140224, + "loss": 1.3732, + "grad_norm": 0.17457713186740875, + "learning_rate": 1.303659888151753e-05 + }, + { + "step": 550, + "epoch": 0.8814102564102564, + "cpu_mem": 1.961783296, + "gpu_mem": 4.569158656, + "loss": 1.3677, + "grad_norm": 0.18112346529960632, + "learning_rate": 1.2696231015883913e-05 + }, + { + "step": 551, + "epoch": 0.8830128205128205, + "cpu_mem": 1.961783296, + "gpu_mem": 4.569129472, + "loss": 1.3911, + "grad_norm": 0.2676812410354614, + "learning_rate": 1.2360168965055301e-05 + }, + { + "step": 552, + "epoch": 0.8846153846153846, + "cpu_mem": 1.961783296, + "gpu_mem": 4.569160192, + "loss": 1.373, + "grad_norm": 0.18968218564987183, + "learning_rate": 1.2028423267860805e-05 + }, + { + "step": 553, + "epoch": 0.8862179487179487, + "cpu_mem": 1.961783296, + "gpu_mem": 4.569169408, + "loss": 1.3773, + "grad_norm": 0.15592533349990845, + "learning_rate": 1.1701004327769709e-05 + }, + { + "step": 554, + "epoch": 0.8878205128205128, + "cpu_mem": 1.961783296, + "gpu_mem": 4.569131008, + "loss": 1.3698, + "grad_norm": 0.1227807030081749, + "learning_rate": 1.1377922412565005e-05 + }, + { + "step": 555, + "epoch": 0.8894230769230769, + "cpu_mem": 1.961783296, + "gpu_mem": 4.569135616, + "loss": 1.3672, + "grad_norm": 0.14391037821769714, + "learning_rate": 1.1059187654021762e-05 + }, + { + "step": 556, + "epoch": 0.8910256410256411, + "cpu_mem": 1.961783296, + "gpu_mem": 4.569160192, + "loss": 1.3667, + "grad_norm": 0.15328063070774078, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 557, + "epoch": 0.8926282051282052, + "cpu_mem": 1.961979904, + "gpu_mem": 4.569178624, + "loss": 1.3719, + "grad_norm": 0.11721517145633698, + "learning_rate": 1.0434799452076915e-05 + }, + { + "step": 558, + "epoch": 0.8942307692307693, + "cpu_mem": 1.961979904, + "gpu_mem": 4.569160192, + "loss": 1.4012, + "grad_norm": 0.40171417593955994, + "learning_rate": 1.0129165589346643e-05 + }, + { + "step": 559, + "epoch": 0.8958333333333334, + "cpu_mem": 1.961979904, + "gpu_mem": 4.56921088, + "loss": 1.3674, + "grad_norm": 0.15043997764587402, + "learning_rate": 9.82791804400626e-06 + }, + { + "step": 560, + "epoch": 0.8974358974358975, + "cpu_mem": 1.961979904, + "gpu_mem": 4.569143296, + "loss": 1.3737, + "grad_norm": 0.1909467726945877, + "learning_rate": 9.531066263109971e-06 + }, + { + "step": 561, + "epoch": 0.8990384615384616, + "cpu_mem": 1.961979904, + "gpu_mem": 4.569144832, + "loss": 1.3726, + "grad_norm": 0.1596338450908661, + "learning_rate": 9.238619555861731e-06 + }, + { + "step": 562, + "epoch": 0.9006410256410257, + "cpu_mem": 1.961979904, + "gpu_mem": 4.569144832, + "loss": 1.3924, + "grad_norm": 0.1806458979845047, + "learning_rate": 8.950587093323435e-06 + }, + { + "step": 563, + "epoch": 0.9022435897435898, + "cpu_mem": 1.961979904, + "gpu_mem": 4.569150976, + "loss": 1.3707, + "grad_norm": 0.2202754020690918, + "learning_rate": 8.66697790812731e-06 + }, + { + "step": 564, + "epoch": 0.9038461538461539, + "cpu_mem": 1.961979904, + "gpu_mem": 4.5691648, + "loss": 1.3651, + "grad_norm": 0.13174790143966675, + "learning_rate": 8.387800894192453e-06 + }, + { + "step": 565, + "epoch": 0.905448717948718, + "cpu_mem": 1.961979904, + "gpu_mem": 4.569169408, + "loss": 1.387, + "grad_norm": 0.15048378705978394, + "learning_rate": 8.113064806446285e-06 + }, + { + "step": 566, + "epoch": 0.907051282051282, + "cpu_mem": 1.961979904, + "gpu_mem": 4.569163264, + "loss": 1.3866, + "grad_norm": 0.19274763762950897, + "learning_rate": 7.842778260549654e-06 + }, + { + "step": 567, + "epoch": 0.9086538461538461, + "cpu_mem": 1.961979904, + "gpu_mem": 4.56915712, + "loss": 1.4033, + "grad_norm": 0.20190924406051636, + "learning_rate": 7.5769497326268804e-06 + }, + { + "step": 568, + "epoch": 0.9102564102564102, + "cpu_mem": 1.961979904, + "gpu_mem": 4.569170944, + "loss": 1.3788, + "grad_norm": 0.18314000964164734, + "learning_rate": 7.315587558999864e-06 + }, + { + "step": 569, + "epoch": 0.9118589743589743, + "cpu_mem": 1.961979904, + "gpu_mem": 4.569163264, + "loss": 1.3749, + "grad_norm": 0.13606035709381104, + "learning_rate": 7.058699935926526e-06 + }, + { + "step": 570, + "epoch": 0.9134615384615384, + "cpu_mem": 1.961979904, + "gpu_mem": 4.569147904, + "loss": 1.4026, + "grad_norm": 0.23454615473747253, + "learning_rate": 6.8062949193440515e-06 + }, + { + "step": 571, + "epoch": 0.9150641025641025, + "cpu_mem": 1.961979904, + "gpu_mem": 4.56915712, + "loss": 1.3643, + "grad_norm": 0.08178477734327316, + "learning_rate": 6.5583804246160385e-06 + }, + { + "step": 572, + "epoch": 0.9166666666666666, + "cpu_mem": 1.961979904, + "gpu_mem": 4.569166336, + "loss": 1.3701, + "grad_norm": 0.12373453378677368, + "learning_rate": 6.3149642262843804e-06 + }, + { + "step": 573, + "epoch": 0.9182692307692307, + "cpu_mem": 1.961979904, + "gpu_mem": 4.569169408, + "loss": 1.389, + "grad_norm": 0.16227751970291138, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 574, + "epoch": 0.9198717948717948, + "cpu_mem": 1.961979904, + "gpu_mem": 4.569123328, + "loss": 1.3813, + "grad_norm": 0.1200789138674736, + "learning_rate": 5.84165711141048e-06 + }, + { + "step": 575, + "epoch": 0.9214743589743589, + "cpu_mem": 1.961979904, + "gpu_mem": 4.569177088, + "loss": 1.3759, + "grad_norm": 0.18958142399787903, + "learning_rate": 5.611781037671176e-06 + }, + { + "step": 576, + "epoch": 0.9230769230769231, + "cpu_mem": 1.961979904, + "gpu_mem": 4.569178624, + "loss": 1.3712, + "grad_norm": 0.11867231875658035, + "learning_rate": 5.386432945468555e-06 + }, + { + "step": 577, + "epoch": 0.9246794871794872, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569123328, + "loss": 1.3874, + "grad_norm": 0.1175699532032013, + "learning_rate": 5.165619901667311e-06 + }, + { + "step": 578, + "epoch": 0.9262820512820513, + "cpu_mem": 1.962176512, + "gpu_mem": 4.56915712, + "loss": 1.3753, + "grad_norm": 0.15708571672439575, + "learning_rate": 4.949348830914002e-06 + }, + { + "step": 579, + "epoch": 0.9278846153846154, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569135616, + "loss": 1.3509, + "grad_norm": 0.12248227745294571, + "learning_rate": 4.737626515419951e-06 + }, + { + "step": 580, + "epoch": 0.9294871794871795, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569166336, + "loss": 1.3806, + "grad_norm": 0.10919484496116638, + "learning_rate": 4.530459594748592e-06 + }, + { + "step": 581, + "epoch": 0.9310897435897436, + "cpu_mem": 1.962176512, + "gpu_mem": 4.56914176, + "loss": 1.3765, + "grad_norm": 0.32887178659439087, + "learning_rate": 4.327854565607164e-06 + }, + { + "step": 582, + "epoch": 0.9326923076923077, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569175552, + "loss": 1.3702, + "grad_norm": 0.16640739142894745, + "learning_rate": 4.129817781643091e-06 + }, + { + "step": 583, + "epoch": 0.9342948717948718, + "cpu_mem": 1.962176512, + "gpu_mem": 4.56919552, + "loss": 1.3872, + "grad_norm": 0.14840783178806305, + "learning_rate": 3.9363554532446276e-06 + }, + { + "step": 584, + "epoch": 0.9358974358974359, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569160192, + "loss": 1.3797, + "grad_norm": 0.13726213574409485, + "learning_rate": 3.7474736473461607e-06 + }, + { + "step": 585, + "epoch": 0.9375, + "cpu_mem": 1.962176512, + "gpu_mem": 4.56918016, + "loss": 1.37, + "grad_norm": 0.1519099324941635, + "learning_rate": 3.56317828723795e-06 + }, + { + "step": 586, + "epoch": 0.9391025641025641, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569160192, + "loss": 1.3694, + "grad_norm": 0.19757229089736938, + "learning_rate": 3.383475152380355e-06 + }, + { + "step": 587, + "epoch": 0.9407051282051282, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569161728, + "loss": 1.3668, + "grad_norm": 0.1673838198184967, + "learning_rate": 3.2083698782225997e-06 + }, + { + "step": 588, + "epoch": 0.9423076923076923, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569154048, + "loss": 1.3767, + "grad_norm": 0.14191272854804993, + "learning_rate": 3.0378679560260467e-06 + }, + { + "step": 589, + "epoch": 0.9439102564102564, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569152512, + "loss": 1.36, + "grad_norm": 0.40535256266593933, + "learning_rate": 2.871974732691984e-06 + }, + { + "step": 590, + "epoch": 0.9455128205128205, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569166336, + "loss": 1.3888, + "grad_norm": 0.136625275015831, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 591, + "epoch": 0.9471153846153846, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569137152, + "loss": 1.3881, + "grad_norm": 0.12656542658805847, + "learning_rate": 2.554035047414732e-06 + }, + { + "step": 592, + "epoch": 0.9487179487179487, + "cpu_mem": 1.962176512, + "gpu_mem": 4.56918784, + "loss": 1.3758, + "grad_norm": 0.1789671927690506, + "learning_rate": 2.401998555987389e-06 + }, + { + "step": 593, + "epoch": 0.9503205128205128, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569184768, + "loss": 1.3771, + "grad_norm": 0.12493494898080826, + "learning_rate": 2.2545907041415457e-06 + }, + { + "step": 594, + "epoch": 0.9519230769230769, + "cpu_mem": 1.962176512, + "gpu_mem": 4.5691648, + "loss": 1.3732, + "grad_norm": 0.1324484944343567, + "learning_rate": 2.1118161145537436e-06 + }, + { + "step": 595, + "epoch": 0.9535256410256411, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569146368, + "loss": 1.3921, + "grad_norm": 0.13219936192035675, + "learning_rate": 1.973679264602485e-06 + }, + { + "step": 596, + "epoch": 0.9551282051282052, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569155584, + "loss": 1.3913, + "grad_norm": 0.13235777616500854, + "learning_rate": 1.840184486227808e-06 + }, + { + "step": 597, + "epoch": 0.9567307692307693, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569123328, + "loss": 1.3661, + "grad_norm": 0.108641117811203, + "learning_rate": 1.711335965795435e-06 + }, + { + "step": 598, + "epoch": 0.9583333333333334, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569184768, + "loss": 1.37, + "grad_norm": 0.14280681312084198, + "learning_rate": 1.5871377439655054e-06 + }, + { + "step": 599, + "epoch": 0.9599358974358975, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569183232, + "loss": 1.3959, + "grad_norm": 0.18592743575572968, + "learning_rate": 1.4675937155658456e-06 + }, + { + "step": 600, + "epoch": 0.9615384615384616, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569138688, + "loss": 1.372, + "grad_norm": 0.12982177734375, + "learning_rate": 1.3527076294698846e-06 + }, + { + "step": 601, + "epoch": 0.9631410256410257, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569170944, + "loss": 1.3595, + "grad_norm": 0.12453503161668777, + "learning_rate": 1.2424830884790126e-06 + }, + { + "step": 602, + "epoch": 0.9647435897435898, + "cpu_mem": 1.962176512, + "gpu_mem": 4.5691648, + "loss": 1.3634, + "grad_norm": 0.17802751064300537, + "learning_rate": 1.1369235492096397e-06 + }, + { + "step": 603, + "epoch": 0.9663461538461539, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569150976, + "loss": 1.3879, + "grad_norm": 0.30377915501594543, + "learning_rate": 1.0360323219847645e-06 + }, + { + "step": 604, + "epoch": 0.967948717948718, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569150976, + "loss": 1.371, + "grad_norm": 0.15095767378807068, + "learning_rate": 9.398125707302084e-07 + }, + { + "step": 605, + "epoch": 0.969551282051282, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569177088, + "loss": 1.3777, + "grad_norm": 0.21598152816295624, + "learning_rate": 8.482673128753947e-07 + }, + { + "step": 606, + "epoch": 0.9711538461538461, + "cpu_mem": 1.962176512, + "gpu_mem": 4.5691648, + "loss": 1.3902, + "grad_norm": 0.16450022161006927, + "learning_rate": 7.613994192586736e-07 + }, + { + "step": 607, + "epoch": 0.9727564102564102, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569155584, + "loss": 1.3678, + "grad_norm": 0.13941466808319092, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 608, + "epoch": 0.9743589743589743, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569147904, + "loss": 1.3736, + "grad_norm": 0.12483320385217667, + "learning_rate": 6.017064746021094e-07 + }, + { + "step": 609, + "epoch": 0.9759615384615384, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569170944, + "loss": 1.3742, + "grad_norm": 0.17391516268253326, + "learning_rate": 5.288864314965003e-07 + }, + { + "step": 610, + "epoch": 0.9775641025641025, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569160192, + "loss": 1.3607, + "grad_norm": 0.16625471413135529, + "learning_rate": 4.607537683404106e-07 + }, + { + "step": 611, + "epoch": 0.9791666666666666, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569144832, + "loss": 1.3917, + "grad_norm": 0.163979172706604, + "learning_rate": 3.973106217585842e-07 + }, + { + "step": 612, + "epoch": 0.9807692307692307, + "cpu_mem": 1.962176512, + "gpu_mem": 4.56921088, + "loss": 1.3736, + "grad_norm": 0.17395494878292084, + "learning_rate": 3.3855898131356915e-07 + }, + { + "step": 613, + "epoch": 0.9823717948717948, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569154048, + "loss": 1.3847, + "grad_norm": 0.222160205245018, + "learning_rate": 2.845006894433843e-07 + }, + { + "step": 614, + "epoch": 0.9839743589743589, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569140224, + "loss": 1.3805, + "grad_norm": 0.14523842930793762, + "learning_rate": 2.351374414037155e-07 + }, + { + "step": 615, + "epoch": 0.9855769230769231, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569204736, + "loss": 1.3937, + "grad_norm": 0.23285768926143646, + "learning_rate": 1.9047078521474135e-07 + }, + { + "step": 616, + "epoch": 0.9871794871794872, + "cpu_mem": 1.962176512, + "gpu_mem": 4.56913408, + "loss": 1.3739, + "grad_norm": 0.18447525799274445, + "learning_rate": 1.505021216125557e-07 + }, + { + "step": 617, + "epoch": 0.9887820512820513, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569161728, + "loss": 1.3606, + "grad_norm": 0.1378791779279709, + "learning_rate": 1.1523270400535245e-07 + }, + { + "step": 618, + "epoch": 0.9903846153846154, + "cpu_mem": 1.962176512, + "gpu_mem": 4.5691648, + "loss": 1.3751, + "grad_norm": 0.1372581571340561, + "learning_rate": 8.466363843397383e-08 + }, + { + "step": 619, + "epoch": 0.9919871794871795, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569140224, + "loss": 1.3817, + "grad_norm": 0.17237572371959686, + "learning_rate": 5.8795883537338106e-08 + }, + { + "step": 620, + "epoch": 0.9935897435897436, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569167872, + "loss": 1.3704, + "grad_norm": 0.20639659464359283, + "learning_rate": 3.763025052231361e-08 + }, + { + "step": 621, + "epoch": 0.9951923076923077, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569177088, + "loss": 1.3802, + "grad_norm": 0.16992993652820587, + "learning_rate": 2.1167403138339088e-08 + }, + { + "step": 622, + "epoch": 0.9967948717948718, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569174016, + "loss": 1.3971, + "grad_norm": 0.2093990296125412, + "learning_rate": 9.407857656540397e-09 + }, + { + "step": 623, + "epoch": 0.9983974358974359, + "cpu_mem": 1.962176512, + "gpu_mem": 4.569146368, + "loss": 1.3844, + "grad_norm": 0.15191924571990967, + "learning_rate": 2.3519828535434325e-09 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.962176512, + "gpu_mem": 4.568892928, + "loss": 1.3923, + "grad_norm": 0.22232866287231445, + "learning_rate": 0.0 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.962176512, + "gpu_mem": 4.568892928, + "train_runtime": 16065.466, + "train_samples_per_second": 2.484, + "train_steps_per_second": 0.039, + "total_flos": 8.526318586299187e+16, + "train_loss": 1.4588101987655346 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r2-a2/README.md b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r2-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r2-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r2-a2/adapter_config.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4c4758d99093e963e7b960b3e04b3ff68f0cc5fe --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r2-a2/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha": 4, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "inference_mode": true, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 2, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r2-a2/eval_results.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..0bec5b167bb0d2f58b456ade1394a90e1a877ba8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "logiqa", + "results": 0.29064242925015493 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r2-a2/training_configuration.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..d412a02ddf8d60b33f880fb8192f0f6cdb4e7362 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "LOGIQA", + "dataset_id": "data/logiqa_train", + "preprocess_id": "logiqa_train_deepeval" + }, + "peft_config": { + "method": "loha", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 3153920 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 3, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loha-logiqa-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r2-a2", + "seed": 42, + "timestamp": "2025-09-12T19:41:02.345606" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r2-a2/training_logs.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..b93f73f643bed7c5af7bae07c7e15602bb86fcb6 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r2-a2/training_logs.json @@ -0,0 +1,5305 @@ +[ + { + "step": 1, + "epoch": 0.005089058524173028, + "cpu_mem": 1.92272384, + "gpu_mem": 4.430417408, + "loss": 3.8396, + "grad_norm": 3.608147144317627, + "learning_rate": 5.084745762711864e-06 + }, + { + "step": 2, + "epoch": 0.010178117048346057, + "cpu_mem": 1.928032256, + "gpu_mem": 4.455668224, + "loss": 3.9728, + "grad_norm": 3.623274087905884, + "learning_rate": 1.0169491525423728e-05 + }, + { + "step": 3, + "epoch": 0.015267175572519083, + "cpu_mem": 1.928032256, + "gpu_mem": 4.455745024, + "loss": 3.8512, + "grad_norm": 3.5405385494232178, + "learning_rate": 1.5254237288135592e-05 + }, + { + "step": 4, + "epoch": 0.020356234096692113, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455643648, + "loss": 3.8223, + "grad_norm": 3.873591423034668, + "learning_rate": 2.0338983050847455e-05 + }, + { + "step": 5, + "epoch": 0.02544529262086514, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455659008, + "loss": 3.929, + "grad_norm": 3.8158721923828125, + "learning_rate": 2.542372881355932e-05 + }, + { + "step": 6, + "epoch": 0.030534351145038167, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455651328, + "loss": 3.8757, + "grad_norm": 3.56449556350708, + "learning_rate": 3.0508474576271185e-05 + }, + { + "step": 7, + "epoch": 0.035623409669211195, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455721984, + "loss": 3.9011, + "grad_norm": 3.7332425117492676, + "learning_rate": 3.559322033898305e-05 + }, + { + "step": 8, + "epoch": 0.04071246819338423, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455706624, + "loss": 3.7711, + "grad_norm": 3.5942087173461914, + "learning_rate": 4.067796610169491e-05 + }, + { + "step": 9, + "epoch": 0.04580152671755725, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455705088, + "loss": 3.7356, + "grad_norm": 3.505679130554199, + "learning_rate": 4.576271186440678e-05 + }, + { + "step": 10, + "epoch": 0.05089058524173028, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45571584, + "loss": 4.0423, + "grad_norm": 3.488323450088501, + "learning_rate": 5.084745762711864e-05 + }, + { + "step": 11, + "epoch": 0.05597964376590331, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455619072, + "loss": 3.7626, + "grad_norm": 3.4816484451293945, + "learning_rate": 5.59322033898305e-05 + }, + { + "step": 12, + "epoch": 0.061068702290076333, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45566976, + "loss": 3.6119, + "grad_norm": 3.4165711402893066, + "learning_rate": 6.101694915254237e-05 + }, + { + "step": 13, + "epoch": 0.06615776081424936, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45576192, + "loss": 3.6515, + "grad_norm": 3.5131239891052246, + "learning_rate": 6.610169491525423e-05 + }, + { + "step": 14, + "epoch": 0.07124681933842239, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455674368, + "loss": 3.6974, + "grad_norm": 3.4749362468719482, + "learning_rate": 7.11864406779661e-05 + }, + { + "step": 15, + "epoch": 0.07633587786259542, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455812608, + "loss": 3.5963, + "grad_norm": 3.5153825283050537, + "learning_rate": 7.627118644067796e-05 + }, + { + "step": 16, + "epoch": 0.08142493638676845, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455672832, + "loss": 3.361, + "grad_norm": 3.0985071659088135, + "learning_rate": 8.135593220338982e-05 + }, + { + "step": 17, + "epoch": 0.08651399491094147, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455703552, + "loss": 3.2874, + "grad_norm": 3.4357998371124268, + "learning_rate": 8.64406779661017e-05 + }, + { + "step": 18, + "epoch": 0.0916030534351145, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455666688, + "loss": 3.4616, + "grad_norm": 3.279432773590088, + "learning_rate": 9.152542372881355e-05 + }, + { + "step": 19, + "epoch": 0.09669211195928754, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455574528, + "loss": 3.3634, + "grad_norm": 3.260382652282715, + "learning_rate": 9.661016949152541e-05 + }, + { + "step": 20, + "epoch": 0.10178117048346055, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455612928, + "loss": 3.2951, + "grad_norm": 3.1233274936676025, + "learning_rate": 0.00010169491525423727 + }, + { + "step": 21, + "epoch": 0.10687022900763359, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45574656, + "loss": 2.9146, + "grad_norm": 3.045985221862793, + "learning_rate": 0.00010677966101694915 + }, + { + "step": 22, + "epoch": 0.11195928753180662, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455645184, + "loss": 3.0178, + "grad_norm": 2.8022148609161377, + "learning_rate": 0.000111864406779661 + }, + { + "step": 23, + "epoch": 0.11704834605597965, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455672832, + "loss": 3.1671, + "grad_norm": 2.9901397228240967, + "learning_rate": 0.00011694915254237288 + }, + { + "step": 24, + "epoch": 0.12213740458015267, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455666688, + "loss": 2.8538, + "grad_norm": 2.673530101776123, + "learning_rate": 0.00012203389830508474 + }, + { + "step": 25, + "epoch": 0.1272264631043257, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455672832, + "loss": 2.7144, + "grad_norm": 2.4594390392303467, + "learning_rate": 0.00012711864406779658 + }, + { + "step": 26, + "epoch": 0.13231552162849872, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455725056, + "loss": 2.4995, + "grad_norm": 2.6887078285217285, + "learning_rate": 0.00013220338983050846 + }, + { + "step": 27, + "epoch": 0.13740458015267176, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455666688, + "loss": 2.6579, + "grad_norm": 2.2986369132995605, + "learning_rate": 0.00013728813559322033 + }, + { + "step": 28, + "epoch": 0.14249363867684478, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455612928, + "loss": 2.4583, + "grad_norm": 2.1246449947357178, + "learning_rate": 0.0001423728813559322 + }, + { + "step": 29, + "epoch": 0.1475826972010178, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455705088, + "loss": 2.333, + "grad_norm": 1.8623199462890625, + "learning_rate": 0.00014745762711864405 + }, + { + "step": 30, + "epoch": 0.15267175572519084, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45570048, + "loss": 2.3321, + "grad_norm": 2.0744597911834717, + "learning_rate": 0.00015254237288135592 + }, + { + "step": 31, + "epoch": 0.15776081424936386, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455678976, + "loss": 2.3855, + "grad_norm": 1.8913559913635254, + "learning_rate": 0.0001576271186440678 + }, + { + "step": 32, + "epoch": 0.1628498727735369, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455683584, + "loss": 2.2464, + "grad_norm": 1.7667324542999268, + "learning_rate": 0.00016271186440677964 + }, + { + "step": 33, + "epoch": 0.16793893129770993, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455718912, + "loss": 2.0756, + "grad_norm": 1.5967001914978027, + "learning_rate": 0.0001677966101694915 + }, + { + "step": 34, + "epoch": 0.17302798982188294, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455643648, + "loss": 2.0645, + "grad_norm": 1.369579792022705, + "learning_rate": 0.0001728813559322034 + }, + { + "step": 35, + "epoch": 0.178117048346056, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455691264, + "loss": 1.9014, + "grad_norm": 1.181432843208313, + "learning_rate": 0.00017796610169491523 + }, + { + "step": 36, + "epoch": 0.183206106870229, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455706624, + "loss": 1.8392, + "grad_norm": 1.0632789134979248, + "learning_rate": 0.0001830508474576271 + }, + { + "step": 37, + "epoch": 0.18829516539440203, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455772672, + "loss": 1.8435, + "grad_norm": 1.157536268234253, + "learning_rate": 0.00018813559322033895 + }, + { + "step": 38, + "epoch": 0.19338422391857507, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455655936, + "loss": 1.6862, + "grad_norm": 0.859493613243103, + "learning_rate": 0.00019322033898305083 + }, + { + "step": 39, + "epoch": 0.1984732824427481, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455772672, + "loss": 1.5753, + "grad_norm": 0.5821015238761902, + "learning_rate": 0.0001983050847457627 + }, + { + "step": 40, + "epoch": 0.2035623409669211, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455694336, + "loss": 1.658, + "grad_norm": 0.7333043813705444, + "learning_rate": 0.00020338983050847455 + }, + { + "step": 41, + "epoch": 0.20865139949109415, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45559296, + "loss": 1.5925, + "grad_norm": 0.5253768563270569, + "learning_rate": 0.00020847457627118642 + }, + { + "step": 42, + "epoch": 0.21374045801526717, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455665152, + "loss": 1.5354, + "grad_norm": 0.2779075503349304, + "learning_rate": 0.0002135593220338983 + }, + { + "step": 43, + "epoch": 0.21882951653944022, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455628288, + "loss": 1.501, + "grad_norm": 0.3173498511314392, + "learning_rate": 0.00021864406779661014 + }, + { + "step": 44, + "epoch": 0.22391857506361323, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455663616, + "loss": 1.5002, + "grad_norm": 0.2558562755584717, + "learning_rate": 0.000223728813559322 + }, + { + "step": 45, + "epoch": 0.22900763358778625, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455717376, + "loss": 1.484, + "grad_norm": 0.2226683646440506, + "learning_rate": 0.00022881355932203386 + }, + { + "step": 46, + "epoch": 0.2340966921119593, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455764992, + "loss": 1.4815, + "grad_norm": 0.27069878578186035, + "learning_rate": 0.00023389830508474576 + }, + { + "step": 47, + "epoch": 0.23918575063613232, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45559296, + "loss": 1.4732, + "grad_norm": 0.22950895130634308, + "learning_rate": 0.0002389830508474576 + }, + { + "step": 48, + "epoch": 0.24427480916030533, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45564672, + "loss": 1.467, + "grad_norm": 0.16220666468143463, + "learning_rate": 0.00024406779661016948 + }, + { + "step": 49, + "epoch": 0.24936386768447838, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455635968, + "loss": 1.4526, + "grad_norm": 0.2833695113658905, + "learning_rate": 0.00024915254237288135 + }, + { + "step": 50, + "epoch": 0.2544529262086514, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455645184, + "loss": 1.4322, + "grad_norm": 0.1461314558982849, + "learning_rate": 0.00025423728813559317 + }, + { + "step": 51, + "epoch": 0.2595419847328244, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455740416, + "loss": 1.4144, + "grad_norm": 0.19095374643802643, + "learning_rate": 0.0002593220338983051 + }, + { + "step": 52, + "epoch": 0.26463104325699743, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455682048, + "loss": 1.4875, + "grad_norm": 0.36749789118766785, + "learning_rate": 0.0002644067796610169 + }, + { + "step": 53, + "epoch": 0.2697201017811705, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455760384, + "loss": 1.4834, + "grad_norm": 0.2373327910900116, + "learning_rate": 0.0002694915254237288 + }, + { + "step": 54, + "epoch": 0.2748091603053435, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455660544, + "loss": 1.44, + "grad_norm": 0.15824952721595764, + "learning_rate": 0.00027457627118644066 + }, + { + "step": 55, + "epoch": 0.27989821882951654, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455657472, + "loss": 1.4096, + "grad_norm": 0.25300133228302, + "learning_rate": 0.0002796610169491525 + }, + { + "step": 56, + "epoch": 0.28498727735368956, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455702016, + "loss": 1.4445, + "grad_norm": 0.24228444695472717, + "learning_rate": 0.0002847457627118644 + }, + { + "step": 57, + "epoch": 0.2900763358778626, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455694336, + "loss": 1.451, + "grad_norm": 0.2678522765636444, + "learning_rate": 0.00028983050847457623 + }, + { + "step": 58, + "epoch": 0.2951653944020356, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455717376, + "loss": 1.4125, + "grad_norm": 0.13799171149730682, + "learning_rate": 0.0002949152542372881 + }, + { + "step": 59, + "epoch": 0.30025445292620867, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45568512, + "loss": 1.4157, + "grad_norm": 0.2175796627998352, + "learning_rate": 0.0003 + }, + { + "step": 60, + "epoch": 0.3053435114503817, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455675904, + "loss": 1.4473, + "grad_norm": 0.16826723515987396, + "learning_rate": 0.00029999735486167307 + }, + { + "step": 61, + "epoch": 0.3104325699745547, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455712768, + "loss": 1.4151, + "grad_norm": 0.16675494611263275, + "learning_rate": 0.00029998941953998247 + }, + { + "step": 62, + "epoch": 0.3155216284987277, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455752704, + "loss": 1.4108, + "grad_norm": 0.23427800834178925, + "learning_rate": 0.0002999761943147951 + }, + { + "step": 63, + "epoch": 0.32061068702290074, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455682048, + "loss": 1.4058, + "grad_norm": 0.21787889301776886, + "learning_rate": 0.000299957679652545 + }, + { + "step": 64, + "epoch": 0.3256997455470738, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455583744, + "loss": 1.4165, + "grad_norm": 0.328104704618454, + "learning_rate": 0.0002999338762062168 + }, + { + "step": 65, + "epoch": 0.33078880407124683, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455665152, + "loss": 1.3923, + "grad_norm": 0.19028036296367645, + "learning_rate": 0.00029990478481532246 + }, + { + "step": 66, + "epoch": 0.33587786259541985, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4557696, + "loss": 1.4338, + "grad_norm": 0.1698009967803955, + "learning_rate": 0.00029987040650587214 + }, + { + "step": 67, + "epoch": 0.34096692111959287, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45564672, + "loss": 1.4091, + "grad_norm": 0.1645933985710144, + "learning_rate": 0.0002998307424903376 + }, + { + "step": 68, + "epoch": 0.3460559796437659, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455698944, + "loss": 1.4125, + "grad_norm": 0.1672719568014145, + "learning_rate": 0.00029978579416760955 + }, + { + "step": 69, + "epoch": 0.3511450381679389, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455691264, + "loss": 1.4121, + "grad_norm": 0.15499304234981537, + "learning_rate": 0.00029973556312294853 + }, + { + "step": 70, + "epoch": 0.356234096692112, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455606784, + "loss": 1.4207, + "grad_norm": 0.1481475830078125, + "learning_rate": 0.0002996800511279286 + }, + { + "step": 71, + "epoch": 0.361323155216285, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455629824, + "loss": 1.359, + "grad_norm": 0.2631801664829254, + "learning_rate": 0.0002996192601403751 + }, + { + "step": 72, + "epoch": 0.366412213740458, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455678976, + "loss": 1.4018, + "grad_norm": 0.1627357006072998, + "learning_rate": 0.00029955319230429584 + }, + { + "step": 73, + "epoch": 0.37150127226463103, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455632896, + "loss": 1.3974, + "grad_norm": 0.1278548240661621, + "learning_rate": 0.00029948184994980486 + }, + { + "step": 74, + "epoch": 0.37659033078880405, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455686656, + "loss": 1.428, + "grad_norm": 0.14154663681983948, + "learning_rate": 0.0002994052355930409 + }, + { + "step": 75, + "epoch": 0.3816793893129771, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455729664, + "loss": 1.4562, + "grad_norm": 0.339764803647995, + "learning_rate": 0.0002993233519360781 + }, + { + "step": 76, + "epoch": 0.38676844783715014, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45567744, + "loss": 1.4038, + "grad_norm": 0.23098774254322052, + "learning_rate": 0.0002992362018668312 + }, + { + "step": 77, + "epoch": 0.39185750636132316, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455616, + "loss": 1.4226, + "grad_norm": 0.20886880159378052, + "learning_rate": 0.00029914378845895343 + }, + { + "step": 78, + "epoch": 0.3969465648854962, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455717376, + "loss": 1.3977, + "grad_norm": 0.3230207562446594, + "learning_rate": 0.000299046114971728 + }, + { + "step": 79, + "epoch": 0.4020356234096692, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455734272, + "loss": 1.4447, + "grad_norm": 0.36863553524017334, + "learning_rate": 0.0002989431848499534 + }, + { + "step": 80, + "epoch": 0.4071246819338422, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455574528, + "loss": 1.4245, + "grad_norm": 0.18651212751865387, + "learning_rate": 0.0002988350017238218 + }, + { + "step": 81, + "epoch": 0.4122137404580153, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455680512, + "loss": 1.3988, + "grad_norm": 0.21150368452072144, + "learning_rate": 0.0002987215694087909 + }, + { + "step": 82, + "epoch": 0.4173027989821883, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455652864, + "loss": 1.3742, + "grad_norm": 0.14768360555171967, + "learning_rate": 0.0002986028919054496 + }, + { + "step": 83, + "epoch": 0.4223918575063613, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455666688, + "loss": 1.3771, + "grad_norm": 0.21049077808856964, + "learning_rate": 0.00029847897339937675 + }, + { + "step": 84, + "epoch": 0.42748091603053434, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455672832, + "loss": 1.422, + "grad_norm": 0.24875298142433167, + "learning_rate": 0.0002983498182609935 + }, + { + "step": 85, + "epoch": 0.43256997455470736, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455702016, + "loss": 1.4382, + "grad_norm": 0.2630525529384613, + "learning_rate": 0.0002982154310454093 + }, + { + "step": 86, + "epoch": 0.43765903307888043, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455643648, + "loss": 1.3811, + "grad_norm": 0.160964235663414, + "learning_rate": 0.00029807581649226114 + }, + { + "step": 87, + "epoch": 0.44274809160305345, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455665152, + "loss": 1.4201, + "grad_norm": 0.28085896372795105, + "learning_rate": 0.00029793097952554646 + }, + { + "step": 88, + "epoch": 0.44783715012722647, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455691264, + "loss": 1.3465, + "grad_norm": 0.1559852510690689, + "learning_rate": 0.0002977809252534494 + }, + { + "step": 89, + "epoch": 0.4529262086513995, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45563904, + "loss": 1.3719, + "grad_norm": 0.1913672536611557, + "learning_rate": 0.00029762565896816073 + }, + { + "step": 90, + "epoch": 0.4580152671755725, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455671296, + "loss": 1.4232, + "grad_norm": 0.1992383599281311, + "learning_rate": 0.000297465186145691 + }, + { + "step": 91, + "epoch": 0.4631043256997455, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455663616, + "loss": 1.3681, + "grad_norm": 0.2501462996006012, + "learning_rate": 0.0002972995124456779 + }, + { + "step": 92, + "epoch": 0.4681933842239186, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455603712, + "loss": 1.3981, + "grad_norm": 0.15586960315704346, + "learning_rate": 0.0002971286437111861 + }, + { + "step": 93, + "epoch": 0.4732824427480916, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455804928, + "loss": 1.4189, + "grad_norm": 0.13466857373714447, + "learning_rate": 0.0002969525859685014 + }, + { + "step": 94, + "epoch": 0.47837150127226463, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455619072, + "loss": 1.3993, + "grad_norm": 0.1925010085105896, + "learning_rate": 0.0002967713454269183 + }, + { + "step": 95, + "epoch": 0.48346055979643765, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455786496, + "loss": 1.3901, + "grad_norm": 0.12607891857624054, + "learning_rate": 0.0002965849284785207 + }, + { + "step": 96, + "epoch": 0.48854961832061067, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455660544, + "loss": 1.3955, + "grad_norm": 0.23390938341617584, + "learning_rate": 0.000296393341697957 + }, + { + "step": 97, + "epoch": 0.49363867684478374, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455655936, + "loss": 1.3966, + "grad_norm": 0.1965438425540924, + "learning_rate": 0.00029619659184220755 + }, + { + "step": 98, + "epoch": 0.49872773536895676, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455703552, + "loss": 1.3844, + "grad_norm": 0.23420394957065582, + "learning_rate": 0.00029599468585034684 + }, + { + "step": 99, + "epoch": 0.5038167938931297, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45568512, + "loss": 1.3605, + "grad_norm": 0.2530987858772278, + "learning_rate": 0.0002957876308432986 + }, + { + "step": 100, + "epoch": 0.5089058524173028, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455649792, + "loss": 1.3653, + "grad_norm": 0.1796950399875641, + "learning_rate": 0.0002955754341235846 + }, + { + "step": 101, + "epoch": 0.5139949109414759, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455632896, + "loss": 1.408, + "grad_norm": 0.12455020099878311, + "learning_rate": 0.00029535810317506714 + }, + { + "step": 102, + "epoch": 0.5190839694656488, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455683584, + "loss": 1.386, + "grad_norm": 0.20781828463077545, + "learning_rate": 0.00029513564566268524 + }, + { + "step": 103, + "epoch": 0.5241730279898219, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455643648, + "loss": 1.387, + "grad_norm": 0.25427743792533875, + "learning_rate": 0.0002949080694321841 + }, + { + "step": 104, + "epoch": 0.5292620865139949, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455732736, + "loss": 1.385, + "grad_norm": 0.15431956946849823, + "learning_rate": 0.0002946753825098386 + }, + { + "step": 105, + "epoch": 0.5343511450381679, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455602176, + "loss": 1.3742, + "grad_norm": 0.17231544852256775, + "learning_rate": 0.0002944375931021699 + }, + { + "step": 106, + "epoch": 0.539440203562341, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455665152, + "loss": 1.3925, + "grad_norm": 0.11091665178537369, + "learning_rate": 0.0002941947095956564 + }, + { + "step": 107, + "epoch": 0.544529262086514, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455660544, + "loss": 1.3619, + "grad_norm": 0.14482708275318146, + "learning_rate": 0.0002939467405564377 + }, + { + "step": 108, + "epoch": 0.549618320610687, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455649792, + "loss": 1.374, + "grad_norm": 0.1556270718574524, + "learning_rate": 0.00029369369473001265 + }, + { + "step": 109, + "epoch": 0.55470737913486, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455818752, + "loss": 1.3997, + "grad_norm": 0.12362687289714813, + "learning_rate": 0.0002934355810409307 + }, + { + "step": 110, + "epoch": 0.5597964376590331, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455616, + "loss": 1.3792, + "grad_norm": 0.17349569499492645, + "learning_rate": 0.0002931724085924774 + }, + { + "step": 111, + "epoch": 0.5648854961832062, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455617536, + "loss": 1.382, + "grad_norm": 0.1533801406621933, + "learning_rate": 0.00029290418666635314 + }, + { + "step": 112, + "epoch": 0.5699745547073791, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455717376, + "loss": 1.3779, + "grad_norm": 0.15566281974315643, + "learning_rate": 0.0002926309247223459 + }, + { + "step": 113, + "epoch": 0.5750636132315522, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45581568, + "loss": 1.3829, + "grad_norm": 0.1547180712223053, + "learning_rate": 0.0002923526323979975 + }, + { + "step": 114, + "epoch": 0.5801526717557252, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455645184, + "loss": 1.3864, + "grad_norm": 0.11252962797880173, + "learning_rate": 0.00029206931950826387 + }, + { + "step": 115, + "epoch": 0.5852417302798982, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4556544, + "loss": 1.4013, + "grad_norm": 0.1310528963804245, + "learning_rate": 0.00029178099604516876 + }, + { + "step": 116, + "epoch": 0.5903307888040712, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45571584, + "loss": 1.3885, + "grad_norm": 0.1273925006389618, + "learning_rate": 0.0002914876721774515 + }, + { + "step": 117, + "epoch": 0.5954198473282443, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45560832, + "loss": 1.3952, + "grad_norm": 0.1506008803844452, + "learning_rate": 0.00029118935825020806 + }, + { + "step": 118, + "epoch": 0.6005089058524173, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455703552, + "loss": 1.3949, + "grad_norm": 0.13644754886627197, + "learning_rate": 0.00029088606478452656 + }, + { + "step": 119, + "epoch": 0.6055979643765903, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455837184, + "loss": 1.4205, + "grad_norm": 0.23428145051002502, + "learning_rate": 0.0002905778024771158 + }, + { + "step": 120, + "epoch": 0.6106870229007634, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455740416, + "loss": 1.3983, + "grad_norm": 0.256718248128891, + "learning_rate": 0.00029026458219992855 + }, + { + "step": 121, + "epoch": 0.6157760814249363, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455786496, + "loss": 1.4011, + "grad_norm": 0.12740395963191986, + "learning_rate": 0.00028994641499977745 + }, + { + "step": 122, + "epoch": 0.6208651399491094, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4557312, + "loss": 1.3907, + "grad_norm": 0.16288091242313385, + "learning_rate": 0.00028962331209794604 + }, + { + "step": 123, + "epoch": 0.6259541984732825, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4557696, + "loss": 1.3798, + "grad_norm": 0.1859237253665924, + "learning_rate": 0.00028929528488979244 + }, + { + "step": 124, + "epoch": 0.6310432569974554, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4556928, + "loss": 1.4174, + "grad_norm": 0.2362847626209259, + "learning_rate": 0.0002889623449443479 + }, + { + "step": 125, + "epoch": 0.6361323155216285, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455728128, + "loss": 1.4118, + "grad_norm": 0.14980214834213257, + "learning_rate": 0.0002886245040039086 + }, + { + "step": 126, + "epoch": 0.6412213740458015, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455634432, + "loss": 1.3945, + "grad_norm": 0.22708477079868317, + "learning_rate": 0.0002882817739836215 + }, + { + "step": 127, + "epoch": 0.6463104325699746, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455659008, + "loss": 1.3981, + "grad_norm": 0.17832283675670624, + "learning_rate": 0.000287934166971064 + }, + { + "step": 128, + "epoch": 0.6513994910941476, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45563136, + "loss": 1.3894, + "grad_norm": 0.15512587130069733, + "learning_rate": 0.0002875816952258179 + }, + { + "step": 129, + "epoch": 0.6564885496183206, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455637504, + "loss": 1.4048, + "grad_norm": 0.21868066489696503, + "learning_rate": 0.00028722437117903693 + }, + { + "step": 130, + "epoch": 0.6615776081424937, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455628288, + "loss": 1.3798, + "grad_norm": 0.19405047595500946, + "learning_rate": 0.000286862207433008 + }, + { + "step": 131, + "epoch": 0.6666666666666666, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45556224, + "loss": 1.3855, + "grad_norm": 0.1168946623802185, + "learning_rate": 0.00028649521676070726 + }, + { + "step": 132, + "epoch": 0.6717557251908397, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455725056, + "loss": 1.3867, + "grad_norm": 0.12285705655813217, + "learning_rate": 0.0002861234121053493 + }, + { + "step": 133, + "epoch": 0.6768447837150128, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455637504, + "loss": 1.3595, + "grad_norm": 0.2873629033565521, + "learning_rate": 0.0002857468065799307 + }, + { + "step": 134, + "epoch": 0.6819338422391857, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455640576, + "loss": 1.3904, + "grad_norm": 0.11161809414625168, + "learning_rate": 0.0002853654134667676 + }, + { + "step": 135, + "epoch": 0.6870229007633588, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455695872, + "loss": 1.3653, + "grad_norm": 0.17580336332321167, + "learning_rate": 0.0002849792462170271 + }, + { + "step": 136, + "epoch": 0.6921119592875318, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455599104, + "loss": 1.4364, + "grad_norm": 0.2518826723098755, + "learning_rate": 0.0002845883184502533 + }, + { + "step": 137, + "epoch": 0.6972010178117048, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455766528, + "loss": 1.3847, + "grad_norm": 0.13714748620986938, + "learning_rate": 0.00028419264395388626 + }, + { + "step": 138, + "epoch": 0.7022900763358778, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455632896, + "loss": 1.3731, + "grad_norm": 0.2554313540458679, + "learning_rate": 0.0002837922366827765 + }, + { + "step": 139, + "epoch": 0.7073791348600509, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455622144, + "loss": 1.3526, + "grad_norm": 0.24349531531333923, + "learning_rate": 0.00028338711075869216 + }, + { + "step": 140, + "epoch": 0.712468193384224, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455674368, + "loss": 1.3667, + "grad_norm": 0.21326081454753876, + "learning_rate": 0.00028297728046982137 + }, + { + "step": 141, + "epoch": 0.7175572519083969, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455599104, + "loss": 1.4103, + "grad_norm": 0.25069767236709595, + "learning_rate": 0.00028256276027026816 + }, + { + "step": 142, + "epoch": 0.72264631043257, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455660544, + "loss": 1.4128, + "grad_norm": 0.2663118243217468, + "learning_rate": 0.0002821435647795429 + }, + { + "step": 143, + "epoch": 0.727735368956743, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455659008, + "loss": 1.3948, + "grad_norm": 0.226455420255661, + "learning_rate": 0.00028171970878204623 + }, + { + "step": 144, + "epoch": 0.732824427480916, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455603712, + "loss": 1.3908, + "grad_norm": 0.12346458435058594, + "learning_rate": 0.0002812912072265481 + }, + { + "step": 145, + "epoch": 0.7379134860050891, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45560064, + "loss": 1.3762, + "grad_norm": 0.19241105020046234, + "learning_rate": 0.00028085807522566043 + }, + { + "step": 146, + "epoch": 0.7430025445292621, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455702016, + "loss": 1.3626, + "grad_norm": 0.21001924574375153, + "learning_rate": 0.00028042032805530387 + }, + { + "step": 147, + "epoch": 0.7480916030534351, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455706624, + "loss": 1.3854, + "grad_norm": 0.2545386850833893, + "learning_rate": 0.00027997798115416935 + }, + { + "step": 148, + "epoch": 0.7531806615776081, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455804928, + "loss": 1.3687, + "grad_norm": 0.1521541178226471, + "learning_rate": 0.0002795310501231734 + }, + { + "step": 149, + "epoch": 0.7582697201017812, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455648256, + "loss": 1.3815, + "grad_norm": 0.164015531539917, + "learning_rate": 0.0002790795507249081 + }, + { + "step": 150, + "epoch": 0.7633587786259542, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455640576, + "loss": 1.3915, + "grad_norm": 0.1408807635307312, + "learning_rate": 0.00027862349888308494 + }, + { + "step": 151, + "epoch": 0.7684478371501272, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455583744, + "loss": 1.3786, + "grad_norm": 0.15991388261318207, + "learning_rate": 0.0002781629106819733 + }, + { + "step": 152, + "epoch": 0.7735368956743003, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455616, + "loss": 1.3818, + "grad_norm": 0.17744256556034088, + "learning_rate": 0.00027769780236583315 + }, + { + "step": 153, + "epoch": 0.7786259541984732, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455643648, + "loss": 1.363, + "grad_norm": 0.20023542642593384, + "learning_rate": 0.0002772281903383424 + }, + { + "step": 154, + "epoch": 0.7837150127226463, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455694336, + "loss": 1.4148, + "grad_norm": 0.2627112865447998, + "learning_rate": 0.00027675409116201797 + }, + { + "step": 155, + "epoch": 0.7888040712468194, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455606784, + "loss": 1.4009, + "grad_norm": 0.14480328559875488, + "learning_rate": 0.00027627552155763186 + }, + { + "step": 156, + "epoch": 0.7938931297709924, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455626752, + "loss": 1.3757, + "grad_norm": 0.23013971745967865, + "learning_rate": 0.00027579249840362145 + }, + { + "step": 157, + "epoch": 0.7989821882951654, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455703552, + "loss": 1.4138, + "grad_norm": 0.26364022493362427, + "learning_rate": 0.0002753050387354942 + }, + { + "step": 158, + "epoch": 0.8040712468193384, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455605248, + "loss": 1.3978, + "grad_norm": 0.13766631484031677, + "learning_rate": 0.0002748131597452268 + }, + { + "step": 159, + "epoch": 0.8091603053435115, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455706624, + "loss": 1.393, + "grad_norm": 0.13330651819705963, + "learning_rate": 0.00027431687878065874 + }, + { + "step": 160, + "epoch": 0.8142493638676844, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455663616, + "loss": 1.3842, + "grad_norm": 0.23525984585285187, + "learning_rate": 0.00027381621334488085 + }, + { + "step": 161, + "epoch": 0.8193384223918575, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45566208, + "loss": 1.3847, + "grad_norm": 0.21360930800437927, + "learning_rate": 0.00027331118109561744 + }, + { + "step": 162, + "epoch": 0.8244274809160306, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455645184, + "loss": 1.3873, + "grad_norm": 0.12144456058740616, + "learning_rate": 0.000272801799844604 + }, + { + "step": 163, + "epoch": 0.8295165394402035, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45576192, + "loss": 1.3893, + "grad_norm": 0.13454005122184753, + "learning_rate": 0.00027228808755695884 + }, + { + "step": 164, + "epoch": 0.8346055979643766, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455657472, + "loss": 1.3887, + "grad_norm": 0.1764563024044037, + "learning_rate": 0.00027177006235054943 + }, + { + "step": 165, + "epoch": 0.8396946564885496, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455737344, + "loss": 1.3893, + "grad_norm": 0.13752621412277222, + "learning_rate": 0.0002712477424953534 + }, + { + "step": 166, + "epoch": 0.8447837150127226, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45564672, + "loss": 1.3881, + "grad_norm": 0.24783319234848022, + "learning_rate": 0.00027072114641281435 + }, + { + "step": 167, + "epoch": 0.8498727735368957, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45560832, + "loss": 1.3552, + "grad_norm": 0.21978147327899933, + "learning_rate": 0.0002701902926751921 + }, + { + "step": 168, + "epoch": 0.8549618320610687, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455611392, + "loss": 1.3634, + "grad_norm": 0.25485971570014954, + "learning_rate": 0.00026965520000490743 + }, + { + "step": 169, + "epoch": 0.8600508905852418, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455678976, + "loss": 1.3721, + "grad_norm": 0.20886285603046417, + "learning_rate": 0.0002691158872738822 + }, + { + "step": 170, + "epoch": 0.8651399491094147, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455686656, + "loss": 1.3921, + "grad_norm": 0.12766045331954956, + "learning_rate": 0.00026857237350287334 + }, + { + "step": 171, + "epoch": 0.8702290076335878, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455660544, + "loss": 1.3678, + "grad_norm": 0.1606871336698532, + "learning_rate": 0.0002680246778608023 + }, + { + "step": 172, + "epoch": 0.8753180661577609, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455640576, + "loss": 1.3777, + "grad_norm": 0.134430930018425, + "learning_rate": 0.0002674728196640788 + }, + { + "step": 173, + "epoch": 0.8804071246819338, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45567744, + "loss": 1.3918, + "grad_norm": 0.17289933562278748, + "learning_rate": 0.00026691681837591984 + }, + { + "step": 174, + "epoch": 0.8854961832061069, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455620608, + "loss": 1.4029, + "grad_norm": 0.18259429931640625, + "learning_rate": 0.00026635669360566296 + }, + { + "step": 175, + "epoch": 0.8905852417302799, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455691264, + "loss": 1.3411, + "grad_norm": 0.1396682858467102, + "learning_rate": 0.00026579246510807477 + }, + { + "step": 176, + "epoch": 0.8956743002544529, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455588352, + "loss": 1.3697, + "grad_norm": 0.1571730524301529, + "learning_rate": 0.00026522415278265425 + }, + { + "step": 177, + "epoch": 0.9007633587786259, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455597568, + "loss": 1.394, + "grad_norm": 0.24166709184646606, + "learning_rate": 0.0002646517766729309 + }, + { + "step": 178, + "epoch": 0.905852417302799, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455674368, + "loss": 1.3767, + "grad_norm": 0.20736737549304962, + "learning_rate": 0.0002640753569657579 + }, + { + "step": 179, + "epoch": 0.910941475826972, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455682048, + "loss": 1.405, + "grad_norm": 0.20414665341377258, + "learning_rate": 0.0002634949139906 + }, + { + "step": 180, + "epoch": 0.916030534351145, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455688192, + "loss": 1.4095, + "grad_norm": 0.20567071437835693, + "learning_rate": 0.00026291046821881673 + }, + { + "step": 181, + "epoch": 0.9211195928753181, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455606784, + "loss": 1.4033, + "grad_norm": 0.17558446526527405, + "learning_rate": 0.0002623220402629402 + }, + { + "step": 182, + "epoch": 0.926208651399491, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455717376, + "loss": 1.3896, + "grad_norm": 0.17092274129390717, + "learning_rate": 0.0002617296508759483 + }, + { + "step": 183, + "epoch": 0.9312977099236641, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455705088, + "loss": 1.4049, + "grad_norm": 0.15569141507148743, + "learning_rate": 0.00026113332095053257 + }, + { + "step": 184, + "epoch": 0.9363867684478372, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455688192, + "loss": 1.385, + "grad_norm": 0.2703980505466461, + "learning_rate": 0.0002605330715183616 + }, + { + "step": 185, + "epoch": 0.9414758269720102, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455626752, + "loss": 1.3847, + "grad_norm": 0.3030729293823242, + "learning_rate": 0.0002599289237493392 + }, + { + "step": 186, + "epoch": 0.9465648854961832, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455659008, + "loss": 1.3498, + "grad_norm": 0.15269865095615387, + "learning_rate": 0.0002593208989508575 + }, + { + "step": 187, + "epoch": 0.9516539440203562, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455755776, + "loss": 1.396, + "grad_norm": 0.1470784842967987, + "learning_rate": 0.00025870901856704583 + }, + { + "step": 188, + "epoch": 0.9567430025445293, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455728128, + "loss": 1.3722, + "grad_norm": 0.15281827747821808, + "learning_rate": 0.00025809330417801425 + }, + { + "step": 189, + "epoch": 0.9618320610687023, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455632896, + "loss": 1.4131, + "grad_norm": 0.2690964937210083, + "learning_rate": 0.00025747377749909254 + }, + { + "step": 190, + "epoch": 0.9669211195928753, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455665152, + "loss": 1.3514, + "grad_norm": 0.1376069188117981, + "learning_rate": 0.00025685046038006413 + }, + { + "step": 191, + "epoch": 0.9720101781170484, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4556544, + "loss": 1.4027, + "grad_norm": 0.24593737721443176, + "learning_rate": 0.0002562233748043958 + }, + { + "step": 192, + "epoch": 0.9770992366412213, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455591424, + "loss": 1.3584, + "grad_norm": 0.21613235771656036, + "learning_rate": 0.00025559254288846196 + }, + { + "step": 193, + "epoch": 0.9821882951653944, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455712768, + "loss": 1.3854, + "grad_norm": 0.19199901819229126, + "learning_rate": 0.0002549579868807651 + }, + { + "step": 194, + "epoch": 0.9872773536895675, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455634432, + "loss": 1.3643, + "grad_norm": 0.1615513116121292, + "learning_rate": 0.0002543197291611507 + }, + { + "step": 195, + "epoch": 0.9923664122137404, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45574656, + "loss": 1.3869, + "grad_norm": 0.14072203636169434, + "learning_rate": 0.0002536777922400183 + }, + { + "step": 196, + "epoch": 0.9974554707379135, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455741952, + "loss": 1.3717, + "grad_norm": 0.12512129545211792, + "learning_rate": 0.0002530321987575271 + }, + { + "step": 197, + "epoch": 1.0025445292620865, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468291584, + "loss": 2.0874, + "grad_norm": 0.31740400195121765, + "learning_rate": 0.0002523829714827981 + }, + { + "step": 198, + "epoch": 1.0076335877862594, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468357632, + "loss": 1.3667, + "grad_norm": 0.18417689204216003, + "learning_rate": 0.00025173013331311053 + }, + { + "step": 199, + "epoch": 1.0127226463104326, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46835456, + "loss": 1.3688, + "grad_norm": 0.178513765335083, + "learning_rate": 0.0002510737072730946 + }, + { + "step": 200, + "epoch": 1.0178117048346056, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46823936, + "loss": 1.3811, + "grad_norm": 0.24014514684677124, + "learning_rate": 0.0002504137165139193 + }, + { + "step": 201, + "epoch": 1.0229007633587786, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468290048, + "loss": 1.3892, + "grad_norm": 0.241567000746727, + "learning_rate": 0.0002497501843124761 + }, + { + "step": 202, + "epoch": 1.0279898218829517, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468283904, + "loss": 1.4125, + "grad_norm": 0.2212083637714386, + "learning_rate": 0.00024908313407055765 + }, + { + "step": 203, + "epoch": 1.0330788804071247, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46833152, + "loss": 1.3817, + "grad_norm": 0.1247771680355072, + "learning_rate": 0.00024841258931403284 + }, + { + "step": 204, + "epoch": 1.0381679389312977, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468263936, + "loss": 1.3703, + "grad_norm": 0.16290989518165588, + "learning_rate": 0.00024773857369201675 + }, + { + "step": 205, + "epoch": 1.0432569974554706, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468310016, + "loss": 1.3792, + "grad_norm": 0.1646203249692917, + "learning_rate": 0.00024706111097603676 + }, + { + "step": 206, + "epoch": 1.0483460559796438, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46832384, + "loss": 1.352, + "grad_norm": 0.25204771757125854, + "learning_rate": 0.00024638022505919425 + }, + { + "step": 207, + "epoch": 1.0534351145038168, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468279296, + "loss": 1.3754, + "grad_norm": 0.24655361473560333, + "learning_rate": 0.00024569593995532157 + }, + { + "step": 208, + "epoch": 1.0585241730279897, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468271616, + "loss": 1.3704, + "grad_norm": 0.14845556020736694, + "learning_rate": 0.00024500827979813546 + }, + { + "step": 209, + "epoch": 1.063613231552163, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46839296, + "loss": 1.3812, + "grad_norm": 0.18303048610687256, + "learning_rate": 0.0002443172688403859 + }, + { + "step": 210, + "epoch": 1.0687022900763359, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468290048, + "loss": 1.3743, + "grad_norm": 0.22597886621952057, + "learning_rate": 0.00024362293145300027 + }, + { + "step": 211, + "epoch": 1.0737913486005088, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468291584, + "loss": 1.3747, + "grad_norm": 0.22880129516124725, + "learning_rate": 0.00024292529212422445 + }, + { + "step": 212, + "epoch": 1.078880407124682, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468294656, + "loss": 1.3789, + "grad_norm": 0.17018520832061768, + "learning_rate": 0.00024222437545875887 + }, + { + "step": 213, + "epoch": 1.083969465648855, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468220928, + "loss": 1.387, + "grad_norm": 0.14244140684604645, + "learning_rate": 0.0002415202061768906 + }, + { + "step": 214, + "epoch": 1.089058524173028, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46821632, + "loss": 1.3582, + "grad_norm": 0.18038666248321533, + "learning_rate": 0.0002408128091136217 + }, + { + "step": 215, + "epoch": 1.094147582697201, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468267008, + "loss": 1.4008, + "grad_norm": 0.45009222626686096, + "learning_rate": 0.00024010220921779336 + }, + { + "step": 216, + "epoch": 1.099236641221374, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468396032, + "loss": 1.3834, + "grad_norm": 0.16312003135681152, + "learning_rate": 0.00023938843155120581 + }, + { + "step": 217, + "epoch": 1.104325699745547, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468257792, + "loss": 1.4242, + "grad_norm": 0.31910964846611023, + "learning_rate": 0.00023867150128773453 + }, + { + "step": 218, + "epoch": 1.10941475826972, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468283904, + "loss": 1.404, + "grad_norm": 0.2746158540248871, + "learning_rate": 0.0002379514437124425 + }, + { + "step": 219, + "epoch": 1.1145038167938932, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468296192, + "loss": 1.4111, + "grad_norm": 0.17097268998622894, + "learning_rate": 0.00023722828422068814 + }, + { + "step": 220, + "epoch": 1.1195928753180662, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468268544, + "loss": 1.3935, + "grad_norm": 0.27749088406562805, + "learning_rate": 0.00023650204831723008 + }, + { + "step": 221, + "epoch": 1.1246819338422391, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468242432, + "loss": 1.4147, + "grad_norm": 0.15319931507110596, + "learning_rate": 0.00023577276161532718 + }, + { + "step": 222, + "epoch": 1.1297709923664123, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46825472, + "loss": 1.3669, + "grad_norm": 0.11509262770414352, + "learning_rate": 0.0002350404498358356 + }, + { + "step": 223, + "epoch": 1.1348600508905853, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468282368, + "loss": 1.3571, + "grad_norm": 0.24220424890518188, + "learning_rate": 0.00023430513880630133 + }, + { + "step": 224, + "epoch": 1.1399491094147582, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4683008, + "loss": 1.399, + "grad_norm": 0.17532102763652802, + "learning_rate": 0.00023356685446004966 + }, + { + "step": 225, + "epoch": 1.1450381679389312, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468351488, + "loss": 1.3463, + "grad_norm": 0.15844811499118805, + "learning_rate": 0.00023282562283527005 + }, + { + "step": 226, + "epoch": 1.1501272264631044, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468302336, + "loss": 1.3988, + "grad_norm": 0.12179142236709595, + "learning_rate": 0.00023208147007409827 + }, + { + "step": 227, + "epoch": 1.1552162849872774, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468253184, + "loss": 1.3797, + "grad_norm": 0.13511726260185242, + "learning_rate": 0.00023133442242169425 + }, + { + "step": 228, + "epoch": 1.1603053435114503, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468297728, + "loss": 1.3741, + "grad_norm": 0.15079058706760406, + "learning_rate": 0.00023058450622531632 + }, + { + "step": 229, + "epoch": 1.1653944020356235, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468196352, + "loss": 1.364, + "grad_norm": 0.12500226497650146, + "learning_rate": 0.00022983174793339206 + }, + { + "step": 230, + "epoch": 1.1704834605597965, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468263936, + "loss": 1.3779, + "grad_norm": 0.11752685159444809, + "learning_rate": 0.0002290761740945857 + }, + { + "step": 231, + "epoch": 1.1755725190839694, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46825472, + "loss": 1.4007, + "grad_norm": 0.21308255195617676, + "learning_rate": 0.00022831781135686135 + }, + { + "step": 232, + "epoch": 1.1806615776081424, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468259328, + "loss": 1.3429, + "grad_norm": 0.19528938829898834, + "learning_rate": 0.00022755668646654375 + }, + { + "step": 233, + "epoch": 1.1857506361323156, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468374528, + "loss": 1.3723, + "grad_norm": 0.24350810050964355, + "learning_rate": 0.00022679282626737442 + }, + { + "step": 234, + "epoch": 1.1908396946564885, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468334592, + "loss": 1.432, + "grad_norm": 0.3143423795700073, + "learning_rate": 0.00022602625769956519 + }, + { + "step": 235, + "epoch": 1.1959287531806615, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46829312, + "loss": 1.4251, + "grad_norm": 0.32041120529174805, + "learning_rate": 0.00022525700779884802 + }, + { + "step": 236, + "epoch": 1.2010178117048347, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468291584, + "loss": 1.368, + "grad_norm": 0.229181706905365, + "learning_rate": 0.00022448510369552164 + }, + { + "step": 237, + "epoch": 1.2061068702290076, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46831616, + "loss": 1.3911, + "grad_norm": 0.18737292289733887, + "learning_rate": 0.0002237105726134943 + }, + { + "step": 238, + "epoch": 1.2111959287531806, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468336128, + "loss": 1.3939, + "grad_norm": 0.12819884717464447, + "learning_rate": 0.00022293344186932406 + }, + { + "step": 239, + "epoch": 1.2162849872773536, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468253184, + "loss": 1.3693, + "grad_norm": 0.17918798327445984, + "learning_rate": 0.00022215373887125514 + }, + { + "step": 240, + "epoch": 1.2213740458015268, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468388352, + "loss": 1.3781, + "grad_norm": 0.12879769504070282, + "learning_rate": 0.00022137149111825128 + }, + { + "step": 241, + "epoch": 1.2264631043256997, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468342272, + "loss": 1.3854, + "grad_norm": 0.220515638589859, + "learning_rate": 0.00022058672619902606 + }, + { + "step": 242, + "epoch": 1.2315521628498727, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468363776, + "loss": 1.3941, + "grad_norm": 0.17368264496326447, + "learning_rate": 0.00021979947179106966 + }, + { + "step": 243, + "epoch": 1.2366412213740459, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468250112, + "loss": 1.3566, + "grad_norm": 0.41035738587379456, + "learning_rate": 0.0002190097556596728 + }, + { + "step": 244, + "epoch": 1.2417302798982188, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468237824, + "loss": 1.4477, + "grad_norm": 0.29978740215301514, + "learning_rate": 0.0002182176056569476 + }, + { + "step": 245, + "epoch": 1.2468193384223918, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468242432, + "loss": 1.3863, + "grad_norm": 0.19055882096290588, + "learning_rate": 0.00021742304972084518 + }, + { + "step": 246, + "epoch": 1.2519083969465647, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468279296, + "loss": 1.3841, + "grad_norm": 0.19380445778369904, + "learning_rate": 0.00021662611587417035 + }, + { + "step": 247, + "epoch": 1.256997455470738, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468240896, + "loss": 1.3763, + "grad_norm": 0.24430827796459198, + "learning_rate": 0.00021582683222359317 + }, + { + "step": 248, + "epoch": 1.262086513994911, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468286976, + "loss": 1.3986, + "grad_norm": 0.16130293905735016, + "learning_rate": 0.00021502522695865796 + }, + { + "step": 249, + "epoch": 1.267175572519084, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468296192, + "loss": 1.398, + "grad_norm": 0.1813725084066391, + "learning_rate": 0.00021422132835078884 + }, + { + "step": 250, + "epoch": 1.272264631043257, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468334592, + "loss": 1.3687, + "grad_norm": 0.13183936476707458, + "learning_rate": 0.0002134151647522927 + }, + { + "step": 251, + "epoch": 1.27735368956743, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468253184, + "loss": 1.3829, + "grad_norm": 0.21588033437728882, + "learning_rate": 0.00021260676459535933 + }, + { + "step": 252, + "epoch": 1.282442748091603, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4682624, + "loss": 1.3691, + "grad_norm": 0.2584744691848755, + "learning_rate": 0.00021179615639105857 + }, + { + "step": 253, + "epoch": 1.2875318066157762, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468328448, + "loss": 1.3775, + "grad_norm": 0.13085049390792847, + "learning_rate": 0.00021098336872833482 + }, + { + "step": 254, + "epoch": 1.2926208651399491, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46825472, + "loss": 1.3716, + "grad_norm": 0.18273581564426422, + "learning_rate": 0.0002101684302729987 + }, + { + "step": 255, + "epoch": 1.297709923664122, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468248576, + "loss": 1.3934, + "grad_norm": 0.22063370048999786, + "learning_rate": 0.00020935136976671617 + }, + { + "step": 256, + "epoch": 1.3027989821882953, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468314624, + "loss": 1.3815, + "grad_norm": 0.18276959657669067, + "learning_rate": 0.00020853221602599458 + }, + { + "step": 257, + "epoch": 1.3078880407124682, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468273152, + "loss": 1.3773, + "grad_norm": 0.12849368155002594, + "learning_rate": 0.00020771099794116672 + }, + { + "step": 258, + "epoch": 1.3129770992366412, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468386816, + "loss": 1.3697, + "grad_norm": 0.17425848543643951, + "learning_rate": 0.0002068877444753717 + }, + { + "step": 259, + "epoch": 1.3180661577608141, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468263936, + "loss": 1.3761, + "grad_norm": 0.16136986017227173, + "learning_rate": 0.0002060624846635335 + }, + { + "step": 260, + "epoch": 1.3231552162849873, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468290048, + "loss": 1.3921, + "grad_norm": 0.13379168510437012, + "learning_rate": 0.00020523524761133677 + }, + { + "step": 261, + "epoch": 1.3282442748091603, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468296192, + "loss": 1.4007, + "grad_norm": 0.3055986166000366, + "learning_rate": 0.00020440606249420073 + }, + { + "step": 262, + "epoch": 1.3333333333333333, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468363776, + "loss": 1.3748, + "grad_norm": 0.14555881917476654, + "learning_rate": 0.00020357495855624974 + }, + { + "step": 263, + "epoch": 1.3384223918575064, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468219392, + "loss": 1.3846, + "grad_norm": 0.16069193184375763, + "learning_rate": 0.0002027419651092822 + }, + { + "step": 264, + "epoch": 1.3435114503816794, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468353024, + "loss": 1.377, + "grad_norm": 0.10386576503515244, + "learning_rate": 0.00020190711153173676 + }, + { + "step": 265, + "epoch": 1.3486005089058524, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468382208, + "loss": 1.3672, + "grad_norm": 0.17562562227249146, + "learning_rate": 0.00020107042726765588 + }, + { + "step": 266, + "epoch": 1.3536895674300253, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468251648, + "loss": 1.397, + "grad_norm": 0.17035603523254395, + "learning_rate": 0.0002002319418256479 + }, + { + "step": 267, + "epoch": 1.3587786259541985, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468253184, + "loss": 1.3963, + "grad_norm": 0.17072318494319916, + "learning_rate": 0.00019939168477784583 + }, + { + "step": 268, + "epoch": 1.3638676844783715, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46825472, + "loss": 1.3535, + "grad_norm": 0.10525336861610413, + "learning_rate": 0.00019854968575886458 + }, + { + "step": 269, + "epoch": 1.3689567430025447, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468328448, + "loss": 1.402, + "grad_norm": 0.19079315662384033, + "learning_rate": 0.00019770597446475588 + }, + { + "step": 270, + "epoch": 1.3740458015267176, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468310016, + "loss": 1.3631, + "grad_norm": 0.12190406769514084, + "learning_rate": 0.0001968605806519608 + }, + { + "step": 271, + "epoch": 1.3791348600508906, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468253184, + "loss": 1.4172, + "grad_norm": 0.17969341576099396, + "learning_rate": 0.00019601353413626032 + }, + { + "step": 272, + "epoch": 1.3842239185750635, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468291584, + "loss": 1.3766, + "grad_norm": 0.20288491249084473, + "learning_rate": 0.00019516486479172386 + }, + { + "step": 273, + "epoch": 1.3893129770992365, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468359168, + "loss": 1.3853, + "grad_norm": 0.11844371259212494, + "learning_rate": 0.0001943146025496555 + }, + { + "step": 274, + "epoch": 1.3944020356234097, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468311552, + "loss": 1.4155, + "grad_norm": 0.18590521812438965, + "learning_rate": 0.00019346277739753855 + }, + { + "step": 275, + "epoch": 1.3994910941475827, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46843904, + "loss": 1.3928, + "grad_norm": 0.1833382546901703, + "learning_rate": 0.00019260941937797776 + }, + { + "step": 276, + "epoch": 1.4045801526717558, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46832384, + "loss": 1.3781, + "grad_norm": 0.14808326959609985, + "learning_rate": 0.00019175455858763988 + }, + { + "step": 277, + "epoch": 1.4096692111959288, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468273152, + "loss": 1.3862, + "grad_norm": 0.19257061183452606, + "learning_rate": 0.0001908982251761921 + }, + { + "step": 278, + "epoch": 1.4147582697201018, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468260864, + "loss": 1.3405, + "grad_norm": 0.20402809977531433, + "learning_rate": 0.00019004044934523871 + }, + { + "step": 279, + "epoch": 1.4198473282442747, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468253184, + "loss": 1.4033, + "grad_norm": 0.11357319355010986, + "learning_rate": 0.00018918126134725616 + }, + { + "step": 280, + "epoch": 1.424936386768448, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468376064, + "loss": 1.3694, + "grad_norm": 0.15858721733093262, + "learning_rate": 0.00018832069148452582 + }, + { + "step": 281, + "epoch": 1.4300254452926209, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46831616, + "loss": 1.3718, + "grad_norm": 0.20792405307292938, + "learning_rate": 0.00018745877010806534 + }, + { + "step": 282, + "epoch": 1.4351145038167938, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4682624, + "loss": 1.377, + "grad_norm": 0.16555483639240265, + "learning_rate": 0.00018659552761655828 + }, + { + "step": 283, + "epoch": 1.440203562340967, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468279296, + "loss": 1.4069, + "grad_norm": 0.18612040579319, + "learning_rate": 0.00018573099445528204 + }, + { + "step": 284, + "epoch": 1.44529262086514, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468419072, + "loss": 1.3691, + "grad_norm": 0.15144553780555725, + "learning_rate": 0.00018486520111503387 + }, + { + "step": 285, + "epoch": 1.450381679389313, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468290048, + "loss": 1.3908, + "grad_norm": 0.16589169204235077, + "learning_rate": 0.0001839981781310558 + }, + { + "step": 286, + "epoch": 1.455470737913486, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46828544, + "loss": 1.3992, + "grad_norm": 0.1998831331729889, + "learning_rate": 0.00018312995608195747 + }, + { + "step": 287, + "epoch": 1.460559796437659, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468188672, + "loss": 1.3787, + "grad_norm": 0.19862394034862518, + "learning_rate": 0.00018226056558863778 + }, + { + "step": 288, + "epoch": 1.465648854961832, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468263936, + "loss": 1.3558, + "grad_norm": 0.195574089884758, + "learning_rate": 0.00018139003731320496 + }, + { + "step": 289, + "epoch": 1.470737913486005, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468283904, + "loss": 1.3925, + "grad_norm": 0.1872778683900833, + "learning_rate": 0.00018051840195789506 + }, + { + "step": 290, + "epoch": 1.4758269720101782, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468250112, + "loss": 1.3702, + "grad_norm": 0.13362334668636322, + "learning_rate": 0.00017964569026398926 + }, + { + "step": 291, + "epoch": 1.4809160305343512, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468211712, + "loss": 1.3786, + "grad_norm": 0.14183945953845978, + "learning_rate": 0.00017877193301072945 + }, + { + "step": 292, + "epoch": 1.4860050890585241, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468337664, + "loss": 1.3925, + "grad_norm": 0.126989483833313, + "learning_rate": 0.0001778971610142331 + }, + { + "step": 293, + "epoch": 1.491094147582697, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468214784, + "loss": 1.4049, + "grad_norm": 0.17351119220256805, + "learning_rate": 0.00017702140512640594 + }, + { + "step": 294, + "epoch": 1.4961832061068703, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468257792, + "loss": 1.4136, + "grad_norm": 0.1770288050174713, + "learning_rate": 0.00017614469623385414 + }, + { + "step": 295, + "epoch": 1.5012722646310432, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468240896, + "loss": 1.3514, + "grad_norm": 0.2657321095466614, + "learning_rate": 0.00017526706525679498 + }, + { + "step": 296, + "epoch": 1.5063613231552164, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468282368, + "loss": 1.3747, + "grad_norm": 0.14276258647441864, + "learning_rate": 0.00017438854314796623 + }, + { + "step": 297, + "epoch": 1.5114503816793894, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468222464, + "loss": 1.3478, + "grad_norm": 0.10167302936315536, + "learning_rate": 0.00017350916089153455 + }, + { + "step": 298, + "epoch": 1.5165394402035624, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468245504, + "loss": 1.3527, + "grad_norm": 0.2246718555688858, + "learning_rate": 0.00017262894950200277 + }, + { + "step": 299, + "epoch": 1.5216284987277353, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468234752, + "loss": 1.3714, + "grad_norm": 0.1379583179950714, + "learning_rate": 0.000171747940023116 + }, + { + "step": 300, + "epoch": 1.5267175572519083, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46824704, + "loss": 1.386, + "grad_norm": 0.13369157910346985, + "learning_rate": 0.0001708661635267667 + }, + { + "step": 301, + "epoch": 1.5318066157760815, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468250112, + "loss": 1.3448, + "grad_norm": 0.2943602204322815, + "learning_rate": 0.00016998365111189906 + }, + { + "step": 302, + "epoch": 1.5368956743002544, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46823168, + "loss": 1.3806, + "grad_norm": 0.12263234704732895, + "learning_rate": 0.00016910043390341183 + }, + { + "step": 303, + "epoch": 1.5419847328244276, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468237824, + "loss": 1.3693, + "grad_norm": 0.1992911994457245, + "learning_rate": 0.0001682165430510609 + }, + { + "step": 304, + "epoch": 1.5470737913486006, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4681856, + "loss": 1.4283, + "grad_norm": 0.4879559576511383, + "learning_rate": 0.00016733200972836055 + }, + { + "step": 305, + "epoch": 1.5521628498727735, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468257792, + "loss": 1.3744, + "grad_norm": 0.17813007533550262, + "learning_rate": 0.00016644686513148397 + }, + { + "step": 306, + "epoch": 1.5572519083969465, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468299264, + "loss": 1.3607, + "grad_norm": 0.18805313110351562, + "learning_rate": 0.00016556114047816317 + }, + { + "step": 307, + "epoch": 1.5623409669211195, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46824704, + "loss": 1.3526, + "grad_norm": 0.12624238431453705, + "learning_rate": 0.00016467486700658785 + }, + { + "step": 308, + "epoch": 1.5674300254452926, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468257792, + "loss": 1.4107, + "grad_norm": 0.19703735411167145, + "learning_rate": 0.0001637880759743037 + }, + { + "step": 309, + "epoch": 1.5725190839694656, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468240896, + "loss": 1.3796, + "grad_norm": 0.24067336320877075, + "learning_rate": 0.00016290079865711004 + }, + { + "step": 310, + "epoch": 1.5776081424936388, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468296192, + "loss": 1.3864, + "grad_norm": 0.16809892654418945, + "learning_rate": 0.00016201306634795675 + }, + { + "step": 311, + "epoch": 1.5826972010178118, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4682624, + "loss": 1.3761, + "grad_norm": 0.09654642641544342, + "learning_rate": 0.00016112491035584047 + }, + { + "step": 312, + "epoch": 1.5877862595419847, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468271616, + "loss": 1.4091, + "grad_norm": 0.160189688205719, + "learning_rate": 0.00016023636200470065 + }, + { + "step": 313, + "epoch": 1.5928753180661577, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468273152, + "loss": 1.3811, + "grad_norm": 0.27471238374710083, + "learning_rate": 0.00015934745263231464 + }, + { + "step": 314, + "epoch": 1.5979643765903306, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46850048, + "loss": 1.3876, + "grad_norm": 0.1969471424818039, + "learning_rate": 0.00015845821358919236 + }, + { + "step": 315, + "epoch": 1.6030534351145038, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46829312, + "loss": 1.3683, + "grad_norm": 0.16458091139793396, + "learning_rate": 0.00015756867623747088 + }, + { + "step": 316, + "epoch": 1.608142493638677, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468265472, + "loss": 1.3752, + "grad_norm": 0.14666511118412018, + "learning_rate": 0.00015667887194980806 + }, + { + "step": 317, + "epoch": 1.61323155216285, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468230144, + "loss": 1.3779, + "grad_norm": 0.29237911105155945, + "learning_rate": 0.00015578883210827626 + }, + { + "step": 318, + "epoch": 1.618320610687023, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468320768, + "loss": 1.3699, + "grad_norm": 0.17051124572753906, + "learning_rate": 0.0001548985881032554 + }, + { + "step": 319, + "epoch": 1.623409669211196, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468245504, + "loss": 1.3817, + "grad_norm": 0.22691072523593903, + "learning_rate": 0.00015400817133232606 + }, + { + "step": 320, + "epoch": 1.6284987277353689, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46829312, + "loss": 1.374, + "grad_norm": 0.1409837007522583, + "learning_rate": 0.00015311761319916184 + }, + { + "step": 321, + "epoch": 1.6335877862595418, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468234752, + "loss": 1.3862, + "grad_norm": 0.2341122329235077, + "learning_rate": 0.00015222694511242215 + }, + { + "step": 322, + "epoch": 1.638676844783715, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468455936, + "loss": 1.3881, + "grad_norm": 0.1569787710905075, + "learning_rate": 0.00015133619848464424 + }, + { + "step": 323, + "epoch": 1.6437659033078882, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468337664, + "loss": 1.3701, + "grad_norm": 0.14545221626758575, + "learning_rate": 0.0001504454047311353 + }, + { + "step": 324, + "epoch": 1.6488549618320612, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468230144, + "loss": 1.3554, + "grad_norm": 0.3090737760066986, + "learning_rate": 0.00014955459526886468 + }, + { + "step": 325, + "epoch": 1.6539440203562341, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46827008, + "loss": 1.39, + "grad_norm": 0.18121521174907684, + "learning_rate": 0.00014866380151535574 + }, + { + "step": 326, + "epoch": 1.659033078880407, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468317696, + "loss": 1.3802, + "grad_norm": 0.1759660542011261, + "learning_rate": 0.0001477730548875778 + }, + { + "step": 327, + "epoch": 1.66412213740458, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468283904, + "loss": 1.3979, + "grad_norm": 0.16712847352027893, + "learning_rate": 0.0001468823868008382 + }, + { + "step": 328, + "epoch": 1.6692111959287532, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4683392, + "loss": 1.3894, + "grad_norm": 0.15217722952365875, + "learning_rate": 0.000145991828667674 + }, + { + "step": 329, + "epoch": 1.6743002544529262, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46823168, + "loss": 1.3727, + "grad_norm": 0.1300564706325531, + "learning_rate": 0.0001451014118967446 + }, + { + "step": 330, + "epoch": 1.6793893129770994, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46831616, + "loss": 1.3614, + "grad_norm": 0.23001670837402344, + "learning_rate": 0.00014421116789172374 + }, + { + "step": 331, + "epoch": 1.6844783715012723, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468305408, + "loss": 1.3755, + "grad_norm": 0.24727769196033478, + "learning_rate": 0.00014332112805019194 + }, + { + "step": 332, + "epoch": 1.6895674300254453, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468260864, + "loss": 1.3804, + "grad_norm": 0.14643321931362152, + "learning_rate": 0.00014243132376252912 + }, + { + "step": 333, + "epoch": 1.6946564885496183, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468349952, + "loss": 1.3961, + "grad_norm": 0.1250706911087036, + "learning_rate": 0.00014154178641080767 + }, + { + "step": 334, + "epoch": 1.6997455470737912, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46827776, + "loss": 1.3801, + "grad_norm": 0.13527068495750427, + "learning_rate": 0.0001406525473676854 + }, + { + "step": 335, + "epoch": 1.7048346055979644, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468286976, + "loss": 1.3872, + "grad_norm": 0.11812220513820648, + "learning_rate": 0.00013976363799529936 + }, + { + "step": 336, + "epoch": 1.7099236641221374, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4683392, + "loss": 1.36, + "grad_norm": 0.12820464372634888, + "learning_rate": 0.00013887508964415956 + }, + { + "step": 337, + "epoch": 1.7150127226463106, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4683392, + "loss": 1.3553, + "grad_norm": 0.3400281071662903, + "learning_rate": 0.00013798693365204325 + }, + { + "step": 338, + "epoch": 1.7201017811704835, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468376064, + "loss": 1.3798, + "grad_norm": 0.23647283017635345, + "learning_rate": 0.00013709920134288993 + }, + { + "step": 339, + "epoch": 1.7251908396946565, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46829312, + "loss": 1.404, + "grad_norm": 0.19160452485084534, + "learning_rate": 0.00013621192402569628 + }, + { + "step": 340, + "epoch": 1.7302798982188294, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46836224, + "loss": 1.3863, + "grad_norm": 0.12514011561870575, + "learning_rate": 0.00013532513299341215 + }, + { + "step": 341, + "epoch": 1.7353689567430024, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468303872, + "loss": 1.3674, + "grad_norm": 0.13792411983013153, + "learning_rate": 0.00013443885952183683 + }, + { + "step": 342, + "epoch": 1.7404580152671756, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468310016, + "loss": 1.3577, + "grad_norm": 0.2325419783592224, + "learning_rate": 0.00013355313486851603 + }, + { + "step": 343, + "epoch": 1.7455470737913485, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468220928, + "loss": 1.3921, + "grad_norm": 0.22895346581935883, + "learning_rate": 0.00013266799027163942 + }, + { + "step": 344, + "epoch": 1.7506361323155217, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4683392, + "loss": 1.3868, + "grad_norm": 0.17292290925979614, + "learning_rate": 0.00013178345694893906 + }, + { + "step": 345, + "epoch": 1.7557251908396947, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468291584, + "loss": 1.3549, + "grad_norm": 0.12412680685520172, + "learning_rate": 0.0001308995660965881 + }, + { + "step": 346, + "epoch": 1.7608142493638677, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468237824, + "loss": 1.3874, + "grad_norm": 0.2085018754005432, + "learning_rate": 0.00013001634888810094 + }, + { + "step": 347, + "epoch": 1.7659033078880406, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468286976, + "loss": 1.3793, + "grad_norm": 0.25484591722488403, + "learning_rate": 0.0001291338364732333 + }, + { + "step": 348, + "epoch": 1.7709923664122136, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468412928, + "loss": 1.3899, + "grad_norm": 0.1719558835029602, + "learning_rate": 0.00012825205997688403 + }, + { + "step": 349, + "epoch": 1.7760814249363868, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468211712, + "loss": 1.3401, + "grad_norm": 0.13503798842430115, + "learning_rate": 0.00012737105049799723 + }, + { + "step": 350, + "epoch": 1.78117048346056, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468276224, + "loss": 1.3644, + "grad_norm": 0.14158222079277039, + "learning_rate": 0.00012649083910846543 + }, + { + "step": 351, + "epoch": 1.786259541984733, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468291584, + "loss": 1.3671, + "grad_norm": 0.17420759797096252, + "learning_rate": 0.00012561145685203374 + }, + { + "step": 352, + "epoch": 1.7913486005089059, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468282368, + "loss": 1.3758, + "grad_norm": 0.10984969884157181, + "learning_rate": 0.00012473293474320505 + }, + { + "step": 353, + "epoch": 1.7964376590330788, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468279296, + "loss": 1.3721, + "grad_norm": 0.21302281320095062, + "learning_rate": 0.00012385530376614586 + }, + { + "step": 354, + "epoch": 1.8015267175572518, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46825472, + "loss": 1.3741, + "grad_norm": 0.15973041951656342, + "learning_rate": 0.00012297859487359408 + }, + { + "step": 355, + "epoch": 1.806615776081425, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468237824, + "loss": 1.3624, + "grad_norm": 0.10937695950269699, + "learning_rate": 0.0001221028389857669 + }, + { + "step": 356, + "epoch": 1.811704834605598, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46823936, + "loss": 1.3734, + "grad_norm": 0.09530307352542877, + "learning_rate": 0.00012122806698927051 + }, + { + "step": 357, + "epoch": 1.8167938931297711, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468291584, + "loss": 1.3624, + "grad_norm": 0.2594415843486786, + "learning_rate": 0.00012035430973601075 + }, + { + "step": 358, + "epoch": 1.821882951653944, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468357632, + "loss": 1.3754, + "grad_norm": 0.34069398045539856, + "learning_rate": 0.00011948159804210495 + }, + { + "step": 359, + "epoch": 1.826972010178117, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468311552, + "loss": 1.366, + "grad_norm": 0.16583462059497833, + "learning_rate": 0.00011860996268679504 + }, + { + "step": 360, + "epoch": 1.83206106870229, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46836224, + "loss": 1.3903, + "grad_norm": 0.11170898377895355, + "learning_rate": 0.00011773943441136221 + }, + { + "step": 361, + "epoch": 1.837150127226463, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468333056, + "loss": 1.4304, + "grad_norm": 0.16833265125751495, + "learning_rate": 0.00011687004391804251 + }, + { + "step": 362, + "epoch": 1.8422391857506362, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468267008, + "loss": 1.3555, + "grad_norm": 0.12190385162830353, + "learning_rate": 0.00011600182186894417 + }, + { + "step": 363, + "epoch": 1.8473282442748091, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468224, + "loss": 1.3583, + "grad_norm": 0.146173357963562, + "learning_rate": 0.00011513479888496609 + }, + { + "step": 364, + "epoch": 1.8524173027989823, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468251648, + "loss": 1.3782, + "grad_norm": 0.2962832450866699, + "learning_rate": 0.00011426900554471795 + }, + { + "step": 365, + "epoch": 1.8575063613231553, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468234752, + "loss": 1.382, + "grad_norm": 0.13275086879730225, + "learning_rate": 0.0001134044723834417 + }, + { + "step": 366, + "epoch": 1.8625954198473282, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46827776, + "loss": 1.3678, + "grad_norm": 0.1790265142917633, + "learning_rate": 0.00011254122989193465 + }, + { + "step": 367, + "epoch": 1.8676844783715012, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46833152, + "loss": 1.3671, + "grad_norm": 0.15872983634471893, + "learning_rate": 0.00011167930851547418 + }, + { + "step": 368, + "epoch": 1.8727735368956742, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468228608, + "loss": 1.365, + "grad_norm": 0.1888638436794281, + "learning_rate": 0.0001108187386527438 + }, + { + "step": 369, + "epoch": 1.8778625954198473, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468288512, + "loss": 1.3439, + "grad_norm": 0.19863355159759521, + "learning_rate": 0.00010995955065476126 + }, + { + "step": 370, + "epoch": 1.8829516539440203, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468211712, + "loss": 1.3667, + "grad_norm": 0.32510823011398315, + "learning_rate": 0.00010910177482380795 + }, + { + "step": 371, + "epoch": 1.8880407124681935, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468371456, + "loss": 1.3714, + "grad_norm": 0.168460875749588, + "learning_rate": 0.00010824544141236015 + }, + { + "step": 372, + "epoch": 1.8931297709923665, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468225536, + "loss": 1.3534, + "grad_norm": 0.11180169880390167, + "learning_rate": 0.00010739058062202224 + }, + { + "step": 373, + "epoch": 1.8982188295165394, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468248576, + "loss": 1.3395, + "grad_norm": 0.12151626497507095, + "learning_rate": 0.00010653722260246145 + }, + { + "step": 374, + "epoch": 1.9033078880407124, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468271616, + "loss": 1.4294, + "grad_norm": 0.313635915517807, + "learning_rate": 0.00010568539745034447 + }, + { + "step": 375, + "epoch": 1.9083969465648853, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468495872, + "loss": 1.428, + "grad_norm": 0.27290916442871094, + "learning_rate": 0.00010483513520827614 + }, + { + "step": 376, + "epoch": 1.9134860050890585, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468253184, + "loss": 1.3785, + "grad_norm": 0.15923525393009186, + "learning_rate": 0.00010398646586373969 + }, + { + "step": 377, + "epoch": 1.9185750636132317, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468376064, + "loss": 1.396, + "grad_norm": 0.20447582006454468, + "learning_rate": 0.00010313941934803922 + }, + { + "step": 378, + "epoch": 1.9236641221374047, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468291584, + "loss": 1.3959, + "grad_norm": 0.17306174337863922, + "learning_rate": 0.00010229402553524413 + }, + { + "step": 379, + "epoch": 1.9287531806615776, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468380672, + "loss": 1.3602, + "grad_norm": 0.24845319986343384, + "learning_rate": 0.00010145031424113542 + }, + { + "step": 380, + "epoch": 1.9338422391857506, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468279296, + "loss": 1.3461, + "grad_norm": 0.16327160596847534, + "learning_rate": 0.00010060831522215416 + }, + { + "step": 381, + "epoch": 1.9389312977099236, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468286976, + "loss": 1.3791, + "grad_norm": 0.170322448015213, + "learning_rate": 9.976805817435207e-05 + }, + { + "step": 382, + "epoch": 1.9440203562340967, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468210176, + "loss": 1.3475, + "grad_norm": 0.21285127103328705, + "learning_rate": 9.89295727323441e-05 + }, + { + "step": 383, + "epoch": 1.9491094147582697, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468290048, + "loss": 1.394, + "grad_norm": 0.23720118403434753, + "learning_rate": 9.809288846826327e-05 + }, + { + "step": 384, + "epoch": 1.954198473282443, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468260864, + "loss": 1.3728, + "grad_norm": 0.21460364758968353, + "learning_rate": 9.725803489071779e-05 + }, + { + "step": 385, + "epoch": 1.9592875318066159, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468248576, + "loss": 1.3613, + "grad_norm": 0.1754642128944397, + "learning_rate": 9.642504144375026e-05 + }, + { + "step": 386, + "epoch": 1.9643765903307888, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468391424, + "loss": 1.3518, + "grad_norm": 0.24990974366664886, + "learning_rate": 9.559393750579926e-05 + }, + { + "step": 387, + "epoch": 1.9694656488549618, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468274688, + "loss": 1.3806, + "grad_norm": 0.13970693945884705, + "learning_rate": 9.476475238866318e-05 + }, + { + "step": 388, + "epoch": 1.9745547073791347, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46828544, + "loss": 1.3926, + "grad_norm": 0.1976822316646576, + "learning_rate": 9.393751533646649e-05 + }, + { + "step": 389, + "epoch": 1.979643765903308, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468440576, + "loss": 1.385, + "grad_norm": 0.21725395321846008, + "learning_rate": 9.31122555246283e-05 + }, + { + "step": 390, + "epoch": 1.984732824427481, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468250112, + "loss": 1.4038, + "grad_norm": 0.19912518560886383, + "learning_rate": 9.228900205883324e-05 + }, + { + "step": 391, + "epoch": 1.989821882951654, + "cpu_mem": 1.928228864, + "gpu_mem": 4.468271616, + "loss": 1.3904, + "grad_norm": 0.17259374260902405, + "learning_rate": 9.146778397400543e-05 + }, + { + "step": 392, + "epoch": 1.994910941475827, + "cpu_mem": 1.928228864, + "gpu_mem": 4.46830848, + "loss": 1.3973, + "grad_norm": 0.1761516034603119, + "learning_rate": 9.064863023328384e-05 + }, + { + "step": 393, + "epoch": 2.0, + "cpu_mem": 1.928228864, + "gpu_mem": 4.467895296, + "loss": 2.0293, + "grad_norm": 0.26432839035987854, + "learning_rate": 8.983156972700125e-05 + }, + { + "step": 394, + "epoch": 2.005089058524173, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455614464, + "loss": 1.3766, + "grad_norm": 0.29354530572891235, + "learning_rate": 8.901663127166513e-05 + }, + { + "step": 395, + "epoch": 2.010178117048346, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45559296, + "loss": 1.3862, + "grad_norm": 0.1718294471502304, + "learning_rate": 8.820384360894143e-05 + }, + { + "step": 396, + "epoch": 2.015267175572519, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45560832, + "loss": 1.3784, + "grad_norm": 0.25550195574760437, + "learning_rate": 8.739323540464063e-05 + }, + { + "step": 397, + "epoch": 2.0203562340966923, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455755776, + "loss": 1.3851, + "grad_norm": 0.16924288868904114, + "learning_rate": 8.658483524770728e-05 + }, + { + "step": 398, + "epoch": 2.0254452926208653, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455702016, + "loss": 1.3992, + "grad_norm": 0.1783556342124939, + "learning_rate": 8.577867164921113e-05 + }, + { + "step": 399, + "epoch": 2.030534351145038, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455718912, + "loss": 1.41, + "grad_norm": 0.20210079848766327, + "learning_rate": 8.497477304134203e-05 + }, + { + "step": 400, + "epoch": 2.035623409669211, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455671296, + "loss": 1.3995, + "grad_norm": 0.22543707489967346, + "learning_rate": 8.41731677764068e-05 + }, + { + "step": 401, + "epoch": 2.040712468193384, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455695872, + "loss": 1.3872, + "grad_norm": 0.1975255161523819, + "learning_rate": 8.337388412582972e-05 + }, + { + "step": 402, + "epoch": 2.045801526717557, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455743488, + "loss": 1.3738, + "grad_norm": 0.251764178276062, + "learning_rate": 8.257695027915481e-05 + }, + { + "step": 403, + "epoch": 2.05089058524173, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455652864, + "loss": 1.3888, + "grad_norm": 0.27951356768608093, + "learning_rate": 8.178239434305235e-05 + }, + { + "step": 404, + "epoch": 2.0559796437659035, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455625216, + "loss": 1.3574, + "grad_norm": 0.2341318130493164, + "learning_rate": 8.099024434032717e-05 + }, + { + "step": 405, + "epoch": 2.0610687022900764, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455714304, + "loss": 1.3515, + "grad_norm": 0.1575353592634201, + "learning_rate": 8.02005282089303e-05 + }, + { + "step": 406, + "epoch": 2.0661577608142494, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455605248, + "loss": 1.385, + "grad_norm": 0.25904008746147156, + "learning_rate": 7.941327380097388e-05 + }, + { + "step": 407, + "epoch": 2.0712468193384224, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455657472, + "loss": 1.3703, + "grad_norm": 0.1820371150970459, + "learning_rate": 7.862850888174869e-05 + }, + { + "step": 408, + "epoch": 2.0763358778625953, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455611392, + "loss": 1.3467, + "grad_norm": 0.23251213133335114, + "learning_rate": 7.784626112874487e-05 + }, + { + "step": 409, + "epoch": 2.0814249363867683, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455814144, + "loss": 1.3697, + "grad_norm": 0.2618219256401062, + "learning_rate": 7.706655813067594e-05 + }, + { + "step": 410, + "epoch": 2.0865139949109412, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455812608, + "loss": 1.4011, + "grad_norm": 0.3194250166416168, + "learning_rate": 7.628942738650573e-05 + }, + { + "step": 411, + "epoch": 2.0916030534351147, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455735808, + "loss": 1.3654, + "grad_norm": 0.22246694564819336, + "learning_rate": 7.551489630447835e-05 + }, + { + "step": 412, + "epoch": 2.0966921119592876, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4556928, + "loss": 1.3778, + "grad_norm": 0.19960571825504303, + "learning_rate": 7.474299220115195e-05 + }, + { + "step": 413, + "epoch": 2.1017811704834606, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45566976, + "loss": 1.3469, + "grad_norm": 0.17057035863399506, + "learning_rate": 7.397374230043484e-05 + }, + { + "step": 414, + "epoch": 2.1068702290076335, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45562368, + "loss": 1.3728, + "grad_norm": 0.22431083023548126, + "learning_rate": 7.320717373262557e-05 + }, + { + "step": 415, + "epoch": 2.1119592875318065, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455740416, + "loss": 1.3925, + "grad_norm": 0.27506789565086365, + "learning_rate": 7.244331353345625e-05 + }, + { + "step": 416, + "epoch": 2.1170483460559795, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455628288, + "loss": 1.3792, + "grad_norm": 0.1600046008825302, + "learning_rate": 7.16821886431386e-05 + }, + { + "step": 417, + "epoch": 2.122137404580153, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455617536, + "loss": 1.3802, + "grad_norm": 0.15285362303256989, + "learning_rate": 7.092382590541432e-05 + }, + { + "step": 418, + "epoch": 2.127226463104326, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455622144, + "loss": 1.3821, + "grad_norm": 0.1384151726961136, + "learning_rate": 7.016825206660788e-05 + }, + { + "step": 419, + "epoch": 2.132315521628499, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455737344, + "loss": 1.3835, + "grad_norm": 0.27998876571655273, + "learning_rate": 6.941549377468367e-05 + }, + { + "step": 420, + "epoch": 2.1374045801526718, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455711232, + "loss": 1.3641, + "grad_norm": 0.20075851678848267, + "learning_rate": 6.866557757830575e-05 + }, + { + "step": 421, + "epoch": 2.1424936386768447, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455612928, + "loss": 1.3507, + "grad_norm": 0.26313287019729614, + "learning_rate": 6.791852992590169e-05 + }, + { + "step": 422, + "epoch": 2.1475826972010177, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455648256, + "loss": 1.3478, + "grad_norm": 0.1729445457458496, + "learning_rate": 6.717437716472997e-05 + }, + { + "step": 423, + "epoch": 2.1526717557251906, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455614464, + "loss": 1.361, + "grad_norm": 0.1332211196422577, + "learning_rate": 6.643314553995034e-05 + }, + { + "step": 424, + "epoch": 2.157760814249364, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455748096, + "loss": 1.349, + "grad_norm": 0.11840900033712387, + "learning_rate": 6.569486119369863e-05 + }, + { + "step": 425, + "epoch": 2.162849872773537, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455721984, + "loss": 1.3746, + "grad_norm": 0.23245105147361755, + "learning_rate": 6.495955016416441e-05 + }, + { + "step": 426, + "epoch": 2.16793893129771, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455766528, + "loss": 1.3359, + "grad_norm": 0.283547967672348, + "learning_rate": 6.422723838467286e-05 + }, + { + "step": 427, + "epoch": 2.173027989821883, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455657472, + "loss": 1.3608, + "grad_norm": 0.2748339772224426, + "learning_rate": 6.349795168276994e-05 + }, + { + "step": 428, + "epoch": 2.178117048346056, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455674368, + "loss": 1.3673, + "grad_norm": 0.3982786536216736, + "learning_rate": 6.277171577931187e-05 + }, + { + "step": 429, + "epoch": 2.183206106870229, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455749632, + "loss": 1.3696, + "grad_norm": 0.1911727637052536, + "learning_rate": 6.204855628755751e-05 + }, + { + "step": 430, + "epoch": 2.188295165394402, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455597568, + "loss": 1.3775, + "grad_norm": 0.17001456022262573, + "learning_rate": 6.13284987122654e-05 + }, + { + "step": 431, + "epoch": 2.1933842239185752, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45571584, + "loss": 1.376, + "grad_norm": 0.32780879735946655, + "learning_rate": 6.061156844879417e-05 + }, + { + "step": 432, + "epoch": 2.198473282442748, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455705088, + "loss": 1.3435, + "grad_norm": 0.2518489956855774, + "learning_rate": 5.9897790782206636e-05 + }, + { + "step": 433, + "epoch": 2.203562340966921, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455671296, + "loss": 1.3672, + "grad_norm": 0.1276160180568695, + "learning_rate": 5.9187190886378306e-05 + }, + { + "step": 434, + "epoch": 2.208651399491094, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455649792, + "loss": 1.3687, + "grad_norm": 0.23236069083213806, + "learning_rate": 5.8479793823109406e-05 + }, + { + "step": 435, + "epoch": 2.213740458015267, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455645184, + "loss": 1.3829, + "grad_norm": 0.25693777203559875, + "learning_rate": 5.777562454124113e-05 + }, + { + "step": 436, + "epoch": 2.21882951653944, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455655936, + "loss": 1.3691, + "grad_norm": 0.21493315696716309, + "learning_rate": 5.7074707875775496e-05 + }, + { + "step": 437, + "epoch": 2.223918575063613, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455709696, + "loss": 1.3993, + "grad_norm": 0.2377517819404602, + "learning_rate": 5.637706854699974e-05 + }, + { + "step": 438, + "epoch": 2.2290076335877864, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455626752, + "loss": 1.3919, + "grad_norm": 0.1800578385591507, + "learning_rate": 5.568273115961414e-05 + }, + { + "step": 439, + "epoch": 2.2340966921119594, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4556544, + "loss": 1.3975, + "grad_norm": 0.33508211374282837, + "learning_rate": 5.499172020186447e-05 + }, + { + "step": 440, + "epoch": 2.2391857506361323, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455652864, + "loss": 1.3661, + "grad_norm": 0.21546570956707, + "learning_rate": 5.430406004467842e-05 + }, + { + "step": 441, + "epoch": 2.2442748091603053, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455695872, + "loss": 1.3913, + "grad_norm": 0.25290998816490173, + "learning_rate": 5.361977494080572e-05 + }, + { + "step": 442, + "epoch": 2.2493638676844783, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455694336, + "loss": 1.3715, + "grad_norm": 0.28639110922813416, + "learning_rate": 5.293888902396319e-05 + }, + { + "step": 443, + "epoch": 2.2544529262086512, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455745024, + "loss": 1.367, + "grad_norm": 0.15972523391246796, + "learning_rate": 5.2261426307983204e-05 + }, + { + "step": 444, + "epoch": 2.2595419847328246, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455751168, + "loss": 1.3599, + "grad_norm": 0.12162455916404724, + "learning_rate": 5.158741068596714e-05 + }, + { + "step": 445, + "epoch": 2.2646310432569976, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455689728, + "loss": 1.3793, + "grad_norm": 0.1698918342590332, + "learning_rate": 5.0916865929442326e-05 + }, + { + "step": 446, + "epoch": 2.2697201017811706, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455678976, + "loss": 1.3821, + "grad_norm": 0.24470461905002594, + "learning_rate": 5.024981568752386e-05 + }, + { + "step": 447, + "epoch": 2.2748091603053435, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455703552, + "loss": 1.3829, + "grad_norm": 0.16081811487674713, + "learning_rate": 4.958628348608065e-05 + }, + { + "step": 448, + "epoch": 2.2798982188295165, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455720448, + "loss": 1.3639, + "grad_norm": 0.27067840099334717, + "learning_rate": 4.892629272690536e-05 + }, + { + "step": 449, + "epoch": 2.2849872773536894, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455591424, + "loss": 1.363, + "grad_norm": 0.15692368149757385, + "learning_rate": 4.826986668688944e-05 + }, + { + "step": 450, + "epoch": 2.2900763358778624, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455766528, + "loss": 1.3678, + "grad_norm": 0.2218252569437027, + "learning_rate": 4.761702851720191e-05 + }, + { + "step": 451, + "epoch": 2.2951653944020354, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45572352, + "loss": 1.3671, + "grad_norm": 0.18593786656856537, + "learning_rate": 4.6967801242472916e-05 + }, + { + "step": 452, + "epoch": 2.300254452926209, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455632896, + "loss": 1.3872, + "grad_norm": 0.24128516018390656, + "learning_rate": 4.632220775998172e-05 + }, + { + "step": 453, + "epoch": 2.3053435114503817, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455580672, + "loss": 1.3631, + "grad_norm": 0.12226473540067673, + "learning_rate": 4.568027083884929e-05 + }, + { + "step": 454, + "epoch": 2.3104325699745547, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455596032, + "loss": 1.3976, + "grad_norm": 0.1701870858669281, + "learning_rate": 4.504201311923488e-05 + }, + { + "step": 455, + "epoch": 2.3155216284987277, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455697408, + "loss": 1.3496, + "grad_norm": 0.1485370397567749, + "learning_rate": 4.440745711153804e-05 + }, + { + "step": 456, + "epoch": 2.3206106870229006, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455706624, + "loss": 1.4067, + "grad_norm": 0.3610127568244934, + "learning_rate": 4.377662519560423e-05 + }, + { + "step": 457, + "epoch": 2.325699745547074, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455735808, + "loss": 1.3771, + "grad_norm": 0.25061753392219543, + "learning_rate": 4.3149539619935836e-05 + }, + { + "step": 458, + "epoch": 2.330788804071247, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455797248, + "loss": 1.354, + "grad_norm": 0.22757993638515472, + "learning_rate": 4.252622250090746e-05 + }, + { + "step": 459, + "epoch": 2.33587786259542, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455651328, + "loss": 1.3737, + "grad_norm": 0.19061650335788727, + "learning_rate": 4.190669582198571e-05 + }, + { + "step": 460, + "epoch": 2.340966921119593, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45573888, + "loss": 1.374, + "grad_norm": 0.43389540910720825, + "learning_rate": 4.1290981432954185e-05 + }, + { + "step": 461, + "epoch": 2.346055979643766, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455642112, + "loss": 1.3853, + "grad_norm": 0.2784370183944702, + "learning_rate": 4.067910104914249e-05 + }, + { + "step": 462, + "epoch": 2.351145038167939, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455712768, + "loss": 1.3481, + "grad_norm": 0.12067332118749619, + "learning_rate": 4.007107625066079e-05 + }, + { + "step": 463, + "epoch": 2.356234096692112, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455748096, + "loss": 1.3813, + "grad_norm": 0.1903553605079651, + "learning_rate": 3.946692848163836e-05 + }, + { + "step": 464, + "epoch": 2.3613231552162848, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455686656, + "loss": 1.345, + "grad_norm": 0.20766408741474152, + "learning_rate": 3.886667904946739e-05 + }, + { + "step": 465, + "epoch": 2.366412213740458, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455643648, + "loss": 1.3461, + "grad_norm": 0.163682222366333, + "learning_rate": 3.8270349124051694e-05 + }, + { + "step": 466, + "epoch": 2.371501272264631, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455645184, + "loss": 1.3829, + "grad_norm": 0.16027969121932983, + "learning_rate": 3.767795973705975e-05 + }, + { + "step": 467, + "epoch": 2.376590330788804, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455682048, + "loss": 1.3435, + "grad_norm": 0.22953850030899048, + "learning_rate": 3.708953178118324e-05 + }, + { + "step": 468, + "epoch": 2.381679389312977, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4556544, + "loss": 1.3532, + "grad_norm": 0.21886137127876282, + "learning_rate": 3.6505086009399944e-05 + }, + { + "step": 469, + "epoch": 2.38676844783715, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455711232, + "loss": 1.3863, + "grad_norm": 0.22119812667369843, + "learning_rate": 3.5924643034242136e-05 + }, + { + "step": 470, + "epoch": 2.391857506361323, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455740416, + "loss": 1.3592, + "grad_norm": 0.2492137849330902, + "learning_rate": 3.5348223327069105e-05 + }, + { + "step": 471, + "epoch": 2.3969465648854964, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455640576, + "loss": 1.3781, + "grad_norm": 0.2303318828344345, + "learning_rate": 3.4775847217345756e-05 + }, + { + "step": 472, + "epoch": 2.4020356234096694, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455663616, + "loss": 1.3469, + "grad_norm": 0.23047612607479095, + "learning_rate": 3.420753489192524e-05 + }, + { + "step": 473, + "epoch": 2.4071246819338423, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45563904, + "loss": 1.3763, + "grad_norm": 0.24309685826301575, + "learning_rate": 3.364330639433701e-05 + }, + { + "step": 474, + "epoch": 2.4122137404580153, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4557312, + "loss": 1.4017, + "grad_norm": 0.21080560982227325, + "learning_rate": 3.308318162408013e-05 + }, + { + "step": 475, + "epoch": 2.4173027989821882, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45560064, + "loss": 1.358, + "grad_norm": 0.18057696521282196, + "learning_rate": 3.2527180335921186e-05 + }, + { + "step": 476, + "epoch": 2.422391857506361, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455737344, + "loss": 1.3836, + "grad_norm": 0.16161291301250458, + "learning_rate": 3.197532213919774e-05 + }, + { + "step": 477, + "epoch": 2.427480916030534, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455594496, + "loss": 1.379, + "grad_norm": 0.31396201252937317, + "learning_rate": 3.1427626497126654e-05 + }, + { + "step": 478, + "epoch": 2.432569974554707, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455617536, + "loss": 1.3504, + "grad_norm": 0.1302901655435562, + "learning_rate": 3.088411272611781e-05 + }, + { + "step": 479, + "epoch": 2.4376590330788805, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455665152, + "loss": 1.3951, + "grad_norm": 0.2466396540403366, + "learning_rate": 3.0344799995092533e-05 + }, + { + "step": 480, + "epoch": 2.4427480916030535, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45570816, + "loss": 1.3707, + "grad_norm": 0.1307106465101242, + "learning_rate": 2.9809707324807912e-05 + }, + { + "step": 481, + "epoch": 2.4478371501272265, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455635968, + "loss": 1.376, + "grad_norm": 0.1881062090396881, + "learning_rate": 2.9278853587185658e-05 + }, + { + "step": 482, + "epoch": 2.4529262086513994, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455734272, + "loss": 1.3721, + "grad_norm": 0.23399418592453003, + "learning_rate": 2.8752257504646616e-05 + }, + { + "step": 483, + "epoch": 2.4580152671755724, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455629824, + "loss": 1.3787, + "grad_norm": 0.19066168367862701, + "learning_rate": 2.8229937649450613e-05 + }, + { + "step": 484, + "epoch": 2.4631043256997454, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45578496, + "loss": 1.4012, + "grad_norm": 0.18290597200393677, + "learning_rate": 2.7711912443041123e-05 + }, + { + "step": 485, + "epoch": 2.4681933842239188, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455617536, + "loss": 1.3605, + "grad_norm": 0.19260449707508087, + "learning_rate": 2.719820015539596e-05 + }, + { + "step": 486, + "epoch": 2.4732824427480917, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455660544, + "loss": 1.4009, + "grad_norm": 0.21811369061470032, + "learning_rate": 2.6688818904382513e-05 + }, + { + "step": 487, + "epoch": 2.4783715012722647, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455682048, + "loss": 1.3793, + "grad_norm": 0.20217399299144745, + "learning_rate": 2.6183786655119144e-05 + }, + { + "step": 488, + "epoch": 2.4834605597964376, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455605248, + "loss": 1.3506, + "grad_norm": 0.16086356341838837, + "learning_rate": 2.5683121219341217e-05 + }, + { + "step": 489, + "epoch": 2.4885496183206106, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455741952, + "loss": 1.3654, + "grad_norm": 0.2242749035358429, + "learning_rate": 2.518684025477319e-05 + }, + { + "step": 490, + "epoch": 2.4936386768447836, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455629824, + "loss": 1.348, + "grad_norm": 0.16430974006652832, + "learning_rate": 2.469496126450578e-05 + }, + { + "step": 491, + "epoch": 2.4987277353689565, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455626752, + "loss": 1.4085, + "grad_norm": 0.20375946164131165, + "learning_rate": 2.4207501596378508e-05 + }, + { + "step": 492, + "epoch": 2.5038167938931295, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45560064, + "loss": 1.3128, + "grad_norm": 0.13947565853595734, + "learning_rate": 2.3724478442368133e-05 + }, + { + "step": 493, + "epoch": 2.508905852417303, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455635968, + "loss": 1.3348, + "grad_norm": 0.2146555632352829, + "learning_rate": 2.324590883798204e-05 + }, + { + "step": 494, + "epoch": 2.513994910941476, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455651328, + "loss": 1.3565, + "grad_norm": 0.2136322408914566, + "learning_rate": 2.2771809661657614e-05 + }, + { + "step": 495, + "epoch": 2.519083969465649, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455645184, + "loss": 1.3489, + "grad_norm": 0.16404341161251068, + "learning_rate": 2.2302197634166835e-05 + }, + { + "step": 496, + "epoch": 2.524173027989822, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455657472, + "loss": 1.4092, + "grad_norm": 0.26788437366485596, + "learning_rate": 2.1837089318026714e-05 + }, + { + "step": 497, + "epoch": 2.5292620865139948, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455689728, + "loss": 1.346, + "grad_norm": 0.19344116747379303, + "learning_rate": 2.1376501116915047e-05 + }, + { + "step": 498, + "epoch": 2.534351145038168, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455691264, + "loss": 1.3616, + "grad_norm": 0.11352292448282242, + "learning_rate": 2.0920449275091837e-05 + }, + { + "step": 499, + "epoch": 2.539440203562341, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455663616, + "loss": 1.3783, + "grad_norm": 0.22407680749893188, + "learning_rate": 2.0468949876826573e-05 + }, + { + "step": 500, + "epoch": 2.544529262086514, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455712768, + "loss": 1.345, + "grad_norm": 0.26863396167755127, + "learning_rate": 2.002201884583065e-05 + }, + { + "step": 501, + "epoch": 2.549618320610687, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455642112, + "loss": 1.3765, + "grad_norm": 0.21582777798175812, + "learning_rate": 1.957967194469615e-05 + }, + { + "step": 502, + "epoch": 2.55470737913486, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45570048, + "loss": 1.3557, + "grad_norm": 0.25753501057624817, + "learning_rate": 1.9141924774339566e-05 + }, + { + "step": 503, + "epoch": 2.559796437659033, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455691264, + "loss": 1.3495, + "grad_norm": 0.1416066586971283, + "learning_rate": 1.8708792773451874e-05 + }, + { + "step": 504, + "epoch": 2.564885496183206, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455625216, + "loss": 1.349, + "grad_norm": 0.27636703848838806, + "learning_rate": 1.828029121795375e-05 + }, + { + "step": 505, + "epoch": 2.569974554707379, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45564672, + "loss": 1.3647, + "grad_norm": 0.18816597759723663, + "learning_rate": 1.7856435220457092e-05 + }, + { + "step": 506, + "epoch": 2.5750636132315523, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455702016, + "loss": 1.3661, + "grad_norm": 0.4456332325935364, + "learning_rate": 1.7437239729731806e-05 + }, + { + "step": 507, + "epoch": 2.5801526717557253, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455619072, + "loss": 1.3659, + "grad_norm": 0.13826321065425873, + "learning_rate": 1.7022719530178624e-05 + }, + { + "step": 508, + "epoch": 2.5852417302798982, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455683584, + "loss": 1.372, + "grad_norm": 0.19406495988368988, + "learning_rate": 1.6612889241307836e-05 + }, + { + "step": 509, + "epoch": 2.590330788804071, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455642112, + "loss": 1.3912, + "grad_norm": 0.2620127201080322, + "learning_rate": 1.620776331722347e-05 + }, + { + "step": 510, + "epoch": 2.595419847328244, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455674368, + "loss": 1.3808, + "grad_norm": 0.2516784369945526, + "learning_rate": 1.580735604611368e-05 + }, + { + "step": 511, + "epoch": 2.6005089058524176, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455648256, + "loss": 1.3658, + "grad_norm": 0.1932203769683838, + "learning_rate": 1.5411681549746678e-05 + }, + { + "step": 512, + "epoch": 2.6055979643765905, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455642112, + "loss": 1.3412, + "grad_norm": 0.18061627447605133, + "learning_rate": 1.502075378297285e-05 + }, + { + "step": 513, + "epoch": 2.6106870229007635, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455652864, + "loss": 1.3873, + "grad_norm": 0.2458008974790573, + "learning_rate": 1.4634586533232428e-05 + }, + { + "step": 514, + "epoch": 2.6157760814249365, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455557632, + "loss": 1.3634, + "grad_norm": 0.22297808527946472, + "learning_rate": 1.4253193420069292e-05 + }, + { + "step": 515, + "epoch": 2.6208651399491094, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455626752, + "loss": 1.3486, + "grad_norm": 0.27702632546424866, + "learning_rate": 1.3876587894650686e-05 + }, + { + "step": 516, + "epoch": 2.6259541984732824, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455674368, + "loss": 1.3606, + "grad_norm": 0.21695153415203094, + "learning_rate": 1.350478323929271e-05 + }, + { + "step": 517, + "epoch": 2.6310432569974553, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455619072, + "loss": 1.3761, + "grad_norm": 0.33167293667793274, + "learning_rate": 1.3137792566992001e-05 + }, + { + "step": 518, + "epoch": 2.6361323155216283, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45567744, + "loss": 1.3746, + "grad_norm": 0.20105969905853271, + "learning_rate": 1.2775628820963091e-05 + }, + { + "step": 519, + "epoch": 2.6412213740458013, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455683584, + "loss": 1.345, + "grad_norm": 0.1747850924730301, + "learning_rate": 1.2418304774182075e-05 + }, + { + "step": 520, + "epoch": 2.6463104325699747, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455820288, + "loss": 1.3559, + "grad_norm": 0.15271995961666107, + "learning_rate": 1.2065833028935968e-05 + }, + { + "step": 521, + "epoch": 2.6513994910941476, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455629824, + "loss": 1.3723, + "grad_norm": 0.23024749755859375, + "learning_rate": 1.1718226016378507e-05 + }, + { + "step": 522, + "epoch": 2.6564885496183206, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455694336, + "loss": 1.3315, + "grad_norm": 0.23053520917892456, + "learning_rate": 1.137549599609136e-05 + }, + { + "step": 523, + "epoch": 2.6615776081424936, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455609856, + "loss": 1.3735, + "grad_norm": 0.28014370799064636, + "learning_rate": 1.103765505565205e-05 + }, + { + "step": 524, + "epoch": 2.6666666666666665, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455626752, + "loss": 1.3611, + "grad_norm": 0.1415228247642517, + "learning_rate": 1.0704715110207579e-05 + }, + { + "step": 525, + "epoch": 2.67175572519084, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455626752, + "loss": 1.3249, + "grad_norm": 0.19838416576385498, + "learning_rate": 1.0376687902053981e-05 + }, + { + "step": 526, + "epoch": 2.676844783715013, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455752704, + "loss": 1.3586, + "grad_norm": 0.14876754581928253, + "learning_rate": 1.0053585000222524e-05 + }, + { + "step": 527, + "epoch": 2.681933842239186, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455632896, + "loss": 1.3402, + "grad_norm": 0.22048622369766235, + "learning_rate": 9.735417800071433e-06 + }, + { + "step": 528, + "epoch": 2.687022900763359, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455791104, + "loss": 1.4013, + "grad_norm": 0.24496914446353912, + "learning_rate": 9.42219752288414e-06 + }, + { + "step": 529, + "epoch": 2.6921119592875318, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455616, + "loss": 1.3328, + "grad_norm": 0.1545071303844452, + "learning_rate": 9.113935215473428e-06 + }, + { + "step": 530, + "epoch": 2.6972010178117047, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455634432, + "loss": 1.3935, + "grad_norm": 0.20377838611602783, + "learning_rate": 8.810641749791902e-06 + }, + { + "step": 531, + "epoch": 2.7022900763358777, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455811072, + "loss": 1.3382, + "grad_norm": 0.19216984510421753, + "learning_rate": 8.512327822548481e-06 + }, + { + "step": 532, + "epoch": 2.7073791348600507, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45568512, + "loss": 1.3887, + "grad_norm": 0.16510340571403503, + "learning_rate": 8.219003954831199e-06 + }, + { + "step": 533, + "epoch": 2.712468193384224, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455682048, + "loss": 1.3425, + "grad_norm": 0.21295695006847382, + "learning_rate": 7.930680491736135e-06 + }, + { + "step": 534, + "epoch": 2.717557251908397, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455691264, + "loss": 1.3809, + "grad_norm": 0.15528766810894012, + "learning_rate": 7.647367602002491e-06 + }, + { + "step": 535, + "epoch": 2.72264631043257, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455566848, + "loss": 1.3472, + "grad_norm": 0.16844607889652252, + "learning_rate": 7.369075277654091e-06 + }, + { + "step": 536, + "epoch": 2.727735368956743, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455849472, + "loss": 1.3685, + "grad_norm": 0.2264775186777115, + "learning_rate": 7.095813333646832e-06 + }, + { + "step": 537, + "epoch": 2.732824427480916, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455612928, + "loss": 1.3522, + "grad_norm": 0.15344296395778656, + "learning_rate": 6.827591407522548e-06 + }, + { + "step": 538, + "epoch": 2.7379134860050893, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45570816, + "loss": 1.3676, + "grad_norm": 0.12822727859020233, + "learning_rate": 6.564418959069273e-06 + }, + { + "step": 539, + "epoch": 2.7430025445292623, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455674368, + "loss": 1.365, + "grad_norm": 0.32906296849250793, + "learning_rate": 6.3063052699873326e-06 + }, + { + "step": 540, + "epoch": 2.7480916030534353, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455642112, + "loss": 1.3842, + "grad_norm": 0.23827455937862396, + "learning_rate": 6.053259443562286e-06 + }, + { + "step": 541, + "epoch": 2.753180661577608, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455660544, + "loss": 1.3699, + "grad_norm": 0.1801334172487259, + "learning_rate": 5.8052904043435985e-06 + }, + { + "step": 542, + "epoch": 2.758269720101781, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455711232, + "loss": 1.4122, + "grad_norm": 0.2786211371421814, + "learning_rate": 5.56240689783013e-06 + }, + { + "step": 543, + "epoch": 2.763358778625954, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455606784, + "loss": 1.3683, + "grad_norm": 0.19462022185325623, + "learning_rate": 5.324617490161409e-06 + }, + { + "step": 544, + "epoch": 2.768447837150127, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455635968, + "loss": 1.3567, + "grad_norm": 0.19227056205272675, + "learning_rate": 5.091930567815866e-06 + }, + { + "step": 545, + "epoch": 2.7735368956743, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455579136, + "loss": 1.4129, + "grad_norm": 0.19007696211338043, + "learning_rate": 4.86435433731473e-06 + }, + { + "step": 546, + "epoch": 2.778625954198473, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4556544, + "loss": 1.3543, + "grad_norm": 0.16846071183681488, + "learning_rate": 4.641896824932861e-06 + }, + { + "step": 547, + "epoch": 2.7837150127226464, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45564672, + "loss": 1.3895, + "grad_norm": 0.19129273295402527, + "learning_rate": 4.424565876415415e-06 + }, + { + "step": 548, + "epoch": 2.7888040712468194, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45567744, + "loss": 1.3895, + "grad_norm": 0.2054392546415329, + "learning_rate": 4.212369156701373e-06 + }, + { + "step": 549, + "epoch": 2.7938931297709924, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455606784, + "loss": 1.3789, + "grad_norm": 0.17730583250522614, + "learning_rate": 4.005314149653133e-06 + }, + { + "step": 550, + "epoch": 2.7989821882951653, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455632896, + "loss": 1.3535, + "grad_norm": 0.25382041931152344, + "learning_rate": 3.8034081577924147e-06 + }, + { + "step": 551, + "epoch": 2.8040712468193383, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455728128, + "loss": 1.3641, + "grad_norm": 0.19844534993171692, + "learning_rate": 3.6066583020429864e-06 + }, + { + "step": 552, + "epoch": 2.8091603053435117, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45562368, + "loss": 1.3336, + "grad_norm": 0.14376743137836456, + "learning_rate": 3.415071521479246e-06 + }, + { + "step": 553, + "epoch": 2.8142493638676847, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455632896, + "loss": 1.4203, + "grad_norm": 0.21967819333076477, + "learning_rate": 3.2286545730817183e-06 + }, + { + "step": 554, + "epoch": 2.8193384223918576, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455660544, + "loss": 1.3645, + "grad_norm": 0.2106781154870987, + "learning_rate": 3.0474140314985628e-06 + }, + { + "step": 555, + "epoch": 2.8244274809160306, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455674368, + "loss": 1.3775, + "grad_norm": 0.24160148203372955, + "learning_rate": 2.8713562888138754e-06 + }, + { + "step": 556, + "epoch": 2.8295165394402035, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455668224, + "loss": 1.3801, + "grad_norm": 0.16507750749588013, + "learning_rate": 2.7004875543220506e-06 + }, + { + "step": 557, + "epoch": 2.8346055979643765, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45563136, + "loss": 1.3623, + "grad_norm": 0.21155130863189697, + "learning_rate": 2.5348138543089425e-06 + }, + { + "step": 558, + "epoch": 2.8396946564885495, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455689728, + "loss": 1.3781, + "grad_norm": 0.24674245715141296, + "learning_rate": 2.374341031839283e-06 + }, + { + "step": 559, + "epoch": 2.8447837150127224, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455599104, + "loss": 1.3738, + "grad_norm": 0.2119845151901245, + "learning_rate": 2.2190747465505644e-06 + }, + { + "step": 560, + "epoch": 2.849872773536896, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455683584, + "loss": 1.3829, + "grad_norm": 0.15665999054908752, + "learning_rate": 2.0690204744534976e-06 + }, + { + "step": 561, + "epoch": 2.854961832061069, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455721984, + "loss": 1.3662, + "grad_norm": 0.2546520531177521, + "learning_rate": 1.924183507738819e-06 + }, + { + "step": 562, + "epoch": 2.8600508905852418, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455766528, + "loss": 1.4216, + "grad_norm": 0.14545999467372894, + "learning_rate": 1.7845689545906704e-06 + }, + { + "step": 563, + "epoch": 2.8651399491094147, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455728128, + "loss": 1.3524, + "grad_norm": 0.2089168280363083, + "learning_rate": 1.6501817390064786e-06 + }, + { + "step": 564, + "epoch": 2.8702290076335877, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455665152, + "loss": 1.3803, + "grad_norm": 0.21919827163219452, + "learning_rate": 1.521026600623243e-06 + }, + { + "step": 565, + "epoch": 2.875318066157761, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455718912, + "loss": 1.377, + "grad_norm": 0.29513630270957947, + "learning_rate": 1.3971080945503866e-06 + }, + { + "step": 566, + "epoch": 2.880407124681934, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455657472, + "loss": 1.4129, + "grad_norm": 0.33316928148269653, + "learning_rate": 1.2784305912090842e-06 + }, + { + "step": 567, + "epoch": 2.885496183206107, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455643648, + "loss": 1.3577, + "grad_norm": 0.19456513226032257, + "learning_rate": 1.1649982761782195e-06 + }, + { + "step": 568, + "epoch": 2.89058524173028, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455855616, + "loss": 1.3469, + "grad_norm": 0.15289492905139923, + "learning_rate": 1.0568151500465693e-06 + }, + { + "step": 569, + "epoch": 2.895674300254453, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455720448, + "loss": 1.3443, + "grad_norm": 0.3325195908546448, + "learning_rate": 9.538850282719833e-07 + }, + { + "step": 570, + "epoch": 2.900763358778626, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455694336, + "loss": 1.3756, + "grad_norm": 0.22210504114627838, + "learning_rate": 8.56211541046542e-07 + }, + { + "step": 571, + "epoch": 2.905852417302799, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45570048, + "loss": 1.345, + "grad_norm": 0.1980503499507904, + "learning_rate": 7.637981331687582e-07 + }, + { + "step": 572, + "epoch": 2.910941475826972, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455734272, + "loss": 1.3791, + "grad_norm": 0.2280358523130417, + "learning_rate": 6.766480639218752e-07 + }, + { + "step": 573, + "epoch": 2.916030534351145, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455642112, + "loss": 1.4138, + "grad_norm": 0.18516099452972412, + "learning_rate": 5.947644069591084e-07 + }, + { + "step": 574, + "epoch": 2.921119592875318, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455726592, + "loss": 1.3929, + "grad_norm": 0.2800382673740387, + "learning_rate": 5.181500501950986e-07 + }, + { + "step": 575, + "epoch": 2.926208651399491, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455614464, + "loss": 1.3622, + "grad_norm": 0.13153746724128723, + "learning_rate": 4.468076957041433e-07 + }, + { + "step": 576, + "epoch": 2.931297709923664, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45567744, + "loss": 1.3847, + "grad_norm": 0.20825086534023285, + "learning_rate": 3.807398596248401e-07 + }, + { + "step": 577, + "epoch": 2.936386768447837, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455642112, + "loss": 1.3424, + "grad_norm": 0.19902299344539642, + "learning_rate": 3.199488720714072e-07 + }, + { + "step": 578, + "epoch": 2.94147582697201, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455642112, + "loss": 1.3681, + "grad_norm": 0.14945538341999054, + "learning_rate": 2.64436877051466e-07 + }, + { + "step": 579, + "epoch": 2.9465648854961835, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455726592, + "loss": 1.3667, + "grad_norm": 0.18246431648731232, + "learning_rate": 2.1420583239040167e-07 + }, + { + "step": 580, + "epoch": 2.9516539440203564, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455672832, + "loss": 1.3845, + "grad_norm": 0.18712550401687622, + "learning_rate": 1.6925750966238494e-07 + }, + { + "step": 581, + "epoch": 2.9567430025445294, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4556928, + "loss": 1.3737, + "grad_norm": 0.22718262672424316, + "learning_rate": 1.295934941278387e-07 + }, + { + "step": 582, + "epoch": 2.9618320610687023, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455620608, + "loss": 1.362, + "grad_norm": 0.3022129237651825, + "learning_rate": 9.52151846775162e-08 + }, + { + "step": 583, + "epoch": 2.9669211195928753, + "cpu_mem": 1.928228864, + "gpu_mem": 4.45570816, + "loss": 1.3804, + "grad_norm": 0.16216367483139038, + "learning_rate": 6.612379378320709e-08 + }, + { + "step": 584, + "epoch": 2.9720101781170483, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455675904, + "loss": 1.3513, + "grad_norm": 0.16643819212913513, + "learning_rate": 4.232034745495494e-08 + }, + { + "step": 585, + "epoch": 2.9770992366412212, + "cpu_mem": 1.928228864, + "gpu_mem": 4.4556544, + "loss": 1.3435, + "grad_norm": 0.1452353149652481, + "learning_rate": 2.3805685204869583e-08 + }, + { + "step": 586, + "epoch": 2.982188295165394, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455597568, + "loss": 1.3823, + "grad_norm": 0.38013139367103577, + "learning_rate": 1.0580460017517444e-08 + }, + { + "step": 587, + "epoch": 2.9872773536895676, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455663616, + "loss": 1.4202, + "grad_norm": 0.16252781450748444, + "learning_rate": 2.645138326906604e-09 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455705088, + "loss": 1.3756, + "grad_norm": 0.11781106889247894, + "learning_rate": 0.0 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 1.928228864, + "gpu_mem": 4.455705088, + "train_runtime": 16661.7142, + "train_samples_per_second": 2.263, + "train_steps_per_second": 0.035, + "total_flos": 8.848978954662298e+16, + "train_loss": 1.4949758492764973 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r32-a2/README.md b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r32-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r32-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r32-a2/adapter_config.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0ef1d724eca7640a4f365c193cda2fc4efdb2073 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r32-a2/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha": 64, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "inference_mode": true, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 32, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r32-a2/eval_results.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..01685f8791c03e68d1ed3d35303d6c1f3edde2a7 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "logiqa", + "results": 0.2807271224953522 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r32-a2/training_configuration.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..569ccc147aa2b76bf86eb743388c6e0cefab2a31 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "LOGIQA", + "dataset_id": "data/logiqa_train", + "preprocess_id": "logiqa_train_deepeval" + }, + "peft_config": { + "method": "loha", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 50462720 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 3, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loha-logiqa-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r32-a2", + "seed": 42, + "timestamp": "2025-09-13T23:24:08.488578" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r32-a2/training_logs.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..8112f9124ddc4587a013d1363b773ae2a2c6286e --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r32-a2/training_logs.json @@ -0,0 +1,5305 @@ +[ + { + "step": 1, + "epoch": 0.005089058524173028, + "cpu_mem": 2.087186432, + "gpu_mem": 4.619652608, + "loss": 3.8396, + "grad_norm": 3.630662679672241, + "learning_rate": 5.084745762711864e-06 + }, + { + "step": 2, + "epoch": 0.010178117048346057, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023373824, + "loss": 3.9728, + "grad_norm": 3.5784475803375244, + "learning_rate": 1.0169491525423728e-05 + }, + { + "step": 3, + "epoch": 0.015267175572519083, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023450624, + "loss": 3.8382, + "grad_norm": 3.518805980682373, + "learning_rate": 1.5254237288135592e-05 + }, + { + "step": 4, + "epoch": 0.020356234096692113, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023349248, + "loss": 3.7808, + "grad_norm": 3.8132665157318115, + "learning_rate": 2.0338983050847455e-05 + }, + { + "step": 5, + "epoch": 0.02544529262086514, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023364608, + "loss": 3.85, + "grad_norm": 3.577756881713867, + "learning_rate": 2.542372881355932e-05 + }, + { + "step": 6, + "epoch": 0.030534351145038167, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023356928, + "loss": 3.7509, + "grad_norm": 3.358081579208374, + "learning_rate": 3.0508474576271185e-05 + }, + { + "step": 7, + "epoch": 0.035623409669211195, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023427584, + "loss": 3.7069, + "grad_norm": 3.414523124694824, + "learning_rate": 3.559322033898305e-05 + }, + { + "step": 8, + "epoch": 0.04071246819338423, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023412224, + "loss": 3.5096, + "grad_norm": 3.2670960426330566, + "learning_rate": 4.067796610169491e-05 + }, + { + "step": 9, + "epoch": 0.04580152671755725, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023410688, + "loss": 3.4069, + "grad_norm": 3.1287989616394043, + "learning_rate": 4.576271186440678e-05 + }, + { + "step": 10, + "epoch": 0.05089058524173028, + "cpu_mem": 2.09229824, + "gpu_mem": 5.02342144, + "loss": 3.6127, + "grad_norm": 3.020658254623413, + "learning_rate": 5.084745762711864e-05 + }, + { + "step": 11, + "epoch": 0.05597964376590331, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023324672, + "loss": 3.2461, + "grad_norm": 2.7496652603149414, + "learning_rate": 5.59322033898305e-05 + }, + { + "step": 12, + "epoch": 0.061068702290076333, + "cpu_mem": 2.09229824, + "gpu_mem": 5.02337536, + "loss": 3.0142, + "grad_norm": 2.6029510498046875, + "learning_rate": 6.101694915254237e-05 + }, + { + "step": 13, + "epoch": 0.06615776081424936, + "cpu_mem": 2.09229824, + "gpu_mem": 5.02346752, + "loss": 2.9369, + "grad_norm": 2.4328181743621826, + "learning_rate": 6.610169491525423e-05 + }, + { + "step": 14, + "epoch": 0.07124681933842239, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023379968, + "loss": 2.8732, + "grad_norm": 2.2739627361297607, + "learning_rate": 7.11864406779661e-05 + }, + { + "step": 15, + "epoch": 0.07633587786259542, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023518208, + "loss": 2.6522, + "grad_norm": 2.166672468185425, + "learning_rate": 7.627118644067796e-05 + }, + { + "step": 16, + "epoch": 0.08142493638676845, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023378432, + "loss": 2.453, + "grad_norm": 1.6648517847061157, + "learning_rate": 8.135593220338982e-05 + }, + { + "step": 17, + "epoch": 0.08651399491094147, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023409152, + "loss": 2.2312, + "grad_norm": 1.545669674873352, + "learning_rate": 8.64406779661017e-05 + }, + { + "step": 18, + "epoch": 0.0916030534351145, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023372288, + "loss": 2.3001, + "grad_norm": 1.4697089195251465, + "learning_rate": 9.152542372881355e-05 + }, + { + "step": 19, + "epoch": 0.09669211195928754, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023280128, + "loss": 2.1677, + "grad_norm": 1.2337696552276611, + "learning_rate": 9.661016949152541e-05 + }, + { + "step": 20, + "epoch": 0.10178117048346055, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023318528, + "loss": 2.0523, + "grad_norm": 1.056970477104187, + "learning_rate": 0.00010169491525423727 + }, + { + "step": 21, + "epoch": 0.10687022900763359, + "cpu_mem": 2.09229824, + "gpu_mem": 5.02345216, + "loss": 1.7443, + "grad_norm": 0.7450931668281555, + "learning_rate": 0.00010677966101694915 + }, + { + "step": 22, + "epoch": 0.11195928753180662, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023350784, + "loss": 1.7728, + "grad_norm": 0.7447719573974609, + "learning_rate": 0.000111864406779661 + }, + { + "step": 23, + "epoch": 0.11704834605597965, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023378432, + "loss": 1.7955, + "grad_norm": 0.7510949373245239, + "learning_rate": 0.00011694915254237288 + }, + { + "step": 24, + "epoch": 0.12213740458015267, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023372288, + "loss": 1.6224, + "grad_norm": 0.46836772561073303, + "learning_rate": 0.00012203389830508474 + }, + { + "step": 25, + "epoch": 0.1272264631043257, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023378432, + "loss": 1.6224, + "grad_norm": 0.34628212451934814, + "learning_rate": 0.00012711864406779658 + }, + { + "step": 26, + "epoch": 0.13231552162849872, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023430656, + "loss": 1.4981, + "grad_norm": 0.26384565234184265, + "learning_rate": 0.00013220338983050846 + }, + { + "step": 27, + "epoch": 0.13740458015267176, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023372288, + "loss": 1.5069, + "grad_norm": 0.2635621726512909, + "learning_rate": 0.00013728813559322033 + }, + { + "step": 28, + "epoch": 0.14249363867684478, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023318528, + "loss": 1.4706, + "grad_norm": 0.15952464938163757, + "learning_rate": 0.0001423728813559322 + }, + { + "step": 29, + "epoch": 0.1475826972010178, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023410688, + "loss": 1.493, + "grad_norm": 0.17345111072063446, + "learning_rate": 0.00014745762711864405 + }, + { + "step": 30, + "epoch": 0.15267175572519084, + "cpu_mem": 2.09229824, + "gpu_mem": 5.02340608, + "loss": 1.3972, + "grad_norm": 0.2462405562400818, + "learning_rate": 0.00015254237288135592 + }, + { + "step": 31, + "epoch": 0.15776081424936386, + "cpu_mem": 2.09229824, + "gpu_mem": 5.023384576, + "loss": 1.4114, + "grad_norm": 0.15580488741397858, + "learning_rate": 0.0001576271186440678 + }, + { + "step": 32, + "epoch": 0.1628498727735369, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023389184, + "loss": 1.5029, + "grad_norm": 0.24210672080516815, + "learning_rate": 0.00016271186440677964 + }, + { + "step": 33, + "epoch": 0.16793893129770993, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023424512, + "loss": 1.3608, + "grad_norm": 0.13385920226573944, + "learning_rate": 0.0001677966101694915 + }, + { + "step": 34, + "epoch": 0.17302798982188294, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023349248, + "loss": 1.5442, + "grad_norm": 0.23965148627758026, + "learning_rate": 0.0001728813559322034 + }, + { + "step": 35, + "epoch": 0.178117048346056, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023396864, + "loss": 1.505, + "grad_norm": 0.29093098640441895, + "learning_rate": 0.00017796610169491523 + }, + { + "step": 36, + "epoch": 0.183206106870229, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023412224, + "loss": 1.5109, + "grad_norm": 0.3264513611793518, + "learning_rate": 0.0001830508474576271 + }, + { + "step": 37, + "epoch": 0.18829516539440203, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023478272, + "loss": 1.3759, + "grad_norm": 0.1908360719680786, + "learning_rate": 0.00018813559322033895 + }, + { + "step": 38, + "epoch": 0.19338422391857507, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023361536, + "loss": 1.4169, + "grad_norm": 0.1332641988992691, + "learning_rate": 0.00019322033898305083 + }, + { + "step": 39, + "epoch": 0.1984732824427481, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023478272, + "loss": 1.4593, + "grad_norm": 0.1945722997188568, + "learning_rate": 0.0001983050847457627 + }, + { + "step": 40, + "epoch": 0.2035623409669211, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023399936, + "loss": 1.4041, + "grad_norm": 0.12878568470478058, + "learning_rate": 0.00020338983050847455 + }, + { + "step": 41, + "epoch": 0.20865139949109415, + "cpu_mem": 2.092494848, + "gpu_mem": 5.02329856, + "loss": 1.4339, + "grad_norm": 0.17195849120616913, + "learning_rate": 0.00020847457627118642 + }, + { + "step": 42, + "epoch": 0.21374045801526717, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023370752, + "loss": 1.4244, + "grad_norm": 0.15130512416362762, + "learning_rate": 0.0002135593220338983 + }, + { + "step": 43, + "epoch": 0.21882951653944022, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023333888, + "loss": 1.4052, + "grad_norm": 0.18678444623947144, + "learning_rate": 0.00021864406779661014 + }, + { + "step": 44, + "epoch": 0.22391857506361323, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023369216, + "loss": 1.4222, + "grad_norm": 0.1313791573047638, + "learning_rate": 0.000223728813559322 + }, + { + "step": 45, + "epoch": 0.22900763358778625, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023422976, + "loss": 1.4074, + "grad_norm": 0.128699392080307, + "learning_rate": 0.00022881355932203386 + }, + { + "step": 46, + "epoch": 0.2340966921119593, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023470592, + "loss": 1.4016, + "grad_norm": 0.130779430270195, + "learning_rate": 0.00023389830508474576 + }, + { + "step": 47, + "epoch": 0.23918575063613232, + "cpu_mem": 2.092494848, + "gpu_mem": 5.02329856, + "loss": 1.4033, + "grad_norm": 0.1434730887413025, + "learning_rate": 0.0002389830508474576 + }, + { + "step": 48, + "epoch": 0.24427480916030533, + "cpu_mem": 2.092494848, + "gpu_mem": 5.02335232, + "loss": 1.4454, + "grad_norm": 0.17990173399448395, + "learning_rate": 0.00024406779661016948 + }, + { + "step": 49, + "epoch": 0.24936386768447838, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023341568, + "loss": 1.3804, + "grad_norm": 0.11867712438106537, + "learning_rate": 0.00024915254237288135 + }, + { + "step": 50, + "epoch": 0.2544529262086514, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023350784, + "loss": 1.427, + "grad_norm": 0.17877569794654846, + "learning_rate": 0.00025423728813559317 + }, + { + "step": 51, + "epoch": 0.2595419847328244, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023446016, + "loss": 1.3945, + "grad_norm": 0.1667715311050415, + "learning_rate": 0.0002593220338983051 + }, + { + "step": 52, + "epoch": 0.26463104325699743, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023387648, + "loss": 1.408, + "grad_norm": 0.14575619995594025, + "learning_rate": 0.0002644067796610169 + }, + { + "step": 53, + "epoch": 0.2697201017811705, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023465984, + "loss": 1.4141, + "grad_norm": 0.15257515013217926, + "learning_rate": 0.0002694915254237288 + }, + { + "step": 54, + "epoch": 0.2748091603053435, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023366144, + "loss": 1.4056, + "grad_norm": 0.09998749941587448, + "learning_rate": 0.00027457627118644066 + }, + { + "step": 55, + "epoch": 0.27989821882951654, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023363072, + "loss": 1.3693, + "grad_norm": 0.10653584450483322, + "learning_rate": 0.0002796610169491525 + }, + { + "step": 56, + "epoch": 0.28498727735368956, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023407616, + "loss": 1.4201, + "grad_norm": 0.17916665971279144, + "learning_rate": 0.0002847457627118644 + }, + { + "step": 57, + "epoch": 0.2900763358778626, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023399936, + "loss": 1.3942, + "grad_norm": 0.1180114671587944, + "learning_rate": 0.00028983050847457623 + }, + { + "step": 58, + "epoch": 0.2951653944020356, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023422976, + "loss": 1.3927, + "grad_norm": 0.11093945801258087, + "learning_rate": 0.0002949152542372881 + }, + { + "step": 59, + "epoch": 0.30025445292620867, + "cpu_mem": 2.092494848, + "gpu_mem": 5.02339072, + "loss": 1.4226, + "grad_norm": 0.28586798906326294, + "learning_rate": 0.0003 + }, + { + "step": 60, + "epoch": 0.3053435114503817, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023381504, + "loss": 1.4175, + "grad_norm": 0.12459742277860641, + "learning_rate": 0.00029999735486167307 + }, + { + "step": 61, + "epoch": 0.3104325699745547, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023418368, + "loss": 1.386, + "grad_norm": 0.14990372955799103, + "learning_rate": 0.00029998941953998247 + }, + { + "step": 62, + "epoch": 0.3155216284987277, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023458304, + "loss": 1.3916, + "grad_norm": 0.12644708156585693, + "learning_rate": 0.0002999761943147951 + }, + { + "step": 63, + "epoch": 0.32061068702290074, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023387648, + "loss": 1.3742, + "grad_norm": 0.09974505752325058, + "learning_rate": 0.000299957679652545 + }, + { + "step": 64, + "epoch": 0.3256997455470738, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023289344, + "loss": 1.3928, + "grad_norm": 0.18779629468917847, + "learning_rate": 0.0002999338762062168 + }, + { + "step": 65, + "epoch": 0.33078880407124683, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023370752, + "loss": 1.3967, + "grad_norm": 0.1381853222846985, + "learning_rate": 0.00029990478481532246 + }, + { + "step": 66, + "epoch": 0.33587786259541985, + "cpu_mem": 2.092494848, + "gpu_mem": 5.0234752, + "loss": 1.4043, + "grad_norm": 0.08594711124897003, + "learning_rate": 0.00029987040650587214 + }, + { + "step": 67, + "epoch": 0.34096692111959287, + "cpu_mem": 2.092494848, + "gpu_mem": 5.02335232, + "loss": 1.4287, + "grad_norm": 0.1739995926618576, + "learning_rate": 0.0002998307424903376 + }, + { + "step": 68, + "epoch": 0.3460559796437659, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023404544, + "loss": 1.4057, + "grad_norm": 0.08903756737709045, + "learning_rate": 0.00029978579416760955 + }, + { + "step": 69, + "epoch": 0.3511450381679389, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023396864, + "loss": 1.4038, + "grad_norm": 0.13635936379432678, + "learning_rate": 0.00029973556312294853 + }, + { + "step": 70, + "epoch": 0.356234096692112, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023312384, + "loss": 1.3899, + "grad_norm": 0.08712787181138992, + "learning_rate": 0.0002996800511279286 + }, + { + "step": 71, + "epoch": 0.361323155216285, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023335424, + "loss": 1.4043, + "grad_norm": 0.20734219253063202, + "learning_rate": 0.0002996192601403751 + }, + { + "step": 72, + "epoch": 0.366412213740458, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023384576, + "loss": 1.3838, + "grad_norm": 0.07508254796266556, + "learning_rate": 0.00029955319230429584 + }, + { + "step": 73, + "epoch": 0.37150127226463103, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023338496, + "loss": 1.4037, + "grad_norm": 0.06653927266597748, + "learning_rate": 0.00029948184994980486 + }, + { + "step": 74, + "epoch": 0.37659033078880405, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023392256, + "loss": 1.3991, + "grad_norm": 0.12100593745708466, + "learning_rate": 0.0002994052355930409 + }, + { + "step": 75, + "epoch": 0.3816793893129771, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023435264, + "loss": 1.4119, + "grad_norm": 0.18562443554401398, + "learning_rate": 0.0002993233519360781 + }, + { + "step": 76, + "epoch": 0.38676844783715014, + "cpu_mem": 2.092494848, + "gpu_mem": 5.02338304, + "loss": 1.3827, + "grad_norm": 0.13090719282627106, + "learning_rate": 0.0002992362018668312 + }, + { + "step": 77, + "epoch": 0.39185750636132316, + "cpu_mem": 2.092494848, + "gpu_mem": 5.0233216, + "loss": 1.4126, + "grad_norm": 0.1175994798541069, + "learning_rate": 0.00029914378845895343 + }, + { + "step": 78, + "epoch": 0.3969465648854962, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023422976, + "loss": 1.378, + "grad_norm": 0.1559111475944519, + "learning_rate": 0.000299046114971728 + }, + { + "step": 79, + "epoch": 0.4020356234096692, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023439872, + "loss": 1.3773, + "grad_norm": 0.2712160050868988, + "learning_rate": 0.0002989431848499534 + }, + { + "step": 80, + "epoch": 0.4071246819338422, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023280128, + "loss": 1.4178, + "grad_norm": 0.1598067432641983, + "learning_rate": 0.0002988350017238218 + }, + { + "step": 81, + "epoch": 0.4122137404580153, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023386112, + "loss": 1.4134, + "grad_norm": 0.1681811362504959, + "learning_rate": 0.0002987215694087909 + }, + { + "step": 82, + "epoch": 0.4173027989821883, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023358464, + "loss": 1.3977, + "grad_norm": 0.1466103196144104, + "learning_rate": 0.0002986028919054496 + }, + { + "step": 83, + "epoch": 0.4223918575063613, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023372288, + "loss": 1.3932, + "grad_norm": 0.22241823375225067, + "learning_rate": 0.00029847897339937675 + }, + { + "step": 84, + "epoch": 0.42748091603053434, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023378432, + "loss": 1.45, + "grad_norm": 0.20554186403751373, + "learning_rate": 0.0002983498182609935 + }, + { + "step": 85, + "epoch": 0.43256997455470736, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023407616, + "loss": 1.4473, + "grad_norm": 0.19892089068889618, + "learning_rate": 0.0002982154310454093 + }, + { + "step": 86, + "epoch": 0.43765903307888043, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023349248, + "loss": 1.3713, + "grad_norm": 0.10219594836235046, + "learning_rate": 0.00029807581649226114 + }, + { + "step": 87, + "epoch": 0.44274809160305345, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023370752, + "loss": 1.4106, + "grad_norm": 0.14916357398033142, + "learning_rate": 0.00029793097952554646 + }, + { + "step": 88, + "epoch": 0.44783715012722647, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023396864, + "loss": 1.3721, + "grad_norm": 0.16665107011795044, + "learning_rate": 0.0002977809252534494 + }, + { + "step": 89, + "epoch": 0.4529262086513995, + "cpu_mem": 2.092494848, + "gpu_mem": 5.02334464, + "loss": 1.392, + "grad_norm": 0.13767126202583313, + "learning_rate": 0.00029762565896816073 + }, + { + "step": 90, + "epoch": 0.4580152671755725, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023376896, + "loss": 1.433, + "grad_norm": 0.13740132749080658, + "learning_rate": 0.000297465186145691 + }, + { + "step": 91, + "epoch": 0.4631043256997455, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023369216, + "loss": 1.3848, + "grad_norm": 0.16357162594795227, + "learning_rate": 0.0002972995124456779 + }, + { + "step": 92, + "epoch": 0.4681933842239186, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023309312, + "loss": 1.385, + "grad_norm": 0.06456801295280457, + "learning_rate": 0.0002971286437111861 + }, + { + "step": 93, + "epoch": 0.4732824427480916, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023510528, + "loss": 1.4044, + "grad_norm": 0.08451258391141891, + "learning_rate": 0.0002969525859685014 + }, + { + "step": 94, + "epoch": 0.47837150127226463, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023324672, + "loss": 1.4294, + "grad_norm": 0.21316909790039062, + "learning_rate": 0.0002967713454269183 + }, + { + "step": 95, + "epoch": 0.48346055979643765, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023492096, + "loss": 1.4194, + "grad_norm": 0.138977512717247, + "learning_rate": 0.0002965849284785207 + }, + { + "step": 96, + "epoch": 0.48854961832061067, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023366144, + "loss": 1.4317, + "grad_norm": 0.16922973096370697, + "learning_rate": 0.000296393341697957 + }, + { + "step": 97, + "epoch": 0.49363867684478374, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023361536, + "loss": 1.405, + "grad_norm": 0.12230396270751953, + "learning_rate": 0.00029619659184220755 + }, + { + "step": 98, + "epoch": 0.49872773536895676, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023409152, + "loss": 1.3746, + "grad_norm": 0.12091266363859177, + "learning_rate": 0.00029599468585034684 + }, + { + "step": 99, + "epoch": 0.5038167938931297, + "cpu_mem": 2.092494848, + "gpu_mem": 5.02339072, + "loss": 1.3462, + "grad_norm": 0.15909069776535034, + "learning_rate": 0.0002957876308432986 + }, + { + "step": 100, + "epoch": 0.5089058524173028, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023355392, + "loss": 1.3816, + "grad_norm": 0.11484885960817337, + "learning_rate": 0.0002955754341235846 + }, + { + "step": 101, + "epoch": 0.5139949109414759, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023338496, + "loss": 1.4, + "grad_norm": 0.06656291335821152, + "learning_rate": 0.00029535810317506714 + }, + { + "step": 102, + "epoch": 0.5190839694656488, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023389184, + "loss": 1.3874, + "grad_norm": 0.1247660219669342, + "learning_rate": 0.00029513564566268524 + }, + { + "step": 103, + "epoch": 0.5241730279898219, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023349248, + "loss": 1.4038, + "grad_norm": 0.1562419831752777, + "learning_rate": 0.0002949080694321841 + }, + { + "step": 104, + "epoch": 0.5292620865139949, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023438336, + "loss": 1.393, + "grad_norm": 0.09144721180200577, + "learning_rate": 0.0002946753825098386 + }, + { + "step": 105, + "epoch": 0.5343511450381679, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023307776, + "loss": 1.3871, + "grad_norm": 0.12821786105632782, + "learning_rate": 0.0002944375931021699 + }, + { + "step": 106, + "epoch": 0.539440203562341, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023370752, + "loss": 1.3847, + "grad_norm": 0.06398934870958328, + "learning_rate": 0.0002941947095956564 + }, + { + "step": 107, + "epoch": 0.544529262086514, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023366144, + "loss": 1.3788, + "grad_norm": 0.09258691221475601, + "learning_rate": 0.0002939467405564377 + }, + { + "step": 108, + "epoch": 0.549618320610687, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023355392, + "loss": 1.374, + "grad_norm": 0.09886990487575531, + "learning_rate": 0.00029369369473001265 + }, + { + "step": 109, + "epoch": 0.55470737913486, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023524352, + "loss": 1.3951, + "grad_norm": 0.10488520562648773, + "learning_rate": 0.0002934355810409307 + }, + { + "step": 110, + "epoch": 0.5597964376590331, + "cpu_mem": 2.092494848, + "gpu_mem": 5.0233216, + "loss": 1.3752, + "grad_norm": 0.1477169245481491, + "learning_rate": 0.0002931724085924774 + }, + { + "step": 111, + "epoch": 0.5648854961832062, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023323136, + "loss": 1.3901, + "grad_norm": 0.12490345537662506, + "learning_rate": 0.00029290418666635314 + }, + { + "step": 112, + "epoch": 0.5699745547073791, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023422976, + "loss": 1.3696, + "grad_norm": 0.1147606149315834, + "learning_rate": 0.0002926309247223459 + }, + { + "step": 113, + "epoch": 0.5750636132315522, + "cpu_mem": 2.092494848, + "gpu_mem": 5.02352128, + "loss": 1.3786, + "grad_norm": 0.09607420861721039, + "learning_rate": 0.0002923526323979975 + }, + { + "step": 114, + "epoch": 0.5801526717557252, + "cpu_mem": 2.092494848, + "gpu_mem": 5.023350784, + "loss": 1.3949, + "grad_norm": 0.07278651744127274, + "learning_rate": 0.00029206931950826387 + }, + { + "step": 115, + "epoch": 0.5852417302798982, + "cpu_mem": 2.092494848, + "gpu_mem": 5.02336, + "loss": 1.4004, + "grad_norm": 0.06848017126321793, + "learning_rate": 0.00029178099604516876 + }, + { + "step": 116, + "epoch": 0.5903307888040712, + "cpu_mem": 2.092494848, + "gpu_mem": 5.02342144, + "loss": 1.372, + "grad_norm": 0.05692872032523155, + "learning_rate": 0.0002914876721774515 + }, + { + "step": 117, + "epoch": 0.5954198473282443, + "cpu_mem": 2.095443968, + "gpu_mem": 5.02331392, + "loss": 1.3944, + "grad_norm": 0.11486182361841202, + "learning_rate": 0.00029118935825020806 + }, + { + "step": 118, + "epoch": 0.6005089058524173, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023409152, + "loss": 1.4019, + "grad_norm": 0.10114386677742004, + "learning_rate": 0.00029088606478452656 + }, + { + "step": 119, + "epoch": 0.6055979643765903, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023542784, + "loss": 1.4199, + "grad_norm": 0.16552084684371948, + "learning_rate": 0.0002905778024771158 + }, + { + "step": 120, + "epoch": 0.6106870229007634, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023446016, + "loss": 1.3951, + "grad_norm": 0.13523517549037933, + "learning_rate": 0.00029026458219992855 + }, + { + "step": 121, + "epoch": 0.6157760814249363, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023492096, + "loss": 1.3913, + "grad_norm": 0.08044234663248062, + "learning_rate": 0.00028994641499977745 + }, + { + "step": 122, + "epoch": 0.6208651399491094, + "cpu_mem": 2.097606656, + "gpu_mem": 5.0234368, + "loss": 1.3995, + "grad_norm": 0.09758109599351883, + "learning_rate": 0.00028962331209794604 + }, + { + "step": 123, + "epoch": 0.6259541984732825, + "cpu_mem": 2.097606656, + "gpu_mem": 5.0234752, + "loss": 1.3711, + "grad_norm": 0.09616128355264664, + "learning_rate": 0.00028929528488979244 + }, + { + "step": 124, + "epoch": 0.6310432569974554, + "cpu_mem": 2.097606656, + "gpu_mem": 5.0233984, + "loss": 1.4184, + "grad_norm": 0.14178140461444855, + "learning_rate": 0.0002889623449443479 + }, + { + "step": 125, + "epoch": 0.6361323155216285, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023433728, + "loss": 1.4069, + "grad_norm": 0.11227969080209732, + "learning_rate": 0.0002886245040039086 + }, + { + "step": 126, + "epoch": 0.6412213740458015, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023340032, + "loss": 1.3932, + "grad_norm": 0.14919468760490417, + "learning_rate": 0.0002882817739836215 + }, + { + "step": 127, + "epoch": 0.6463104325699746, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023364608, + "loss": 1.4026, + "grad_norm": 0.12101634591817856, + "learning_rate": 0.000287934166971064 + }, + { + "step": 128, + "epoch": 0.6513994910941476, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02333696, + "loss": 1.3863, + "grad_norm": 0.12134291231632233, + "learning_rate": 0.0002875816952258179 + }, + { + "step": 129, + "epoch": 0.6564885496183206, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023343104, + "loss": 1.3982, + "grad_norm": 0.13776960968971252, + "learning_rate": 0.00028722437117903693 + }, + { + "step": 130, + "epoch": 0.6615776081424937, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023333888, + "loss": 1.3847, + "grad_norm": 0.11652427166700363, + "learning_rate": 0.000286862207433008 + }, + { + "step": 131, + "epoch": 0.6666666666666666, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02326784, + "loss": 1.3854, + "grad_norm": 0.06785834580659866, + "learning_rate": 0.00028649521676070726 + }, + { + "step": 132, + "epoch": 0.6717557251908397, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023430656, + "loss": 1.3848, + "grad_norm": 0.08471981436014175, + "learning_rate": 0.0002861234121053493 + }, + { + "step": 133, + "epoch": 0.6768447837150128, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023343104, + "loss": 1.349, + "grad_norm": 0.13754825294017792, + "learning_rate": 0.0002857468065799307 + }, + { + "step": 134, + "epoch": 0.6819338422391857, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023346176, + "loss": 1.3878, + "grad_norm": 0.06039239838719368, + "learning_rate": 0.0002853654134667676 + }, + { + "step": 135, + "epoch": 0.6870229007633588, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023401472, + "loss": 1.3749, + "grad_norm": 0.10129052400588989, + "learning_rate": 0.0002849792462170271 + }, + { + "step": 136, + "epoch": 0.6921119592875318, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023304704, + "loss": 1.4373, + "grad_norm": 0.16966505348682404, + "learning_rate": 0.0002845883184502533 + }, + { + "step": 137, + "epoch": 0.6972010178117048, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023472128, + "loss": 1.3809, + "grad_norm": 0.09872502833604813, + "learning_rate": 0.00028419264395388626 + }, + { + "step": 138, + "epoch": 0.7022900763358778, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023338496, + "loss": 1.3783, + "grad_norm": 0.12594138085842133, + "learning_rate": 0.0002837922366827765 + }, + { + "step": 139, + "epoch": 0.7073791348600509, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023327744, + "loss": 1.3561, + "grad_norm": 0.1325225830078125, + "learning_rate": 0.00028338711075869216 + }, + { + "step": 140, + "epoch": 0.712468193384224, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023379968, + "loss": 1.3648, + "grad_norm": 0.12468275427818298, + "learning_rate": 0.00028297728046982137 + }, + { + "step": 141, + "epoch": 0.7175572519083969, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023304704, + "loss": 1.415, + "grad_norm": 0.14548173546791077, + "learning_rate": 0.00028256276027026816 + }, + { + "step": 142, + "epoch": 0.72264631043257, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023366144, + "loss": 1.4095, + "grad_norm": 0.1475028693675995, + "learning_rate": 0.0002821435647795429 + }, + { + "step": 143, + "epoch": 0.727735368956743, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023364608, + "loss": 1.3878, + "grad_norm": 0.09868435561656952, + "learning_rate": 0.00028171970878204623 + }, + { + "step": 144, + "epoch": 0.732824427480916, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023309312, + "loss": 1.378, + "grad_norm": 0.08572490513324738, + "learning_rate": 0.0002812912072265481 + }, + { + "step": 145, + "epoch": 0.7379134860050891, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02330624, + "loss": 1.3934, + "grad_norm": 0.12330161780118942, + "learning_rate": 0.00028085807522566043 + }, + { + "step": 146, + "epoch": 0.7430025445292621, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023407616, + "loss": 1.355, + "grad_norm": 0.09540458023548126, + "learning_rate": 0.00028042032805530387 + }, + { + "step": 147, + "epoch": 0.7480916030534351, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023412224, + "loss": 1.3788, + "grad_norm": 0.1199817955493927, + "learning_rate": 0.00027997798115416935 + }, + { + "step": 148, + "epoch": 0.7531806615776081, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023510528, + "loss": 1.3649, + "grad_norm": 0.08093950152397156, + "learning_rate": 0.0002795310501231734 + }, + { + "step": 149, + "epoch": 0.7582697201017812, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023353856, + "loss": 1.3898, + "grad_norm": 0.134857639670372, + "learning_rate": 0.0002790795507249081 + }, + { + "step": 150, + "epoch": 0.7633587786259542, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023346176, + "loss": 1.3754, + "grad_norm": 0.07249001413583755, + "learning_rate": 0.00027862349888308494 + }, + { + "step": 151, + "epoch": 0.7684478371501272, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023289344, + "loss": 1.3706, + "grad_norm": 0.07142002880573273, + "learning_rate": 0.0002781629106819733 + }, + { + "step": 152, + "epoch": 0.7735368956743003, + "cpu_mem": 2.097606656, + "gpu_mem": 5.0233216, + "loss": 1.3884, + "grad_norm": 0.0991361066699028, + "learning_rate": 0.00027769780236583315 + }, + { + "step": 153, + "epoch": 0.7786259541984732, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023349248, + "loss": 1.369, + "grad_norm": 0.11516310274600983, + "learning_rate": 0.0002772281903383424 + }, + { + "step": 154, + "epoch": 0.7837150127226463, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023399936, + "loss": 1.4031, + "grad_norm": 0.12614090740680695, + "learning_rate": 0.00027675409116201797 + }, + { + "step": 155, + "epoch": 0.7888040712468194, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023312384, + "loss": 1.3962, + "grad_norm": 0.07589416950941086, + "learning_rate": 0.00027627552155763186 + }, + { + "step": 156, + "epoch": 0.7938931297709924, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023332352, + "loss": 1.3796, + "grad_norm": 0.12739546597003937, + "learning_rate": 0.00027579249840362145 + }, + { + "step": 157, + "epoch": 0.7989821882951654, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023409152, + "loss": 1.4084, + "grad_norm": 0.15345190465450287, + "learning_rate": 0.0002753050387354942 + }, + { + "step": 158, + "epoch": 0.8040712468193384, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023310848, + "loss": 1.3992, + "grad_norm": 0.07678941637277603, + "learning_rate": 0.0002748131597452268 + }, + { + "step": 159, + "epoch": 0.8091603053435115, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023412224, + "loss": 1.3993, + "grad_norm": 0.09184964001178741, + "learning_rate": 0.00027431687878065874 + }, + { + "step": 160, + "epoch": 0.8142493638676844, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023369216, + "loss": 1.4114, + "grad_norm": 0.1655096858739853, + "learning_rate": 0.00027381621334488085 + }, + { + "step": 161, + "epoch": 0.8193384223918575, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02336768, + "loss": 1.3905, + "grad_norm": 0.09337993711233139, + "learning_rate": 0.00027331118109561744 + }, + { + "step": 162, + "epoch": 0.8244274809160306, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023350784, + "loss": 1.3944, + "grad_norm": 0.07489847391843796, + "learning_rate": 0.000272801799844604 + }, + { + "step": 163, + "epoch": 0.8295165394402035, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02346752, + "loss": 1.3827, + "grad_norm": 0.08136715739965439, + "learning_rate": 0.00027228808755695884 + }, + { + "step": 164, + "epoch": 0.8346055979643766, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023363072, + "loss": 1.3764, + "grad_norm": 0.10279494524002075, + "learning_rate": 0.00027177006235054943 + }, + { + "step": 165, + "epoch": 0.8396946564885496, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023442944, + "loss": 1.3793, + "grad_norm": 0.09409327805042267, + "learning_rate": 0.0002712477424953534 + }, + { + "step": 166, + "epoch": 0.8447837150127226, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02335232, + "loss": 1.3808, + "grad_norm": 0.1479073464870453, + "learning_rate": 0.00027072114641281435 + }, + { + "step": 167, + "epoch": 0.8498727735368957, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02331392, + "loss": 1.3744, + "grad_norm": 0.12301099300384521, + "learning_rate": 0.0002701902926751921 + }, + { + "step": 168, + "epoch": 0.8549618320610687, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023316992, + "loss": 1.3754, + "grad_norm": 0.15243539214134216, + "learning_rate": 0.00026965520000490743 + }, + { + "step": 169, + "epoch": 0.8600508905852418, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023384576, + "loss": 1.3713, + "grad_norm": 0.12365742027759552, + "learning_rate": 0.0002691158872738822 + }, + { + "step": 170, + "epoch": 0.8651399491094147, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023392256, + "loss": 1.3892, + "grad_norm": 0.0711180791258812, + "learning_rate": 0.00026857237350287334 + }, + { + "step": 171, + "epoch": 0.8702290076335878, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023366144, + "loss": 1.3679, + "grad_norm": 0.09409341216087341, + "learning_rate": 0.0002680246778608023 + }, + { + "step": 172, + "epoch": 0.8753180661577609, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023346176, + "loss": 1.3824, + "grad_norm": 0.08553092926740646, + "learning_rate": 0.0002674728196640788 + }, + { + "step": 173, + "epoch": 0.8804071246819338, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02338304, + "loss": 1.3808, + "grad_norm": 0.07596352696418762, + "learning_rate": 0.00026691681837591984 + }, + { + "step": 174, + "epoch": 0.8854961832061069, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023326208, + "loss": 1.4026, + "grad_norm": 0.1006695106625557, + "learning_rate": 0.00026635669360566296 + }, + { + "step": 175, + "epoch": 0.8905852417302799, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023396864, + "loss": 1.3407, + "grad_norm": 0.07519319653511047, + "learning_rate": 0.00026579246510807477 + }, + { + "step": 176, + "epoch": 0.8956743002544529, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023293952, + "loss": 1.3654, + "grad_norm": 0.07208561897277832, + "learning_rate": 0.00026522415278265425 + }, + { + "step": 177, + "epoch": 0.9007633587786259, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023303168, + "loss": 1.3999, + "grad_norm": 0.14012208580970764, + "learning_rate": 0.0002646517766729309 + }, + { + "step": 178, + "epoch": 0.905852417302799, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023379968, + "loss": 1.3774, + "grad_norm": 0.12509065866470337, + "learning_rate": 0.0002640753569657579 + }, + { + "step": 179, + "epoch": 0.910941475826972, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023387648, + "loss": 1.3941, + "grad_norm": 0.12293844670057297, + "learning_rate": 0.0002634949139906 + }, + { + "step": 180, + "epoch": 0.916030534351145, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023393792, + "loss": 1.4012, + "grad_norm": 0.10916856676340103, + "learning_rate": 0.00026291046821881673 + }, + { + "step": 181, + "epoch": 0.9211195928753181, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023312384, + "loss": 1.3947, + "grad_norm": 0.09966816753149033, + "learning_rate": 0.0002623220402629402 + }, + { + "step": 182, + "epoch": 0.926208651399491, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023422976, + "loss": 1.3837, + "grad_norm": 0.07999789714813232, + "learning_rate": 0.0002617296508759483 + }, + { + "step": 183, + "epoch": 0.9312977099236641, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023410688, + "loss": 1.3961, + "grad_norm": 0.08375240117311478, + "learning_rate": 0.00026113332095053257 + }, + { + "step": 184, + "epoch": 0.9363867684478372, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023393792, + "loss": 1.3906, + "grad_norm": 0.15725623071193695, + "learning_rate": 0.0002605330715183616 + }, + { + "step": 185, + "epoch": 0.9414758269720102, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023332352, + "loss": 1.3962, + "grad_norm": 0.18562915921211243, + "learning_rate": 0.0002599289237493392 + }, + { + "step": 186, + "epoch": 0.9465648854961832, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023364608, + "loss": 1.3522, + "grad_norm": 0.0878351628780365, + "learning_rate": 0.0002593208989508575 + }, + { + "step": 187, + "epoch": 0.9516539440203562, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023461376, + "loss": 1.3867, + "grad_norm": 0.08273467421531677, + "learning_rate": 0.00025870901856704583 + }, + { + "step": 188, + "epoch": 0.9567430025445293, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023433728, + "loss": 1.3595, + "grad_norm": 0.08700236678123474, + "learning_rate": 0.00025809330417801425 + }, + { + "step": 189, + "epoch": 0.9618320610687023, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023338496, + "loss": 1.4216, + "grad_norm": 0.15821169316768646, + "learning_rate": 0.00025747377749909254 + }, + { + "step": 190, + "epoch": 0.9669211195928753, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023370752, + "loss": 1.3503, + "grad_norm": 0.09808243066072464, + "learning_rate": 0.00025685046038006413 + }, + { + "step": 191, + "epoch": 0.9720101781170484, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02336, + "loss": 1.415, + "grad_norm": 0.1363837718963623, + "learning_rate": 0.0002562233748043958 + }, + { + "step": 192, + "epoch": 0.9770992366412213, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023297024, + "loss": 1.3569, + "grad_norm": 0.10272692888975143, + "learning_rate": 0.00025559254288846196 + }, + { + "step": 193, + "epoch": 0.9821882951653944, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023418368, + "loss": 1.3936, + "grad_norm": 0.11269378662109375, + "learning_rate": 0.0002549579868807651 + }, + { + "step": 194, + "epoch": 0.9872773536895675, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023340032, + "loss": 1.366, + "grad_norm": 0.08116472512483597, + "learning_rate": 0.0002543197291611507 + }, + { + "step": 195, + "epoch": 0.9923664122137404, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02345216, + "loss": 1.3897, + "grad_norm": 0.0997994989156723, + "learning_rate": 0.0002536777922400183 + }, + { + "step": 196, + "epoch": 0.9974554707379135, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023447552, + "loss": 1.3678, + "grad_norm": 0.0656476616859436, + "learning_rate": 0.0002530321987575271 + }, + { + "step": 197, + "epoch": 1.0025445292620865, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225232384, + "loss": 2.0734, + "grad_norm": 0.16140273213386536, + "learning_rate": 0.0002523829714827981 + }, + { + "step": 198, + "epoch": 1.0076335877862594, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225298432, + "loss": 1.3658, + "grad_norm": 0.08280732482671738, + "learning_rate": 0.00025173013331311053 + }, + { + "step": 199, + "epoch": 1.0127226463104326, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22529536, + "loss": 1.3689, + "grad_norm": 0.1051713079214096, + "learning_rate": 0.0002510737072730946 + }, + { + "step": 200, + "epoch": 1.0178117048346056, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22518016, + "loss": 1.3666, + "grad_norm": 0.11070103198289871, + "learning_rate": 0.0002504137165139193 + }, + { + "step": 201, + "epoch": 1.0229007633587786, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225230848, + "loss": 1.3829, + "grad_norm": 0.1398647576570511, + "learning_rate": 0.0002497501843124761 + }, + { + "step": 202, + "epoch": 1.0279898218829517, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225224704, + "loss": 1.4109, + "grad_norm": 0.12396052479743958, + "learning_rate": 0.00024908313407055765 + }, + { + "step": 203, + "epoch": 1.0330788804071247, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22527232, + "loss": 1.3697, + "grad_norm": 0.08383014053106308, + "learning_rate": 0.00024841258931403284 + }, + { + "step": 204, + "epoch": 1.0381679389312977, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225204736, + "loss": 1.3538, + "grad_norm": 0.09628696739673615, + "learning_rate": 0.00024773857369201675 + }, + { + "step": 205, + "epoch": 1.0432569974554706, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225250816, + "loss": 1.3823, + "grad_norm": 0.09038093686103821, + "learning_rate": 0.00024706111097603676 + }, + { + "step": 206, + "epoch": 1.0483460559796438, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22526464, + "loss": 1.3618, + "grad_norm": 0.12518168985843658, + "learning_rate": 0.00024638022505919425 + }, + { + "step": 207, + "epoch": 1.0534351145038168, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225220096, + "loss": 1.3823, + "grad_norm": 0.12609227001667023, + "learning_rate": 0.00024569593995532157 + }, + { + "step": 208, + "epoch": 1.0585241730279897, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225212416, + "loss": 1.3597, + "grad_norm": 0.08165471255779266, + "learning_rate": 0.00024500827979813546 + }, + { + "step": 209, + "epoch": 1.063613231552163, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22533376, + "loss": 1.3712, + "grad_norm": 0.12241001427173615, + "learning_rate": 0.0002443172688403859 + }, + { + "step": 210, + "epoch": 1.0687022900763359, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225230848, + "loss": 1.3444, + "grad_norm": 0.12741129100322723, + "learning_rate": 0.00024362293145300027 + }, + { + "step": 211, + "epoch": 1.0737913486005088, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225232384, + "loss": 1.3836, + "grad_norm": 0.11377827823162079, + "learning_rate": 0.00024292529212422445 + }, + { + "step": 212, + "epoch": 1.078880407124682, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225235456, + "loss": 1.3826, + "grad_norm": 0.09822376817464828, + "learning_rate": 0.00024222437545875887 + }, + { + "step": 213, + "epoch": 1.083969465648855, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225161728, + "loss": 1.3893, + "grad_norm": 0.12009350955486298, + "learning_rate": 0.0002415202061768906 + }, + { + "step": 214, + "epoch": 1.089058524173028, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22515712, + "loss": 1.3711, + "grad_norm": 0.15842504799365997, + "learning_rate": 0.0002408128091136217 + }, + { + "step": 215, + "epoch": 1.094147582697201, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225207808, + "loss": 1.3964, + "grad_norm": 0.24463821947574615, + "learning_rate": 0.00024010220921779336 + }, + { + "step": 216, + "epoch": 1.099236641221374, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225336832, + "loss": 1.3802, + "grad_norm": 0.1074153408408165, + "learning_rate": 0.00023938843155120581 + }, + { + "step": 217, + "epoch": 1.104325699745547, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225198592, + "loss": 1.4212, + "grad_norm": 0.16619448363780975, + "learning_rate": 0.00023867150128773453 + }, + { + "step": 218, + "epoch": 1.10941475826972, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225224704, + "loss": 1.3949, + "grad_norm": 0.13379280269145966, + "learning_rate": 0.0002379514437124425 + }, + { + "step": 219, + "epoch": 1.1145038167938932, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225236992, + "loss": 1.4064, + "grad_norm": 0.1037093847990036, + "learning_rate": 0.00023722828422068814 + }, + { + "step": 220, + "epoch": 1.1195928753180662, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225209344, + "loss": 1.4236, + "grad_norm": 0.23141014575958252, + "learning_rate": 0.00023650204831723008 + }, + { + "step": 221, + "epoch": 1.1246819338422391, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225183232, + "loss": 1.4369, + "grad_norm": 0.13749733567237854, + "learning_rate": 0.00023577276161532718 + }, + { + "step": 222, + "epoch": 1.1297709923664123, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22519552, + "loss": 1.3801, + "grad_norm": 0.1272643506526947, + "learning_rate": 0.0002350404498358356 + }, + { + "step": 223, + "epoch": 1.1348600508905853, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225223168, + "loss": 1.3773, + "grad_norm": 0.16451361775398254, + "learning_rate": 0.00023430513880630133 + }, + { + "step": 224, + "epoch": 1.1399491094147582, + "cpu_mem": 2.097606656, + "gpu_mem": 5.2252416, + "loss": 1.3924, + "grad_norm": 0.11038436740636826, + "learning_rate": 0.00023356685446004966 + }, + { + "step": 225, + "epoch": 1.1450381679389312, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225292288, + "loss": 1.3357, + "grad_norm": 0.09217668324708939, + "learning_rate": 0.00023282562283527005 + }, + { + "step": 226, + "epoch": 1.1501272264631044, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225243136, + "loss": 1.398, + "grad_norm": 0.07793892920017242, + "learning_rate": 0.00023208147007409827 + }, + { + "step": 227, + "epoch": 1.1552162849872774, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225193984, + "loss": 1.3725, + "grad_norm": 0.09290432184934616, + "learning_rate": 0.00023133442242169425 + }, + { + "step": 228, + "epoch": 1.1603053435114503, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225238528, + "loss": 1.4033, + "grad_norm": 0.12867408990859985, + "learning_rate": 0.00023058450622531632 + }, + { + "step": 229, + "epoch": 1.1653944020356235, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225137152, + "loss": 1.3514, + "grad_norm": 0.06013266742229462, + "learning_rate": 0.00022983174793339206 + }, + { + "step": 230, + "epoch": 1.1704834605597965, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225204736, + "loss": 1.3907, + "grad_norm": 0.08891057223081589, + "learning_rate": 0.0002290761740945857 + }, + { + "step": 231, + "epoch": 1.1755725190839694, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22519552, + "loss": 1.4205, + "grad_norm": 0.1528865396976471, + "learning_rate": 0.00022831781135686135 + }, + { + "step": 232, + "epoch": 1.1806615776081424, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225200128, + "loss": 1.3167, + "grad_norm": 0.08379894495010376, + "learning_rate": 0.00022755668646654375 + }, + { + "step": 233, + "epoch": 1.1857506361323156, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225315328, + "loss": 1.3875, + "grad_norm": 0.1535923033952713, + "learning_rate": 0.00022679282626737442 + }, + { + "step": 234, + "epoch": 1.1908396946564885, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225275392, + "loss": 1.441, + "grad_norm": 0.15936315059661865, + "learning_rate": 0.00022602625769956519 + }, + { + "step": 235, + "epoch": 1.1959287531806615, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22523392, + "loss": 1.4172, + "grad_norm": 0.14903037250041962, + "learning_rate": 0.00022525700779884802 + }, + { + "step": 236, + "epoch": 1.2010178117048347, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225232384, + "loss": 1.3635, + "grad_norm": 0.10939159244298935, + "learning_rate": 0.00022448510369552164 + }, + { + "step": 237, + "epoch": 1.2061068702290076, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22525696, + "loss": 1.3834, + "grad_norm": 0.07424819469451904, + "learning_rate": 0.0002237105726134943 + }, + { + "step": 238, + "epoch": 1.2111959287531806, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225276928, + "loss": 1.3834, + "grad_norm": 0.060000572353601456, + "learning_rate": 0.00022293344186932406 + }, + { + "step": 239, + "epoch": 1.2162849872773536, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225193984, + "loss": 1.3754, + "grad_norm": 0.1027088314294815, + "learning_rate": 0.00022215373887125514 + }, + { + "step": 240, + "epoch": 1.2213740458015268, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225329152, + "loss": 1.3812, + "grad_norm": 0.0854240357875824, + "learning_rate": 0.00022137149111825128 + }, + { + "step": 241, + "epoch": 1.2264631043256997, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225283072, + "loss": 1.3629, + "grad_norm": 0.07821852713823318, + "learning_rate": 0.00022058672619902606 + }, + { + "step": 242, + "epoch": 1.2315521628498727, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225304576, + "loss": 1.3942, + "grad_norm": 0.09134451299905777, + "learning_rate": 0.00021979947179106966 + }, + { + "step": 243, + "epoch": 1.2366412213740459, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225190912, + "loss": 1.342, + "grad_norm": 0.16135185956954956, + "learning_rate": 0.0002190097556596728 + }, + { + "step": 244, + "epoch": 1.2417302798982188, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225178624, + "loss": 1.4488, + "grad_norm": 0.16970904171466827, + "learning_rate": 0.0002182176056569476 + }, + { + "step": 245, + "epoch": 1.2468193384223918, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225183232, + "loss": 1.3998, + "grad_norm": 0.12316521257162094, + "learning_rate": 0.00021742304972084518 + }, + { + "step": 246, + "epoch": 1.2519083969465647, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225220096, + "loss": 1.3909, + "grad_norm": 0.10256976634263992, + "learning_rate": 0.00021662611587417035 + }, + { + "step": 247, + "epoch": 1.256997455470738, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225181696, + "loss": 1.3759, + "grad_norm": 0.11245053261518478, + "learning_rate": 0.00021582683222359317 + }, + { + "step": 248, + "epoch": 1.262086513994911, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225227776, + "loss": 1.3965, + "grad_norm": 0.07924303412437439, + "learning_rate": 0.00021502522695865796 + }, + { + "step": 249, + "epoch": 1.267175572519084, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225236992, + "loss": 1.3974, + "grad_norm": 0.08717533200979233, + "learning_rate": 0.00021422132835078884 + }, + { + "step": 250, + "epoch": 1.272264631043257, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225275392, + "loss": 1.3586, + "grad_norm": 0.06859307736158371, + "learning_rate": 0.0002134151647522927 + }, + { + "step": 251, + "epoch": 1.27735368956743, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225193984, + "loss": 1.39, + "grad_norm": 0.11102957278490067, + "learning_rate": 0.00021260676459535933 + }, + { + "step": 252, + "epoch": 1.282442748091603, + "cpu_mem": 2.097606656, + "gpu_mem": 5.2252032, + "loss": 1.3652, + "grad_norm": 0.11725170910358429, + "learning_rate": 0.00021179615639105857 + }, + { + "step": 253, + "epoch": 1.2875318066157762, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225269248, + "loss": 1.3704, + "grad_norm": 0.0597151480615139, + "learning_rate": 0.00021098336872833482 + }, + { + "step": 254, + "epoch": 1.2926208651399491, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22519552, + "loss": 1.3567, + "grad_norm": 0.08320306986570358, + "learning_rate": 0.0002101684302729987 + }, + { + "step": 255, + "epoch": 1.297709923664122, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225189376, + "loss": 1.3894, + "grad_norm": 0.09628330171108246, + "learning_rate": 0.00020935136976671617 + }, + { + "step": 256, + "epoch": 1.3027989821882953, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225255424, + "loss": 1.3832, + "grad_norm": 0.09768350422382355, + "learning_rate": 0.00020853221602599458 + }, + { + "step": 257, + "epoch": 1.3078880407124682, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225213952, + "loss": 1.3786, + "grad_norm": 0.07934417575597763, + "learning_rate": 0.00020771099794116672 + }, + { + "step": 258, + "epoch": 1.3129770992366412, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225327616, + "loss": 1.3596, + "grad_norm": 0.07175023853778839, + "learning_rate": 0.0002068877444753717 + }, + { + "step": 259, + "epoch": 1.3180661577608141, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225204736, + "loss": 1.3785, + "grad_norm": 0.07853607088327408, + "learning_rate": 0.0002060624846635335 + }, + { + "step": 260, + "epoch": 1.3231552162849873, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225230848, + "loss": 1.3903, + "grad_norm": 0.07328056544065475, + "learning_rate": 0.00020523524761133677 + }, + { + "step": 261, + "epoch": 1.3282442748091603, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225236992, + "loss": 1.3997, + "grad_norm": 0.1354559361934662, + "learning_rate": 0.00020440606249420073 + }, + { + "step": 262, + "epoch": 1.3333333333333333, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225304576, + "loss": 1.381, + "grad_norm": 0.07513947784900665, + "learning_rate": 0.00020357495855624974 + }, + { + "step": 263, + "epoch": 1.3384223918575064, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225160192, + "loss": 1.3711, + "grad_norm": 0.07666267454624176, + "learning_rate": 0.0002027419651092822 + }, + { + "step": 264, + "epoch": 1.3435114503816794, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225293824, + "loss": 1.3741, + "grad_norm": 0.04778103902935982, + "learning_rate": 0.00020190711153173676 + }, + { + "step": 265, + "epoch": 1.3486005089058524, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225323008, + "loss": 1.3603, + "grad_norm": 0.07590889185667038, + "learning_rate": 0.00020107042726765588 + }, + { + "step": 266, + "epoch": 1.3536895674300253, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225192448, + "loss": 1.3944, + "grad_norm": 0.0872729942202568, + "learning_rate": 0.0002002319418256479 + }, + { + "step": 267, + "epoch": 1.3587786259541985, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225193984, + "loss": 1.3842, + "grad_norm": 0.08165434002876282, + "learning_rate": 0.00019939168477784583 + }, + { + "step": 268, + "epoch": 1.3638676844783715, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22519552, + "loss": 1.3545, + "grad_norm": 0.05452108755707741, + "learning_rate": 0.00019854968575886458 + }, + { + "step": 269, + "epoch": 1.3689567430025447, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225269248, + "loss": 1.4032, + "grad_norm": 0.11400062590837479, + "learning_rate": 0.00019770597446475588 + }, + { + "step": 270, + "epoch": 1.3740458015267176, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225250816, + "loss": 1.3541, + "grad_norm": 0.06522851437330246, + "learning_rate": 0.0001968605806519608 + }, + { + "step": 271, + "epoch": 1.3791348600508906, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225193984, + "loss": 1.4057, + "grad_norm": 0.09262599050998688, + "learning_rate": 0.00019601353413626032 + }, + { + "step": 272, + "epoch": 1.3842239185750635, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225232384, + "loss": 1.3766, + "grad_norm": 0.1050165593624115, + "learning_rate": 0.00019516486479172386 + }, + { + "step": 273, + "epoch": 1.3893129770992365, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225299968, + "loss": 1.3831, + "grad_norm": 0.06024334952235222, + "learning_rate": 0.0001943146025496555 + }, + { + "step": 274, + "epoch": 1.3944020356234097, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225252352, + "loss": 1.4013, + "grad_norm": 0.09611882269382477, + "learning_rate": 0.00019346277739753855 + }, + { + "step": 275, + "epoch": 1.3994910941475827, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22537984, + "loss": 1.3884, + "grad_norm": 0.10370416939258575, + "learning_rate": 0.00019260941937797776 + }, + { + "step": 276, + "epoch": 1.4045801526717558, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22526464, + "loss": 1.3767, + "grad_norm": 0.0938180536031723, + "learning_rate": 0.00019175455858763988 + }, + { + "step": 277, + "epoch": 1.4096692111959288, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225213952, + "loss": 1.3761, + "grad_norm": 0.0955619290471077, + "learning_rate": 0.0001908982251761921 + }, + { + "step": 278, + "epoch": 1.4147582697201018, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225201664, + "loss": 1.3384, + "grad_norm": 0.09996272623538971, + "learning_rate": 0.00019004044934523871 + }, + { + "step": 279, + "epoch": 1.4198473282442747, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225193984, + "loss": 1.3895, + "grad_norm": 0.06775447726249695, + "learning_rate": 0.00018918126134725616 + }, + { + "step": 280, + "epoch": 1.424936386768448, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225316864, + "loss": 1.3685, + "grad_norm": 0.07409968227148056, + "learning_rate": 0.00018832069148452582 + }, + { + "step": 281, + "epoch": 1.4300254452926209, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22525696, + "loss": 1.3817, + "grad_norm": 0.11970478296279907, + "learning_rate": 0.00018745877010806534 + }, + { + "step": 282, + "epoch": 1.4351145038167938, + "cpu_mem": 2.097606656, + "gpu_mem": 5.2252032, + "loss": 1.3897, + "grad_norm": 0.13555559515953064, + "learning_rate": 0.00018659552761655828 + }, + { + "step": 283, + "epoch": 1.440203562340967, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225220096, + "loss": 1.4252, + "grad_norm": 0.153153195977211, + "learning_rate": 0.00018573099445528204 + }, + { + "step": 284, + "epoch": 1.44529262086514, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225359872, + "loss": 1.363, + "grad_norm": 0.09430140256881714, + "learning_rate": 0.00018486520111503387 + }, + { + "step": 285, + "epoch": 1.450381679389313, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225230848, + "loss": 1.3877, + "grad_norm": 0.07627585530281067, + "learning_rate": 0.0001839981781310558 + }, + { + "step": 286, + "epoch": 1.455470737913486, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22522624, + "loss": 1.3906, + "grad_norm": 0.10076899826526642, + "learning_rate": 0.00018312995608195747 + }, + { + "step": 287, + "epoch": 1.460559796437659, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225129472, + "loss": 1.376, + "grad_norm": 0.10536720603704453, + "learning_rate": 0.00018226056558863778 + }, + { + "step": 288, + "epoch": 1.465648854961832, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225204736, + "loss": 1.3564, + "grad_norm": 0.0883631482720375, + "learning_rate": 0.00018139003731320496 + }, + { + "step": 289, + "epoch": 1.470737913486005, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225224704, + "loss": 1.3918, + "grad_norm": 0.09157300740480423, + "learning_rate": 0.00018051840195789506 + }, + { + "step": 290, + "epoch": 1.4758269720101782, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225190912, + "loss": 1.3718, + "grad_norm": 0.07194729149341583, + "learning_rate": 0.00017964569026398926 + }, + { + "step": 291, + "epoch": 1.4809160305343512, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225152512, + "loss": 1.3771, + "grad_norm": 0.07772547006607056, + "learning_rate": 0.00017877193301072945 + }, + { + "step": 292, + "epoch": 1.4860050890585241, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225278464, + "loss": 1.3817, + "grad_norm": 0.07354319095611572, + "learning_rate": 0.0001778971610142331 + }, + { + "step": 293, + "epoch": 1.491094147582697, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225155584, + "loss": 1.4081, + "grad_norm": 0.09436897188425064, + "learning_rate": 0.00017702140512640594 + }, + { + "step": 294, + "epoch": 1.4961832061068703, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225198592, + "loss": 1.4123, + "grad_norm": 0.09413344413042068, + "learning_rate": 0.00017614469623385414 + }, + { + "step": 295, + "epoch": 1.5012722646310432, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225181696, + "loss": 1.342, + "grad_norm": 0.12768588960170746, + "learning_rate": 0.00017526706525679498 + }, + { + "step": 296, + "epoch": 1.5063613231552164, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225223168, + "loss": 1.3782, + "grad_norm": 0.0854087620973587, + "learning_rate": 0.00017438854314796623 + }, + { + "step": 297, + "epoch": 1.5114503816793894, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225163264, + "loss": 1.3524, + "grad_norm": 0.06653552502393723, + "learning_rate": 0.00017350916089153455 + }, + { + "step": 298, + "epoch": 1.5165394402035624, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225186304, + "loss": 1.347, + "grad_norm": 0.10484214872121811, + "learning_rate": 0.00017262894950200277 + }, + { + "step": 299, + "epoch": 1.5216284987277353, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225175552, + "loss": 1.3545, + "grad_norm": 0.07785512506961823, + "learning_rate": 0.000171747940023116 + }, + { + "step": 300, + "epoch": 1.5267175572519083, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22518784, + "loss": 1.3768, + "grad_norm": 0.06456510722637177, + "learning_rate": 0.0001708661635267667 + }, + { + "step": 301, + "epoch": 1.5318066157760815, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225190912, + "loss": 1.3509, + "grad_norm": 0.1549719274044037, + "learning_rate": 0.00016998365111189906 + }, + { + "step": 302, + "epoch": 1.5368956743002544, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22517248, + "loss": 1.3762, + "grad_norm": 0.06324886530637741, + "learning_rate": 0.00016910043390341183 + }, + { + "step": 303, + "epoch": 1.5419847328244276, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225178624, + "loss": 1.3637, + "grad_norm": 0.09968630969524384, + "learning_rate": 0.0001682165430510609 + }, + { + "step": 304, + "epoch": 1.5470737913486006, + "cpu_mem": 2.097606656, + "gpu_mem": 5.2251264, + "loss": 1.4372, + "grad_norm": 0.25002405047416687, + "learning_rate": 0.00016733200972836055 + }, + { + "step": 305, + "epoch": 1.5521628498727735, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225198592, + "loss": 1.3731, + "grad_norm": 0.09397585690021515, + "learning_rate": 0.00016644686513148397 + }, + { + "step": 306, + "epoch": 1.5572519083969465, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225240064, + "loss": 1.3619, + "grad_norm": 0.09155919402837753, + "learning_rate": 0.00016556114047816317 + }, + { + "step": 307, + "epoch": 1.5623409669211195, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22518784, + "loss": 1.3458, + "grad_norm": 0.07885987311601639, + "learning_rate": 0.00016467486700658785 + }, + { + "step": 308, + "epoch": 1.5674300254452926, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225198592, + "loss": 1.397, + "grad_norm": 0.09076039493083954, + "learning_rate": 0.0001637880759743037 + }, + { + "step": 309, + "epoch": 1.5725190839694656, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225181696, + "loss": 1.3893, + "grad_norm": 0.11672648787498474, + "learning_rate": 0.00016290079865711004 + }, + { + "step": 310, + "epoch": 1.5776081424936388, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225236992, + "loss": 1.3713, + "grad_norm": 0.07299899309873581, + "learning_rate": 0.00016201306634795675 + }, + { + "step": 311, + "epoch": 1.5826972010178118, + "cpu_mem": 2.097606656, + "gpu_mem": 5.2252032, + "loss": 1.3657, + "grad_norm": 0.06599471718072891, + "learning_rate": 0.00016112491035584047 + }, + { + "step": 312, + "epoch": 1.5877862595419847, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225212416, + "loss": 1.3937, + "grad_norm": 0.07548397779464722, + "learning_rate": 0.00016023636200470065 + }, + { + "step": 313, + "epoch": 1.5928753180661577, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225213952, + "loss": 1.3843, + "grad_norm": 0.14317268133163452, + "learning_rate": 0.00015934745263231464 + }, + { + "step": 314, + "epoch": 1.5979643765903306, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22544128, + "loss": 1.3758, + "grad_norm": 0.07235162705183029, + "learning_rate": 0.00015845821358919236 + }, + { + "step": 315, + "epoch": 1.6030534351145038, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22523392, + "loss": 1.3618, + "grad_norm": 0.10120508074760437, + "learning_rate": 0.00015756867623747088 + }, + { + "step": 316, + "epoch": 1.608142493638677, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225206272, + "loss": 1.3784, + "grad_norm": 0.11348243802785873, + "learning_rate": 0.00015667887194980806 + }, + { + "step": 317, + "epoch": 1.61323155216285, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225170944, + "loss": 1.3768, + "grad_norm": 0.15892241895198822, + "learning_rate": 0.00015578883210827626 + }, + { + "step": 318, + "epoch": 1.618320610687023, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225261568, + "loss": 1.3633, + "grad_norm": 0.10408803820610046, + "learning_rate": 0.0001548985881032554 + }, + { + "step": 319, + "epoch": 1.623409669211196, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225186304, + "loss": 1.3703, + "grad_norm": 0.1156342476606369, + "learning_rate": 0.00015400817133232606 + }, + { + "step": 320, + "epoch": 1.6284987277353689, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22523392, + "loss": 1.3663, + "grad_norm": 0.08790463954210281, + "learning_rate": 0.00015311761319916184 + }, + { + "step": 321, + "epoch": 1.6335877862595418, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225175552, + "loss": 1.3768, + "grad_norm": 0.11334401369094849, + "learning_rate": 0.00015222694511242215 + }, + { + "step": 322, + "epoch": 1.638676844783715, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225396736, + "loss": 1.3877, + "grad_norm": 0.10038699954748154, + "learning_rate": 0.00015133619848464424 + }, + { + "step": 323, + "epoch": 1.6437659033078882, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225278464, + "loss": 1.3582, + "grad_norm": 0.0993136465549469, + "learning_rate": 0.0001504454047311353 + }, + { + "step": 324, + "epoch": 1.6488549618320612, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225170944, + "loss": 1.3425, + "grad_norm": 0.17016522586345673, + "learning_rate": 0.00014955459526886468 + }, + { + "step": 325, + "epoch": 1.6539440203562341, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22521088, + "loss": 1.3867, + "grad_norm": 0.10087893158197403, + "learning_rate": 0.00014866380151535574 + }, + { + "step": 326, + "epoch": 1.659033078880407, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225258496, + "loss": 1.366, + "grad_norm": 0.1097145825624466, + "learning_rate": 0.0001477730548875778 + }, + { + "step": 327, + "epoch": 1.66412213740458, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225224704, + "loss": 1.4002, + "grad_norm": 0.13216274976730347, + "learning_rate": 0.0001468823868008382 + }, + { + "step": 328, + "epoch": 1.6692111959287532, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22528, + "loss": 1.3901, + "grad_norm": 0.11289918422698975, + "learning_rate": 0.000145991828667674 + }, + { + "step": 329, + "epoch": 1.6743002544529262, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22517248, + "loss": 1.3641, + "grad_norm": 0.08961150050163269, + "learning_rate": 0.0001451014118967446 + }, + { + "step": 330, + "epoch": 1.6793893129770994, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22525696, + "loss": 1.3433, + "grad_norm": 0.11354149132966995, + "learning_rate": 0.00014421116789172374 + }, + { + "step": 331, + "epoch": 1.6844783715012723, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225246208, + "loss": 1.364, + "grad_norm": 0.13793021440505981, + "learning_rate": 0.00014332112805019194 + }, + { + "step": 332, + "epoch": 1.6895674300254453, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225201664, + "loss": 1.3754, + "grad_norm": 0.07995404303073883, + "learning_rate": 0.00014243132376252912 + }, + { + "step": 333, + "epoch": 1.6946564885496183, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225290752, + "loss": 1.4013, + "grad_norm": 0.09345988929271698, + "learning_rate": 0.00014154178641080767 + }, + { + "step": 334, + "epoch": 1.6997455470737912, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22521856, + "loss": 1.3631, + "grad_norm": 0.09081145375967026, + "learning_rate": 0.0001406525473676854 + }, + { + "step": 335, + "epoch": 1.7048346055979644, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225227776, + "loss": 1.3836, + "grad_norm": 0.08292467892169952, + "learning_rate": 0.00013976363799529936 + }, + { + "step": 336, + "epoch": 1.7099236641221374, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22528, + "loss": 1.342, + "grad_norm": 0.08972112834453583, + "learning_rate": 0.00013887508964415956 + }, + { + "step": 337, + "epoch": 1.7150127226463106, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22528, + "loss": 1.3517, + "grad_norm": 0.21372486650943756, + "learning_rate": 0.00013798693365204325 + }, + { + "step": 338, + "epoch": 1.7201017811704835, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225316864, + "loss": 1.3815, + "grad_norm": 0.1405383050441742, + "learning_rate": 0.00013709920134288993 + }, + { + "step": 339, + "epoch": 1.7251908396946565, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22523392, + "loss": 1.3905, + "grad_norm": 0.11434153467416763, + "learning_rate": 0.00013621192402569628 + }, + { + "step": 340, + "epoch": 1.7302798982188294, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22530304, + "loss": 1.3732, + "grad_norm": 0.0888475552201271, + "learning_rate": 0.00013532513299341215 + }, + { + "step": 341, + "epoch": 1.7353689567430024, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225244672, + "loss": 1.3693, + "grad_norm": 0.10161008685827255, + "learning_rate": 0.00013443885952183683 + }, + { + "step": 342, + "epoch": 1.7404580152671756, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225250816, + "loss": 1.3554, + "grad_norm": 0.1444694697856903, + "learning_rate": 0.00013355313486851603 + }, + { + "step": 343, + "epoch": 1.7455470737913485, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225161728, + "loss": 1.3813, + "grad_norm": 0.12481843680143356, + "learning_rate": 0.00013266799027163942 + }, + { + "step": 344, + "epoch": 1.7506361323155217, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22528, + "loss": 1.3865, + "grad_norm": 0.0915384441614151, + "learning_rate": 0.00013178345694893906 + }, + { + "step": 345, + "epoch": 1.7557251908396947, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225232384, + "loss": 1.3553, + "grad_norm": 0.09329578280448914, + "learning_rate": 0.0001308995660965881 + }, + { + "step": 346, + "epoch": 1.7608142493638677, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225178624, + "loss": 1.375, + "grad_norm": 0.1313147246837616, + "learning_rate": 0.00013001634888810094 + }, + { + "step": 347, + "epoch": 1.7659033078880406, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225227776, + "loss": 1.3583, + "grad_norm": 0.14309073984622955, + "learning_rate": 0.0001291338364732333 + }, + { + "step": 348, + "epoch": 1.7709923664122136, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225353728, + "loss": 1.3962, + "grad_norm": 0.1070340946316719, + "learning_rate": 0.00012825205997688403 + }, + { + "step": 349, + "epoch": 1.7760814249363868, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225152512, + "loss": 1.3285, + "grad_norm": 0.08102849125862122, + "learning_rate": 0.00012737105049799723 + }, + { + "step": 350, + "epoch": 1.78117048346056, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225217024, + "loss": 1.3583, + "grad_norm": 0.08409847319126129, + "learning_rate": 0.00012649083910846543 + }, + { + "step": 351, + "epoch": 1.786259541984733, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225232384, + "loss": 1.3452, + "grad_norm": 0.09179184585809708, + "learning_rate": 0.00012561145685203374 + }, + { + "step": 352, + "epoch": 1.7913486005089059, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225223168, + "loss": 1.3526, + "grad_norm": 0.08309400081634521, + "learning_rate": 0.00012473293474320505 + }, + { + "step": 353, + "epoch": 1.7964376590330788, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225220096, + "loss": 1.3494, + "grad_norm": 0.14211881160736084, + "learning_rate": 0.00012385530376614586 + }, + { + "step": 354, + "epoch": 1.8015267175572518, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22519552, + "loss": 1.3837, + "grad_norm": 0.11210797727108002, + "learning_rate": 0.00012297859487359408 + }, + { + "step": 355, + "epoch": 1.806615776081425, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225178624, + "loss": 1.3557, + "grad_norm": 0.07443038374185562, + "learning_rate": 0.0001221028389857669 + }, + { + "step": 356, + "epoch": 1.811704834605598, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22518016, + "loss": 1.3747, + "grad_norm": 0.0735790953040123, + "learning_rate": 0.00012122806698927051 + }, + { + "step": 357, + "epoch": 1.8167938931297711, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225232384, + "loss": 1.3404, + "grad_norm": 0.13101893663406372, + "learning_rate": 0.00012035430973601075 + }, + { + "step": 358, + "epoch": 1.821882951653944, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225298432, + "loss": 1.3882, + "grad_norm": 0.21229051053524017, + "learning_rate": 0.00011948159804210495 + }, + { + "step": 359, + "epoch": 1.826972010178117, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225252352, + "loss": 1.3537, + "grad_norm": 0.09808764606714249, + "learning_rate": 0.00011860996268679504 + }, + { + "step": 360, + "epoch": 1.83206106870229, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22530304, + "loss": 1.3848, + "grad_norm": 0.08892170339822769, + "learning_rate": 0.00011773943441136221 + }, + { + "step": 361, + "epoch": 1.837150127226463, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225273856, + "loss": 1.4288, + "grad_norm": 0.14257118105888367, + "learning_rate": 0.00011687004391804251 + }, + { + "step": 362, + "epoch": 1.8422391857506362, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225207808, + "loss": 1.3374, + "grad_norm": 0.09616126120090485, + "learning_rate": 0.00011600182186894417 + }, + { + "step": 363, + "epoch": 1.8473282442748091, + "cpu_mem": 2.097606656, + "gpu_mem": 5.2251648, + "loss": 1.3383, + "grad_norm": 0.09475188702344894, + "learning_rate": 0.00011513479888496609 + }, + { + "step": 364, + "epoch": 1.8524173027989823, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225192448, + "loss": 1.3286, + "grad_norm": 0.18226824700832367, + "learning_rate": 0.00011426900554471795 + }, + { + "step": 365, + "epoch": 1.8575063613231553, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225175552, + "loss": 1.3882, + "grad_norm": 0.10736498981714249, + "learning_rate": 0.0001134044723834417 + }, + { + "step": 366, + "epoch": 1.8625954198473282, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22521856, + "loss": 1.3582, + "grad_norm": 0.12344431132078171, + "learning_rate": 0.00011254122989193465 + }, + { + "step": 367, + "epoch": 1.8676844783715012, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22527232, + "loss": 1.3593, + "grad_norm": 0.1394435614347458, + "learning_rate": 0.00011167930851547418 + }, + { + "step": 368, + "epoch": 1.8727735368956742, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225169408, + "loss": 1.3748, + "grad_norm": 0.1193355917930603, + "learning_rate": 0.0001108187386527438 + }, + { + "step": 369, + "epoch": 1.8778625954198473, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225229312, + "loss": 1.3557, + "grad_norm": 0.15916530787944794, + "learning_rate": 0.00010995955065476126 + }, + { + "step": 370, + "epoch": 1.8829516539440203, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225152512, + "loss": 1.3711, + "grad_norm": 0.21765422821044922, + "learning_rate": 0.00010910177482380795 + }, + { + "step": 371, + "epoch": 1.8880407124681935, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225312256, + "loss": 1.3764, + "grad_norm": 0.13213393092155457, + "learning_rate": 0.00010824544141236015 + }, + { + "step": 372, + "epoch": 1.8931297709923665, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225166336, + "loss": 1.3653, + "grad_norm": 0.08698606491088867, + "learning_rate": 0.00010739058062202224 + }, + { + "step": 373, + "epoch": 1.8982188295165394, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225189376, + "loss": 1.3323, + "grad_norm": 0.08427226543426514, + "learning_rate": 0.00010653722260246145 + }, + { + "step": 374, + "epoch": 1.9033078880407124, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225212416, + "loss": 1.4363, + "grad_norm": 0.25485554337501526, + "learning_rate": 0.00010568539745034447 + }, + { + "step": 375, + "epoch": 1.9083969465648853, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225436672, + "loss": 1.4589, + "grad_norm": 0.21366065740585327, + "learning_rate": 0.00010483513520827614 + }, + { + "step": 376, + "epoch": 1.9134860050890585, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225193984, + "loss": 1.3818, + "grad_norm": 0.10645754635334015, + "learning_rate": 0.00010398646586373969 + }, + { + "step": 377, + "epoch": 1.9185750636132317, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225316864, + "loss": 1.3825, + "grad_norm": 0.12373249232769012, + "learning_rate": 0.00010313941934803922 + }, + { + "step": 378, + "epoch": 1.9236641221374047, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225232384, + "loss": 1.3845, + "grad_norm": 0.11327085644006729, + "learning_rate": 0.00010229402553524413 + }, + { + "step": 379, + "epoch": 1.9287531806615776, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225321472, + "loss": 1.3517, + "grad_norm": 0.16957500576972961, + "learning_rate": 0.00010145031424113542 + }, + { + "step": 380, + "epoch": 1.9338422391857506, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225220096, + "loss": 1.3421, + "grad_norm": 0.11874180287122726, + "learning_rate": 0.00010060831522215416 + }, + { + "step": 381, + "epoch": 1.9389312977099236, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225227776, + "loss": 1.3681, + "grad_norm": 0.09566855430603027, + "learning_rate": 9.976805817435207e-05 + }, + { + "step": 382, + "epoch": 1.9440203562340967, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225150976, + "loss": 1.3379, + "grad_norm": 0.14228934049606323, + "learning_rate": 9.89295727323441e-05 + }, + { + "step": 383, + "epoch": 1.9491094147582697, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225230848, + "loss": 1.4178, + "grad_norm": 0.1772908717393875, + "learning_rate": 9.809288846826327e-05 + }, + { + "step": 384, + "epoch": 1.954198473282443, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225201664, + "loss": 1.3583, + "grad_norm": 0.11914203315973282, + "learning_rate": 9.725803489071779e-05 + }, + { + "step": 385, + "epoch": 1.9592875318066159, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225189376, + "loss": 1.3387, + "grad_norm": 0.09314459562301636, + "learning_rate": 9.642504144375026e-05 + }, + { + "step": 386, + "epoch": 1.9643765903307888, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225332224, + "loss": 1.3339, + "grad_norm": 0.15286897122859955, + "learning_rate": 9.559393750579926e-05 + }, + { + "step": 387, + "epoch": 1.9694656488549618, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225215488, + "loss": 1.3829, + "grad_norm": 0.11533936858177185, + "learning_rate": 9.476475238866318e-05 + }, + { + "step": 388, + "epoch": 1.9745547073791347, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22522624, + "loss": 1.3833, + "grad_norm": 0.10550590604543686, + "learning_rate": 9.393751533646649e-05 + }, + { + "step": 389, + "epoch": 1.979643765903308, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225381376, + "loss": 1.4085, + "grad_norm": 0.14180640876293182, + "learning_rate": 9.31122555246283e-05 + }, + { + "step": 390, + "epoch": 1.984732824427481, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225190912, + "loss": 1.3948, + "grad_norm": 0.12902002036571503, + "learning_rate": 9.228900205883324e-05 + }, + { + "step": 391, + "epoch": 1.989821882951654, + "cpu_mem": 2.097606656, + "gpu_mem": 5.225212416, + "loss": 1.3746, + "grad_norm": 0.1296055167913437, + "learning_rate": 9.146778397400543e-05 + }, + { + "step": 392, + "epoch": 1.994910941475827, + "cpu_mem": 2.097606656, + "gpu_mem": 5.22524928, + "loss": 1.3948, + "grad_norm": 0.10897767543792725, + "learning_rate": 9.064863023328384e-05 + }, + { + "step": 393, + "epoch": 2.0, + "cpu_mem": 2.097606656, + "gpu_mem": 5.224836096, + "loss": 2.0283, + "grad_norm": 0.15235230326652527, + "learning_rate": 8.983156972700125e-05 + }, + { + "step": 394, + "epoch": 2.005089058524173, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023320064, + "loss": 1.3737, + "grad_norm": 0.1621650904417038, + "learning_rate": 8.901663127166513e-05 + }, + { + "step": 395, + "epoch": 2.010178117048346, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02329856, + "loss": 1.3848, + "grad_norm": 0.10830694437026978, + "learning_rate": 8.820384360894143e-05 + }, + { + "step": 396, + "epoch": 2.015267175572519, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02331392, + "loss": 1.3742, + "grad_norm": 0.18477565050125122, + "learning_rate": 8.739323540464063e-05 + }, + { + "step": 397, + "epoch": 2.0203562340966923, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023461376, + "loss": 1.3515, + "grad_norm": 0.09645005315542221, + "learning_rate": 8.658483524770728e-05 + }, + { + "step": 398, + "epoch": 2.0254452926208653, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023407616, + "loss": 1.3977, + "grad_norm": 0.11919897049665451, + "learning_rate": 8.577867164921113e-05 + }, + { + "step": 399, + "epoch": 2.030534351145038, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023424512, + "loss": 1.4004, + "grad_norm": 0.11987853795289993, + "learning_rate": 8.497477304134203e-05 + }, + { + "step": 400, + "epoch": 2.035623409669211, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023376896, + "loss": 1.3949, + "grad_norm": 0.13423588871955872, + "learning_rate": 8.41731677764068e-05 + }, + { + "step": 401, + "epoch": 2.040712468193384, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023401472, + "loss": 1.3864, + "grad_norm": 0.11585619300603867, + "learning_rate": 8.337388412582972e-05 + }, + { + "step": 402, + "epoch": 2.045801526717557, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023449088, + "loss": 1.3656, + "grad_norm": 0.1694011688232422, + "learning_rate": 8.257695027915481e-05 + }, + { + "step": 403, + "epoch": 2.05089058524173, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023358464, + "loss": 1.3767, + "grad_norm": 0.16418787837028503, + "learning_rate": 8.178239434305235e-05 + }, + { + "step": 404, + "epoch": 2.0559796437659035, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023330816, + "loss": 1.3393, + "grad_norm": 0.153577521443367, + "learning_rate": 8.099024434032717e-05 + }, + { + "step": 405, + "epoch": 2.0610687022900764, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023419904, + "loss": 1.3386, + "grad_norm": 0.09233392030000687, + "learning_rate": 8.02005282089303e-05 + }, + { + "step": 406, + "epoch": 2.0661577608142494, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023310848, + "loss": 1.3644, + "grad_norm": 0.16417177021503448, + "learning_rate": 7.941327380097388e-05 + }, + { + "step": 407, + "epoch": 2.0712468193384224, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023363072, + "loss": 1.3523, + "grad_norm": 0.11673954129219055, + "learning_rate": 7.862850888174869e-05 + }, + { + "step": 408, + "epoch": 2.0763358778625953, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023316992, + "loss": 1.3595, + "grad_norm": 0.116581991314888, + "learning_rate": 7.784626112874487e-05 + }, + { + "step": 409, + "epoch": 2.0814249363867683, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023519744, + "loss": 1.3567, + "grad_norm": 0.12573616206645966, + "learning_rate": 7.706655813067594e-05 + }, + { + "step": 410, + "epoch": 2.0865139949109412, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023518208, + "loss": 1.3813, + "grad_norm": 0.17161321640014648, + "learning_rate": 7.628942738650573e-05 + }, + { + "step": 411, + "epoch": 2.0916030534351147, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023441408, + "loss": 1.3605, + "grad_norm": 0.12039340287446976, + "learning_rate": 7.551489630447835e-05 + }, + { + "step": 412, + "epoch": 2.0966921119592876, + "cpu_mem": 2.097606656, + "gpu_mem": 5.0233984, + "loss": 1.3399, + "grad_norm": 0.10583378374576569, + "learning_rate": 7.474299220115195e-05 + }, + { + "step": 413, + "epoch": 2.1017811704834606, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02337536, + "loss": 1.3249, + "grad_norm": 0.08971253782510757, + "learning_rate": 7.397374230043484e-05 + }, + { + "step": 414, + "epoch": 2.1068702290076335, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02332928, + "loss": 1.3435, + "grad_norm": 0.11068787425756454, + "learning_rate": 7.320717373262557e-05 + }, + { + "step": 415, + "epoch": 2.1119592875318065, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023446016, + "loss": 1.3501, + "grad_norm": 0.15414883196353912, + "learning_rate": 7.244331353345625e-05 + }, + { + "step": 416, + "epoch": 2.1170483460559795, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023333888, + "loss": 1.3615, + "grad_norm": 0.08943329006433487, + "learning_rate": 7.16821886431386e-05 + }, + { + "step": 417, + "epoch": 2.122137404580153, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023323136, + "loss": 1.3689, + "grad_norm": 0.10885965079069138, + "learning_rate": 7.092382590541432e-05 + }, + { + "step": 418, + "epoch": 2.127226463104326, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023327744, + "loss": 1.3554, + "grad_norm": 0.07330764085054398, + "learning_rate": 7.016825206660788e-05 + }, + { + "step": 419, + "epoch": 2.132315521628499, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023442944, + "loss": 1.4095, + "grad_norm": 0.16408753395080566, + "learning_rate": 6.941549377468367e-05 + }, + { + "step": 420, + "epoch": 2.1374045801526718, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023416832, + "loss": 1.377, + "grad_norm": 0.12450336664915085, + "learning_rate": 6.866557757830575e-05 + }, + { + "step": 421, + "epoch": 2.1424936386768447, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023318528, + "loss": 1.3375, + "grad_norm": 0.17032939195632935, + "learning_rate": 6.791852992590169e-05 + }, + { + "step": 422, + "epoch": 2.1475826972010177, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023353856, + "loss": 1.3414, + "grad_norm": 0.12528181076049805, + "learning_rate": 6.717437716472997e-05 + }, + { + "step": 423, + "epoch": 2.1526717557251906, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023320064, + "loss": 1.3503, + "grad_norm": 0.08990214020013809, + "learning_rate": 6.643314553995034e-05 + }, + { + "step": 424, + "epoch": 2.157760814249364, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023453696, + "loss": 1.3395, + "grad_norm": 0.08977000415325165, + "learning_rate": 6.569486119369863e-05 + }, + { + "step": 425, + "epoch": 2.162849872773537, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023427584, + "loss": 1.3624, + "grad_norm": 0.11898981779813766, + "learning_rate": 6.495955016416441e-05 + }, + { + "step": 426, + "epoch": 2.16793893129771, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023472128, + "loss": 1.346, + "grad_norm": 0.15996168553829193, + "learning_rate": 6.422723838467286e-05 + }, + { + "step": 427, + "epoch": 2.173027989821883, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023363072, + "loss": 1.3614, + "grad_norm": 0.17816510796546936, + "learning_rate": 6.349795168276994e-05 + }, + { + "step": 428, + "epoch": 2.178117048346056, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023379968, + "loss": 1.3689, + "grad_norm": 0.20531681180000305, + "learning_rate": 6.277171577931187e-05 + }, + { + "step": 429, + "epoch": 2.183206106870229, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023455232, + "loss": 1.3663, + "grad_norm": 0.11304175108671188, + "learning_rate": 6.204855628755751e-05 + }, + { + "step": 430, + "epoch": 2.188295165394402, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023303168, + "loss": 1.3613, + "grad_norm": 0.10711293667554855, + "learning_rate": 6.13284987122654e-05 + }, + { + "step": 431, + "epoch": 2.1933842239185752, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02342144, + "loss": 1.3609, + "grad_norm": 0.20871219038963318, + "learning_rate": 6.061156844879417e-05 + }, + { + "step": 432, + "epoch": 2.198473282442748, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023410688, + "loss": 1.3359, + "grad_norm": 0.1800965517759323, + "learning_rate": 5.9897790782206636e-05 + }, + { + "step": 433, + "epoch": 2.203562340966921, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023376896, + "loss": 1.3455, + "grad_norm": 0.10922704637050629, + "learning_rate": 5.9187190886378306e-05 + }, + { + "step": 434, + "epoch": 2.208651399491094, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023355392, + "loss": 1.3621, + "grad_norm": 0.12730520963668823, + "learning_rate": 5.8479793823109406e-05 + }, + { + "step": 435, + "epoch": 2.213740458015267, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023350784, + "loss": 1.3725, + "grad_norm": 0.16326957941055298, + "learning_rate": 5.777562454124113e-05 + }, + { + "step": 436, + "epoch": 2.21882951653944, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023361536, + "loss": 1.36, + "grad_norm": 0.12224474549293518, + "learning_rate": 5.7074707875775496e-05 + }, + { + "step": 437, + "epoch": 2.223918575063613, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023415296, + "loss": 1.395, + "grad_norm": 0.1534615457057953, + "learning_rate": 5.637706854699974e-05 + }, + { + "step": 438, + "epoch": 2.2290076335877864, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023332352, + "loss": 1.3812, + "grad_norm": 0.13234108686447144, + "learning_rate": 5.568273115961414e-05 + }, + { + "step": 439, + "epoch": 2.2340966921119594, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02336, + "loss": 1.3863, + "grad_norm": 0.17735913395881653, + "learning_rate": 5.499172020186447e-05 + }, + { + "step": 440, + "epoch": 2.2391857506361323, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023358464, + "loss": 1.3613, + "grad_norm": 0.13855727016925812, + "learning_rate": 5.430406004467842e-05 + }, + { + "step": 441, + "epoch": 2.2442748091603053, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023401472, + "loss": 1.3606, + "grad_norm": 0.1355808526277542, + "learning_rate": 5.361977494080572e-05 + }, + { + "step": 442, + "epoch": 2.2493638676844783, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023399936, + "loss": 1.359, + "grad_norm": 0.18253402411937714, + "learning_rate": 5.293888902396319e-05 + }, + { + "step": 443, + "epoch": 2.2544529262086512, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023450624, + "loss": 1.3536, + "grad_norm": 0.15602940320968628, + "learning_rate": 5.2261426307983204e-05 + }, + { + "step": 444, + "epoch": 2.2595419847328246, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023456768, + "loss": 1.3439, + "grad_norm": 0.09778697043657303, + "learning_rate": 5.158741068596714e-05 + }, + { + "step": 445, + "epoch": 2.2646310432569976, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023395328, + "loss": 1.3671, + "grad_norm": 0.11863283067941666, + "learning_rate": 5.0916865929442326e-05 + }, + { + "step": 446, + "epoch": 2.2697201017811706, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023384576, + "loss": 1.397, + "grad_norm": 0.18060003221035004, + "learning_rate": 5.024981568752386e-05 + }, + { + "step": 447, + "epoch": 2.2748091603053435, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023409152, + "loss": 1.3712, + "grad_norm": 0.12113738805055618, + "learning_rate": 4.958628348608065e-05 + }, + { + "step": 448, + "epoch": 2.2798982188295165, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023426048, + "loss": 1.3488, + "grad_norm": 0.15285247564315796, + "learning_rate": 4.892629272690536e-05 + }, + { + "step": 449, + "epoch": 2.2849872773536894, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023297024, + "loss": 1.3631, + "grad_norm": 0.12101858109235764, + "learning_rate": 4.826986668688944e-05 + }, + { + "step": 450, + "epoch": 2.2900763358778624, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023472128, + "loss": 1.3361, + "grad_norm": 0.14207875728607178, + "learning_rate": 4.761702851720191e-05 + }, + { + "step": 451, + "epoch": 2.2951653944020354, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02342912, + "loss": 1.3412, + "grad_norm": 0.10436668246984482, + "learning_rate": 4.6967801242472916e-05 + }, + { + "step": 452, + "epoch": 2.300254452926209, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023338496, + "loss": 1.3733, + "grad_norm": 0.17112308740615845, + "learning_rate": 4.632220775998172e-05 + }, + { + "step": 453, + "epoch": 2.3053435114503817, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023286272, + "loss": 1.3408, + "grad_norm": 0.10321767628192902, + "learning_rate": 4.568027083884929e-05 + }, + { + "step": 454, + "epoch": 2.3104325699745547, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023301632, + "loss": 1.3896, + "grad_norm": 0.12708689272403717, + "learning_rate": 4.504201311923488e-05 + }, + { + "step": 455, + "epoch": 2.3155216284987277, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023403008, + "loss": 1.3375, + "grad_norm": 0.09785709530115128, + "learning_rate": 4.440745711153804e-05 + }, + { + "step": 456, + "epoch": 2.3206106870229006, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023412224, + "loss": 1.3727, + "grad_norm": 0.19311727583408356, + "learning_rate": 4.377662519560423e-05 + }, + { + "step": 457, + "epoch": 2.325699745547074, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023441408, + "loss": 1.3496, + "grad_norm": 0.17121818661689758, + "learning_rate": 4.3149539619935836e-05 + }, + { + "step": 458, + "epoch": 2.330788804071247, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023502848, + "loss": 1.3335, + "grad_norm": 0.12156299501657486, + "learning_rate": 4.252622250090746e-05 + }, + { + "step": 459, + "epoch": 2.33587786259542, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023356928, + "loss": 1.355, + "grad_norm": 0.12453088164329529, + "learning_rate": 4.190669582198571e-05 + }, + { + "step": 460, + "epoch": 2.340966921119593, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02344448, + "loss": 1.3313, + "grad_norm": 0.2172391712665558, + "learning_rate": 4.1290981432954185e-05 + }, + { + "step": 461, + "epoch": 2.346055979643766, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023347712, + "loss": 1.3603, + "grad_norm": 0.15644493699073792, + "learning_rate": 4.067910104914249e-05 + }, + { + "step": 462, + "epoch": 2.351145038167939, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023418368, + "loss": 1.3307, + "grad_norm": 0.10348180681467056, + "learning_rate": 4.007107625066079e-05 + }, + { + "step": 463, + "epoch": 2.356234096692112, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023453696, + "loss": 1.3695, + "grad_norm": 0.16655202209949493, + "learning_rate": 3.946692848163836e-05 + }, + { + "step": 464, + "epoch": 2.3613231552162848, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023392256, + "loss": 1.2952, + "grad_norm": 0.13182023167610168, + "learning_rate": 3.886667904946739e-05 + }, + { + "step": 465, + "epoch": 2.366412213740458, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023349248, + "loss": 1.3313, + "grad_norm": 0.1159553974866867, + "learning_rate": 3.8270349124051694e-05 + }, + { + "step": 466, + "epoch": 2.371501272264631, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023350784, + "loss": 1.4007, + "grad_norm": 0.12268602102994919, + "learning_rate": 3.767795973705975e-05 + }, + { + "step": 467, + "epoch": 2.376590330788804, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023387648, + "loss": 1.3169, + "grad_norm": 0.11903684586286545, + "learning_rate": 3.708953178118324e-05 + }, + { + "step": 468, + "epoch": 2.381679389312977, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02336, + "loss": 1.321, + "grad_norm": 0.10778895765542984, + "learning_rate": 3.6505086009399944e-05 + }, + { + "step": 469, + "epoch": 2.38676844783715, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023416832, + "loss": 1.3631, + "grad_norm": 0.1400643140077591, + "learning_rate": 3.5924643034242136e-05 + }, + { + "step": 470, + "epoch": 2.391857506361323, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023446016, + "loss": 1.3306, + "grad_norm": 0.15912677347660065, + "learning_rate": 3.5348223327069105e-05 + }, + { + "step": 471, + "epoch": 2.3969465648854964, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023346176, + "loss": 1.3759, + "grad_norm": 0.2191082388162613, + "learning_rate": 3.4775847217345756e-05 + }, + { + "step": 472, + "epoch": 2.4020356234096694, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023369216, + "loss": 1.3491, + "grad_norm": 0.19941779971122742, + "learning_rate": 3.420753489192524e-05 + }, + { + "step": 473, + "epoch": 2.4071246819338423, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02334464, + "loss": 1.3683, + "grad_norm": 0.18891407549381256, + "learning_rate": 3.364330639433701e-05 + }, + { + "step": 474, + "epoch": 2.4122137404580153, + "cpu_mem": 2.097606656, + "gpu_mem": 5.0234368, + "loss": 1.4044, + "grad_norm": 0.15468700230121613, + "learning_rate": 3.308318162408013e-05 + }, + { + "step": 475, + "epoch": 2.4173027989821882, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02330624, + "loss": 1.3412, + "grad_norm": 0.14858222007751465, + "learning_rate": 3.2527180335921186e-05 + }, + { + "step": 476, + "epoch": 2.422391857506361, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023442944, + "loss": 1.3712, + "grad_norm": 0.11256857961416245, + "learning_rate": 3.197532213919774e-05 + }, + { + "step": 477, + "epoch": 2.427480916030534, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023300096, + "loss": 1.3629, + "grad_norm": 0.2203657180070877, + "learning_rate": 3.1427626497126654e-05 + }, + { + "step": 478, + "epoch": 2.432569974554707, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023323136, + "loss": 1.3421, + "grad_norm": 0.09855563193559647, + "learning_rate": 3.088411272611781e-05 + }, + { + "step": 479, + "epoch": 2.4376590330788805, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023370752, + "loss": 1.3767, + "grad_norm": 0.17440812289714813, + "learning_rate": 3.0344799995092533e-05 + }, + { + "step": 480, + "epoch": 2.4427480916030535, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02341376, + "loss": 1.3559, + "grad_norm": 0.09359735995531082, + "learning_rate": 2.9809707324807912e-05 + }, + { + "step": 481, + "epoch": 2.4478371501272265, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023341568, + "loss": 1.3477, + "grad_norm": 0.17375795543193817, + "learning_rate": 2.9278853587185658e-05 + }, + { + "step": 482, + "epoch": 2.4529262086513994, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023439872, + "loss": 1.3717, + "grad_norm": 0.18833616375923157, + "learning_rate": 2.8752257504646616e-05 + }, + { + "step": 483, + "epoch": 2.4580152671755724, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023335424, + "loss": 1.3486, + "grad_norm": 0.13220572471618652, + "learning_rate": 2.8229937649450613e-05 + }, + { + "step": 484, + "epoch": 2.4631043256997454, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02349056, + "loss": 1.3684, + "grad_norm": 0.13149318099021912, + "learning_rate": 2.7711912443041123e-05 + }, + { + "step": 485, + "epoch": 2.4681933842239188, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023323136, + "loss": 1.34, + "grad_norm": 0.12346682697534561, + "learning_rate": 2.719820015539596e-05 + }, + { + "step": 486, + "epoch": 2.4732824427480917, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023366144, + "loss": 1.3908, + "grad_norm": 0.16203059256076813, + "learning_rate": 2.6688818904382513e-05 + }, + { + "step": 487, + "epoch": 2.4783715012722647, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023387648, + "loss": 1.3586, + "grad_norm": 0.1295696347951889, + "learning_rate": 2.6183786655119144e-05 + }, + { + "step": 488, + "epoch": 2.4834605597964376, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023310848, + "loss": 1.3526, + "grad_norm": 0.12611760199069977, + "learning_rate": 2.5683121219341217e-05 + }, + { + "step": 489, + "epoch": 2.4885496183206106, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023447552, + "loss": 1.3558, + "grad_norm": 0.19936195015907288, + "learning_rate": 2.518684025477319e-05 + }, + { + "step": 490, + "epoch": 2.4936386768447836, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023335424, + "loss": 1.3191, + "grad_norm": 0.10953337699174881, + "learning_rate": 2.469496126450578e-05 + }, + { + "step": 491, + "epoch": 2.4987277353689565, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023332352, + "loss": 1.4092, + "grad_norm": 0.12222813069820404, + "learning_rate": 2.4207501596378508e-05 + }, + { + "step": 492, + "epoch": 2.5038167938931295, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02330624, + "loss": 1.2919, + "grad_norm": 0.10766459256410599, + "learning_rate": 2.3724478442368133e-05 + }, + { + "step": 493, + "epoch": 2.508905852417303, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023341568, + "loss": 1.3023, + "grad_norm": 0.13826335966587067, + "learning_rate": 2.324590883798204e-05 + }, + { + "step": 494, + "epoch": 2.513994910941476, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023356928, + "loss": 1.3364, + "grad_norm": 0.14942249655723572, + "learning_rate": 2.2771809661657614e-05 + }, + { + "step": 495, + "epoch": 2.519083969465649, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023350784, + "loss": 1.3393, + "grad_norm": 0.13177841901779175, + "learning_rate": 2.2302197634166835e-05 + }, + { + "step": 496, + "epoch": 2.524173027989822, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023363072, + "loss": 1.3958, + "grad_norm": 0.16141349077224731, + "learning_rate": 2.1837089318026714e-05 + }, + { + "step": 497, + "epoch": 2.5292620865139948, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023395328, + "loss": 1.3265, + "grad_norm": 0.12936410307884216, + "learning_rate": 2.1376501116915047e-05 + }, + { + "step": 498, + "epoch": 2.534351145038168, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023396864, + "loss": 1.3415, + "grad_norm": 0.11042087525129318, + "learning_rate": 2.0920449275091837e-05 + }, + { + "step": 499, + "epoch": 2.539440203562341, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023369216, + "loss": 1.3613, + "grad_norm": 0.12383691221475601, + "learning_rate": 2.0468949876826573e-05 + }, + { + "step": 500, + "epoch": 2.544529262086514, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023418368, + "loss": 1.3139, + "grad_norm": 0.1895645707845688, + "learning_rate": 2.002201884583065e-05 + }, + { + "step": 501, + "epoch": 2.549618320610687, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023347712, + "loss": 1.3603, + "grad_norm": 0.13090960681438446, + "learning_rate": 1.957967194469615e-05 + }, + { + "step": 502, + "epoch": 2.55470737913486, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02340608, + "loss": 1.3311, + "grad_norm": 0.19973506033420563, + "learning_rate": 1.9141924774339566e-05 + }, + { + "step": 503, + "epoch": 2.559796437659033, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023396864, + "loss": 1.3026, + "grad_norm": 0.11740544438362122, + "learning_rate": 1.8708792773451874e-05 + }, + { + "step": 504, + "epoch": 2.564885496183206, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023330816, + "loss": 1.3349, + "grad_norm": 0.1911497712135315, + "learning_rate": 1.828029121795375e-05 + }, + { + "step": 505, + "epoch": 2.569974554707379, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02335232, + "loss": 1.3448, + "grad_norm": 0.12041319161653519, + "learning_rate": 1.7856435220457092e-05 + }, + { + "step": 506, + "epoch": 2.5750636132315523, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023407616, + "loss": 1.372, + "grad_norm": 0.2885526716709137, + "learning_rate": 1.7437239729731806e-05 + }, + { + "step": 507, + "epoch": 2.5801526717557253, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023324672, + "loss": 1.3512, + "grad_norm": 0.10914917290210724, + "learning_rate": 1.7022719530178624e-05 + }, + { + "step": 508, + "epoch": 2.5852417302798982, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023389184, + "loss": 1.3257, + "grad_norm": 0.13059920072555542, + "learning_rate": 1.6612889241307836e-05 + }, + { + "step": 509, + "epoch": 2.590330788804071, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023347712, + "loss": 1.3813, + "grad_norm": 0.1647031009197235, + "learning_rate": 1.620776331722347e-05 + }, + { + "step": 510, + "epoch": 2.595419847328244, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023379968, + "loss": 1.3801, + "grad_norm": 0.19006235897541046, + "learning_rate": 1.580735604611368e-05 + }, + { + "step": 511, + "epoch": 2.6005089058524176, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023353856, + "loss": 1.3533, + "grad_norm": 0.1610890030860901, + "learning_rate": 1.5411681549746678e-05 + }, + { + "step": 512, + "epoch": 2.6055979643765905, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023347712, + "loss": 1.3369, + "grad_norm": 0.13746750354766846, + "learning_rate": 1.502075378297285e-05 + }, + { + "step": 513, + "epoch": 2.6106870229007635, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023358464, + "loss": 1.3836, + "grad_norm": 0.1614898145198822, + "learning_rate": 1.4634586533232428e-05 + }, + { + "step": 514, + "epoch": 2.6157760814249365, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023263232, + "loss": 1.349, + "grad_norm": 0.14449022710323334, + "learning_rate": 1.4253193420069292e-05 + }, + { + "step": 515, + "epoch": 2.6208651399491094, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023332352, + "loss": 1.3211, + "grad_norm": 0.18423518538475037, + "learning_rate": 1.3876587894650686e-05 + }, + { + "step": 516, + "epoch": 2.6259541984732824, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023379968, + "loss": 1.363, + "grad_norm": 0.1516801416873932, + "learning_rate": 1.350478323929271e-05 + }, + { + "step": 517, + "epoch": 2.6310432569974553, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023324672, + "loss": 1.3557, + "grad_norm": 0.22318509221076965, + "learning_rate": 1.3137792566992001e-05 + }, + { + "step": 518, + "epoch": 2.6361323155216283, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02338304, + "loss": 1.3664, + "grad_norm": 0.1444937288761139, + "learning_rate": 1.2775628820963091e-05 + }, + { + "step": 519, + "epoch": 2.6412213740458013, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023389184, + "loss": 1.3223, + "grad_norm": 0.12237636744976044, + "learning_rate": 1.2418304774182075e-05 + }, + { + "step": 520, + "epoch": 2.6463104325699747, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023525888, + "loss": 1.322, + "grad_norm": 0.12398252636194229, + "learning_rate": 1.2065833028935968e-05 + }, + { + "step": 521, + "epoch": 2.6513994910941476, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023335424, + "loss": 1.3521, + "grad_norm": 0.1520349681377411, + "learning_rate": 1.1718226016378507e-05 + }, + { + "step": 522, + "epoch": 2.6564885496183206, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023399936, + "loss": 1.3032, + "grad_norm": 0.13556666672229767, + "learning_rate": 1.137549599609136e-05 + }, + { + "step": 523, + "epoch": 2.6615776081424936, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023315456, + "loss": 1.3451, + "grad_norm": 0.20363746583461761, + "learning_rate": 1.103765505565205e-05 + }, + { + "step": 524, + "epoch": 2.6666666666666665, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023332352, + "loss": 1.3398, + "grad_norm": 0.13346220552921295, + "learning_rate": 1.0704715110207579e-05 + }, + { + "step": 525, + "epoch": 2.67175572519084, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023332352, + "loss": 1.3112, + "grad_norm": 0.13579463958740234, + "learning_rate": 1.0376687902053981e-05 + }, + { + "step": 526, + "epoch": 2.676844783715013, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023458304, + "loss": 1.3474, + "grad_norm": 0.11342372745275497, + "learning_rate": 1.0053585000222524e-05 + }, + { + "step": 527, + "epoch": 2.681933842239186, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023338496, + "loss": 1.3264, + "grad_norm": 0.15050476789474487, + "learning_rate": 9.735417800071433e-06 + }, + { + "step": 528, + "epoch": 2.687022900763359, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023496704, + "loss": 1.3794, + "grad_norm": 0.14252106845378876, + "learning_rate": 9.42219752288414e-06 + }, + { + "step": 529, + "epoch": 2.6921119592875318, + "cpu_mem": 2.097606656, + "gpu_mem": 5.0233216, + "loss": 1.3249, + "grad_norm": 0.10365717858076096, + "learning_rate": 9.113935215473428e-06 + }, + { + "step": 530, + "epoch": 2.6972010178117047, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023340032, + "loss": 1.385, + "grad_norm": 0.12168505787849426, + "learning_rate": 8.810641749791902e-06 + }, + { + "step": 531, + "epoch": 2.7022900763358777, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023516672, + "loss": 1.2953, + "grad_norm": 0.12850001454353333, + "learning_rate": 8.512327822548481e-06 + }, + { + "step": 532, + "epoch": 2.7073791348600507, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02339072, + "loss": 1.3408, + "grad_norm": 0.11684691160917282, + "learning_rate": 8.219003954831199e-06 + }, + { + "step": 533, + "epoch": 2.712468193384224, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023387648, + "loss": 1.3224, + "grad_norm": 0.16691863536834717, + "learning_rate": 7.930680491736135e-06 + }, + { + "step": 534, + "epoch": 2.717557251908397, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023396864, + "loss": 1.379, + "grad_norm": 0.12496516108512878, + "learning_rate": 7.647367602002491e-06 + }, + { + "step": 535, + "epoch": 2.72264631043257, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023272448, + "loss": 1.3358, + "grad_norm": 0.12254011631011963, + "learning_rate": 7.369075277654091e-06 + }, + { + "step": 536, + "epoch": 2.727735368956743, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023555072, + "loss": 1.351, + "grad_norm": 0.16197244822978973, + "learning_rate": 7.095813333646832e-06 + }, + { + "step": 537, + "epoch": 2.732824427480916, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023318528, + "loss": 1.3286, + "grad_norm": 0.10929515957832336, + "learning_rate": 6.827591407522548e-06 + }, + { + "step": 538, + "epoch": 2.7379134860050893, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02341376, + "loss": 1.3603, + "grad_norm": 0.10004337131977081, + "learning_rate": 6.564418959069273e-06 + }, + { + "step": 539, + "epoch": 2.7430025445292623, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023379968, + "loss": 1.3566, + "grad_norm": 0.22687365114688873, + "learning_rate": 6.3063052699873326e-06 + }, + { + "step": 540, + "epoch": 2.7480916030534353, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023347712, + "loss": 1.3687, + "grad_norm": 0.17481710016727448, + "learning_rate": 6.053259443562286e-06 + }, + { + "step": 541, + "epoch": 2.753180661577608, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023366144, + "loss": 1.3659, + "grad_norm": 0.13331645727157593, + "learning_rate": 5.8052904043435985e-06 + }, + { + "step": 542, + "epoch": 2.758269720101781, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023416832, + "loss": 1.4196, + "grad_norm": 0.16931912302970886, + "learning_rate": 5.56240689783013e-06 + }, + { + "step": 543, + "epoch": 2.763358778625954, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023312384, + "loss": 1.3751, + "grad_norm": 0.1303732693195343, + "learning_rate": 5.324617490161409e-06 + }, + { + "step": 544, + "epoch": 2.768447837150127, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023341568, + "loss": 1.3484, + "grad_norm": 0.14195778965950012, + "learning_rate": 5.091930567815866e-06 + }, + { + "step": 545, + "epoch": 2.7735368956743, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023284736, + "loss": 1.4012, + "grad_norm": 0.12940308451652527, + "learning_rate": 4.86435433731473e-06 + }, + { + "step": 546, + "epoch": 2.778625954198473, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02336, + "loss": 1.3174, + "grad_norm": 0.1266697645187378, + "learning_rate": 4.641896824932861e-06 + }, + { + "step": 547, + "epoch": 2.7837150127226464, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02335232, + "loss": 1.376, + "grad_norm": 0.15501487255096436, + "learning_rate": 4.424565876415415e-06 + }, + { + "step": 548, + "epoch": 2.7888040712468194, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02338304, + "loss": 1.3613, + "grad_norm": 0.12661653757095337, + "learning_rate": 4.212369156701373e-06 + }, + { + "step": 549, + "epoch": 2.7938931297709924, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023312384, + "loss": 1.3672, + "grad_norm": 0.12456722557544708, + "learning_rate": 4.005314149653133e-06 + }, + { + "step": 550, + "epoch": 2.7989821882951653, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023338496, + "loss": 1.3357, + "grad_norm": 0.16156189143657684, + "learning_rate": 3.8034081577924147e-06 + }, + { + "step": 551, + "epoch": 2.8040712468193383, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023433728, + "loss": 1.3451, + "grad_norm": 0.15168391168117523, + "learning_rate": 3.6066583020429864e-06 + }, + { + "step": 552, + "epoch": 2.8091603053435117, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02332928, + "loss": 1.3065, + "grad_norm": 0.125733882188797, + "learning_rate": 3.415071521479246e-06 + }, + { + "step": 553, + "epoch": 2.8142493638676847, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023338496, + "loss": 1.4101, + "grad_norm": 0.13352614641189575, + "learning_rate": 3.2286545730817183e-06 + }, + { + "step": 554, + "epoch": 2.8193384223918576, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023366144, + "loss": 1.3671, + "grad_norm": 0.147377148270607, + "learning_rate": 3.0474140314985628e-06 + }, + { + "step": 555, + "epoch": 2.8244274809160306, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023379968, + "loss": 1.3589, + "grad_norm": 0.17584380507469177, + "learning_rate": 2.8713562888138754e-06 + }, + { + "step": 556, + "epoch": 2.8295165394402035, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023373824, + "loss": 1.357, + "grad_norm": 0.12587469816207886, + "learning_rate": 2.7004875543220506e-06 + }, + { + "step": 557, + "epoch": 2.8346055979643765, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02333696, + "loss": 1.3303, + "grad_norm": 0.14053376019001007, + "learning_rate": 2.5348138543089425e-06 + }, + { + "step": 558, + "epoch": 2.8396946564885495, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023395328, + "loss": 1.3643, + "grad_norm": 0.18607409298419952, + "learning_rate": 2.374341031839283e-06 + }, + { + "step": 559, + "epoch": 2.8447837150127224, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023304704, + "loss": 1.3781, + "grad_norm": 0.16899679601192474, + "learning_rate": 2.2190747465505644e-06 + }, + { + "step": 560, + "epoch": 2.849872773536896, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023389184, + "loss": 1.3579, + "grad_norm": 0.13462162017822266, + "learning_rate": 2.0690204744534976e-06 + }, + { + "step": 561, + "epoch": 2.854961832061069, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023427584, + "loss": 1.3162, + "grad_norm": 0.19316910207271576, + "learning_rate": 1.924183507738819e-06 + }, + { + "step": 562, + "epoch": 2.8600508905852418, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023472128, + "loss": 1.4179, + "grad_norm": 0.12925928831100464, + "learning_rate": 1.7845689545906704e-06 + }, + { + "step": 563, + "epoch": 2.8651399491094147, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023433728, + "loss": 1.3056, + "grad_norm": 0.1469661444425583, + "learning_rate": 1.6501817390064786e-06 + }, + { + "step": 564, + "epoch": 2.8702290076335877, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023370752, + "loss": 1.3552, + "grad_norm": 0.1676928699016571, + "learning_rate": 1.521026600623243e-06 + }, + { + "step": 565, + "epoch": 2.875318066157761, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023424512, + "loss": 1.3624, + "grad_norm": 0.1964017003774643, + "learning_rate": 1.3971080945503866e-06 + }, + { + "step": 566, + "epoch": 2.880407124681934, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023363072, + "loss": 1.3835, + "grad_norm": 0.2577018737792969, + "learning_rate": 1.2784305912090842e-06 + }, + { + "step": 567, + "epoch": 2.885496183206107, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023349248, + "loss": 1.3281, + "grad_norm": 0.12528155744075775, + "learning_rate": 1.1649982761782195e-06 + }, + { + "step": 568, + "epoch": 2.89058524173028, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023561216, + "loss": 1.3204, + "grad_norm": 0.12024806439876556, + "learning_rate": 1.0568151500465693e-06 + }, + { + "step": 569, + "epoch": 2.895674300254453, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023426048, + "loss": 1.3249, + "grad_norm": 0.24812181293964386, + "learning_rate": 9.538850282719833e-07 + }, + { + "step": 570, + "epoch": 2.900763358778626, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023399936, + "loss": 1.3618, + "grad_norm": 0.14166007936000824, + "learning_rate": 8.56211541046542e-07 + }, + { + "step": 571, + "epoch": 2.905852417302799, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02340608, + "loss": 1.3269, + "grad_norm": 0.14851367473602295, + "learning_rate": 7.637981331687582e-07 + }, + { + "step": 572, + "epoch": 2.910941475826972, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023439872, + "loss": 1.3657, + "grad_norm": 0.15915240347385406, + "learning_rate": 6.766480639218752e-07 + }, + { + "step": 573, + "epoch": 2.916030534351145, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023347712, + "loss": 1.403, + "grad_norm": 0.1489430069923401, + "learning_rate": 5.947644069591084e-07 + }, + { + "step": 574, + "epoch": 2.921119592875318, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023432192, + "loss": 1.3816, + "grad_norm": 0.16597650945186615, + "learning_rate": 5.181500501950986e-07 + }, + { + "step": 575, + "epoch": 2.926208651399491, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023320064, + "loss": 1.3435, + "grad_norm": 0.10187417268753052, + "learning_rate": 4.468076957041433e-07 + }, + { + "step": 576, + "epoch": 2.931297709923664, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02338304, + "loss": 1.3819, + "grad_norm": 0.14525948464870453, + "learning_rate": 3.807398596248401e-07 + }, + { + "step": 577, + "epoch": 2.936386768447837, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023347712, + "loss": 1.3419, + "grad_norm": 0.14910657703876495, + "learning_rate": 3.199488720714072e-07 + }, + { + "step": 578, + "epoch": 2.94147582697201, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023347712, + "loss": 1.341, + "grad_norm": 0.11149083822965622, + "learning_rate": 2.64436877051466e-07 + }, + { + "step": 579, + "epoch": 2.9465648854961835, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023432192, + "loss": 1.3658, + "grad_norm": 0.16095764935016632, + "learning_rate": 2.1420583239040167e-07 + }, + { + "step": 580, + "epoch": 2.9516539440203564, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023378432, + "loss": 1.3883, + "grad_norm": 0.14923934638500214, + "learning_rate": 1.6925750966238494e-07 + }, + { + "step": 581, + "epoch": 2.9567430025445294, + "cpu_mem": 2.097606656, + "gpu_mem": 5.0233984, + "loss": 1.3587, + "grad_norm": 0.1574961394071579, + "learning_rate": 1.295934941278387e-07 + }, + { + "step": 582, + "epoch": 2.9618320610687023, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023326208, + "loss": 1.3403, + "grad_norm": 0.23870916664600372, + "learning_rate": 9.52151846775162e-08 + }, + { + "step": 583, + "epoch": 2.9669211195928753, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02341376, + "loss": 1.3812, + "grad_norm": 0.14083535969257355, + "learning_rate": 6.612379378320709e-08 + }, + { + "step": 584, + "epoch": 2.9720101781170483, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023381504, + "loss": 1.315, + "grad_norm": 0.15621314942836761, + "learning_rate": 4.232034745495494e-08 + }, + { + "step": 585, + "epoch": 2.9770992366412212, + "cpu_mem": 2.097606656, + "gpu_mem": 5.02336, + "loss": 1.309, + "grad_norm": 0.10324036329984665, + "learning_rate": 2.3805685204869583e-08 + }, + { + "step": 586, + "epoch": 2.982188295165394, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023303168, + "loss": 1.3658, + "grad_norm": 0.2391793578863144, + "learning_rate": 1.0580460017517444e-08 + }, + { + "step": 587, + "epoch": 2.9872773536895676, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023369216, + "loss": 1.4192, + "grad_norm": 0.14164426922798157, + "learning_rate": 2.645138326906604e-09 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023410688, + "loss": 1.3616, + "grad_norm": 0.10059570521116257, + "learning_rate": 0.0 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 2.097606656, + "gpu_mem": 5.023410688, + "train_runtime": 16692.8173, + "train_samples_per_second": 2.259, + "train_steps_per_second": 0.035, + "total_flos": 9.252417491653018e+16, + "train_loss": 1.441596847002198 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r8-a2/README.md b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r8-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r8-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r8-a2/adapter_config.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5723daa9f5f7b854bf548bbee9a6d37e12198a3a --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r8-a2/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha": 16, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "inference_mode": true, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 8, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r8-a2/eval_results.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..129cfa025fbeeea999dc3a5fad1025386a2ac263 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "logiqa", + "results": 0.2739103491014253 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r8-a2/training_configuration.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..9f7ab51b9b7a6007e512bc8fc9987d58d8d38e02 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "LOGIQA", + "dataset_id": "data/logiqa_train", + "preprocess_id": "logiqa_train_deepeval" + }, + "peft_config": { + "method": "loha", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 12615680 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 3, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loha-logiqa-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r8-a2", + "seed": 42, + "timestamp": "2025-09-13T09:32:08.881170" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r8-a2/training_logs.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..316efb254541e4c5050c9590efd8024f8715ba33 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-logiqa-r8-a2/training_logs.json @@ -0,0 +1,5305 @@ +[ + { + "step": 1, + "epoch": 0.005089058524173028, + "cpu_mem": 1.930477568, + "gpu_mem": 4.468264448, + "loss": 3.8396, + "grad_norm": 3.549515962600708, + "learning_rate": 5.084745762711864e-06 + }, + { + "step": 2, + "epoch": 0.010178117048346057, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569209344, + "loss": 3.9728, + "grad_norm": 3.5172746181488037, + "learning_rate": 1.0169491525423728e-05 + }, + { + "step": 3, + "epoch": 0.015267175572519083, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569286144, + "loss": 3.8467, + "grad_norm": 3.497091770172119, + "learning_rate": 1.5254237288135592e-05 + }, + { + "step": 4, + "epoch": 0.020356234096692113, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569184768, + "loss": 3.808, + "grad_norm": 3.757815361022949, + "learning_rate": 2.0338983050847455e-05 + }, + { + "step": 5, + "epoch": 0.02544529262086514, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569200128, + "loss": 3.9015, + "grad_norm": 3.5803463459014893, + "learning_rate": 2.542372881355932e-05 + }, + { + "step": 6, + "epoch": 0.030534351145038167, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569192448, + "loss": 3.8323, + "grad_norm": 3.3428876399993896, + "learning_rate": 3.0508474576271185e-05 + }, + { + "step": 7, + "epoch": 0.035623409669211195, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569263104, + "loss": 3.8332, + "grad_norm": 3.485027313232422, + "learning_rate": 3.559322033898305e-05 + }, + { + "step": 8, + "epoch": 0.04071246819338423, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569247744, + "loss": 3.6785, + "grad_norm": 3.39766263961792, + "learning_rate": 4.067796610169491e-05 + }, + { + "step": 9, + "epoch": 0.04580152671755725, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569246208, + "loss": 3.6182, + "grad_norm": 3.279181480407715, + "learning_rate": 4.576271186440678e-05 + }, + { + "step": 10, + "epoch": 0.05089058524173028, + "cpu_mem": 1.935785984, + "gpu_mem": 4.56925696, + "loss": 3.8884, + "grad_norm": 3.2287352085113525, + "learning_rate": 5.084745762711864e-05 + }, + { + "step": 11, + "epoch": 0.05597964376590331, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569160192, + "loss": 3.5727, + "grad_norm": 3.1412644386291504, + "learning_rate": 5.59322033898305e-05 + }, + { + "step": 12, + "epoch": 0.061068702290076333, + "cpu_mem": 1.935785984, + "gpu_mem": 4.56921088, + "loss": 3.3879, + "grad_norm": 3.0449695587158203, + "learning_rate": 6.101694915254237e-05 + }, + { + "step": 13, + "epoch": 0.06615776081424936, + "cpu_mem": 1.935785984, + "gpu_mem": 4.56930304, + "loss": 3.3778, + "grad_norm": 3.026195526123047, + "learning_rate": 6.610169491525423e-05 + }, + { + "step": 14, + "epoch": 0.07124681933842239, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569215488, + "loss": 3.3758, + "grad_norm": 2.9620587825775146, + "learning_rate": 7.11864406779661e-05 + }, + { + "step": 15, + "epoch": 0.07633587786259542, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569353728, + "loss": 3.2196, + "grad_norm": 2.9325826168060303, + "learning_rate": 7.627118644067796e-05 + }, + { + "step": 16, + "epoch": 0.08142493638676845, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569213952, + "loss": 2.985, + "grad_norm": 2.479107141494751, + "learning_rate": 8.135593220338982e-05 + }, + { + "step": 17, + "epoch": 0.08651399491094147, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569244672, + "loss": 2.8292, + "grad_norm": 2.5109918117523193, + "learning_rate": 8.64406779661017e-05 + }, + { + "step": 18, + "epoch": 0.0916030534351145, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569207808, + "loss": 2.9548, + "grad_norm": 2.4550552368164062, + "learning_rate": 9.152542372881355e-05 + }, + { + "step": 19, + "epoch": 0.09669211195928754, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569115648, + "loss": 2.8191, + "grad_norm": 2.2314083576202393, + "learning_rate": 9.661016949152541e-05 + }, + { + "step": 20, + "epoch": 0.10178117048346055, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569154048, + "loss": 2.7148, + "grad_norm": 2.098381996154785, + "learning_rate": 0.00010169491525423727 + }, + { + "step": 21, + "epoch": 0.10687022900763359, + "cpu_mem": 1.935785984, + "gpu_mem": 4.56928768, + "loss": 2.3199, + "grad_norm": 1.8392590284347534, + "learning_rate": 0.00010677966101694915 + }, + { + "step": 22, + "epoch": 0.11195928753180662, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569186304, + "loss": 2.3935, + "grad_norm": 1.6726374626159668, + "learning_rate": 0.000111864406779661 + }, + { + "step": 23, + "epoch": 0.11704834605597965, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569213952, + "loss": 2.4778, + "grad_norm": 1.6418979167938232, + "learning_rate": 0.00011694915254237288 + }, + { + "step": 24, + "epoch": 0.12213740458015267, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569207808, + "loss": 2.1971, + "grad_norm": 1.3529406785964966, + "learning_rate": 0.00012203389830508474 + }, + { + "step": 25, + "epoch": 0.1272264631043257, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569213952, + "loss": 2.0924, + "grad_norm": 1.1247645616531372, + "learning_rate": 0.00012711864406779658 + }, + { + "step": 26, + "epoch": 0.13231552162849872, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569266176, + "loss": 1.8261, + "grad_norm": 1.0008445978164673, + "learning_rate": 0.00013220338983050846 + }, + { + "step": 27, + "epoch": 0.13740458015267176, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569207808, + "loss": 1.9698, + "grad_norm": 0.9867499470710754, + "learning_rate": 0.00013728813559322033 + }, + { + "step": 28, + "epoch": 0.14249363867684478, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569154048, + "loss": 1.8058, + "grad_norm": 0.8025996685028076, + "learning_rate": 0.0001423728813559322 + }, + { + "step": 29, + "epoch": 0.1475826972010178, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569246208, + "loss": 1.7408, + "grad_norm": 0.6417842507362366, + "learning_rate": 0.00014745762711864405 + }, + { + "step": 30, + "epoch": 0.15267175572519084, + "cpu_mem": 1.935785984, + "gpu_mem": 4.5692416, + "loss": 1.645, + "grad_norm": 0.6631098389625549, + "learning_rate": 0.00015254237288135592 + }, + { + "step": 31, + "epoch": 0.15776081424936386, + "cpu_mem": 1.935785984, + "gpu_mem": 4.569220096, + "loss": 1.6967, + "grad_norm": 0.6490795612335205, + "learning_rate": 0.0001576271186440678 + }, + { + "step": 32, + "epoch": 0.1628498727735369, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569224704, + "loss": 1.6527, + "grad_norm": 0.4467485845088959, + "learning_rate": 0.00016271186440677964 + }, + { + "step": 33, + "epoch": 0.16793893129770993, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569260032, + "loss": 1.4761, + "grad_norm": 0.39100348949432373, + "learning_rate": 0.0001677966101694915 + }, + { + "step": 34, + "epoch": 0.17302798982188294, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569184768, + "loss": 1.6057, + "grad_norm": 0.2476552575826645, + "learning_rate": 0.0001728813559322034 + }, + { + "step": 35, + "epoch": 0.178117048346056, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569232384, + "loss": 1.5347, + "grad_norm": 0.25612369179725647, + "learning_rate": 0.00017796610169491523 + }, + { + "step": 36, + "epoch": 0.183206106870229, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569247744, + "loss": 1.5521, + "grad_norm": 0.2836960554122925, + "learning_rate": 0.0001830508474576271 + }, + { + "step": 37, + "epoch": 0.18829516539440203, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569313792, + "loss": 1.443, + "grad_norm": 0.2236870974302292, + "learning_rate": 0.00018813559322033895 + }, + { + "step": 38, + "epoch": 0.19338422391857507, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569197056, + "loss": 1.4704, + "grad_norm": 0.19536416232585907, + "learning_rate": 0.00019322033898305083 + }, + { + "step": 39, + "epoch": 0.1984732824427481, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569313792, + "loss": 1.5229, + "grad_norm": 0.3132101595401764, + "learning_rate": 0.0001983050847457627 + }, + { + "step": 40, + "epoch": 0.2035623409669211, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569235456, + "loss": 1.4489, + "grad_norm": 0.19646091759204865, + "learning_rate": 0.00020338983050847455 + }, + { + "step": 41, + "epoch": 0.20865139949109415, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56913408, + "loss": 1.4597, + "grad_norm": 0.15439078211784363, + "learning_rate": 0.00020847457627118642 + }, + { + "step": 42, + "epoch": 0.21374045801526717, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569206272, + "loss": 1.5278, + "grad_norm": 0.32348328828811646, + "learning_rate": 0.0002135593220338983 + }, + { + "step": 43, + "epoch": 0.21882951653944022, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569169408, + "loss": 1.5113, + "grad_norm": 0.34586232900619507, + "learning_rate": 0.00021864406779661014 + }, + { + "step": 44, + "epoch": 0.22391857506361323, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569204736, + "loss": 1.4489, + "grad_norm": 0.18899372220039368, + "learning_rate": 0.000223728813559322 + }, + { + "step": 45, + "epoch": 0.22900763358778625, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569258496, + "loss": 1.4338, + "grad_norm": 0.17869776487350464, + "learning_rate": 0.00022881355932203386 + }, + { + "step": 46, + "epoch": 0.2340966921119593, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569306112, + "loss": 1.4351, + "grad_norm": 0.18683211505413055, + "learning_rate": 0.00023389830508474576 + }, + { + "step": 47, + "epoch": 0.23918575063613232, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56913408, + "loss": 1.4025, + "grad_norm": 0.13387924432754517, + "learning_rate": 0.0002389830508474576 + }, + { + "step": 48, + "epoch": 0.24427480916030533, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56918784, + "loss": 1.4227, + "grad_norm": 0.11155431717634201, + "learning_rate": 0.00024406779661016948 + }, + { + "step": 49, + "epoch": 0.24936386768447838, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569177088, + "loss": 1.4045, + "grad_norm": 0.14956432580947876, + "learning_rate": 0.00024915254237288135 + }, + { + "step": 50, + "epoch": 0.2544529262086514, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569186304, + "loss": 1.4344, + "grad_norm": 0.18801793456077576, + "learning_rate": 0.00025423728813559317 + }, + { + "step": 51, + "epoch": 0.2595419847328244, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569281536, + "loss": 1.4229, + "grad_norm": 0.20351341366767883, + "learning_rate": 0.0002593220338983051 + }, + { + "step": 52, + "epoch": 0.26463104325699743, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569223168, + "loss": 1.4127, + "grad_norm": 0.12706004083156586, + "learning_rate": 0.0002644067796610169 + }, + { + "step": 53, + "epoch": 0.2697201017811705, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569301504, + "loss": 1.417, + "grad_norm": 0.12689682841300964, + "learning_rate": 0.0002694915254237288 + }, + { + "step": 54, + "epoch": 0.2748091603053435, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569201664, + "loss": 1.4232, + "grad_norm": 0.13460659980773926, + "learning_rate": 0.00027457627118644066 + }, + { + "step": 55, + "epoch": 0.27989821882951654, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569198592, + "loss": 1.4329, + "grad_norm": 0.2573798596858978, + "learning_rate": 0.0002796610169491525 + }, + { + "step": 56, + "epoch": 0.28498727735368956, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569243136, + "loss": 1.4203, + "grad_norm": 0.16048993170261383, + "learning_rate": 0.0002847457627118644 + }, + { + "step": 57, + "epoch": 0.2900763358778626, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569235456, + "loss": 1.4379, + "grad_norm": 0.20544494688510895, + "learning_rate": 0.00028983050847457623 + }, + { + "step": 58, + "epoch": 0.2951653944020356, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569258496, + "loss": 1.4087, + "grad_norm": 0.1448390781879425, + "learning_rate": 0.0002949152542372881 + }, + { + "step": 59, + "epoch": 0.30025445292620867, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56922624, + "loss": 1.3853, + "grad_norm": 0.18294715881347656, + "learning_rate": 0.0003 + }, + { + "step": 60, + "epoch": 0.3053435114503817, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569217024, + "loss": 1.441, + "grad_norm": 0.13632921874523163, + "learning_rate": 0.00029999735486167307 + }, + { + "step": 61, + "epoch": 0.3104325699745547, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569253888, + "loss": 1.3919, + "grad_norm": 0.12274248898029327, + "learning_rate": 0.00029998941953998247 + }, + { + "step": 62, + "epoch": 0.3155216284987277, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569293824, + "loss": 1.3998, + "grad_norm": 0.15397880971431732, + "learning_rate": 0.0002999761943147951 + }, + { + "step": 63, + "epoch": 0.32061068702290074, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569223168, + "loss": 1.3871, + "grad_norm": 0.12094773352146149, + "learning_rate": 0.000299957679652545 + }, + { + "step": 64, + "epoch": 0.3256997455470738, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569124864, + "loss": 1.446, + "grad_norm": 0.2699417471885681, + "learning_rate": 0.0002999338762062168 + }, + { + "step": 65, + "epoch": 0.33078880407124683, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569206272, + "loss": 1.4055, + "grad_norm": 0.20444999635219574, + "learning_rate": 0.00029990478481532246 + }, + { + "step": 66, + "epoch": 0.33587786259541985, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56931072, + "loss": 1.43, + "grad_norm": 0.17445404827594757, + "learning_rate": 0.00029987040650587214 + }, + { + "step": 67, + "epoch": 0.34096692111959287, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56918784, + "loss": 1.417, + "grad_norm": 0.16850414872169495, + "learning_rate": 0.0002998307424903376 + }, + { + "step": 68, + "epoch": 0.3460559796437659, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569240064, + "loss": 1.4161, + "grad_norm": 0.11213970929384232, + "learning_rate": 0.00029978579416760955 + }, + { + "step": 69, + "epoch": 0.3511450381679389, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569232384, + "loss": 1.3993, + "grad_norm": 0.14490589499473572, + "learning_rate": 0.00029973556312294853 + }, + { + "step": 70, + "epoch": 0.356234096692112, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569147904, + "loss": 1.3998, + "grad_norm": 0.12091319262981415, + "learning_rate": 0.0002996800511279286 + }, + { + "step": 71, + "epoch": 0.361323155216285, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569170944, + "loss": 1.3663, + "grad_norm": 0.1756073236465454, + "learning_rate": 0.0002996192601403751 + }, + { + "step": 72, + "epoch": 0.366412213740458, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569220096, + "loss": 1.3889, + "grad_norm": 0.09686543792486191, + "learning_rate": 0.00029955319230429584 + }, + { + "step": 73, + "epoch": 0.37150127226463103, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569174016, + "loss": 1.4051, + "grad_norm": 0.07542643696069717, + "learning_rate": 0.00029948184994980486 + }, + { + "step": 74, + "epoch": 0.37659033078880405, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569227776, + "loss": 1.3998, + "grad_norm": 0.10449641197919846, + "learning_rate": 0.0002994052355930409 + }, + { + "step": 75, + "epoch": 0.3816793893129771, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569270784, + "loss": 1.442, + "grad_norm": 0.21701234579086304, + "learning_rate": 0.0002993233519360781 + }, + { + "step": 76, + "epoch": 0.38676844783715014, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56921856, + "loss": 1.4113, + "grad_norm": 0.170782670378685, + "learning_rate": 0.0002992362018668312 + }, + { + "step": 77, + "epoch": 0.39185750636132316, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56915712, + "loss": 1.4352, + "grad_norm": 0.18078745901584625, + "learning_rate": 0.00029914378845895343 + }, + { + "step": 78, + "epoch": 0.3969465648854962, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569258496, + "loss": 1.4165, + "grad_norm": 0.212010458111763, + "learning_rate": 0.000299046114971728 + }, + { + "step": 79, + "epoch": 0.4020356234096692, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569275392, + "loss": 1.4428, + "grad_norm": 0.30649855732917786, + "learning_rate": 0.0002989431848499534 + }, + { + "step": 80, + "epoch": 0.4071246819338422, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569115648, + "loss": 1.4055, + "grad_norm": 0.11497607082128525, + "learning_rate": 0.0002988350017238218 + }, + { + "step": 81, + "epoch": 0.4122137404580153, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569221632, + "loss": 1.3902, + "grad_norm": 0.13332179188728333, + "learning_rate": 0.0002987215694087909 + }, + { + "step": 82, + "epoch": 0.4173027989821883, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569193984, + "loss": 1.3881, + "grad_norm": 0.12817880511283875, + "learning_rate": 0.0002986028919054496 + }, + { + "step": 83, + "epoch": 0.4223918575063613, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569207808, + "loss": 1.3886, + "grad_norm": 0.1981366127729416, + "learning_rate": 0.00029847897339937675 + }, + { + "step": 84, + "epoch": 0.42748091603053434, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569213952, + "loss": 1.4239, + "grad_norm": 0.18421640992164612, + "learning_rate": 0.0002983498182609935 + }, + { + "step": 85, + "epoch": 0.43256997455470736, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569243136, + "loss": 1.4315, + "grad_norm": 0.1648665815591812, + "learning_rate": 0.0002982154310454093 + }, + { + "step": 86, + "epoch": 0.43765903307888043, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569184768, + "loss": 1.3841, + "grad_norm": 0.11869049072265625, + "learning_rate": 0.00029807581649226114 + }, + { + "step": 87, + "epoch": 0.44274809160305345, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569206272, + "loss": 1.4143, + "grad_norm": 0.18488410115242004, + "learning_rate": 0.00029793097952554646 + }, + { + "step": 88, + "epoch": 0.44783715012722647, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569232384, + "loss": 1.357, + "grad_norm": 0.13734875619411469, + "learning_rate": 0.0002977809252534494 + }, + { + "step": 89, + "epoch": 0.4529262086513995, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56918016, + "loss": 1.369, + "grad_norm": 0.11882341653108597, + "learning_rate": 0.00029762565896816073 + }, + { + "step": 90, + "epoch": 0.4580152671755725, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569212416, + "loss": 1.4168, + "grad_norm": 0.14233236014842987, + "learning_rate": 0.000297465186145691 + }, + { + "step": 91, + "epoch": 0.4631043256997455, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569204736, + "loss": 1.3552, + "grad_norm": 0.1299295276403427, + "learning_rate": 0.0002972995124456779 + }, + { + "step": 92, + "epoch": 0.4681933842239186, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569144832, + "loss": 1.3958, + "grad_norm": 0.12095583230257034, + "learning_rate": 0.0002971286437111861 + }, + { + "step": 93, + "epoch": 0.4732824427480916, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569346048, + "loss": 1.4076, + "grad_norm": 0.09772393107414246, + "learning_rate": 0.0002969525859685014 + }, + { + "step": 94, + "epoch": 0.47837150127226463, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569160192, + "loss": 1.4145, + "grad_norm": 0.20665358006954193, + "learning_rate": 0.0002967713454269183 + }, + { + "step": 95, + "epoch": 0.48346055979643765, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569327616, + "loss": 1.4027, + "grad_norm": 0.1196855753660202, + "learning_rate": 0.0002965849284785207 + }, + { + "step": 96, + "epoch": 0.48854961832061067, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569201664, + "loss": 1.4086, + "grad_norm": 0.17218641936779022, + "learning_rate": 0.000296393341697957 + }, + { + "step": 97, + "epoch": 0.49363867684478374, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569197056, + "loss": 1.402, + "grad_norm": 0.13539455831050873, + "learning_rate": 0.00029619659184220755 + }, + { + "step": 98, + "epoch": 0.49872773536895676, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569244672, + "loss": 1.3623, + "grad_norm": 0.12946245074272156, + "learning_rate": 0.00029599468585034684 + }, + { + "step": 99, + "epoch": 0.5038167938931297, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56922624, + "loss": 1.3559, + "grad_norm": 0.19536428153514862, + "learning_rate": 0.0002957876308432986 + }, + { + "step": 100, + "epoch": 0.5089058524173028, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569190912, + "loss": 1.3715, + "grad_norm": 0.11279573291540146, + "learning_rate": 0.0002955754341235846 + }, + { + "step": 101, + "epoch": 0.5139949109414759, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569174016, + "loss": 1.4044, + "grad_norm": 0.07794487476348877, + "learning_rate": 0.00029535810317506714 + }, + { + "step": 102, + "epoch": 0.5190839694656488, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569224704, + "loss": 1.3788, + "grad_norm": 0.13990958034992218, + "learning_rate": 0.00029513564566268524 + }, + { + "step": 103, + "epoch": 0.5241730279898219, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569184768, + "loss": 1.3843, + "grad_norm": 0.1598130762577057, + "learning_rate": 0.0002949080694321841 + }, + { + "step": 104, + "epoch": 0.5292620865139949, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569273856, + "loss": 1.3827, + "grad_norm": 0.09490810334682465, + "learning_rate": 0.0002946753825098386 + }, + { + "step": 105, + "epoch": 0.5343511450381679, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569143296, + "loss": 1.3794, + "grad_norm": 0.12982894480228424, + "learning_rate": 0.0002944375931021699 + }, + { + "step": 106, + "epoch": 0.539440203562341, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569206272, + "loss": 1.3857, + "grad_norm": 0.08088549971580505, + "learning_rate": 0.0002941947095956564 + }, + { + "step": 107, + "epoch": 0.544529262086514, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569201664, + "loss": 1.3702, + "grad_norm": 0.09849405288696289, + "learning_rate": 0.0002939467405564377 + }, + { + "step": 108, + "epoch": 0.549618320610687, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569190912, + "loss": 1.3678, + "grad_norm": 0.10334116965532303, + "learning_rate": 0.00029369369473001265 + }, + { + "step": 109, + "epoch": 0.55470737913486, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569359872, + "loss": 1.3952, + "grad_norm": 0.10706798732280731, + "learning_rate": 0.0002934355810409307 + }, + { + "step": 110, + "epoch": 0.5597964376590331, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56915712, + "loss": 1.3762, + "grad_norm": 0.15835213661193848, + "learning_rate": 0.0002931724085924774 + }, + { + "step": 111, + "epoch": 0.5648854961832062, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569158656, + "loss": 1.388, + "grad_norm": 0.1385117918252945, + "learning_rate": 0.00029290418666635314 + }, + { + "step": 112, + "epoch": 0.5699745547073791, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569258496, + "loss": 1.3767, + "grad_norm": 0.12949194014072418, + "learning_rate": 0.0002926309247223459 + }, + { + "step": 113, + "epoch": 0.5750636132315522, + "cpu_mem": 1.935982592, + "gpu_mem": 4.5693568, + "loss": 1.3819, + "grad_norm": 0.11827753484249115, + "learning_rate": 0.0002923526323979975 + }, + { + "step": 114, + "epoch": 0.5801526717557252, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569186304, + "loss": 1.3895, + "grad_norm": 0.07724884897470474, + "learning_rate": 0.00029206931950826387 + }, + { + "step": 115, + "epoch": 0.5852417302798982, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56919552, + "loss": 1.3957, + "grad_norm": 0.07897931337356567, + "learning_rate": 0.00029178099604516876 + }, + { + "step": 116, + "epoch": 0.5903307888040712, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56925696, + "loss": 1.3795, + "grad_norm": 0.08443287014961243, + "learning_rate": 0.0002914876721774515 + }, + { + "step": 117, + "epoch": 0.5954198473282443, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56914944, + "loss": 1.3906, + "grad_norm": 0.11091914772987366, + "learning_rate": 0.00029118935825020806 + }, + { + "step": 118, + "epoch": 0.6005089058524173, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569244672, + "loss": 1.4031, + "grad_norm": 0.1031036525964737, + "learning_rate": 0.00029088606478452656 + }, + { + "step": 119, + "epoch": 0.6055979643765903, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569378304, + "loss": 1.4194, + "grad_norm": 0.18757420778274536, + "learning_rate": 0.0002905778024771158 + }, + { + "step": 120, + "epoch": 0.6106870229007634, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569281536, + "loss": 1.3883, + "grad_norm": 0.163682758808136, + "learning_rate": 0.00029026458219992855 + }, + { + "step": 121, + "epoch": 0.6157760814249363, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569327616, + "loss": 1.3929, + "grad_norm": 0.09515506029129028, + "learning_rate": 0.00028994641499977745 + }, + { + "step": 122, + "epoch": 0.6208651399491094, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56927232, + "loss": 1.3928, + "grad_norm": 0.12332849949598312, + "learning_rate": 0.00028962331209794604 + }, + { + "step": 123, + "epoch": 0.6259541984732825, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56931072, + "loss": 1.3828, + "grad_norm": 0.14076073467731476, + "learning_rate": 0.00028929528488979244 + }, + { + "step": 124, + "epoch": 0.6310432569974554, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56923392, + "loss": 1.4104, + "grad_norm": 0.1625133603811264, + "learning_rate": 0.0002889623449443479 + }, + { + "step": 125, + "epoch": 0.6361323155216285, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569269248, + "loss": 1.4092, + "grad_norm": 0.12798595428466797, + "learning_rate": 0.0002886245040039086 + }, + { + "step": 126, + "epoch": 0.6412213740458015, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569175552, + "loss": 1.3867, + "grad_norm": 0.16466458141803741, + "learning_rate": 0.0002882817739836215 + }, + { + "step": 127, + "epoch": 0.6463104325699746, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569200128, + "loss": 1.4028, + "grad_norm": 0.1432015597820282, + "learning_rate": 0.000287934166971064 + }, + { + "step": 128, + "epoch": 0.6513994910941476, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56917248, + "loss": 1.3865, + "grad_norm": 0.144430473446846, + "learning_rate": 0.0002875816952258179 + }, + { + "step": 129, + "epoch": 0.6564885496183206, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569178624, + "loss": 1.3992, + "grad_norm": 0.14820416271686554, + "learning_rate": 0.00028722437117903693 + }, + { + "step": 130, + "epoch": 0.6615776081424937, + "cpu_mem": 1.935982592, + "gpu_mem": 4.569169408, + "loss": 1.3824, + "grad_norm": 0.1324981451034546, + "learning_rate": 0.000286862207433008 + }, + { + "step": 131, + "epoch": 0.6666666666666666, + "cpu_mem": 1.935982592, + "gpu_mem": 4.56910336, + "loss": 1.3916, + "grad_norm": 0.08812088519334793, + "learning_rate": 0.00028649521676070726 + }, + { + "step": 132, + "epoch": 0.6717557251908397, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569266176, + "loss": 1.3887, + "grad_norm": 0.11203940957784653, + "learning_rate": 0.0002861234121053493 + }, + { + "step": 133, + "epoch": 0.6768447837150128, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569178624, + "loss": 1.3497, + "grad_norm": 0.17959894239902496, + "learning_rate": 0.0002857468065799307 + }, + { + "step": 134, + "epoch": 0.6819338422391857, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569181696, + "loss": 1.3923, + "grad_norm": 0.07334964722394943, + "learning_rate": 0.0002853654134667676 + }, + { + "step": 135, + "epoch": 0.6870229007633588, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569236992, + "loss": 1.3696, + "grad_norm": 0.12879355251789093, + "learning_rate": 0.0002849792462170271 + }, + { + "step": 136, + "epoch": 0.6921119592875318, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569140224, + "loss": 1.4312, + "grad_norm": 0.1913805902004242, + "learning_rate": 0.0002845883184502533 + }, + { + "step": 137, + "epoch": 0.6972010178117048, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569307648, + "loss": 1.3791, + "grad_norm": 0.11504063010215759, + "learning_rate": 0.00028419264395388626 + }, + { + "step": 138, + "epoch": 0.7022900763358778, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569174016, + "loss": 1.3789, + "grad_norm": 0.1521594375371933, + "learning_rate": 0.0002837922366827765 + }, + { + "step": 139, + "epoch": 0.7073791348600509, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569163264, + "loss": 1.3575, + "grad_norm": 0.19085325300693512, + "learning_rate": 0.00028338711075869216 + }, + { + "step": 140, + "epoch": 0.712468193384224, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569215488, + "loss": 1.3734, + "grad_norm": 0.16663019359111786, + "learning_rate": 0.00028297728046982137 + }, + { + "step": 141, + "epoch": 0.7175572519083969, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569140224, + "loss": 1.411, + "grad_norm": 0.17543716728687286, + "learning_rate": 0.00028256276027026816 + }, + { + "step": 142, + "epoch": 0.72264631043257, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569201664, + "loss": 1.4075, + "grad_norm": 0.19479577243328094, + "learning_rate": 0.0002821435647795429 + }, + { + "step": 143, + "epoch": 0.727735368956743, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569200128, + "loss": 1.393, + "grad_norm": 0.13281694054603577, + "learning_rate": 0.00028171970878204623 + }, + { + "step": 144, + "epoch": 0.732824427480916, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569144832, + "loss": 1.38, + "grad_norm": 0.10887536406517029, + "learning_rate": 0.0002812912072265481 + }, + { + "step": 145, + "epoch": 0.7379134860050891, + "cpu_mem": 1.937358848, + "gpu_mem": 4.56914176, + "loss": 1.3815, + "grad_norm": 0.1453862488269806, + "learning_rate": 0.00028085807522566043 + }, + { + "step": 146, + "epoch": 0.7430025445292621, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569243136, + "loss": 1.362, + "grad_norm": 0.1230216771364212, + "learning_rate": 0.00028042032805530387 + }, + { + "step": 147, + "epoch": 0.7480916030534351, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569247744, + "loss": 1.3833, + "grad_norm": 0.1605168730020523, + "learning_rate": 0.00027997798115416935 + }, + { + "step": 148, + "epoch": 0.7531806615776081, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569346048, + "loss": 1.3643, + "grad_norm": 0.09940079599618912, + "learning_rate": 0.0002795310501231734 + }, + { + "step": 149, + "epoch": 0.7582697201017812, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569189376, + "loss": 1.39, + "grad_norm": 0.15874512493610382, + "learning_rate": 0.0002790795507249081 + }, + { + "step": 150, + "epoch": 0.7633587786259542, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569181696, + "loss": 1.3791, + "grad_norm": 0.09351309388875961, + "learning_rate": 0.00027862349888308494 + }, + { + "step": 151, + "epoch": 0.7684478371501272, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569124864, + "loss": 1.3794, + "grad_norm": 0.09927459806203842, + "learning_rate": 0.0002781629106819733 + }, + { + "step": 152, + "epoch": 0.7735368956743003, + "cpu_mem": 1.937358848, + "gpu_mem": 4.56915712, + "loss": 1.3926, + "grad_norm": 0.1385425180196762, + "learning_rate": 0.00027769780236583315 + }, + { + "step": 153, + "epoch": 0.7786259541984732, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569184768, + "loss": 1.3695, + "grad_norm": 0.1252247840166092, + "learning_rate": 0.0002772281903383424 + }, + { + "step": 154, + "epoch": 0.7837150127226463, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569235456, + "loss": 1.414, + "grad_norm": 0.17590075731277466, + "learning_rate": 0.00027675409116201797 + }, + { + "step": 155, + "epoch": 0.7888040712468194, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569147904, + "loss": 1.4, + "grad_norm": 0.09531085938215256, + "learning_rate": 0.00027627552155763186 + }, + { + "step": 156, + "epoch": 0.7938931297709924, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569167872, + "loss": 1.3776, + "grad_norm": 0.15937922894954681, + "learning_rate": 0.00027579249840362145 + }, + { + "step": 157, + "epoch": 0.7989821882951654, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569244672, + "loss": 1.4096, + "grad_norm": 0.20363746583461761, + "learning_rate": 0.0002753050387354942 + }, + { + "step": 158, + "epoch": 0.8040712468193384, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569146368, + "loss": 1.3945, + "grad_norm": 0.0948168933391571, + "learning_rate": 0.0002748131597452268 + }, + { + "step": 159, + "epoch": 0.8091603053435115, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569247744, + "loss": 1.3972, + "grad_norm": 0.11129654943943024, + "learning_rate": 0.00027431687878065874 + }, + { + "step": 160, + "epoch": 0.8142493638676844, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569204736, + "loss": 1.3958, + "grad_norm": 0.18359526991844177, + "learning_rate": 0.00027381621334488085 + }, + { + "step": 161, + "epoch": 0.8193384223918575, + "cpu_mem": 1.937358848, + "gpu_mem": 4.5692032, + "loss": 1.3951, + "grad_norm": 0.12877298891544342, + "learning_rate": 0.00027331118109561744 + }, + { + "step": 162, + "epoch": 0.8244274809160306, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569186304, + "loss": 1.3956, + "grad_norm": 0.08722075074911118, + "learning_rate": 0.000272801799844604 + }, + { + "step": 163, + "epoch": 0.8295165394402035, + "cpu_mem": 1.937358848, + "gpu_mem": 4.56930304, + "loss": 1.3849, + "grad_norm": 0.10024484246969223, + "learning_rate": 0.00027228808755695884 + }, + { + "step": 164, + "epoch": 0.8346055979643766, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569198592, + "loss": 1.3837, + "grad_norm": 0.12168323248624802, + "learning_rate": 0.00027177006235054943 + }, + { + "step": 165, + "epoch": 0.8396946564885496, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569278464, + "loss": 1.387, + "grad_norm": 0.1081094741821289, + "learning_rate": 0.0002712477424953534 + }, + { + "step": 166, + "epoch": 0.8447837150127226, + "cpu_mem": 1.937358848, + "gpu_mem": 4.56918784, + "loss": 1.3922, + "grad_norm": 0.18285246193408966, + "learning_rate": 0.00027072114641281435 + }, + { + "step": 167, + "epoch": 0.8498727735368957, + "cpu_mem": 1.937358848, + "gpu_mem": 4.56914944, + "loss": 1.3675, + "grad_norm": 0.13098567724227905, + "learning_rate": 0.0002701902926751921 + }, + { + "step": 168, + "epoch": 0.8549618320610687, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569152512, + "loss": 1.3691, + "grad_norm": 0.1760040819644928, + "learning_rate": 0.00026965520000490743 + }, + { + "step": 169, + "epoch": 0.8600508905852418, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569220096, + "loss": 1.3772, + "grad_norm": 0.145279660820961, + "learning_rate": 0.0002691158872738822 + }, + { + "step": 170, + "epoch": 0.8651399491094147, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569227776, + "loss": 1.3862, + "grad_norm": 0.08411004394292831, + "learning_rate": 0.00026857237350287334 + }, + { + "step": 171, + "epoch": 0.8702290076335878, + "cpu_mem": 1.937358848, + "gpu_mem": 4.569201664, + "loss": 1.365, + "grad_norm": 0.11182354390621185, + "learning_rate": 0.0002680246778608023 + }, + { + "step": 172, + "epoch": 0.8753180661577609, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569181696, + "loss": 1.3743, + "grad_norm": 0.09388243407011032, + "learning_rate": 0.0002674728196640788 + }, + { + "step": 173, + "epoch": 0.8804071246819338, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56921856, + "loss": 1.3837, + "grad_norm": 0.09596379101276398, + "learning_rate": 0.00026691681837591984 + }, + { + "step": 174, + "epoch": 0.8854961832061069, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569161728, + "loss": 1.4011, + "grad_norm": 0.11639085412025452, + "learning_rate": 0.00026635669360566296 + }, + { + "step": 175, + "epoch": 0.8905852417302799, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569232384, + "loss": 1.3342, + "grad_norm": 0.0995016098022461, + "learning_rate": 0.00026579246510807477 + }, + { + "step": 176, + "epoch": 0.8956743002544529, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569129472, + "loss": 1.3719, + "grad_norm": 0.09246751666069031, + "learning_rate": 0.00026522415278265425 + }, + { + "step": 177, + "epoch": 0.9007633587786259, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569138688, + "loss": 1.3976, + "grad_norm": 0.17316646873950958, + "learning_rate": 0.0002646517766729309 + }, + { + "step": 178, + "epoch": 0.905852417302799, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569215488, + "loss": 1.3815, + "grad_norm": 0.15212079882621765, + "learning_rate": 0.0002640753569657579 + }, + { + "step": 179, + "epoch": 0.910941475826972, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569223168, + "loss": 1.3953, + "grad_norm": 0.13720908761024475, + "learning_rate": 0.0002634949139906 + }, + { + "step": 180, + "epoch": 0.916030534351145, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569229312, + "loss": 1.4089, + "grad_norm": 0.14928120374679565, + "learning_rate": 0.00026291046821881673 + }, + { + "step": 181, + "epoch": 0.9211195928753181, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569147904, + "loss": 1.3969, + "grad_norm": 0.13475453853607178, + "learning_rate": 0.0002623220402629402 + }, + { + "step": 182, + "epoch": 0.926208651399491, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569258496, + "loss": 1.3855, + "grad_norm": 0.10292042046785355, + "learning_rate": 0.0002617296508759483 + }, + { + "step": 183, + "epoch": 0.9312977099236641, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569246208, + "loss": 1.4032, + "grad_norm": 0.10712137818336487, + "learning_rate": 0.00026113332095053257 + }, + { + "step": 184, + "epoch": 0.9363867684478372, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569229312, + "loss": 1.3851, + "grad_norm": 0.1901812106370926, + "learning_rate": 0.0002605330715183616 + }, + { + "step": 185, + "epoch": 0.9414758269720102, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569167872, + "loss": 1.3846, + "grad_norm": 0.21705807745456696, + "learning_rate": 0.0002599289237493392 + }, + { + "step": 186, + "epoch": 0.9465648854961832, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569200128, + "loss": 1.3492, + "grad_norm": 0.10873789340257645, + "learning_rate": 0.0002593208989508575 + }, + { + "step": 187, + "epoch": 0.9516539440203562, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569296896, + "loss": 1.3918, + "grad_norm": 0.1092628538608551, + "learning_rate": 0.00025870901856704583 + }, + { + "step": 188, + "epoch": 0.9567430025445293, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569269248, + "loss": 1.3643, + "grad_norm": 0.11864656955003738, + "learning_rate": 0.00025809330417801425 + }, + { + "step": 189, + "epoch": 0.9618320610687023, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569174016, + "loss": 1.4116, + "grad_norm": 0.18530677258968353, + "learning_rate": 0.00025747377749909254 + }, + { + "step": 190, + "epoch": 0.9669211195928753, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569206272, + "loss": 1.3496, + "grad_norm": 0.10128190368413925, + "learning_rate": 0.00025685046038006413 + }, + { + "step": 191, + "epoch": 0.9720101781170484, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56919552, + "loss": 1.3976, + "grad_norm": 0.15003758668899536, + "learning_rate": 0.0002562233748043958 + }, + { + "step": 192, + "epoch": 0.9770992366412213, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569132544, + "loss": 1.361, + "grad_norm": 0.1354484111070633, + "learning_rate": 0.00025559254288846196 + }, + { + "step": 193, + "epoch": 0.9821882951653944, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569253888, + "loss": 1.3827, + "grad_norm": 0.12727609276771545, + "learning_rate": 0.0002549579868807651 + }, + { + "step": 194, + "epoch": 0.9872773536895675, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569175552, + "loss": 1.3715, + "grad_norm": 0.10924215614795685, + "learning_rate": 0.0002543197291611507 + }, + { + "step": 195, + "epoch": 0.9923664122137404, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56928768, + "loss": 1.3908, + "grad_norm": 0.12295433133840561, + "learning_rate": 0.0002536777922400183 + }, + { + "step": 196, + "epoch": 0.9974554707379135, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569283072, + "loss": 1.3649, + "grad_norm": 0.09022051095962524, + "learning_rate": 0.0002530321987575271 + }, + { + "step": 197, + "epoch": 1.0025445292620865, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619679744, + "loss": 2.08, + "grad_norm": 0.22233690321445465, + "learning_rate": 0.0002523829714827981 + }, + { + "step": 198, + "epoch": 1.0076335877862594, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619745792, + "loss": 1.3628, + "grad_norm": 0.11249995231628418, + "learning_rate": 0.00025173013331311053 + }, + { + "step": 199, + "epoch": 1.0127226463104326, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61974272, + "loss": 1.3687, + "grad_norm": 0.13681519031524658, + "learning_rate": 0.0002510737072730946 + }, + { + "step": 200, + "epoch": 1.0178117048346056, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61962752, + "loss": 1.3768, + "grad_norm": 0.1397385150194168, + "learning_rate": 0.0002504137165139193 + }, + { + "step": 201, + "epoch": 1.0229007633587786, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619678208, + "loss": 1.3769, + "grad_norm": 0.15561597049236298, + "learning_rate": 0.0002497501843124761 + }, + { + "step": 202, + "epoch": 1.0279898218829517, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619672064, + "loss": 1.4113, + "grad_norm": 0.16623929142951965, + "learning_rate": 0.00024908313407055765 + }, + { + "step": 203, + "epoch": 1.0330788804071247, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61971968, + "loss": 1.3757, + "grad_norm": 0.1103413924574852, + "learning_rate": 0.00024841258931403284 + }, + { + "step": 204, + "epoch": 1.0381679389312977, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619652096, + "loss": 1.357, + "grad_norm": 0.12129487842321396, + "learning_rate": 0.00024773857369201675 + }, + { + "step": 205, + "epoch": 1.0432569974554706, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619698176, + "loss": 1.3836, + "grad_norm": 0.11247876286506653, + "learning_rate": 0.00024706111097603676 + }, + { + "step": 206, + "epoch": 1.0483460559796438, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619712, + "loss": 1.3604, + "grad_norm": 0.16204394400119781, + "learning_rate": 0.00024638022505919425 + }, + { + "step": 207, + "epoch": 1.0534351145038168, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619667456, + "loss": 1.3774, + "grad_norm": 0.16225826740264893, + "learning_rate": 0.00024569593995532157 + }, + { + "step": 208, + "epoch": 1.0585241730279897, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619659776, + "loss": 1.3636, + "grad_norm": 0.09976457804441452, + "learning_rate": 0.00024500827979813546 + }, + { + "step": 209, + "epoch": 1.063613231552163, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61978112, + "loss": 1.377, + "grad_norm": 0.1717393547296524, + "learning_rate": 0.0002443172688403859 + }, + { + "step": 210, + "epoch": 1.0687022900763359, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619678208, + "loss": 1.3551, + "grad_norm": 0.1726837456226349, + "learning_rate": 0.00024362293145300027 + }, + { + "step": 211, + "epoch": 1.0737913486005088, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619679744, + "loss": 1.381, + "grad_norm": 0.13957460224628448, + "learning_rate": 0.00024292529212422445 + }, + { + "step": 212, + "epoch": 1.078880407124682, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619682816, + "loss": 1.3803, + "grad_norm": 0.11891599744558334, + "learning_rate": 0.00024222437545875887 + }, + { + "step": 213, + "epoch": 1.083969465648855, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619609088, + "loss": 1.3831, + "grad_norm": 0.12794281542301178, + "learning_rate": 0.0002415202061768906 + }, + { + "step": 214, + "epoch": 1.089058524173028, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61960448, + "loss": 1.3643, + "grad_norm": 0.1726492941379547, + "learning_rate": 0.0002408128091136217 + }, + { + "step": 215, + "epoch": 1.094147582697201, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619655168, + "loss": 1.3967, + "grad_norm": 0.3067849278450012, + "learning_rate": 0.00024010220921779336 + }, + { + "step": 216, + "epoch": 1.099236641221374, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619784192, + "loss": 1.388, + "grad_norm": 0.1305411010980606, + "learning_rate": 0.00023938843155120581 + }, + { + "step": 217, + "epoch": 1.104325699745547, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619645952, + "loss": 1.4259, + "grad_norm": 0.22498588263988495, + "learning_rate": 0.00023867150128773453 + }, + { + "step": 218, + "epoch": 1.10941475826972, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619672064, + "loss": 1.3968, + "grad_norm": 0.19523490965366364, + "learning_rate": 0.0002379514437124425 + }, + { + "step": 219, + "epoch": 1.1145038167938932, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619684352, + "loss": 1.4073, + "grad_norm": 0.1352265328168869, + "learning_rate": 0.00023722828422068814 + }, + { + "step": 220, + "epoch": 1.1195928753180662, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619656704, + "loss": 1.4009, + "grad_norm": 0.21941451728343964, + "learning_rate": 0.00023650204831723008 + }, + { + "step": 221, + "epoch": 1.1246819338422391, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619630592, + "loss": 1.4299, + "grad_norm": 0.13648509979248047, + "learning_rate": 0.00023577276161532718 + }, + { + "step": 222, + "epoch": 1.1297709923664123, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61964288, + "loss": 1.3728, + "grad_norm": 0.11963602155447006, + "learning_rate": 0.0002350404498358356 + }, + { + "step": 223, + "epoch": 1.1348600508905853, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619670528, + "loss": 1.372, + "grad_norm": 0.19158609211444855, + "learning_rate": 0.00023430513880630133 + }, + { + "step": 224, + "epoch": 1.1399491094147582, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61968896, + "loss": 1.3978, + "grad_norm": 0.1310475468635559, + "learning_rate": 0.00023356685446004966 + }, + { + "step": 225, + "epoch": 1.1450381679389312, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619739648, + "loss": 1.3426, + "grad_norm": 0.13538897037506104, + "learning_rate": 0.00023282562283527005 + }, + { + "step": 226, + "epoch": 1.1501272264631044, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619690496, + "loss": 1.3971, + "grad_norm": 0.09965481609106064, + "learning_rate": 0.00023208147007409827 + }, + { + "step": 227, + "epoch": 1.1552162849872774, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619641344, + "loss": 1.3691, + "grad_norm": 0.09686226397752762, + "learning_rate": 0.00023133442242169425 + }, + { + "step": 228, + "epoch": 1.1603053435114503, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619685888, + "loss": 1.3861, + "grad_norm": 0.12797991931438446, + "learning_rate": 0.00023058450622531632 + }, + { + "step": 229, + "epoch": 1.1653944020356235, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619584512, + "loss": 1.3511, + "grad_norm": 0.07516215741634369, + "learning_rate": 0.00022983174793339206 + }, + { + "step": 230, + "epoch": 1.1704834605597965, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619652096, + "loss": 1.3834, + "grad_norm": 0.08632528781890869, + "learning_rate": 0.0002290761740945857 + }, + { + "step": 231, + "epoch": 1.1755725190839694, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61964288, + "loss": 1.4049, + "grad_norm": 0.16997931897640228, + "learning_rate": 0.00022831781135686135 + }, + { + "step": 232, + "epoch": 1.1806615776081424, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619647488, + "loss": 1.3213, + "grad_norm": 0.11855362355709076, + "learning_rate": 0.00022755668646654375 + }, + { + "step": 233, + "epoch": 1.1857506361323156, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619762688, + "loss": 1.3785, + "grad_norm": 0.1972045749425888, + "learning_rate": 0.00022679282626737442 + }, + { + "step": 234, + "epoch": 1.1908396946564885, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619722752, + "loss": 1.4415, + "grad_norm": 0.2055157870054245, + "learning_rate": 0.00022602625769956519 + }, + { + "step": 235, + "epoch": 1.1959287531806615, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61968128, + "loss": 1.4286, + "grad_norm": 0.20593143999576569, + "learning_rate": 0.00022525700779884802 + }, + { + "step": 236, + "epoch": 1.2010178117048347, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619679744, + "loss": 1.3673, + "grad_norm": 0.14381447434425354, + "learning_rate": 0.00022448510369552164 + }, + { + "step": 237, + "epoch": 1.2061068702290076, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61970432, + "loss": 1.3886, + "grad_norm": 0.12741021811962128, + "learning_rate": 0.0002237105726134943 + }, + { + "step": 238, + "epoch": 1.2111959287531806, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619724288, + "loss": 1.3904, + "grad_norm": 0.09041012823581696, + "learning_rate": 0.00022293344186932406 + }, + { + "step": 239, + "epoch": 1.2162849872773536, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619641344, + "loss": 1.3761, + "grad_norm": 0.13817766308784485, + "learning_rate": 0.00022215373887125514 + }, + { + "step": 240, + "epoch": 1.2213740458015268, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619776512, + "loss": 1.3765, + "grad_norm": 0.10572250932455063, + "learning_rate": 0.00022137149111825128 + }, + { + "step": 241, + "epoch": 1.2264631043256997, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619730432, + "loss": 1.3739, + "grad_norm": 0.1270529180765152, + "learning_rate": 0.00022058672619902606 + }, + { + "step": 242, + "epoch": 1.2315521628498727, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619751936, + "loss": 1.3924, + "grad_norm": 0.1123330146074295, + "learning_rate": 0.00021979947179106966 + }, + { + "step": 243, + "epoch": 1.2366412213740459, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619638272, + "loss": 1.3494, + "grad_norm": 0.24410994350910187, + "learning_rate": 0.0002190097556596728 + }, + { + "step": 244, + "epoch": 1.2417302798982188, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619625984, + "loss": 1.4469, + "grad_norm": 0.23390009999275208, + "learning_rate": 0.0002182176056569476 + }, + { + "step": 245, + "epoch": 1.2468193384223918, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619630592, + "loss": 1.3979, + "grad_norm": 0.17118783295154572, + "learning_rate": 0.00021742304972084518 + }, + { + "step": 246, + "epoch": 1.2519083969465647, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619667456, + "loss": 1.3838, + "grad_norm": 0.14100313186645508, + "learning_rate": 0.00021662611587417035 + }, + { + "step": 247, + "epoch": 1.256997455470738, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619629056, + "loss": 1.3781, + "grad_norm": 0.16697362065315247, + "learning_rate": 0.00021582683222359317 + }, + { + "step": 248, + "epoch": 1.262086513994911, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619675136, + "loss": 1.3978, + "grad_norm": 0.11307302862405777, + "learning_rate": 0.00021502522695865796 + }, + { + "step": 249, + "epoch": 1.267175572519084, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619684352, + "loss": 1.3979, + "grad_norm": 0.12262143194675446, + "learning_rate": 0.00021422132835078884 + }, + { + "step": 250, + "epoch": 1.272264631043257, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619722752, + "loss": 1.3626, + "grad_norm": 0.08717484772205353, + "learning_rate": 0.0002134151647522927 + }, + { + "step": 251, + "epoch": 1.27735368956743, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619641344, + "loss": 1.3819, + "grad_norm": 0.15800683200359344, + "learning_rate": 0.00021260676459535933 + }, + { + "step": 252, + "epoch": 1.282442748091603, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61965056, + "loss": 1.3703, + "grad_norm": 0.18218304216861725, + "learning_rate": 0.00021179615639105857 + }, + { + "step": 253, + "epoch": 1.2875318066157762, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619716608, + "loss": 1.3693, + "grad_norm": 0.07954366505146027, + "learning_rate": 0.00021098336872833482 + }, + { + "step": 254, + "epoch": 1.2926208651399491, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61964288, + "loss": 1.3564, + "grad_norm": 0.11705395579338074, + "learning_rate": 0.0002101684302729987 + }, + { + "step": 255, + "epoch": 1.297709923664122, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619636736, + "loss": 1.3867, + "grad_norm": 0.1254345029592514, + "learning_rate": 0.00020935136976671617 + }, + { + "step": 256, + "epoch": 1.3027989821882953, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619702784, + "loss": 1.3744, + "grad_norm": 0.12117626518011093, + "learning_rate": 0.00020853221602599458 + }, + { + "step": 257, + "epoch": 1.3078880407124682, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619661312, + "loss": 1.3751, + "grad_norm": 0.10367321968078613, + "learning_rate": 0.00020771099794116672 + }, + { + "step": 258, + "epoch": 1.3129770992366412, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619774976, + "loss": 1.3674, + "grad_norm": 0.10992772877216339, + "learning_rate": 0.0002068877444753717 + }, + { + "step": 259, + "epoch": 1.3180661577608141, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619652096, + "loss": 1.3818, + "grad_norm": 0.1104874387383461, + "learning_rate": 0.0002060624846635335 + }, + { + "step": 260, + "epoch": 1.3231552162849873, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619678208, + "loss": 1.3935, + "grad_norm": 0.09905646741390228, + "learning_rate": 0.00020523524761133677 + }, + { + "step": 261, + "epoch": 1.3282442748091603, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619684352, + "loss": 1.4003, + "grad_norm": 0.18350572884082794, + "learning_rate": 0.00020440606249420073 + }, + { + "step": 262, + "epoch": 1.3333333333333333, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619751936, + "loss": 1.3817, + "grad_norm": 0.09636932611465454, + "learning_rate": 0.00020357495855624974 + }, + { + "step": 263, + "epoch": 1.3384223918575064, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619607552, + "loss": 1.3781, + "grad_norm": 0.10567140579223633, + "learning_rate": 0.0002027419651092822 + }, + { + "step": 264, + "epoch": 1.3435114503816794, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619741184, + "loss": 1.381, + "grad_norm": 0.0715365782380104, + "learning_rate": 0.00020190711153173676 + }, + { + "step": 265, + "epoch": 1.3486005089058524, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619770368, + "loss": 1.3627, + "grad_norm": 0.11272958666086197, + "learning_rate": 0.00020107042726765588 + }, + { + "step": 266, + "epoch": 1.3536895674300253, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619639808, + "loss": 1.3942, + "grad_norm": 0.10147303342819214, + "learning_rate": 0.0002002319418256479 + }, + { + "step": 267, + "epoch": 1.3587786259541985, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619641344, + "loss": 1.3874, + "grad_norm": 0.11673005670309067, + "learning_rate": 0.00019939168477784583 + }, + { + "step": 268, + "epoch": 1.3638676844783715, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61964288, + "loss": 1.3516, + "grad_norm": 0.06847807765007019, + "learning_rate": 0.00019854968575886458 + }, + { + "step": 269, + "epoch": 1.3689567430025447, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619716608, + "loss": 1.4088, + "grad_norm": 0.1634063571691513, + "learning_rate": 0.00019770597446475588 + }, + { + "step": 270, + "epoch": 1.3740458015267176, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619698176, + "loss": 1.3552, + "grad_norm": 0.07933250069618225, + "learning_rate": 0.0001968605806519608 + }, + { + "step": 271, + "epoch": 1.3791348600508906, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619641344, + "loss": 1.41, + "grad_norm": 0.12397512048482895, + "learning_rate": 0.00019601353413626032 + }, + { + "step": 272, + "epoch": 1.3842239185750635, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619679744, + "loss": 1.3776, + "grad_norm": 0.14683836698532104, + "learning_rate": 0.00019516486479172386 + }, + { + "step": 273, + "epoch": 1.3893129770992365, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619747328, + "loss": 1.3826, + "grad_norm": 0.08442371338605881, + "learning_rate": 0.0001943146025496555 + }, + { + "step": 274, + "epoch": 1.3944020356234097, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619699712, + "loss": 1.4018, + "grad_norm": 0.1321588009595871, + "learning_rate": 0.00019346277739753855 + }, + { + "step": 275, + "epoch": 1.3994910941475827, + "cpu_mem": 1.940701184, + "gpu_mem": 4.6198272, + "loss": 1.3877, + "grad_norm": 0.1360960602760315, + "learning_rate": 0.00019260941937797776 + }, + { + "step": 276, + "epoch": 1.4045801526717558, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619712, + "loss": 1.3737, + "grad_norm": 0.12411756813526154, + "learning_rate": 0.00019175455858763988 + }, + { + "step": 277, + "epoch": 1.4096692111959288, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619661312, + "loss": 1.3745, + "grad_norm": 0.13634656369686127, + "learning_rate": 0.0001908982251761921 + }, + { + "step": 278, + "epoch": 1.4147582697201018, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619649024, + "loss": 1.3396, + "grad_norm": 0.12806583940982819, + "learning_rate": 0.00019004044934523871 + }, + { + "step": 279, + "epoch": 1.4198473282442747, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619641344, + "loss": 1.3954, + "grad_norm": 0.08278314769268036, + "learning_rate": 0.00018918126134725616 + }, + { + "step": 280, + "epoch": 1.424936386768448, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619764224, + "loss": 1.3678, + "grad_norm": 0.10108662396669388, + "learning_rate": 0.00018832069148452582 + }, + { + "step": 281, + "epoch": 1.4300254452926209, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61970432, + "loss": 1.3864, + "grad_norm": 0.1757262945175171, + "learning_rate": 0.00018745877010806534 + }, + { + "step": 282, + "epoch": 1.4351145038167938, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61965056, + "loss": 1.3835, + "grad_norm": 0.15929225087165833, + "learning_rate": 0.00018659552761655828 + }, + { + "step": 283, + "epoch": 1.440203562340967, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619667456, + "loss": 1.4197, + "grad_norm": 0.1868702471256256, + "learning_rate": 0.00018573099445528204 + }, + { + "step": 284, + "epoch": 1.44529262086514, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619807232, + "loss": 1.3675, + "grad_norm": 0.11450479924678802, + "learning_rate": 0.00018486520111503387 + }, + { + "step": 285, + "epoch": 1.450381679389313, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619678208, + "loss": 1.3924, + "grad_norm": 0.10315029323101044, + "learning_rate": 0.0001839981781310558 + }, + { + "step": 286, + "epoch": 1.455470737913486, + "cpu_mem": 1.940701184, + "gpu_mem": 4.6196736, + "loss": 1.3855, + "grad_norm": 0.12842804193496704, + "learning_rate": 0.00018312995608195747 + }, + { + "step": 287, + "epoch": 1.460559796437659, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619576832, + "loss": 1.3747, + "grad_norm": 0.14868056774139404, + "learning_rate": 0.00018226056558863778 + }, + { + "step": 288, + "epoch": 1.465648854961832, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619652096, + "loss": 1.3592, + "grad_norm": 0.11574488133192062, + "learning_rate": 0.00018139003731320496 + }, + { + "step": 289, + "epoch": 1.470737913486005, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619672064, + "loss": 1.3935, + "grad_norm": 0.11888021975755692, + "learning_rate": 0.00018051840195789506 + }, + { + "step": 290, + "epoch": 1.4758269720101782, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619638272, + "loss": 1.3724, + "grad_norm": 0.09984689950942993, + "learning_rate": 0.00017964569026398926 + }, + { + "step": 291, + "epoch": 1.4809160305343512, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619599872, + "loss": 1.3794, + "grad_norm": 0.10075947642326355, + "learning_rate": 0.00017877193301072945 + }, + { + "step": 292, + "epoch": 1.4860050890585241, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619725824, + "loss": 1.3916, + "grad_norm": 0.09009528905153275, + "learning_rate": 0.0001778971610142331 + }, + { + "step": 293, + "epoch": 1.491094147582697, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619602944, + "loss": 1.4057, + "grad_norm": 0.1116676852107048, + "learning_rate": 0.00017702140512640594 + }, + { + "step": 294, + "epoch": 1.4961832061068703, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619645952, + "loss": 1.4148, + "grad_norm": 0.11232031136751175, + "learning_rate": 0.00017614469623385414 + }, + { + "step": 295, + "epoch": 1.5012722646310432, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619629056, + "loss": 1.3497, + "grad_norm": 0.1664682775735855, + "learning_rate": 0.00017526706525679498 + }, + { + "step": 296, + "epoch": 1.5063613231552164, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619670528, + "loss": 1.382, + "grad_norm": 0.10978628695011139, + "learning_rate": 0.00017438854314796623 + }, + { + "step": 297, + "epoch": 1.5114503816793894, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619610624, + "loss": 1.3514, + "grad_norm": 0.08486147224903107, + "learning_rate": 0.00017350916089153455 + }, + { + "step": 298, + "epoch": 1.5165394402035624, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619633664, + "loss": 1.3529, + "grad_norm": 0.13282345235347748, + "learning_rate": 0.00017262894950200277 + }, + { + "step": 299, + "epoch": 1.5216284987277353, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619622912, + "loss": 1.3591, + "grad_norm": 0.09368728846311569, + "learning_rate": 0.000171747940023116 + }, + { + "step": 300, + "epoch": 1.5267175572519083, + "cpu_mem": 1.940701184, + "gpu_mem": 4.6196352, + "loss": 1.3781, + "grad_norm": 0.08968287706375122, + "learning_rate": 0.0001708661635267667 + }, + { + "step": 301, + "epoch": 1.5318066157760815, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619638272, + "loss": 1.3526, + "grad_norm": 0.20325089991092682, + "learning_rate": 0.00016998365111189906 + }, + { + "step": 302, + "epoch": 1.5368956743002544, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61961984, + "loss": 1.3744, + "grad_norm": 0.07840386033058167, + "learning_rate": 0.00016910043390341183 + }, + { + "step": 303, + "epoch": 1.5419847328244276, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619625984, + "loss": 1.3708, + "grad_norm": 0.12875628471374512, + "learning_rate": 0.0001682165430510609 + }, + { + "step": 304, + "epoch": 1.5470737913486006, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61957376, + "loss": 1.4369, + "grad_norm": 0.3250971734523773, + "learning_rate": 0.00016733200972836055 + }, + { + "step": 305, + "epoch": 1.5521628498727735, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619645952, + "loss": 1.3795, + "grad_norm": 0.11826823651790619, + "learning_rate": 0.00016644686513148397 + }, + { + "step": 306, + "epoch": 1.5572519083969465, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619687424, + "loss": 1.3656, + "grad_norm": 0.11403290927410126, + "learning_rate": 0.00016556114047816317 + }, + { + "step": 307, + "epoch": 1.5623409669211195, + "cpu_mem": 1.940701184, + "gpu_mem": 4.6196352, + "loss": 1.351, + "grad_norm": 0.10229948163032532, + "learning_rate": 0.00016467486700658785 + }, + { + "step": 308, + "epoch": 1.5674300254452926, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619645952, + "loss": 1.4038, + "grad_norm": 0.12131376564502716, + "learning_rate": 0.0001637880759743037 + }, + { + "step": 309, + "epoch": 1.5725190839694656, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619629056, + "loss": 1.386, + "grad_norm": 0.14341866970062256, + "learning_rate": 0.00016290079865711004 + }, + { + "step": 310, + "epoch": 1.5776081424936388, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619684352, + "loss": 1.3832, + "grad_norm": 0.1053689643740654, + "learning_rate": 0.00016201306634795675 + }, + { + "step": 311, + "epoch": 1.5826972010178118, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61965056, + "loss": 1.3707, + "grad_norm": 0.08268966525793076, + "learning_rate": 0.00016112491035584047 + }, + { + "step": 312, + "epoch": 1.5877862595419847, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619659776, + "loss": 1.4034, + "grad_norm": 0.10320953279733658, + "learning_rate": 0.00016023636200470065 + }, + { + "step": 313, + "epoch": 1.5928753180661577, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619661312, + "loss": 1.3777, + "grad_norm": 0.15242639183998108, + "learning_rate": 0.00015934745263231464 + }, + { + "step": 314, + "epoch": 1.5979643765903306, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61988864, + "loss": 1.3888, + "grad_norm": 0.11086822301149368, + "learning_rate": 0.00015845821358919236 + }, + { + "step": 315, + "epoch": 1.6030534351145038, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61968128, + "loss": 1.3641, + "grad_norm": 0.11487813293933868, + "learning_rate": 0.00015756867623747088 + }, + { + "step": 316, + "epoch": 1.608142493638677, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619653632, + "loss": 1.3806, + "grad_norm": 0.12873968482017517, + "learning_rate": 0.00015667887194980806 + }, + { + "step": 317, + "epoch": 1.61323155216285, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619618304, + "loss": 1.3749, + "grad_norm": 0.1827714592218399, + "learning_rate": 0.00015578883210827626 + }, + { + "step": 318, + "epoch": 1.618320610687023, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619708928, + "loss": 1.3648, + "grad_norm": 0.12241631001234055, + "learning_rate": 0.0001548985881032554 + }, + { + "step": 319, + "epoch": 1.623409669211196, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619633664, + "loss": 1.3695, + "grad_norm": 0.14607319235801697, + "learning_rate": 0.00015400817133232606 + }, + { + "step": 320, + "epoch": 1.6284987277353689, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61968128, + "loss": 1.3699, + "grad_norm": 0.10468507558107376, + "learning_rate": 0.00015311761319916184 + }, + { + "step": 321, + "epoch": 1.6335877862595418, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619622912, + "loss": 1.3855, + "grad_norm": 0.1473015695810318, + "learning_rate": 0.00015222694511242215 + }, + { + "step": 322, + "epoch": 1.638676844783715, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619844096, + "loss": 1.3933, + "grad_norm": 0.11238094419240952, + "learning_rate": 0.00015133619848464424 + }, + { + "step": 323, + "epoch": 1.6437659033078882, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619725824, + "loss": 1.3705, + "grad_norm": 0.130208358168602, + "learning_rate": 0.0001504454047311353 + }, + { + "step": 324, + "epoch": 1.6488549618320612, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619618304, + "loss": 1.3525, + "grad_norm": 0.2364986091852188, + "learning_rate": 0.00014955459526886468 + }, + { + "step": 325, + "epoch": 1.6539440203562341, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61965824, + "loss": 1.3923, + "grad_norm": 0.149781733751297, + "learning_rate": 0.00014866380151535574 + }, + { + "step": 326, + "epoch": 1.659033078880407, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619705856, + "loss": 1.3657, + "grad_norm": 0.11875800788402557, + "learning_rate": 0.0001477730548875778 + }, + { + "step": 327, + "epoch": 1.66412213740458, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619672064, + "loss": 1.3924, + "grad_norm": 0.12045573443174362, + "learning_rate": 0.0001468823868008382 + }, + { + "step": 328, + "epoch": 1.6692111959287532, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61972736, + "loss": 1.3894, + "grad_norm": 0.10391460359096527, + "learning_rate": 0.000145991828667674 + }, + { + "step": 329, + "epoch": 1.6743002544529262, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61961984, + "loss": 1.372, + "grad_norm": 0.11503754556179047, + "learning_rate": 0.0001451014118967446 + }, + { + "step": 330, + "epoch": 1.6793893129770994, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61970432, + "loss": 1.3518, + "grad_norm": 0.15308599174022675, + "learning_rate": 0.00014421116789172374 + }, + { + "step": 331, + "epoch": 1.6844783715012723, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619693568, + "loss": 1.3693, + "grad_norm": 0.15860919654369354, + "learning_rate": 0.00014332112805019194 + }, + { + "step": 332, + "epoch": 1.6895674300254453, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619649024, + "loss": 1.3762, + "grad_norm": 0.08910667151212692, + "learning_rate": 0.00014243132376252912 + }, + { + "step": 333, + "epoch": 1.6946564885496183, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619738112, + "loss": 1.3969, + "grad_norm": 0.10641851276159286, + "learning_rate": 0.00014154178641080767 + }, + { + "step": 334, + "epoch": 1.6997455470737912, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61966592, + "loss": 1.3722, + "grad_norm": 0.09762413799762726, + "learning_rate": 0.0001406525473676854 + }, + { + "step": 335, + "epoch": 1.7048346055979644, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619675136, + "loss": 1.381, + "grad_norm": 0.07750643044710159, + "learning_rate": 0.00013976363799529936 + }, + { + "step": 336, + "epoch": 1.7099236641221374, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61972736, + "loss": 1.3523, + "grad_norm": 0.11001694202423096, + "learning_rate": 0.00013887508964415956 + }, + { + "step": 337, + "epoch": 1.7150127226463106, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61972736, + "loss": 1.3464, + "grad_norm": 0.2357664406299591, + "learning_rate": 0.00013798693365204325 + }, + { + "step": 338, + "epoch": 1.7201017811704835, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619764224, + "loss": 1.385, + "grad_norm": 0.1766819804906845, + "learning_rate": 0.00013709920134288993 + }, + { + "step": 339, + "epoch": 1.7251908396946565, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61968128, + "loss": 1.4004, + "grad_norm": 0.14639823138713837, + "learning_rate": 0.00013621192402569628 + }, + { + "step": 340, + "epoch": 1.7302798982188294, + "cpu_mem": 1.940701184, + "gpu_mem": 4.6197504, + "loss": 1.3816, + "grad_norm": 0.10319050401449203, + "learning_rate": 0.00013532513299341215 + }, + { + "step": 341, + "epoch": 1.7353689567430024, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619692032, + "loss": 1.3755, + "grad_norm": 0.11984696984291077, + "learning_rate": 0.00013443885952183683 + }, + { + "step": 342, + "epoch": 1.7404580152671756, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619698176, + "loss": 1.3538, + "grad_norm": 0.15688806772232056, + "learning_rate": 0.00013355313486851603 + }, + { + "step": 343, + "epoch": 1.7455470737913485, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619609088, + "loss": 1.3904, + "grad_norm": 0.1527719646692276, + "learning_rate": 0.00013266799027163942 + }, + { + "step": 344, + "epoch": 1.7506361323155217, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61972736, + "loss": 1.3919, + "grad_norm": 0.11574702709913254, + "learning_rate": 0.00013178345694893906 + }, + { + "step": 345, + "epoch": 1.7557251908396947, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619679744, + "loss": 1.358, + "grad_norm": 0.10883761942386627, + "learning_rate": 0.0001308995660965881 + }, + { + "step": 346, + "epoch": 1.7608142493638677, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619625984, + "loss": 1.3831, + "grad_norm": 0.16567623615264893, + "learning_rate": 0.00013001634888810094 + }, + { + "step": 347, + "epoch": 1.7659033078880406, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619675136, + "loss": 1.3712, + "grad_norm": 0.18206211924552917, + "learning_rate": 0.0001291338364732333 + }, + { + "step": 348, + "epoch": 1.7709923664122136, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619801088, + "loss": 1.3939, + "grad_norm": 0.11878462880849838, + "learning_rate": 0.00012825205997688403 + }, + { + "step": 349, + "epoch": 1.7760814249363868, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619599872, + "loss": 1.3365, + "grad_norm": 0.10213032364845276, + "learning_rate": 0.00012737105049799723 + }, + { + "step": 350, + "epoch": 1.78117048346056, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619664384, + "loss": 1.3637, + "grad_norm": 0.10336507111787796, + "learning_rate": 0.00012649083910846543 + }, + { + "step": 351, + "epoch": 1.786259541984733, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619679744, + "loss": 1.3568, + "grad_norm": 0.11558816581964493, + "learning_rate": 0.00012561145685203374 + }, + { + "step": 352, + "epoch": 1.7913486005089059, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619670528, + "loss": 1.3613, + "grad_norm": 0.09730163216590881, + "learning_rate": 0.00012473293474320505 + }, + { + "step": 353, + "epoch": 1.7964376590330788, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619667456, + "loss": 1.3619, + "grad_norm": 0.16565188765525818, + "learning_rate": 0.00012385530376614586 + }, + { + "step": 354, + "epoch": 1.8015267175572518, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61964288, + "loss": 1.3811, + "grad_norm": 0.11589106917381287, + "learning_rate": 0.00012297859487359408 + }, + { + "step": 355, + "epoch": 1.806615776081425, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619625984, + "loss": 1.3584, + "grad_norm": 0.08044332265853882, + "learning_rate": 0.0001221028389857669 + }, + { + "step": 356, + "epoch": 1.811704834605598, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61962752, + "loss": 1.3778, + "grad_norm": 0.07388331741094589, + "learning_rate": 0.00012122806698927051 + }, + { + "step": 357, + "epoch": 1.8167938931297711, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619679744, + "loss": 1.3519, + "grad_norm": 0.15207646787166595, + "learning_rate": 0.00012035430973601075 + }, + { + "step": 358, + "epoch": 1.821882951653944, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619745792, + "loss": 1.3771, + "grad_norm": 0.24081583321094513, + "learning_rate": 0.00011948159804210495 + }, + { + "step": 359, + "epoch": 1.826972010178117, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619699712, + "loss": 1.3604, + "grad_norm": 0.1189778745174408, + "learning_rate": 0.00011860996268679504 + }, + { + "step": 360, + "epoch": 1.83206106870229, + "cpu_mem": 1.940701184, + "gpu_mem": 4.6197504, + "loss": 1.3885, + "grad_norm": 0.10157686471939087, + "learning_rate": 0.00011773943441136221 + }, + { + "step": 361, + "epoch": 1.837150127226463, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619721216, + "loss": 1.4306, + "grad_norm": 0.1664385050535202, + "learning_rate": 0.00011687004391804251 + }, + { + "step": 362, + "epoch": 1.8422391857506362, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619655168, + "loss": 1.3433, + "grad_norm": 0.10322940349578857, + "learning_rate": 0.00011600182186894417 + }, + { + "step": 363, + "epoch": 1.8473282442748091, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61961216, + "loss": 1.3524, + "grad_norm": 0.11840961128473282, + "learning_rate": 0.00011513479888496609 + }, + { + "step": 364, + "epoch": 1.8524173027989823, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619639808, + "loss": 1.3659, + "grad_norm": 0.22352561354637146, + "learning_rate": 0.00011426900554471795 + }, + { + "step": 365, + "epoch": 1.8575063613231553, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619622912, + "loss": 1.392, + "grad_norm": 0.10508042573928833, + "learning_rate": 0.0001134044723834417 + }, + { + "step": 366, + "epoch": 1.8625954198473282, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61966592, + "loss": 1.3631, + "grad_norm": 0.1195199266076088, + "learning_rate": 0.00011254122989193465 + }, + { + "step": 367, + "epoch": 1.8676844783715012, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61971968, + "loss": 1.3658, + "grad_norm": 0.1408369541168213, + "learning_rate": 0.00011167930851547418 + }, + { + "step": 368, + "epoch": 1.8727735368956742, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619616768, + "loss": 1.3699, + "grad_norm": 0.1259409338235855, + "learning_rate": 0.0001108187386527438 + }, + { + "step": 369, + "epoch": 1.8778625954198473, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619676672, + "loss": 1.3491, + "grad_norm": 0.16986438632011414, + "learning_rate": 0.00010995955065476126 + }, + { + "step": 370, + "epoch": 1.8829516539440203, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619599872, + "loss": 1.37, + "grad_norm": 0.22811450064182281, + "learning_rate": 0.00010910177482380795 + }, + { + "step": 371, + "epoch": 1.8880407124681935, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619759616, + "loss": 1.3742, + "grad_norm": 0.1375986784696579, + "learning_rate": 0.00010824544141236015 + }, + { + "step": 372, + "epoch": 1.8931297709923665, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619613696, + "loss": 1.3667, + "grad_norm": 0.0896725133061409, + "learning_rate": 0.00010739058062202224 + }, + { + "step": 373, + "epoch": 1.8982188295165394, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619636736, + "loss": 1.3375, + "grad_norm": 0.0919974073767662, + "learning_rate": 0.00010653722260246145 + }, + { + "step": 374, + "epoch": 1.9033078880407124, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619659776, + "loss": 1.4237, + "grad_norm": 0.2571694850921631, + "learning_rate": 0.00010568539745034447 + }, + { + "step": 375, + "epoch": 1.9083969465648853, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619884032, + "loss": 1.4476, + "grad_norm": 0.21741774678230286, + "learning_rate": 0.00010483513520827614 + }, + { + "step": 376, + "epoch": 1.9134860050890585, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619641344, + "loss": 1.3839, + "grad_norm": 0.11413206905126572, + "learning_rate": 0.00010398646586373969 + }, + { + "step": 377, + "epoch": 1.9185750636132317, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619764224, + "loss": 1.3841, + "grad_norm": 0.1303163468837738, + "learning_rate": 0.00010313941934803922 + }, + { + "step": 378, + "epoch": 1.9236641221374047, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619679744, + "loss": 1.3954, + "grad_norm": 0.1361367106437683, + "learning_rate": 0.00010229402553524413 + }, + { + "step": 379, + "epoch": 1.9287531806615776, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619768832, + "loss": 1.3539, + "grad_norm": 0.18704886734485626, + "learning_rate": 0.00010145031424113542 + }, + { + "step": 380, + "epoch": 1.9338422391857506, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619667456, + "loss": 1.3427, + "grad_norm": 0.12491501867771149, + "learning_rate": 0.00010060831522215416 + }, + { + "step": 381, + "epoch": 1.9389312977099236, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619675136, + "loss": 1.3785, + "grad_norm": 0.11554654687643051, + "learning_rate": 9.976805817435207e-05 + }, + { + "step": 382, + "epoch": 1.9440203562340967, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619598336, + "loss": 1.3428, + "grad_norm": 0.1435931921005249, + "learning_rate": 9.89295727323441e-05 + }, + { + "step": 383, + "epoch": 1.9491094147582697, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619678208, + "loss": 1.4017, + "grad_norm": 0.18761184811592102, + "learning_rate": 9.809288846826327e-05 + }, + { + "step": 384, + "epoch": 1.954198473282443, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619649024, + "loss": 1.3624, + "grad_norm": 0.13378268480300903, + "learning_rate": 9.725803489071779e-05 + }, + { + "step": 385, + "epoch": 1.9592875318066159, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619636736, + "loss": 1.3513, + "grad_norm": 0.11485403776168823, + "learning_rate": 9.642504144375026e-05 + }, + { + "step": 386, + "epoch": 1.9643765903307888, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619779584, + "loss": 1.3448, + "grad_norm": 0.15612168610095978, + "learning_rate": 9.559393750579926e-05 + }, + { + "step": 387, + "epoch": 1.9694656488549618, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619662848, + "loss": 1.38, + "grad_norm": 0.11850685626268387, + "learning_rate": 9.476475238866318e-05 + }, + { + "step": 388, + "epoch": 1.9745547073791347, + "cpu_mem": 1.940701184, + "gpu_mem": 4.6196736, + "loss": 1.3914, + "grad_norm": 0.11695393174886703, + "learning_rate": 9.393751533646649e-05 + }, + { + "step": 389, + "epoch": 1.979643765903308, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619828736, + "loss": 1.3992, + "grad_norm": 0.16192936897277832, + "learning_rate": 9.31122555246283e-05 + }, + { + "step": 390, + "epoch": 1.984732824427481, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619638272, + "loss": 1.3964, + "grad_norm": 0.16167229413986206, + "learning_rate": 9.228900205883324e-05 + }, + { + "step": 391, + "epoch": 1.989821882951654, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619659776, + "loss": 1.3865, + "grad_norm": 0.1481785923242569, + "learning_rate": 9.146778397400543e-05 + }, + { + "step": 392, + "epoch": 1.994910941475827, + "cpu_mem": 1.940701184, + "gpu_mem": 4.61969664, + "loss": 1.3949, + "grad_norm": 0.12188734114170074, + "learning_rate": 9.064863023328384e-05 + }, + { + "step": 393, + "epoch": 2.0, + "cpu_mem": 1.940701184, + "gpu_mem": 4.619283456, + "loss": 2.0251, + "grad_norm": 0.1877029538154602, + "learning_rate": 8.983156972700125e-05 + }, + { + "step": 394, + "epoch": 2.005089058524173, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569155584, + "loss": 1.3789, + "grad_norm": 0.21110688149929047, + "learning_rate": 8.901663127166513e-05 + }, + { + "step": 395, + "epoch": 2.010178117048346, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56913408, + "loss": 1.3888, + "grad_norm": 0.13724982738494873, + "learning_rate": 8.820384360894143e-05 + }, + { + "step": 396, + "epoch": 2.015267175572519, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56914944, + "loss": 1.3752, + "grad_norm": 0.2011563777923584, + "learning_rate": 8.739323540464063e-05 + }, + { + "step": 397, + "epoch": 2.0203562340966923, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569296896, + "loss": 1.3672, + "grad_norm": 0.12336952239274979, + "learning_rate": 8.658483524770728e-05 + }, + { + "step": 398, + "epoch": 2.0254452926208653, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569243136, + "loss": 1.4029, + "grad_norm": 0.13523390889167786, + "learning_rate": 8.577867164921113e-05 + }, + { + "step": 399, + "epoch": 2.030534351145038, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569260032, + "loss": 1.4075, + "grad_norm": 0.1506805121898651, + "learning_rate": 8.497477304134203e-05 + }, + { + "step": 400, + "epoch": 2.035623409669211, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569212416, + "loss": 1.3991, + "grad_norm": 0.15464822947978973, + "learning_rate": 8.41731677764068e-05 + }, + { + "step": 401, + "epoch": 2.040712468193384, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569236992, + "loss": 1.3917, + "grad_norm": 0.1349705457687378, + "learning_rate": 8.337388412582972e-05 + }, + { + "step": 402, + "epoch": 2.045801526717557, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569284608, + "loss": 1.3705, + "grad_norm": 0.17840957641601562, + "learning_rate": 8.257695027915481e-05 + }, + { + "step": 403, + "epoch": 2.05089058524173, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569193984, + "loss": 1.3916, + "grad_norm": 0.21362873911857605, + "learning_rate": 8.178239434305235e-05 + }, + { + "step": 404, + "epoch": 2.0559796437659035, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569166336, + "loss": 1.3476, + "grad_norm": 0.1691908985376358, + "learning_rate": 8.099024434032717e-05 + }, + { + "step": 405, + "epoch": 2.0610687022900764, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569255424, + "loss": 1.3452, + "grad_norm": 0.10908565670251846, + "learning_rate": 8.02005282089303e-05 + }, + { + "step": 406, + "epoch": 2.0661577608142494, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569146368, + "loss": 1.3745, + "grad_norm": 0.1766427755355835, + "learning_rate": 7.941327380097388e-05 + }, + { + "step": 407, + "epoch": 2.0712468193384224, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569198592, + "loss": 1.3609, + "grad_norm": 0.14026325941085815, + "learning_rate": 7.862850888174869e-05 + }, + { + "step": 408, + "epoch": 2.0763358778625953, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569152512, + "loss": 1.3529, + "grad_norm": 0.15024764835834503, + "learning_rate": 7.784626112874487e-05 + }, + { + "step": 409, + "epoch": 2.0814249363867683, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569355264, + "loss": 1.3686, + "grad_norm": 0.16674309968948364, + "learning_rate": 7.706655813067594e-05 + }, + { + "step": 410, + "epoch": 2.0865139949109412, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569353728, + "loss": 1.3986, + "grad_norm": 0.21658866107463837, + "learning_rate": 7.628942738650573e-05 + }, + { + "step": 411, + "epoch": 2.0916030534351147, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569276928, + "loss": 1.3674, + "grad_norm": 0.15271611511707306, + "learning_rate": 7.551489630447835e-05 + }, + { + "step": 412, + "epoch": 2.0966921119592876, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56923392, + "loss": 1.36, + "grad_norm": 0.14001798629760742, + "learning_rate": 7.474299220115195e-05 + }, + { + "step": 413, + "epoch": 2.1017811704834606, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56921088, + "loss": 1.3393, + "grad_norm": 0.11234854906797409, + "learning_rate": 7.397374230043484e-05 + }, + { + "step": 414, + "epoch": 2.1068702290076335, + "cpu_mem": 1.940701184, + "gpu_mem": 4.5691648, + "loss": 1.3601, + "grad_norm": 0.1471264660358429, + "learning_rate": 7.320717373262557e-05 + }, + { + "step": 415, + "epoch": 2.1119592875318065, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569281536, + "loss": 1.3666, + "grad_norm": 0.20594516396522522, + "learning_rate": 7.244331353345625e-05 + }, + { + "step": 416, + "epoch": 2.1170483460559795, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569169408, + "loss": 1.3684, + "grad_norm": 0.11995385587215424, + "learning_rate": 7.16821886431386e-05 + }, + { + "step": 417, + "epoch": 2.122137404580153, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569158656, + "loss": 1.3793, + "grad_norm": 0.1316065490245819, + "learning_rate": 7.092382590541432e-05 + }, + { + "step": 418, + "epoch": 2.127226463104326, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569163264, + "loss": 1.3678, + "grad_norm": 0.08662188053131104, + "learning_rate": 7.016825206660788e-05 + }, + { + "step": 419, + "epoch": 2.132315521628499, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569278464, + "loss": 1.3951, + "grad_norm": 0.19127745926380157, + "learning_rate": 6.941549377468367e-05 + }, + { + "step": 420, + "epoch": 2.1374045801526718, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569252352, + "loss": 1.3729, + "grad_norm": 0.1398060917854309, + "learning_rate": 6.866557757830575e-05 + }, + { + "step": 421, + "epoch": 2.1424936386768447, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569154048, + "loss": 1.3444, + "grad_norm": 0.1846694052219391, + "learning_rate": 6.791852992590169e-05 + }, + { + "step": 422, + "epoch": 2.1475826972010177, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569189376, + "loss": 1.3437, + "grad_norm": 0.15681670606136322, + "learning_rate": 6.717437716472997e-05 + }, + { + "step": 423, + "epoch": 2.1526717557251906, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569155584, + "loss": 1.3581, + "grad_norm": 0.1098094955086708, + "learning_rate": 6.643314553995034e-05 + }, + { + "step": 424, + "epoch": 2.157760814249364, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569289216, + "loss": 1.3432, + "grad_norm": 0.09406350553035736, + "learning_rate": 6.569486119369863e-05 + }, + { + "step": 425, + "epoch": 2.162849872773537, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569263104, + "loss": 1.372, + "grad_norm": 0.14715464413166046, + "learning_rate": 6.495955016416441e-05 + }, + { + "step": 426, + "epoch": 2.16793893129771, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569307648, + "loss": 1.3433, + "grad_norm": 0.19197596609592438, + "learning_rate": 6.422723838467286e-05 + }, + { + "step": 427, + "epoch": 2.173027989821883, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569198592, + "loss": 1.3623, + "grad_norm": 0.20966041088104248, + "learning_rate": 6.349795168276994e-05 + }, + { + "step": 428, + "epoch": 2.178117048346056, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569215488, + "loss": 1.3639, + "grad_norm": 0.253628671169281, + "learning_rate": 6.277171577931187e-05 + }, + { + "step": 429, + "epoch": 2.183206106870229, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569290752, + "loss": 1.3742, + "grad_norm": 0.13425350189208984, + "learning_rate": 6.204855628755751e-05 + }, + { + "step": 430, + "epoch": 2.188295165394402, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569138688, + "loss": 1.371, + "grad_norm": 0.14083567261695862, + "learning_rate": 6.13284987122654e-05 + }, + { + "step": 431, + "epoch": 2.1933842239185752, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56925696, + "loss": 1.3698, + "grad_norm": 0.23924005031585693, + "learning_rate": 6.061156844879417e-05 + }, + { + "step": 432, + "epoch": 2.198473282442748, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569246208, + "loss": 1.3426, + "grad_norm": 0.188904270529747, + "learning_rate": 5.9897790782206636e-05 + }, + { + "step": 433, + "epoch": 2.203562340966921, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569212416, + "loss": 1.3568, + "grad_norm": 0.10972847789525986, + "learning_rate": 5.9187190886378306e-05 + }, + { + "step": 434, + "epoch": 2.208651399491094, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569190912, + "loss": 1.3675, + "grad_norm": 0.16597089171409607, + "learning_rate": 5.8479793823109406e-05 + }, + { + "step": 435, + "epoch": 2.213740458015267, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569186304, + "loss": 1.3737, + "grad_norm": 0.16293185949325562, + "learning_rate": 5.777562454124113e-05 + }, + { + "step": 436, + "epoch": 2.21882951653944, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569197056, + "loss": 1.3672, + "grad_norm": 0.1463368684053421, + "learning_rate": 5.7074707875775496e-05 + }, + { + "step": 437, + "epoch": 2.223918575063613, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569250816, + "loss": 1.392, + "grad_norm": 0.1492786854505539, + "learning_rate": 5.637706854699974e-05 + }, + { + "step": 438, + "epoch": 2.2290076335877864, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569167872, + "loss": 1.3885, + "grad_norm": 0.15046493709087372, + "learning_rate": 5.568273115961414e-05 + }, + { + "step": 439, + "epoch": 2.2340966921119594, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56919552, + "loss": 1.39, + "grad_norm": 0.1962796151638031, + "learning_rate": 5.499172020186447e-05 + }, + { + "step": 440, + "epoch": 2.2391857506361323, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569193984, + "loss": 1.3676, + "grad_norm": 0.1770879030227661, + "learning_rate": 5.430406004467842e-05 + }, + { + "step": 441, + "epoch": 2.2442748091603053, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569236992, + "loss": 1.3788, + "grad_norm": 0.15598605573177338, + "learning_rate": 5.361977494080572e-05 + }, + { + "step": 442, + "epoch": 2.2493638676844783, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569235456, + "loss": 1.3727, + "grad_norm": 0.24079090356826782, + "learning_rate": 5.293888902396319e-05 + }, + { + "step": 443, + "epoch": 2.2544529262086512, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569286144, + "loss": 1.3647, + "grad_norm": 0.1498352736234665, + "learning_rate": 5.2261426307983204e-05 + }, + { + "step": 444, + "epoch": 2.2595419847328246, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569292288, + "loss": 1.3517, + "grad_norm": 0.0972243919968605, + "learning_rate": 5.158741068596714e-05 + }, + { + "step": 445, + "epoch": 2.2646310432569976, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569230848, + "loss": 1.3716, + "grad_norm": 0.11667793989181519, + "learning_rate": 5.0916865929442326e-05 + }, + { + "step": 446, + "epoch": 2.2697201017811706, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569220096, + "loss": 1.3884, + "grad_norm": 0.19834527373313904, + "learning_rate": 5.024981568752386e-05 + }, + { + "step": 447, + "epoch": 2.2748091603053435, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569244672, + "loss": 1.3781, + "grad_norm": 0.12636996805667877, + "learning_rate": 4.958628348608065e-05 + }, + { + "step": 448, + "epoch": 2.2798982188295165, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569261568, + "loss": 1.363, + "grad_norm": 0.18252292275428772, + "learning_rate": 4.892629272690536e-05 + }, + { + "step": 449, + "epoch": 2.2849872773536894, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569132544, + "loss": 1.3678, + "grad_norm": 0.11808428913354874, + "learning_rate": 4.826986668688944e-05 + }, + { + "step": 450, + "epoch": 2.2900763358778624, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569307648, + "loss": 1.3533, + "grad_norm": 0.15763521194458008, + "learning_rate": 4.761702851720191e-05 + }, + { + "step": 451, + "epoch": 2.2951653944020354, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56926464, + "loss": 1.358, + "grad_norm": 0.13373996317386627, + "learning_rate": 4.6967801242472916e-05 + }, + { + "step": 452, + "epoch": 2.300254452926209, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569174016, + "loss": 1.3748, + "grad_norm": 0.18384166061878204, + "learning_rate": 4.632220775998172e-05 + }, + { + "step": 453, + "epoch": 2.3053435114503817, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569121792, + "loss": 1.3529, + "grad_norm": 0.10703245550394058, + "learning_rate": 4.568027083884929e-05 + }, + { + "step": 454, + "epoch": 2.3104325699745547, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569137152, + "loss": 1.3873, + "grad_norm": 0.13425439596176147, + "learning_rate": 4.504201311923488e-05 + }, + { + "step": 455, + "epoch": 2.3155216284987277, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569238528, + "loss": 1.349, + "grad_norm": 0.10723390430212021, + "learning_rate": 4.440745711153804e-05 + }, + { + "step": 456, + "epoch": 2.3206106870229006, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569247744, + "loss": 1.3915, + "grad_norm": 0.23947182297706604, + "learning_rate": 4.377662519560423e-05 + }, + { + "step": 457, + "epoch": 2.325699745547074, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569276928, + "loss": 1.3714, + "grad_norm": 0.2107890099287033, + "learning_rate": 4.3149539619935836e-05 + }, + { + "step": 458, + "epoch": 2.330788804071247, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569338368, + "loss": 1.3429, + "grad_norm": 0.1472177952528, + "learning_rate": 4.252622250090746e-05 + }, + { + "step": 459, + "epoch": 2.33587786259542, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569192448, + "loss": 1.3676, + "grad_norm": 0.15247642993927002, + "learning_rate": 4.190669582198571e-05 + }, + { + "step": 460, + "epoch": 2.340966921119593, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56928, + "loss": 1.3568, + "grad_norm": 0.2648078501224518, + "learning_rate": 4.1290981432954185e-05 + }, + { + "step": 461, + "epoch": 2.346055979643766, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569183232, + "loss": 1.3786, + "grad_norm": 0.2074078619480133, + "learning_rate": 4.067910104914249e-05 + }, + { + "step": 462, + "epoch": 2.351145038167939, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569253888, + "loss": 1.3494, + "grad_norm": 0.11796335130929947, + "learning_rate": 4.007107625066079e-05 + }, + { + "step": 463, + "epoch": 2.356234096692112, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569289216, + "loss": 1.3669, + "grad_norm": 0.172476664185524, + "learning_rate": 3.946692848163836e-05 + }, + { + "step": 464, + "epoch": 2.3613231552162848, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569227776, + "loss": 1.3272, + "grad_norm": 0.14716388285160065, + "learning_rate": 3.886667904946739e-05 + }, + { + "step": 465, + "epoch": 2.366412213740458, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569184768, + "loss": 1.339, + "grad_norm": 0.11250872910022736, + "learning_rate": 3.8270349124051694e-05 + }, + { + "step": 466, + "epoch": 2.371501272264631, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569186304, + "loss": 1.3994, + "grad_norm": 0.13964863121509552, + "learning_rate": 3.767795973705975e-05 + }, + { + "step": 467, + "epoch": 2.376590330788804, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569223168, + "loss": 1.3332, + "grad_norm": 0.14758144319057465, + "learning_rate": 3.708953178118324e-05 + }, + { + "step": 468, + "epoch": 2.381679389312977, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56919552, + "loss": 1.3414, + "grad_norm": 0.13047748804092407, + "learning_rate": 3.6505086009399944e-05 + }, + { + "step": 469, + "epoch": 2.38676844783715, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569252352, + "loss": 1.3674, + "grad_norm": 0.14995726943016052, + "learning_rate": 3.5924643034242136e-05 + }, + { + "step": 470, + "epoch": 2.391857506361323, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569281536, + "loss": 1.3503, + "grad_norm": 0.21032758057117462, + "learning_rate": 3.5348223327069105e-05 + }, + { + "step": 471, + "epoch": 2.3969465648854964, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569181696, + "loss": 1.3801, + "grad_norm": 0.192751944065094, + "learning_rate": 3.4775847217345756e-05 + }, + { + "step": 472, + "epoch": 2.4020356234096694, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569204736, + "loss": 1.3461, + "grad_norm": 0.1810380071401596, + "learning_rate": 3.420753489192524e-05 + }, + { + "step": 473, + "epoch": 2.4071246819338423, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56918016, + "loss": 1.3769, + "grad_norm": 0.20646949112415314, + "learning_rate": 3.364330639433701e-05 + }, + { + "step": 474, + "epoch": 2.4122137404580153, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56927232, + "loss": 1.4, + "grad_norm": 0.1648380309343338, + "learning_rate": 3.308318162408013e-05 + }, + { + "step": 475, + "epoch": 2.4173027989821882, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56914176, + "loss": 1.3517, + "grad_norm": 0.1465964913368225, + "learning_rate": 3.2527180335921186e-05 + }, + { + "step": 476, + "epoch": 2.422391857506361, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569278464, + "loss": 1.3797, + "grad_norm": 0.117573581635952, + "learning_rate": 3.197532213919774e-05 + }, + { + "step": 477, + "epoch": 2.427480916030534, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569135616, + "loss": 1.3791, + "grad_norm": 0.23789352178573608, + "learning_rate": 3.1427626497126654e-05 + }, + { + "step": 478, + "epoch": 2.432569974554707, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569158656, + "loss": 1.3509, + "grad_norm": 0.10253336280584335, + "learning_rate": 3.088411272611781e-05 + }, + { + "step": 479, + "epoch": 2.4376590330788805, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569206272, + "loss": 1.3955, + "grad_norm": 0.1927931308746338, + "learning_rate": 3.0344799995092533e-05 + }, + { + "step": 480, + "epoch": 2.4427480916030535, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56924928, + "loss": 1.3689, + "grad_norm": 0.11004645377397537, + "learning_rate": 2.9809707324807912e-05 + }, + { + "step": 481, + "epoch": 2.4478371501272265, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569177088, + "loss": 1.3668, + "grad_norm": 0.1641109138727188, + "learning_rate": 2.9278853587185658e-05 + }, + { + "step": 482, + "epoch": 2.4529262086513994, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569275392, + "loss": 1.3678, + "grad_norm": 0.17157648503780365, + "learning_rate": 2.8752257504646616e-05 + }, + { + "step": 483, + "epoch": 2.4580152671755724, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569170944, + "loss": 1.3701, + "grad_norm": 0.1508682817220688, + "learning_rate": 2.8229937649450613e-05 + }, + { + "step": 484, + "epoch": 2.4631043256997454, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56932608, + "loss": 1.3888, + "grad_norm": 0.15306159853935242, + "learning_rate": 2.7711912443041123e-05 + }, + { + "step": 485, + "epoch": 2.4681933842239188, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569158656, + "loss": 1.3501, + "grad_norm": 0.14828428626060486, + "learning_rate": 2.719820015539596e-05 + }, + { + "step": 486, + "epoch": 2.4732824427480917, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569201664, + "loss": 1.3986, + "grad_norm": 0.16945599019527435, + "learning_rate": 2.6688818904382513e-05 + }, + { + "step": 487, + "epoch": 2.4783715012722647, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569223168, + "loss": 1.3742, + "grad_norm": 0.15842625498771667, + "learning_rate": 2.6183786655119144e-05 + }, + { + "step": 488, + "epoch": 2.4834605597964376, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569146368, + "loss": 1.357, + "grad_norm": 0.11870865523815155, + "learning_rate": 2.5683121219341217e-05 + }, + { + "step": 489, + "epoch": 2.4885496183206106, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569283072, + "loss": 1.3608, + "grad_norm": 0.17626595497131348, + "learning_rate": 2.518684025477319e-05 + }, + { + "step": 490, + "epoch": 2.4936386768447836, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569170944, + "loss": 1.3392, + "grad_norm": 0.132248193025589, + "learning_rate": 2.469496126450578e-05 + }, + { + "step": 491, + "epoch": 2.4987277353689565, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569167872, + "loss": 1.4079, + "grad_norm": 0.1505178064107895, + "learning_rate": 2.4207501596378508e-05 + }, + { + "step": 492, + "epoch": 2.5038167938931295, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56914176, + "loss": 1.3036, + "grad_norm": 0.10795271396636963, + "learning_rate": 2.3724478442368133e-05 + }, + { + "step": 493, + "epoch": 2.508905852417303, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569177088, + "loss": 1.3275, + "grad_norm": 0.15352462232112885, + "learning_rate": 2.324590883798204e-05 + }, + { + "step": 494, + "epoch": 2.513994910941476, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569192448, + "loss": 1.3452, + "grad_norm": 0.16307023167610168, + "learning_rate": 2.2771809661657614e-05 + }, + { + "step": 495, + "epoch": 2.519083969465649, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569186304, + "loss": 1.3499, + "grad_norm": 0.12392111867666245, + "learning_rate": 2.2302197634166835e-05 + }, + { + "step": 496, + "epoch": 2.524173027989822, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569198592, + "loss": 1.405, + "grad_norm": 0.17996898293495178, + "learning_rate": 2.1837089318026714e-05 + }, + { + "step": 497, + "epoch": 2.5292620865139948, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569230848, + "loss": 1.3421, + "grad_norm": 0.15211565792560577, + "learning_rate": 2.1376501116915047e-05 + }, + { + "step": 498, + "epoch": 2.534351145038168, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569232384, + "loss": 1.354, + "grad_norm": 0.10257530957460403, + "learning_rate": 2.0920449275091837e-05 + }, + { + "step": 499, + "epoch": 2.539440203562341, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569204736, + "loss": 1.3668, + "grad_norm": 0.15264037251472473, + "learning_rate": 2.0468949876826573e-05 + }, + { + "step": 500, + "epoch": 2.544529262086514, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569253888, + "loss": 1.3299, + "grad_norm": 0.20252157747745514, + "learning_rate": 2.002201884583065e-05 + }, + { + "step": 501, + "epoch": 2.549618320610687, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569183232, + "loss": 1.3658, + "grad_norm": 0.1481276899576187, + "learning_rate": 1.957967194469615e-05 + }, + { + "step": 502, + "epoch": 2.55470737913486, + "cpu_mem": 1.940701184, + "gpu_mem": 4.5692416, + "loss": 1.3444, + "grad_norm": 0.2078181952238083, + "learning_rate": 1.9141924774339566e-05 + }, + { + "step": 503, + "epoch": 2.559796437659033, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569232384, + "loss": 1.3309, + "grad_norm": 0.1234116405248642, + "learning_rate": 1.8708792773451874e-05 + }, + { + "step": 504, + "epoch": 2.564885496183206, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569166336, + "loss": 1.3463, + "grad_norm": 0.19892245531082153, + "learning_rate": 1.828029121795375e-05 + }, + { + "step": 505, + "epoch": 2.569974554707379, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56918784, + "loss": 1.3553, + "grad_norm": 0.12429164350032806, + "learning_rate": 1.7856435220457092e-05 + }, + { + "step": 506, + "epoch": 2.5750636132315523, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569243136, + "loss": 1.3738, + "grad_norm": 0.34524235129356384, + "learning_rate": 1.7437239729731806e-05 + }, + { + "step": 507, + "epoch": 2.5801526717557253, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569160192, + "loss": 1.3643, + "grad_norm": 0.12196838855743408, + "learning_rate": 1.7022719530178624e-05 + }, + { + "step": 508, + "epoch": 2.5852417302798982, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569224704, + "loss": 1.3508, + "grad_norm": 0.12743444740772247, + "learning_rate": 1.6612889241307836e-05 + }, + { + "step": 509, + "epoch": 2.590330788804071, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569183232, + "loss": 1.3967, + "grad_norm": 0.16924889385700226, + "learning_rate": 1.620776331722347e-05 + }, + { + "step": 510, + "epoch": 2.595419847328244, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569215488, + "loss": 1.3827, + "grad_norm": 0.20860859751701355, + "learning_rate": 1.580735604611368e-05 + }, + { + "step": 511, + "epoch": 2.6005089058524176, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569189376, + "loss": 1.359, + "grad_norm": 0.15096603333950043, + "learning_rate": 1.5411681549746678e-05 + }, + { + "step": 512, + "epoch": 2.6055979643765905, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569183232, + "loss": 1.3447, + "grad_norm": 0.15661181509494781, + "learning_rate": 1.502075378297285e-05 + }, + { + "step": 513, + "epoch": 2.6106870229007635, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569193984, + "loss": 1.3892, + "grad_norm": 0.1693093627691269, + "learning_rate": 1.4634586533232428e-05 + }, + { + "step": 514, + "epoch": 2.6157760814249365, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569098752, + "loss": 1.3608, + "grad_norm": 0.1816324144601822, + "learning_rate": 1.4253193420069292e-05 + }, + { + "step": 515, + "epoch": 2.6208651399491094, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569167872, + "loss": 1.3404, + "grad_norm": 0.18792948126792908, + "learning_rate": 1.3876587894650686e-05 + }, + { + "step": 516, + "epoch": 2.6259541984732824, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569215488, + "loss": 1.3691, + "grad_norm": 0.1429559886455536, + "learning_rate": 1.350478323929271e-05 + }, + { + "step": 517, + "epoch": 2.6310432569974553, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569160192, + "loss": 1.3624, + "grad_norm": 0.2560316026210785, + "learning_rate": 1.3137792566992001e-05 + }, + { + "step": 518, + "epoch": 2.6361323155216283, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56921856, + "loss": 1.3732, + "grad_norm": 0.14215078949928284, + "learning_rate": 1.2775628820963091e-05 + }, + { + "step": 519, + "epoch": 2.6412213740458013, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569224704, + "loss": 1.3375, + "grad_norm": 0.1378907710313797, + "learning_rate": 1.2418304774182075e-05 + }, + { + "step": 520, + "epoch": 2.6463104325699747, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569361408, + "loss": 1.3352, + "grad_norm": 0.12387850880622864, + "learning_rate": 1.2065833028935968e-05 + }, + { + "step": 521, + "epoch": 2.6513994910941476, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569170944, + "loss": 1.367, + "grad_norm": 0.14952220022678375, + "learning_rate": 1.1718226016378507e-05 + }, + { + "step": 522, + "epoch": 2.6564885496183206, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569235456, + "loss": 1.3169, + "grad_norm": 0.16929730772972107, + "learning_rate": 1.137549599609136e-05 + }, + { + "step": 523, + "epoch": 2.6615776081424936, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569150976, + "loss": 1.3616, + "grad_norm": 0.22113659977912903, + "learning_rate": 1.103765505565205e-05 + }, + { + "step": 524, + "epoch": 2.6666666666666665, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569167872, + "loss": 1.3524, + "grad_norm": 0.1316693276166916, + "learning_rate": 1.0704715110207579e-05 + }, + { + "step": 525, + "epoch": 2.67175572519084, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569167872, + "loss": 1.3214, + "grad_norm": 0.14906273782253265, + "learning_rate": 1.0376687902053981e-05 + }, + { + "step": 526, + "epoch": 2.676844783715013, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569293824, + "loss": 1.351, + "grad_norm": 0.1172790601849556, + "learning_rate": 1.0053585000222524e-05 + }, + { + "step": 527, + "epoch": 2.681933842239186, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569174016, + "loss": 1.3361, + "grad_norm": 0.16457872092723846, + "learning_rate": 9.735417800071433e-06 + }, + { + "step": 528, + "epoch": 2.687022900763359, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569332224, + "loss": 1.3943, + "grad_norm": 0.15351296961307526, + "learning_rate": 9.42219752288414e-06 + }, + { + "step": 529, + "epoch": 2.6921119592875318, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56915712, + "loss": 1.3296, + "grad_norm": 0.11193352937698364, + "learning_rate": 9.113935215473428e-06 + }, + { + "step": 530, + "epoch": 2.6972010178117047, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569175552, + "loss": 1.3931, + "grad_norm": 0.13140545785427094, + "learning_rate": 8.810641749791902e-06 + }, + { + "step": 531, + "epoch": 2.7022900763358777, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569352192, + "loss": 1.3191, + "grad_norm": 0.14270859956741333, + "learning_rate": 8.512327822548481e-06 + }, + { + "step": 532, + "epoch": 2.7073791348600507, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56922624, + "loss": 1.3733, + "grad_norm": 0.12348409742116928, + "learning_rate": 8.219003954831199e-06 + }, + { + "step": 533, + "epoch": 2.712468193384224, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569223168, + "loss": 1.3359, + "grad_norm": 0.15581145882606506, + "learning_rate": 7.930680491736135e-06 + }, + { + "step": 534, + "epoch": 2.717557251908397, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569232384, + "loss": 1.3799, + "grad_norm": 0.1289188712835312, + "learning_rate": 7.647367602002491e-06 + }, + { + "step": 535, + "epoch": 2.72264631043257, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569107968, + "loss": 1.3401, + "grad_norm": 0.1234297826886177, + "learning_rate": 7.369075277654091e-06 + }, + { + "step": 536, + "epoch": 2.727735368956743, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569390592, + "loss": 1.3602, + "grad_norm": 0.1753847897052765, + "learning_rate": 7.095813333646832e-06 + }, + { + "step": 537, + "epoch": 2.732824427480916, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569154048, + "loss": 1.3393, + "grad_norm": 0.1218089610338211, + "learning_rate": 6.827591407522548e-06 + }, + { + "step": 538, + "epoch": 2.7379134860050893, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56924928, + "loss": 1.367, + "grad_norm": 0.0962374359369278, + "learning_rate": 6.564418959069273e-06 + }, + { + "step": 539, + "epoch": 2.7430025445292623, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569215488, + "loss": 1.3572, + "grad_norm": 0.24412646889686584, + "learning_rate": 6.3063052699873326e-06 + }, + { + "step": 540, + "epoch": 2.7480916030534353, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569183232, + "loss": 1.3794, + "grad_norm": 0.1879199743270874, + "learning_rate": 6.053259443562286e-06 + }, + { + "step": 541, + "epoch": 2.753180661577608, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569201664, + "loss": 1.3675, + "grad_norm": 0.13856124877929688, + "learning_rate": 5.8052904043435985e-06 + }, + { + "step": 542, + "epoch": 2.758269720101781, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569252352, + "loss": 1.4247, + "grad_norm": 0.19396206736564636, + "learning_rate": 5.56240689783013e-06 + }, + { + "step": 543, + "epoch": 2.763358778625954, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569147904, + "loss": 1.3723, + "grad_norm": 0.128175288438797, + "learning_rate": 5.324617490161409e-06 + }, + { + "step": 544, + "epoch": 2.768447837150127, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569177088, + "loss": 1.3545, + "grad_norm": 0.1593579351902008, + "learning_rate": 5.091930567815866e-06 + }, + { + "step": 545, + "epoch": 2.7735368956743, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569120256, + "loss": 1.4184, + "grad_norm": 0.14318133890628815, + "learning_rate": 4.86435433731473e-06 + }, + { + "step": 546, + "epoch": 2.778625954198473, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56919552, + "loss": 1.3428, + "grad_norm": 0.12492880970239639, + "learning_rate": 4.641896824932861e-06 + }, + { + "step": 547, + "epoch": 2.7837150127226464, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56918784, + "loss": 1.3794, + "grad_norm": 0.14729654788970947, + "learning_rate": 4.424565876415415e-06 + }, + { + "step": 548, + "epoch": 2.7888040712468194, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56921856, + "loss": 1.3763, + "grad_norm": 0.1457790583372116, + "learning_rate": 4.212369156701373e-06 + }, + { + "step": 549, + "epoch": 2.7938931297709924, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569147904, + "loss": 1.3781, + "grad_norm": 0.14082591235637665, + "learning_rate": 4.005314149653133e-06 + }, + { + "step": 550, + "epoch": 2.7989821882951653, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569174016, + "loss": 1.3487, + "grad_norm": 0.19487358629703522, + "learning_rate": 3.8034081577924147e-06 + }, + { + "step": 551, + "epoch": 2.8040712468193383, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569269248, + "loss": 1.3517, + "grad_norm": 0.16892223060131073, + "learning_rate": 3.6066583020429864e-06 + }, + { + "step": 552, + "epoch": 2.8091603053435117, + "cpu_mem": 1.940701184, + "gpu_mem": 4.5691648, + "loss": 1.3282, + "grad_norm": 0.12526707351207733, + "learning_rate": 3.415071521479246e-06 + }, + { + "step": 553, + "epoch": 2.8142493638676847, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569174016, + "loss": 1.4085, + "grad_norm": 0.1377761960029602, + "learning_rate": 3.2286545730817183e-06 + }, + { + "step": 554, + "epoch": 2.8193384223918576, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569201664, + "loss": 1.371, + "grad_norm": 0.16604435443878174, + "learning_rate": 3.0474140314985628e-06 + }, + { + "step": 555, + "epoch": 2.8244274809160306, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569215488, + "loss": 1.3661, + "grad_norm": 0.1917344331741333, + "learning_rate": 2.8713562888138754e-06 + }, + { + "step": 556, + "epoch": 2.8295165394402035, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569209344, + "loss": 1.3788, + "grad_norm": 0.14817111194133759, + "learning_rate": 2.7004875543220506e-06 + }, + { + "step": 557, + "epoch": 2.8346055979643765, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56917248, + "loss": 1.3518, + "grad_norm": 0.15677215158939362, + "learning_rate": 2.5348138543089425e-06 + }, + { + "step": 558, + "epoch": 2.8396946564885495, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569230848, + "loss": 1.3724, + "grad_norm": 0.21148453652858734, + "learning_rate": 2.374341031839283e-06 + }, + { + "step": 559, + "epoch": 2.8447837150127224, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569140224, + "loss": 1.3758, + "grad_norm": 0.16044655442237854, + "learning_rate": 2.2190747465505644e-06 + }, + { + "step": 560, + "epoch": 2.849872773536896, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569224704, + "loss": 1.3685, + "grad_norm": 0.14097799360752106, + "learning_rate": 2.0690204744534976e-06 + }, + { + "step": 561, + "epoch": 2.854961832061069, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569263104, + "loss": 1.3409, + "grad_norm": 0.19615375995635986, + "learning_rate": 1.924183507738819e-06 + }, + { + "step": 562, + "epoch": 2.8600508905852418, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569307648, + "loss": 1.4195, + "grad_norm": 0.1256924867630005, + "learning_rate": 1.7845689545906704e-06 + }, + { + "step": 563, + "epoch": 2.8651399491094147, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569269248, + "loss": 1.3237, + "grad_norm": 0.15286113321781158, + "learning_rate": 1.6501817390064786e-06 + }, + { + "step": 564, + "epoch": 2.8702290076335877, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569206272, + "loss": 1.3701, + "grad_norm": 0.17890912294387817, + "learning_rate": 1.521026600623243e-06 + }, + { + "step": 565, + "epoch": 2.875318066157761, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569260032, + "loss": 1.3776, + "grad_norm": 0.2363748997449875, + "learning_rate": 1.3971080945503866e-06 + }, + { + "step": 566, + "epoch": 2.880407124681934, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569198592, + "loss": 1.4041, + "grad_norm": 0.26145821809768677, + "learning_rate": 1.2784305912090842e-06 + }, + { + "step": 567, + "epoch": 2.885496183206107, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569184768, + "loss": 1.3444, + "grad_norm": 0.13829472661018372, + "learning_rate": 1.1649982761782195e-06 + }, + { + "step": 568, + "epoch": 2.89058524173028, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569396736, + "loss": 1.3406, + "grad_norm": 0.1269315630197525, + "learning_rate": 1.0568151500465693e-06 + }, + { + "step": 569, + "epoch": 2.895674300254453, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569261568, + "loss": 1.3424, + "grad_norm": 0.2668635845184326, + "learning_rate": 9.538850282719833e-07 + }, + { + "step": 570, + "epoch": 2.900763358778626, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569235456, + "loss": 1.3667, + "grad_norm": 0.15625768899917603, + "learning_rate": 8.56211541046542e-07 + }, + { + "step": 571, + "epoch": 2.905852417302799, + "cpu_mem": 1.940701184, + "gpu_mem": 4.5692416, + "loss": 1.3446, + "grad_norm": 0.1618412733078003, + "learning_rate": 7.637981331687582e-07 + }, + { + "step": 572, + "epoch": 2.910941475826972, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569275392, + "loss": 1.3695, + "grad_norm": 0.1762702465057373, + "learning_rate": 6.766480639218752e-07 + }, + { + "step": 573, + "epoch": 2.916030534351145, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569183232, + "loss": 1.4125, + "grad_norm": 0.15875622630119324, + "learning_rate": 5.947644069591084e-07 + }, + { + "step": 574, + "epoch": 2.921119592875318, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569267712, + "loss": 1.3889, + "grad_norm": 0.19755540788173676, + "learning_rate": 5.181500501950986e-07 + }, + { + "step": 575, + "epoch": 2.926208651399491, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569155584, + "loss": 1.3605, + "grad_norm": 0.1052677109837532, + "learning_rate": 4.468076957041433e-07 + }, + { + "step": 576, + "epoch": 2.931297709923664, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56921856, + "loss": 1.3844, + "grad_norm": 0.1540636569261551, + "learning_rate": 3.807398596248401e-07 + }, + { + "step": 577, + "epoch": 2.936386768447837, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569183232, + "loss": 1.3429, + "grad_norm": 0.16524913907051086, + "learning_rate": 3.199488720714072e-07 + }, + { + "step": 578, + "epoch": 2.94147582697201, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569183232, + "loss": 1.3565, + "grad_norm": 0.11653438210487366, + "learning_rate": 2.64436877051466e-07 + }, + { + "step": 579, + "epoch": 2.9465648854961835, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569267712, + "loss": 1.3731, + "grad_norm": 0.15984803438186646, + "learning_rate": 2.1420583239040167e-07 + }, + { + "step": 580, + "epoch": 2.9516539440203564, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569213952, + "loss": 1.3916, + "grad_norm": 0.1596316546201706, + "learning_rate": 1.6925750966238494e-07 + }, + { + "step": 581, + "epoch": 2.9567430025445294, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56923392, + "loss": 1.373, + "grad_norm": 0.17287905514240265, + "learning_rate": 1.295934941278387e-07 + }, + { + "step": 582, + "epoch": 2.9618320610687023, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569161728, + "loss": 1.3543, + "grad_norm": 0.24945701658725739, + "learning_rate": 9.52151846775162e-08 + }, + { + "step": 583, + "epoch": 2.9669211195928753, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56924928, + "loss": 1.3835, + "grad_norm": 0.12757360935211182, + "learning_rate": 6.612379378320709e-08 + }, + { + "step": 584, + "epoch": 2.9720101781170483, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569217024, + "loss": 1.337, + "grad_norm": 0.13159261643886566, + "learning_rate": 4.232034745495494e-08 + }, + { + "step": 585, + "epoch": 2.9770992366412212, + "cpu_mem": 1.940701184, + "gpu_mem": 4.56919552, + "loss": 1.3351, + "grad_norm": 0.10587425529956818, + "learning_rate": 2.3805685204869583e-08 + }, + { + "step": 586, + "epoch": 2.982188295165394, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569138688, + "loss": 1.3794, + "grad_norm": 0.27038824558258057, + "learning_rate": 1.0580460017517444e-08 + }, + { + "step": 587, + "epoch": 2.9872773536895676, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569204736, + "loss": 1.4104, + "grad_norm": 0.13926050066947937, + "learning_rate": 2.645138326906604e-09 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569246208, + "loss": 1.3695, + "grad_norm": 0.10030262172222137, + "learning_rate": 0.0 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 1.940701184, + "gpu_mem": 4.569246208, + "train_runtime": 16671.9115, + "train_samples_per_second": 2.261, + "train_steps_per_second": 0.035, + "total_flos": 8.929666662060442e+16, + "train_loss": 1.4663503652527219 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r2-a2/README.md b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r2-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r2-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r2-a2/adapter_config.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4c4758d99093e963e7b960b3e04b3ff68f0cc5fe --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r2-a2/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha": 4, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "inference_mode": true, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 2, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r2-a2/eval_results.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b89e83ee72c0c53022686fc96773c8b252b5c6ac --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "winogrande", + "results": 0.5027624309392266 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r2-a2/training_configuration.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..f7bacd1efbc1d68a749180068e3d962aee59a291 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "WINOGRANDE", + "dataset_id": "allenai/winogrande", + "preprocess_id": "winogrande_train_deepeval" + }, + "peft_config": { + "method": "loha", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 3153920 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loha-winogrande-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r2-a2", + "seed": 42, + "timestamp": "2025-09-13T00:59:49.658753" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r2-a2/training_logs.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..0a7ba08e1112418bd26436075f76b10c5b945883 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r2-a2/training_logs.json @@ -0,0 +1,5773 @@ +[ + { + "step": 1, + "epoch": 0.00625, + "cpu_mem": 1.843970048, + "gpu_mem": 4.429950464, + "loss": 3.3802, + "grad_norm": 2.987323522567749, + "learning_rate": 4.6875e-06 + }, + { + "step": 2, + "epoch": 0.0125, + "cpu_mem": 1.850064896, + "gpu_mem": 4.455179776, + "loss": 3.3361, + "grad_norm": 2.9495489597320557, + "learning_rate": 9.375e-06 + }, + { + "step": 3, + "epoch": 0.01875, + "cpu_mem": 1.850458112, + "gpu_mem": 4.455184384, + "loss": 3.2353, + "grad_norm": 2.7779335975646973, + "learning_rate": 1.40625e-05 + }, + { + "step": 4, + "epoch": 0.025, + "cpu_mem": 1.850851328, + "gpu_mem": 4.455182848, + "loss": 3.2108, + "grad_norm": 2.8182156085968018, + "learning_rate": 1.875e-05 + }, + { + "step": 5, + "epoch": 0.03125, + "cpu_mem": 1.851244544, + "gpu_mem": 4.455182848, + "loss": 3.2437, + "grad_norm": 2.8791325092315674, + "learning_rate": 2.3437499999999997e-05 + }, + { + "step": 6, + "epoch": 0.0375, + "cpu_mem": 1.851441152, + "gpu_mem": 4.455188992, + "loss": 3.2588, + "grad_norm": 2.9321441650390625, + "learning_rate": 2.8125e-05 + }, + { + "step": 7, + "epoch": 0.04375, + "cpu_mem": 1.85163776, + "gpu_mem": 4.455195136, + "loss": 3.1848, + "grad_norm": 2.9217889308929443, + "learning_rate": 3.28125e-05 + }, + { + "step": 8, + "epoch": 0.05, + "cpu_mem": 1.851834368, + "gpu_mem": 4.45517824, + "loss": 3.1515, + "grad_norm": 2.7925472259521484, + "learning_rate": 3.75e-05 + }, + { + "step": 9, + "epoch": 0.05625, + "cpu_mem": 1.852030976, + "gpu_mem": 4.455184384, + "loss": 3.3598, + "grad_norm": 2.9350500106811523, + "learning_rate": 4.2187499999999995e-05 + }, + { + "step": 10, + "epoch": 0.0625, + "cpu_mem": 1.852227584, + "gpu_mem": 4.455187456, + "loss": 3.191, + "grad_norm": 2.9643239974975586, + "learning_rate": 4.6874999999999994e-05 + }, + { + "step": 11, + "epoch": 0.06875, + "cpu_mem": 1.852424192, + "gpu_mem": 4.455176704, + "loss": 3.1396, + "grad_norm": 2.9508512020111084, + "learning_rate": 5.156249999999999e-05 + }, + { + "step": 12, + "epoch": 0.075, + "cpu_mem": 1.8526208, + "gpu_mem": 4.455181312, + "loss": 3.2628, + "grad_norm": 2.9139976501464844, + "learning_rate": 5.625e-05 + }, + { + "step": 13, + "epoch": 0.08125, + "cpu_mem": 1.852817408, + "gpu_mem": 4.455188992, + "loss": 3.0435, + "grad_norm": 2.861978054046631, + "learning_rate": 6.09375e-05 + }, + { + "step": 14, + "epoch": 0.0875, + "cpu_mem": 1.852817408, + "gpu_mem": 4.455184384, + "loss": 3.0172, + "grad_norm": 3.0608623027801514, + "learning_rate": 6.5625e-05 + }, + { + "step": 15, + "epoch": 0.09375, + "cpu_mem": 1.853014016, + "gpu_mem": 4.455184384, + "loss": 2.8786, + "grad_norm": 3.0879340171813965, + "learning_rate": 7.03125e-05 + }, + { + "step": 16, + "epoch": 0.1, + "cpu_mem": 1.853210624, + "gpu_mem": 4.455181312, + "loss": 2.8875, + "grad_norm": 3.036341428756714, + "learning_rate": 7.5e-05 + }, + { + "step": 17, + "epoch": 0.10625, + "cpu_mem": 1.853210624, + "gpu_mem": 4.455181312, + "loss": 2.7296, + "grad_norm": 3.0215604305267334, + "learning_rate": 7.968749999999999e-05 + }, + { + "step": 18, + "epoch": 0.1125, + "cpu_mem": 1.853407232, + "gpu_mem": 4.455184384, + "loss": 2.7569, + "grad_norm": 3.064049243927002, + "learning_rate": 8.437499999999999e-05 + }, + { + "step": 19, + "epoch": 0.11875, + "cpu_mem": 1.853407232, + "gpu_mem": 4.455181312, + "loss": 2.6668, + "grad_norm": 3.080202341079712, + "learning_rate": 8.906249999999999e-05 + }, + { + "step": 20, + "epoch": 0.125, + "cpu_mem": 1.85360384, + "gpu_mem": 4.455188992, + "loss": 2.7177, + "grad_norm": 2.9739267826080322, + "learning_rate": 9.374999999999999e-05 + }, + { + "step": 21, + "epoch": 0.13125, + "cpu_mem": 1.85360384, + "gpu_mem": 4.455181312, + "loss": 2.6081, + "grad_norm": 3.080386161804199, + "learning_rate": 9.843749999999999e-05 + }, + { + "step": 22, + "epoch": 0.1375, + "cpu_mem": 1.853800448, + "gpu_mem": 4.455181312, + "loss": 2.4602, + "grad_norm": 3.010634422302246, + "learning_rate": 0.00010312499999999999 + }, + { + "step": 23, + "epoch": 0.14375, + "cpu_mem": 1.853800448, + "gpu_mem": 4.455176704, + "loss": 2.4991, + "grad_norm": 3.049595355987549, + "learning_rate": 0.00010781249999999998 + }, + { + "step": 24, + "epoch": 0.15, + "cpu_mem": 1.853800448, + "gpu_mem": 4.455179776, + "loss": 2.3215, + "grad_norm": 2.8185763359069824, + "learning_rate": 0.0001125 + }, + { + "step": 25, + "epoch": 0.15625, + "cpu_mem": 1.853800448, + "gpu_mem": 4.455182848, + "loss": 2.2689, + "grad_norm": 2.7809863090515137, + "learning_rate": 0.0001171875 + }, + { + "step": 26, + "epoch": 0.1625, + "cpu_mem": 1.853800448, + "gpu_mem": 4.45517824, + "loss": 2.0085, + "grad_norm": 2.7902612686157227, + "learning_rate": 0.000121875 + }, + { + "step": 27, + "epoch": 0.16875, + "cpu_mem": 1.853800448, + "gpu_mem": 4.455176704, + "loss": 2.1084, + "grad_norm": 2.8021862506866455, + "learning_rate": 0.0001265625 + }, + { + "step": 28, + "epoch": 0.175, + "cpu_mem": 1.853800448, + "gpu_mem": 4.455182848, + "loss": 1.8094, + "grad_norm": 2.499572992324829, + "learning_rate": 0.00013125 + }, + { + "step": 29, + "epoch": 0.18125, + "cpu_mem": 1.853800448, + "gpu_mem": 4.455181312, + "loss": 1.8083, + "grad_norm": 2.5020790100097656, + "learning_rate": 0.0001359375 + }, + { + "step": 30, + "epoch": 0.1875, + "cpu_mem": 1.853800448, + "gpu_mem": 4.455181312, + "loss": 1.7753, + "grad_norm": 2.3316545486450195, + "learning_rate": 0.000140625 + }, + { + "step": 31, + "epoch": 0.19375, + "cpu_mem": 1.853800448, + "gpu_mem": 4.455181312, + "loss": 1.5834, + "grad_norm": 2.2503652572631836, + "learning_rate": 0.0001453125 + }, + { + "step": 32, + "epoch": 0.2, + "cpu_mem": 1.853997056, + "gpu_mem": 4.45517824, + "loss": 1.5857, + "grad_norm": 2.011564016342163, + "learning_rate": 0.00015 + }, + { + "step": 33, + "epoch": 0.20625, + "cpu_mem": 1.853997056, + "gpu_mem": 4.45517824, + "loss": 1.3195, + "grad_norm": 1.6936798095703125, + "learning_rate": 0.00015468749999999999 + }, + { + "step": 34, + "epoch": 0.2125, + "cpu_mem": 1.853997056, + "gpu_mem": 4.45517824, + "loss": 1.2393, + "grad_norm": 1.5177048444747925, + "learning_rate": 0.00015937499999999998 + }, + { + "step": 35, + "epoch": 0.21875, + "cpu_mem": 1.853997056, + "gpu_mem": 4.455184384, + "loss": 1.1429, + "grad_norm": 1.3398171663284302, + "learning_rate": 0.00016406249999999998 + }, + { + "step": 36, + "epoch": 0.225, + "cpu_mem": 1.853997056, + "gpu_mem": 4.455179776, + "loss": 1.1821, + "grad_norm": 1.2413849830627441, + "learning_rate": 0.00016874999999999998 + }, + { + "step": 37, + "epoch": 0.23125, + "cpu_mem": 1.853997056, + "gpu_mem": 4.45517824, + "loss": 0.9898, + "grad_norm": 0.8798674941062927, + "learning_rate": 0.00017343749999999998 + }, + { + "step": 38, + "epoch": 0.2375, + "cpu_mem": 1.853997056, + "gpu_mem": 4.455182848, + "loss": 1.0126, + "grad_norm": 0.8425956964492798, + "learning_rate": 0.00017812499999999998 + }, + { + "step": 39, + "epoch": 0.24375, + "cpu_mem": 1.853997056, + "gpu_mem": 4.455188992, + "loss": 1.0096, + "grad_norm": 0.9666163325309753, + "learning_rate": 0.00018281249999999998 + }, + { + "step": 40, + "epoch": 0.25, + "cpu_mem": 1.853997056, + "gpu_mem": 4.45518592, + "loss": 0.9034, + "grad_norm": 0.6025753617286682, + "learning_rate": 0.00018749999999999998 + }, + { + "step": 41, + "epoch": 0.25625, + "cpu_mem": 1.853997056, + "gpu_mem": 4.45518592, + "loss": 0.8819, + "grad_norm": 0.7408265471458435, + "learning_rate": 0.00019218749999999998 + }, + { + "step": 42, + "epoch": 0.2625, + "cpu_mem": 1.853997056, + "gpu_mem": 4.455182848, + "loss": 0.8152, + "grad_norm": 0.4866456091403961, + "learning_rate": 0.00019687499999999997 + }, + { + "step": 43, + "epoch": 0.26875, + "cpu_mem": 1.853997056, + "gpu_mem": 4.455182848, + "loss": 0.8601, + "grad_norm": 0.6245154738426208, + "learning_rate": 0.00020156249999999997 + }, + { + "step": 44, + "epoch": 0.275, + "cpu_mem": 1.854193664, + "gpu_mem": 4.455182848, + "loss": 0.7818, + "grad_norm": 0.36564165353775024, + "learning_rate": 0.00020624999999999997 + }, + { + "step": 45, + "epoch": 0.28125, + "cpu_mem": 1.854193664, + "gpu_mem": 4.455190528, + "loss": 0.7902, + "grad_norm": 0.4412634074687958, + "learning_rate": 0.00021093749999999997 + }, + { + "step": 46, + "epoch": 0.2875, + "cpu_mem": 1.854193664, + "gpu_mem": 4.455182848, + "loss": 0.7668, + "grad_norm": 0.38179007172584534, + "learning_rate": 0.00021562499999999997 + }, + { + "step": 47, + "epoch": 0.29375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7312, + "grad_norm": 0.3691229522228241, + "learning_rate": 0.00022031249999999997 + }, + { + "step": 48, + "epoch": 0.3, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7873, + "grad_norm": 0.47540605068206787, + "learning_rate": 0.000225 + }, + { + "step": 49, + "epoch": 0.30625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.7467, + "grad_norm": 0.8892286419868469, + "learning_rate": 0.0002296875 + }, + { + "step": 50, + "epoch": 0.3125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7369, + "grad_norm": 0.2513984739780426, + "learning_rate": 0.000234375 + }, + { + "step": 51, + "epoch": 0.31875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.7387, + "grad_norm": 0.3403948247432709, + "learning_rate": 0.0002390625 + }, + { + "step": 52, + "epoch": 0.325, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7098, + "grad_norm": 0.5572483539581299, + "learning_rate": 0.00024375 + }, + { + "step": 53, + "epoch": 0.33125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7445, + "grad_norm": 0.7724244594573975, + "learning_rate": 0.00024843749999999996 + }, + { + "step": 54, + "epoch": 0.3375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455172096, + "loss": 0.692, + "grad_norm": 0.19667086005210876, + "learning_rate": 0.000253125 + }, + { + "step": 55, + "epoch": 0.34375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6968, + "grad_norm": 0.2634413540363312, + "learning_rate": 0.00025781249999999996 + }, + { + "step": 56, + "epoch": 0.35, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7345, + "grad_norm": 0.3488966226577759, + "learning_rate": 0.0002625 + }, + { + "step": 57, + "epoch": 0.35625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.7082, + "grad_norm": 0.32447463274002075, + "learning_rate": 0.00026718749999999996 + }, + { + "step": 58, + "epoch": 0.3625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7154, + "grad_norm": 0.27029919624328613, + "learning_rate": 0.000271875 + }, + { + "step": 59, + "epoch": 0.36875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455190528, + "loss": 0.7361, + "grad_norm": 0.3998737633228302, + "learning_rate": 0.00027656249999999995 + }, + { + "step": 60, + "epoch": 0.375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7238, + "grad_norm": 0.37246114015579224, + "learning_rate": 0.00028125 + }, + { + "step": 61, + "epoch": 0.38125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7105, + "grad_norm": 0.20969606935977936, + "learning_rate": 0.00028593749999999995 + }, + { + "step": 62, + "epoch": 0.3875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6867, + "grad_norm": 0.1595180332660675, + "learning_rate": 0.000290625 + }, + { + "step": 63, + "epoch": 0.39375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7119, + "grad_norm": 0.4252409040927887, + "learning_rate": 0.00029531249999999995 + }, + { + "step": 64, + "epoch": 0.4, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.7126, + "grad_norm": 0.24246743321418762, + "learning_rate": 0.0003 + }, + { + "step": 65, + "epoch": 0.40625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7019, + "grad_norm": 0.3751748502254486, + "learning_rate": 0.00029999776892091325 + }, + { + "step": 66, + "epoch": 0.4125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7084, + "grad_norm": 0.27233508229255676, + "learning_rate": 0.00029999107575002246 + }, + { + "step": 67, + "epoch": 0.41875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7097, + "grad_norm": 0.3623232841491699, + "learning_rate": 0.0002999799206864343 + }, + { + "step": 68, + "epoch": 0.425, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7046, + "grad_norm": 0.1557946652173996, + "learning_rate": 0.0002999643040619863 + }, + { + "step": 69, + "epoch": 0.43125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6989, + "grad_norm": 0.1449778825044632, + "learning_rate": 0.0002999442263412377 + }, + { + "step": 70, + "epoch": 0.4375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.7176, + "grad_norm": 0.29982030391693115, + "learning_rate": 0.00029991968812145484 + }, + { + "step": 71, + "epoch": 0.44375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6937, + "grad_norm": 0.19219128787517548, + "learning_rate": 0.00029989069013259374 + }, + { + "step": 72, + "epoch": 0.45, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6903, + "grad_norm": 0.17342492938041687, + "learning_rate": 0.00029985723323727866 + }, + { + "step": 73, + "epoch": 0.45625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7149, + "grad_norm": 0.3396533131599426, + "learning_rate": 0.00029981931843077583 + }, + { + "step": 74, + "epoch": 0.4625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6995, + "grad_norm": 0.1771945059299469, + "learning_rate": 0.00029977694684096444 + }, + { + "step": 75, + "epoch": 0.46875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7051, + "grad_norm": 0.44677069783210754, + "learning_rate": 0.0002997301197283027 + }, + { + "step": 76, + "epoch": 0.475, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.713, + "grad_norm": 0.38033682107925415, + "learning_rate": 0.0002996788384857905 + }, + { + "step": 77, + "epoch": 0.48125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.7006, + "grad_norm": 0.18717817962169647, + "learning_rate": 0.00029962310463892795 + }, + { + "step": 78, + "epoch": 0.4875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6881, + "grad_norm": 0.198476180434227, + "learning_rate": 0.00029956291984566997 + }, + { + "step": 79, + "epoch": 0.49375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7002, + "grad_norm": 0.13516530394554138, + "learning_rate": 0.00029949828589637703 + }, + { + "step": 80, + "epoch": 0.5, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7164, + "grad_norm": 0.12599493563175201, + "learning_rate": 0.0002994292047137618 + }, + { + "step": 81, + "epoch": 0.50625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.6938, + "grad_norm": 0.14423681795597076, + "learning_rate": 0.00029935567835283203 + }, + { + "step": 82, + "epoch": 0.5125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.683, + "grad_norm": 0.14571498334407806, + "learning_rate": 0.00029927770900082954 + }, + { + "step": 83, + "epoch": 0.51875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.696, + "grad_norm": 0.29470860958099365, + "learning_rate": 0.0002991952989771647 + }, + { + "step": 84, + "epoch": 0.525, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.7065, + "grad_norm": 0.23688244819641113, + "learning_rate": 0.0002991084507333479 + }, + { + "step": 85, + "epoch": 0.53125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7183, + "grad_norm": 0.18180721998214722, + "learning_rate": 0.00029901716685291663 + }, + { + "step": 86, + "epoch": 0.5375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7206, + "grad_norm": 0.17528623342514038, + "learning_rate": 0.0002989214500513582 + }, + { + "step": 87, + "epoch": 0.54375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6961, + "grad_norm": 0.16329164803028107, + "learning_rate": 0.0002988213031760294 + }, + { + "step": 88, + "epoch": 0.55, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6976, + "grad_norm": 0.14971400797367096, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 89, + "epoch": 0.55625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.7088, + "grad_norm": 0.24654538929462433, + "learning_rate": 0.0002986077312523219 + }, + { + "step": 90, + "epoch": 0.5625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6912, + "grad_norm": 0.23716630041599274, + "learning_rate": 0.00029849431255722116 + }, + { + "step": 91, + "epoch": 0.56875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6915, + "grad_norm": 0.43149498105049133, + "learning_rate": 0.00029837647649471715 + }, + { + "step": 92, + "epoch": 0.575, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7151, + "grad_norm": 0.24975161254405975, + "learning_rate": 0.0002982542265701641 + }, + { + "step": 93, + "epoch": 0.58125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7001, + "grad_norm": 0.15113577246665955, + "learning_rate": 0.0002981275664202187 + }, + { + "step": 94, + "epoch": 0.5875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7006, + "grad_norm": 0.17224188148975372, + "learning_rate": 0.00029799649981273186 + }, + { + "step": 95, + "epoch": 0.59375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6907, + "grad_norm": 0.2093725949525833, + "learning_rate": 0.00029786103064663634 + }, + { + "step": 96, + "epoch": 0.6, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7045, + "grad_norm": 0.36437565088272095, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 97, + "epoch": 0.60625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6949, + "grad_norm": 0.11575677990913391, + "learning_rate": 0.00029757690088906156 + }, + { + "step": 98, + "epoch": 0.6125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.7089, + "grad_norm": 0.12552209198474884, + "learning_rate": 0.00029742824874979515 + }, + { + "step": 99, + "epoch": 0.61875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7165, + "grad_norm": 0.4358549416065216, + "learning_rate": 0.0002972752109560943 + }, + { + "step": 100, + "epoch": 0.625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.705, + "grad_norm": 0.16167262196540833, + "learning_rate": 0.00029711779206048454 + }, + { + "step": 101, + "epoch": 0.63125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6947, + "grad_norm": 0.18331022560596466, + "learning_rate": 0.0002969559967458194 + }, + { + "step": 102, + "epoch": 0.6375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.705, + "grad_norm": 0.2435128539800644, + "learning_rate": 0.0002967898298251407 + }, + { + "step": 103, + "epoch": 0.64375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6893, + "grad_norm": 0.12310245633125305, + "learning_rate": 0.0002966192962415358 + }, + { + "step": 104, + "epoch": 0.65, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7227, + "grad_norm": 0.317805677652359, + "learning_rate": 0.00029644440106799 + }, + { + "step": 105, + "epoch": 0.65625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7039, + "grad_norm": 0.19519931077957153, + "learning_rate": 0.00029626514950723627 + }, + { + "step": 106, + "epoch": 0.6625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6987, + "grad_norm": 0.1593208909034729, + "learning_rate": 0.0002960815468916 + }, + { + "step": 107, + "epoch": 0.66875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7137, + "grad_norm": 0.19044259190559387, + "learning_rate": 0.0002958935986828407 + }, + { + "step": 108, + "epoch": 0.675, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6627, + "grad_norm": 0.15448793768882751, + "learning_rate": 0.00029570131047198915 + }, + { + "step": 109, + "epoch": 0.68125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.71, + "grad_norm": 0.21943873167037964, + "learning_rate": 0.0002955046879791816 + }, + { + "step": 110, + "epoch": 0.6875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.7052, + "grad_norm": 0.2690284848213196, + "learning_rate": 0.00029530373705348895 + }, + { + "step": 111, + "epoch": 0.69375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.7058, + "grad_norm": 0.1355724036693573, + "learning_rate": 0.00029509846367274336 + }, + { + "step": 112, + "epoch": 0.7, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7003, + "grad_norm": 0.13730323314666748, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 113, + "epoch": 0.70625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455173632, + "loss": 0.7081, + "grad_norm": 0.27153047919273376, + "learning_rate": 0.00029467497410015625 + }, + { + "step": 114, + "epoch": 0.7125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7016, + "grad_norm": 0.1978193074464798, + "learning_rate": 0.00029445677050616437 + }, + { + "step": 115, + "epoch": 0.71875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6924, + "grad_norm": 0.12134377658367157, + "learning_rate": 0.0002942342696524443 + }, + { + "step": 116, + "epoch": 0.725, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.7176, + "grad_norm": 0.4499070644378662, + "learning_rate": 0.0002940074781578893 + }, + { + "step": 117, + "epoch": 0.73125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.7077, + "grad_norm": 0.10800253599882126, + "learning_rate": 0.00029377640276902954 + }, + { + "step": 118, + "epoch": 0.7375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7052, + "grad_norm": 0.1284097284078598, + "learning_rate": 0.0002935410503598313 + }, + { + "step": 119, + "epoch": 0.74375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6904, + "grad_norm": 0.27395737171173096, + "learning_rate": 0.00029330142793149237 + }, + { + "step": 120, + "epoch": 0.75, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6931, + "grad_norm": 0.11831894516944885, + "learning_rate": 0.000293057542612234 + }, + { + "step": 121, + "epoch": 0.75625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6932, + "grad_norm": 0.15581564605236053, + "learning_rate": 0.0002928094016570886 + }, + { + "step": 122, + "epoch": 0.7625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6937, + "grad_norm": 0.3341188430786133, + "learning_rate": 0.00029255701244768414 + }, + { + "step": 123, + "epoch": 0.76875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7053, + "grad_norm": 0.11574015766382217, + "learning_rate": 0.0002923003824920244 + }, + { + "step": 124, + "epoch": 0.775, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7028, + "grad_norm": 0.11749006807804108, + "learning_rate": 0.0002920395194242658 + }, + { + "step": 125, + "epoch": 0.78125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455173632, + "loss": 0.6989, + "grad_norm": 0.1204805076122284, + "learning_rate": 0.00029177443100449014 + }, + { + "step": 126, + "epoch": 0.7875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.693, + "grad_norm": 0.17649832367897034, + "learning_rate": 0.00029150512511847375 + }, + { + "step": 127, + "epoch": 0.79375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7039, + "grad_norm": 0.18270865082740784, + "learning_rate": 0.00029123160977745306 + }, + { + "step": 128, + "epoch": 0.8, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.689, + "grad_norm": 0.11535566300153732, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 129, + "epoch": 0.80625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7104, + "grad_norm": 0.12271557748317719, + "learning_rate": 0.00029067198340121094 + }, + { + "step": 130, + "epoch": 0.8125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.7134, + "grad_norm": 0.11121604591608047, + "learning_rate": 0.00029038588901359884 + }, + { + "step": 131, + "epoch": 0.81875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7047, + "grad_norm": 0.18222512304782867, + "learning_rate": 0.00029009561846570604 + }, + { + "step": 132, + "epoch": 0.825, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6926, + "grad_norm": 0.1515274941921234, + "learning_rate": 0.00028980118039241976 + }, + { + "step": 133, + "epoch": 0.83125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6954, + "grad_norm": 0.36331021785736084, + "learning_rate": 0.00028950258355260177 + }, + { + "step": 134, + "epoch": 0.8375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6977, + "grad_norm": 0.07465390115976334, + "learning_rate": 0.00028919983682882766 + }, + { + "step": 135, + "epoch": 0.84375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6996, + "grad_norm": 0.24394814670085907, + "learning_rate": 0.0002888929492271224 + }, + { + "step": 136, + "epoch": 0.85, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6966, + "grad_norm": 0.17391850054264069, + "learning_rate": 0.000288581929876693 + }, + { + "step": 137, + "epoch": 0.85625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6853, + "grad_norm": 0.09583298116922379, + "learning_rate": 0.00028826678802965614 + }, + { + "step": 138, + "epoch": 0.8625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7067, + "grad_norm": 0.15124133229255676, + "learning_rate": 0.0002879475330607638 + }, + { + "step": 139, + "epoch": 0.86875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7138, + "grad_norm": 0.2401583045721054, + "learning_rate": 0.00028762417446712363 + }, + { + "step": 140, + "epoch": 0.875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6904, + "grad_norm": 0.08491721004247665, + "learning_rate": 0.00028729672186791704 + }, + { + "step": 141, + "epoch": 0.88125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6776, + "grad_norm": 0.11177503317594528, + "learning_rate": 0.00028696518500411254 + }, + { + "step": 142, + "epoch": 0.8875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6918, + "grad_norm": 0.11581003665924072, + "learning_rate": 0.0002866295737381763 + }, + { + "step": 143, + "epoch": 0.89375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7048, + "grad_norm": 0.19036953151226044, + "learning_rate": 0.0002862898980537788 + }, + { + "step": 144, + "epoch": 0.9, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7092, + "grad_norm": 0.16183516383171082, + "learning_rate": 0.0002859461680554975 + }, + { + "step": 145, + "epoch": 0.90625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.6978, + "grad_norm": 0.1216326653957367, + "learning_rate": 0.0002855983939685165 + }, + { + "step": 146, + "epoch": 0.9125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6852, + "grad_norm": 0.10252995789051056, + "learning_rate": 0.0002852465861383224 + }, + { + "step": 147, + "epoch": 0.91875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.704, + "grad_norm": 0.23037372529506683, + "learning_rate": 0.00028489075503039643 + }, + { + "step": 148, + "epoch": 0.925, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6981, + "grad_norm": 0.09529856592416763, + "learning_rate": 0.00028453091122990323 + }, + { + "step": 149, + "epoch": 0.93125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6871, + "grad_norm": 0.1566941738128662, + "learning_rate": 0.0002841670654413757 + }, + { + "step": 150, + "epoch": 0.9375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6878, + "grad_norm": 0.1572422981262207, + "learning_rate": 0.0002837992284883971 + }, + { + "step": 151, + "epoch": 0.94375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.701, + "grad_norm": 0.14229758083820343, + "learning_rate": 0.0002834274113132784 + }, + { + "step": 152, + "epoch": 0.95, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6963, + "grad_norm": 0.06774759292602539, + "learning_rate": 0.0002830516249767332 + }, + { + "step": 153, + "epoch": 0.95625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6858, + "grad_norm": 0.13160522282123566, + "learning_rate": 0.0002826718806575488 + }, + { + "step": 154, + "epoch": 0.9625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6993, + "grad_norm": 0.10639344155788422, + "learning_rate": 0.0002822881896522532 + }, + { + "step": 155, + "epoch": 0.96875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6988, + "grad_norm": 0.2339792400598526, + "learning_rate": 0.0002819005633747795 + }, + { + "step": 156, + "epoch": 0.975, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.698, + "grad_norm": 0.13326497375965118, + "learning_rate": 0.00028150901335612615 + }, + { + "step": 157, + "epoch": 0.98125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6804, + "grad_norm": 0.5780956149101257, + "learning_rate": 0.0002811135512440138 + }, + { + "step": 158, + "epoch": 0.9875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455192064, + "loss": 0.6971, + "grad_norm": 0.1314663141965866, + "learning_rate": 0.0002807141888025392 + }, + { + "step": 159, + "epoch": 0.99375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6955, + "grad_norm": 0.11078652739524841, + "learning_rate": 0.00028031093791182484 + }, + { + "step": 160, + "epoch": 1.0, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.7019, + "grad_norm": 0.18069936335086823, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 161, + "epoch": 1.00625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7141, + "grad_norm": 0.2529050409793854, + "learning_rate": 0.0002794928188811727 + }, + { + "step": 162, + "epoch": 1.0125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.7142, + "grad_norm": 0.2632238268852234, + "learning_rate": 0.0002790779750784118 + }, + { + "step": 163, + "epoch": 1.01875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7408, + "grad_norm": 0.4549804925918579, + "learning_rate": 0.0002786592915000408 + }, + { + "step": 164, + "epoch": 1.025, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7226, + "grad_norm": 0.38999685645103455, + "learning_rate": 0.00027823678060094197 + }, + { + "step": 165, + "epoch": 1.03125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.7235, + "grad_norm": 0.3067549467086792, + "learning_rate": 0.0002778104549498518 + }, + { + "step": 166, + "epoch": 1.0375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6955, + "grad_norm": 0.08506763726472855, + "learning_rate": 0.00027738032722898683 + }, + { + "step": 167, + "epoch": 1.04375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7025, + "grad_norm": 0.11627508699893951, + "learning_rate": 0.00027694641023366656 + }, + { + "step": 168, + "epoch": 1.05, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.7097, + "grad_norm": 0.1311037838459015, + "learning_rate": 0.0002765087168719328 + }, + { + "step": 169, + "epoch": 1.05625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7044, + "grad_norm": 0.18991442024707794, + "learning_rate": 0.00027606726016416567 + }, + { + "step": 170, + "epoch": 1.0625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.7206, + "grad_norm": 0.3345896899700165, + "learning_rate": 0.00027562205324269617 + }, + { + "step": 171, + "epoch": 1.06875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.691, + "grad_norm": 0.09144661575555801, + "learning_rate": 0.00027517310935141565 + }, + { + "step": 172, + "epoch": 1.075, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7293, + "grad_norm": 0.27406060695648193, + "learning_rate": 0.0002747204418453818 + }, + { + "step": 173, + "epoch": 1.08125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455173632, + "loss": 0.7297, + "grad_norm": 0.2647015154361725, + "learning_rate": 0.00027426406419042135 + }, + { + "step": 174, + "epoch": 1.0875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7041, + "grad_norm": 0.15704335272312164, + "learning_rate": 0.00027380398996272956 + }, + { + "step": 175, + "epoch": 1.09375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6931, + "grad_norm": 0.0719040259718895, + "learning_rate": 0.0002733402328484662 + }, + { + "step": 176, + "epoch": 1.1, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7035, + "grad_norm": 0.15641671419143677, + "learning_rate": 0.00027287280664334875 + }, + { + "step": 177, + "epoch": 1.10625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7121, + "grad_norm": 0.14803701639175415, + "learning_rate": 0.0002724017252522415 + }, + { + "step": 178, + "epoch": 1.1125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6893, + "grad_norm": 0.07192156463861465, + "learning_rate": 0.0002719270026887423 + }, + { + "step": 179, + "epoch": 1.11875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6958, + "grad_norm": 0.09501781314611435, + "learning_rate": 0.0002714486530747656 + }, + { + "step": 180, + "epoch": 1.125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6979, + "grad_norm": 0.08280641585588455, + "learning_rate": 0.0002709666906401224 + }, + { + "step": 181, + "epoch": 1.13125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455196672, + "loss": 0.6994, + "grad_norm": 0.3023011088371277, + "learning_rate": 0.0002704811297220967 + }, + { + "step": 182, + "epoch": 1.1375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6921, + "grad_norm": 0.19089466333389282, + "learning_rate": 0.00026999198476501945 + }, + { + "step": 183, + "epoch": 1.14375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6794, + "grad_norm": 0.1461048126220703, + "learning_rate": 0.0002694992703198383 + }, + { + "step": 184, + "epoch": 1.15, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.692, + "grad_norm": 0.12304554879665375, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 185, + "epoch": 1.15625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6915, + "grad_norm": 0.07031626999378204, + "learning_rate": 0.0002685031916994403 + }, + { + "step": 186, + "epoch": 1.1625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.7005, + "grad_norm": 0.16209903359413147, + "learning_rate": 0.0002679998571552925 + }, + { + "step": 187, + "epoch": 1.16875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6817, + "grad_norm": 0.15300852060317993, + "learning_rate": 0.0002674930123842975 + }, + { + "step": 188, + "epoch": 1.175, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.684, + "grad_norm": 0.08508949726819992, + "learning_rate": 0.0002669826724639322 + }, + { + "step": 189, + "epoch": 1.18125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7024, + "grad_norm": 0.21480239927768707, + "learning_rate": 0.0002664688525756463 + }, + { + "step": 190, + "epoch": 1.1875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6822, + "grad_norm": 0.0867135301232338, + "learning_rate": 0.0002659515680044105 + }, + { + "step": 191, + "epoch": 1.19375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6788, + "grad_norm": 0.08089124411344528, + "learning_rate": 0.00026543083413826203 + }, + { + "step": 192, + "epoch": 1.2, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7265, + "grad_norm": 0.39511507749557495, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 193, + "epoch": 1.20625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7048, + "grad_norm": 0.209834486246109, + "learning_rate": 0.0002643790805859582 + }, + { + "step": 194, + "epoch": 1.2125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.709, + "grad_norm": 0.22699134051799774, + "learning_rate": 0.00026384809218707423 + }, + { + "step": 195, + "epoch": 1.21875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6837, + "grad_norm": 0.23797856271266937, + "learning_rate": 0.0002633137170668897 + }, + { + "step": 196, + "epoch": 1.225, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6901, + "grad_norm": 0.0798911526799202, + "learning_rate": 0.0002627759711218466 + }, + { + "step": 197, + "epoch": 1.23125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.6972, + "grad_norm": 0.07681240141391754, + "learning_rate": 0.00026223487034866133 + }, + { + "step": 198, + "epoch": 1.2375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6962, + "grad_norm": 0.07772725075483322, + "learning_rate": 0.00026169043084384896 + }, + { + "step": 199, + "epoch": 1.24375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6942, + "grad_norm": 0.18942886590957642, + "learning_rate": 0.00026114266880324387 + }, + { + "step": 200, + "epoch": 1.25, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6902, + "grad_norm": 0.08737677335739136, + "learning_rate": 0.0002605916005215186 + }, + { + "step": 201, + "epoch": 1.25625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.7002, + "grad_norm": 0.11227729916572571, + "learning_rate": 0.00026003724239169874 + }, + { + "step": 202, + "epoch": 1.2625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7072, + "grad_norm": 0.2610335946083069, + "learning_rate": 0.00025947961090467533 + }, + { + "step": 203, + "epoch": 1.26875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6914, + "grad_norm": 0.34792855381965637, + "learning_rate": 0.0002589187226487144 + }, + { + "step": 204, + "epoch": 1.275, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6993, + "grad_norm": 0.09217633306980133, + "learning_rate": 0.0002583545943089633 + }, + { + "step": 205, + "epoch": 1.28125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6933, + "grad_norm": 0.08132340013980865, + "learning_rate": 0.00025778724266695466 + }, + { + "step": 206, + "epoch": 1.2875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6952, + "grad_norm": 0.0874810665845871, + "learning_rate": 0.00025721668460010696 + }, + { + "step": 207, + "epoch": 1.29375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.7, + "grad_norm": 0.09196976572275162, + "learning_rate": 0.0002566429370812223 + }, + { + "step": 208, + "epoch": 1.3, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6934, + "grad_norm": 0.16784431040287018, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 209, + "epoch": 1.30625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7058, + "grad_norm": 0.19436070322990417, + "learning_rate": 0.0002554859420524386 + }, + { + "step": 210, + "epoch": 1.3125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.711, + "grad_norm": 0.18295979499816895, + "learning_rate": 0.00025490272896050507 + }, + { + "step": 211, + "epoch": 1.31875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6984, + "grad_norm": 0.19882185757160187, + "learning_rate": 0.00025431639525144175 + }, + { + "step": 212, + "epoch": 1.325, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6852, + "grad_norm": 0.08433717489242554, + "learning_rate": 0.0002537269583673404 + }, + { + "step": 213, + "epoch": 1.33125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7033, + "grad_norm": 0.15989674627780914, + "learning_rate": 0.0002531344358426051 + }, + { + "step": 214, + "epoch": 1.3375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455190528, + "loss": 0.6899, + "grad_norm": 0.08103392273187637, + "learning_rate": 0.0002525388453034307 + }, + { + "step": 215, + "epoch": 1.34375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.704, + "grad_norm": 0.2471395581960678, + "learning_rate": 0.0002519402044672784 + }, + { + "step": 216, + "epoch": 1.35, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6995, + "grad_norm": 0.27907517552375793, + "learning_rate": 0.00025133853114234905 + }, + { + "step": 217, + "epoch": 1.35625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6991, + "grad_norm": 0.09603721648454666, + "learning_rate": 0.00025073384322705274 + }, + { + "step": 218, + "epoch": 1.3625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6898, + "grad_norm": 0.233570396900177, + "learning_rate": 0.0002501261587094771 + }, + { + "step": 219, + "epoch": 1.36875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6959, + "grad_norm": 0.08418440818786621, + "learning_rate": 0.00024951549566685165 + }, + { + "step": 220, + "epoch": 1.375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6965, + "grad_norm": 0.06506829708814621, + "learning_rate": 0.0002489018722650103 + }, + { + "step": 221, + "epoch": 1.38125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6869, + "grad_norm": 0.2401263266801834, + "learning_rate": 0.00024828530675785094 + }, + { + "step": 222, + "epoch": 1.3875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7007, + "grad_norm": 0.11142557114362717, + "learning_rate": 0.00024766581748679234 + }, + { + "step": 223, + "epoch": 1.39375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.6934, + "grad_norm": 0.12641632556915283, + "learning_rate": 0.0002470434228802286 + }, + { + "step": 224, + "epoch": 1.4, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6964, + "grad_norm": 0.0973016694188118, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 225, + "epoch": 1.40625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7037, + "grad_norm": 0.2524285316467285, + "learning_rate": 0.0002457899918057468 + }, + { + "step": 226, + "epoch": 1.4125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7037, + "grad_norm": 0.19421261548995972, + "learning_rate": 0.0002451589926245468 + }, + { + "step": 227, + "epoch": 1.41875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6967, + "grad_norm": 0.11203791946172714, + "learning_rate": 0.00024452516268016865 + }, + { + "step": 228, + "epoch": 1.425, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6905, + "grad_norm": 0.08030221611261368, + "learning_rate": 0.00024388852082760884 + }, + { + "step": 229, + "epoch": 1.43125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6934, + "grad_norm": 0.13075901567935944, + "learning_rate": 0.00024324908600551162 + }, + { + "step": 230, + "epoch": 1.4375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6905, + "grad_norm": 0.09915310889482498, + "learning_rate": 0.00024260687723560574 + }, + { + "step": 231, + "epoch": 1.44375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6915, + "grad_norm": 0.21143624186515808, + "learning_rate": 0.00024196191362213862 + }, + { + "step": 232, + "epoch": 1.45, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6965, + "grad_norm": 0.14313869178295135, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 233, + "epoch": 1.45625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.7078, + "grad_norm": 0.15866082906723022, + "learning_rate": 0.0002406637986906913 + }, + { + "step": 234, + "epoch": 1.4625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455192064, + "loss": 0.6839, + "grad_norm": 0.18565437197685242, + "learning_rate": 0.00024001068598867212 + }, + { + "step": 235, + "epoch": 1.46875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7032, + "grad_norm": 0.21441660821437836, + "learning_rate": 0.000239354895673865 + }, + { + "step": 236, + "epoch": 1.475, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6974, + "grad_norm": 0.20014037191867828, + "learning_rate": 0.00023869644725453735 + }, + { + "step": 237, + "epoch": 1.48125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455190528, + "loss": 0.7098, + "grad_norm": 0.3224134147167206, + "learning_rate": 0.00023803536031802918 + }, + { + "step": 238, + "epoch": 1.4875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.7156, + "grad_norm": 0.4705771207809448, + "learning_rate": 0.00023737165453017033 + }, + { + "step": 239, + "epoch": 1.49375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6964, + "grad_norm": 0.11511879414319992, + "learning_rate": 0.0002367053496346955 + }, + { + "step": 240, + "epoch": 1.5, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6974, + "grad_norm": 0.07066799700260162, + "learning_rate": 0.00023603646545265687 + }, + { + "step": 241, + "epoch": 1.50625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6847, + "grad_norm": 0.26175642013549805, + "learning_rate": 0.00023536502188183472 + }, + { + "step": 242, + "epoch": 1.5125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6816, + "grad_norm": 0.13679614663124084, + "learning_rate": 0.00023469103889614505 + }, + { + "step": 243, + "epoch": 1.51875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455190528, + "loss": 0.7196, + "grad_norm": 0.32985344529151917, + "learning_rate": 0.0002340145365450458 + }, + { + "step": 244, + "epoch": 1.525, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7139, + "grad_norm": 0.230184406042099, + "learning_rate": 0.0002333355349529403 + }, + { + "step": 245, + "epoch": 1.53125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.7074, + "grad_norm": 0.25706344842910767, + "learning_rate": 0.0002326540543185786 + }, + { + "step": 246, + "epoch": 1.5375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6974, + "grad_norm": 0.0733429342508316, + "learning_rate": 0.0002319701149144565 + }, + { + "step": 247, + "epoch": 1.54375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.704, + "grad_norm": 0.17531318962574005, + "learning_rate": 0.00023128373708621275 + }, + { + "step": 248, + "epoch": 1.55, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6851, + "grad_norm": 0.11206116527318954, + "learning_rate": 0.00023059494125202357 + }, + { + "step": 249, + "epoch": 1.55625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7003, + "grad_norm": 0.21140798926353455, + "learning_rate": 0.00022990374790199532 + }, + { + "step": 250, + "epoch": 1.5625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6997, + "grad_norm": 0.1486646980047226, + "learning_rate": 0.0002292101775975552 + }, + { + "step": 251, + "epoch": 1.56875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6991, + "grad_norm": 0.14675626158714294, + "learning_rate": 0.00022851425097083906 + }, + { + "step": 252, + "epoch": 1.575, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6909, + "grad_norm": 0.18790899217128754, + "learning_rate": 0.00022781598872407822 + }, + { + "step": 253, + "epoch": 1.58125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7039, + "grad_norm": 0.07258997112512589, + "learning_rate": 0.00022711541162898321 + }, + { + "step": 254, + "epoch": 1.5875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7007, + "grad_norm": 0.08673511445522308, + "learning_rate": 0.00022641254052612627 + }, + { + "step": 255, + "epoch": 1.59375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.684, + "grad_norm": 0.08997368067502975, + "learning_rate": 0.00022570739632432079 + }, + { + "step": 256, + "epoch": 1.6, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6994, + "grad_norm": 0.17153172194957733, + "learning_rate": 0.000225 + }, + { + "step": 257, + "epoch": 1.60625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.696, + "grad_norm": 0.08975733071565628, + "learning_rate": 0.0002242903725965924 + }, + { + "step": 258, + "epoch": 1.6125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.705, + "grad_norm": 0.23946084082126617, + "learning_rate": 0.00022357853522389615 + }, + { + "step": 259, + "epoch": 1.61875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.687, + "grad_norm": 0.16325004398822784, + "learning_rate": 0.000222864509057451 + }, + { + "step": 260, + "epoch": 1.625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6888, + "grad_norm": 0.059710755944252014, + "learning_rate": 0.00022214831533790813 + }, + { + "step": 261, + "epoch": 1.63125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7184, + "grad_norm": 0.3737087845802307, + "learning_rate": 0.0002214299753703987 + }, + { + "step": 262, + "epoch": 1.6375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6891, + "grad_norm": 0.09628027677536011, + "learning_rate": 0.00022070951052389966 + }, + { + "step": 263, + "epoch": 1.64375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455192064, + "loss": 0.6952, + "grad_norm": 0.08075688034296036, + "learning_rate": 0.00021998694223059837 + }, + { + "step": 264, + "epoch": 1.65, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6939, + "grad_norm": 0.09955614805221558, + "learning_rate": 0.0002192622919852551 + }, + { + "step": 265, + "epoch": 1.65625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7028, + "grad_norm": 0.08779430389404297, + "learning_rate": 0.00021853558134456307 + }, + { + "step": 266, + "epoch": 1.6625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6882, + "grad_norm": 0.12556606531143188, + "learning_rate": 0.00021780683192650796 + }, + { + "step": 267, + "epoch": 1.66875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6911, + "grad_norm": 0.11146517843008041, + "learning_rate": 0.00021707606540972413 + }, + { + "step": 268, + "epoch": 1.675, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6985, + "grad_norm": 0.24154888093471527, + "learning_rate": 0.00021634330353285017 + }, + { + "step": 269, + "epoch": 1.68125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6911, + "grad_norm": 0.10853473842144012, + "learning_rate": 0.00021560856809388213 + }, + { + "step": 270, + "epoch": 1.6875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6977, + "grad_norm": 0.21161895990371704, + "learning_rate": 0.00021487188094952489 + }, + { + "step": 271, + "epoch": 1.69375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6928, + "grad_norm": 0.09245482832193375, + "learning_rate": 0.0002141332640145423 + }, + { + "step": 272, + "epoch": 1.7, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6838, + "grad_norm": 0.17897014319896698, + "learning_rate": 0.0002133927392611049 + }, + { + "step": 273, + "epoch": 1.70625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.7024, + "grad_norm": 0.1865539699792862, + "learning_rate": 0.00021265032871813658 + }, + { + "step": 274, + "epoch": 1.7125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6784, + "grad_norm": 0.13563524186611176, + "learning_rate": 0.00021190605447065917 + }, + { + "step": 275, + "epoch": 1.71875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7079, + "grad_norm": 0.20571720600128174, + "learning_rate": 0.0002111599386591355 + }, + { + "step": 276, + "epoch": 1.725, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.7012, + "grad_norm": 0.16555127501487732, + "learning_rate": 0.00021041200347881057 + }, + { + "step": 277, + "epoch": 1.73125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.6889, + "grad_norm": 0.15500560402870178, + "learning_rate": 0.00020966227117905163 + }, + { + "step": 278, + "epoch": 1.7375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.686, + "grad_norm": 0.10064449906349182, + "learning_rate": 0.00020891076406268612 + }, + { + "step": 279, + "epoch": 1.74375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.705, + "grad_norm": 0.2203359305858612, + "learning_rate": 0.00020815750448533805 + }, + { + "step": 280, + "epoch": 1.75, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.6973, + "grad_norm": 0.10707780718803406, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 281, + "epoch": 1.75625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.6982, + "grad_norm": 0.17813244462013245, + "learning_rate": 0.00020664581763018324 + }, + { + "step": 282, + "epoch": 1.7625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6953, + "grad_norm": 0.09353701025247574, + "learning_rate": 0.00020588743532161543 + }, + { + "step": 283, + "epoch": 1.76875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7078, + "grad_norm": 0.3339330852031708, + "learning_rate": 0.00020512739048920552 + }, + { + "step": 284, + "epoch": 1.775, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7056, + "grad_norm": 0.15303492546081543, + "learning_rate": 0.00020436570574255522 + }, + { + "step": 285, + "epoch": 1.78125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.698, + "grad_norm": 0.0985996425151825, + "learning_rate": 0.00020360240374005 + }, + { + "step": 286, + "epoch": 1.7875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.694, + "grad_norm": 0.29975447058677673, + "learning_rate": 0.00020283750718818501 + }, + { + "step": 287, + "epoch": 1.79375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.697, + "grad_norm": 0.07534032315015793, + "learning_rate": 0.00020207103884088955 + }, + { + "step": 288, + "epoch": 1.8, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6987, + "grad_norm": 0.05975080654025078, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 289, + "epoch": 1.80625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7021, + "grad_norm": 0.10094958543777466, + "learning_rate": 0.00020053347800883298 + }, + { + "step": 290, + "epoch": 1.8125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6879, + "grad_norm": 0.08386775851249695, + "learning_rate": 0.00019976243126300282 + }, + { + "step": 291, + "epoch": 1.81875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6939, + "grad_norm": 0.08353454619646072, + "learning_rate": 0.00019898990419824333 + }, + { + "step": 292, + "epoch": 1.825, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455172096, + "loss": 0.694, + "grad_norm": 0.07752493768930435, + "learning_rate": 0.00019821591979547423 + }, + { + "step": 293, + "epoch": 1.83125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.7085, + "grad_norm": 0.2584010362625122, + "learning_rate": 0.00019744050107896774 + }, + { + "step": 294, + "epoch": 1.8375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455173632, + "loss": 0.7166, + "grad_norm": 0.3778802752494812, + "learning_rate": 0.0001966636711156636 + }, + { + "step": 295, + "epoch": 1.84375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6932, + "grad_norm": 0.0914960727095604, + "learning_rate": 0.00019588545301448302 + }, + { + "step": 296, + "epoch": 1.85, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6897, + "grad_norm": 0.16070522367954254, + "learning_rate": 0.00019510586992564093 + }, + { + "step": 297, + "epoch": 1.85625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6975, + "grad_norm": 0.07239442318677902, + "learning_rate": 0.0001943249450399578 + }, + { + "step": 298, + "epoch": 1.8625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6966, + "grad_norm": 0.08239169418811798, + "learning_rate": 0.0001935427015881693 + }, + { + "step": 299, + "epoch": 1.86875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7024, + "grad_norm": 0.12518537044525146, + "learning_rate": 0.00019275916284023563 + }, + { + "step": 300, + "epoch": 1.875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7027, + "grad_norm": 0.26605284214019775, + "learning_rate": 0.00019197435210464882 + }, + { + "step": 301, + "epoch": 1.88125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.692, + "grad_norm": 0.08482640236616135, + "learning_rate": 0.00019118829272773985 + }, + { + "step": 302, + "epoch": 1.8875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6905, + "grad_norm": 0.07606563717126846, + "learning_rate": 0.00019040100809298392 + }, + { + "step": 303, + "epoch": 1.89375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455192064, + "loss": 0.6964, + "grad_norm": 0.11519191414117813, + "learning_rate": 0.00018961252162030476 + }, + { + "step": 304, + "epoch": 1.9, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6971, + "grad_norm": 0.14751791954040527, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 305, + "epoch": 1.90625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6983, + "grad_norm": 0.15296456217765808, + "learning_rate": 0.00018803203701893393 + }, + { + "step": 306, + "epoch": 1.9125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6909, + "grad_norm": 0.09257344156503677, + "learning_rate": 0.00018724008590605742 + }, + { + "step": 307, + "epoch": 1.91875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.694, + "grad_norm": 0.15957510471343994, + "learning_rate": 0.0001864470269854896 + }, + { + "step": 308, + "epoch": 1.925, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6982, + "grad_norm": 0.1326066106557846, + "learning_rate": 0.00018565288384892595 + }, + { + "step": 309, + "epoch": 1.93125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7028, + "grad_norm": 0.24188445508480072, + "learning_rate": 0.00018485768012031518 + }, + { + "step": 310, + "epoch": 1.9375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6911, + "grad_norm": 0.18427728116512299, + "learning_rate": 0.00018406143945515598 + }, + { + "step": 311, + "epoch": 1.94375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.6922, + "grad_norm": 0.1143706664443016, + "learning_rate": 0.00018326418553979367 + }, + { + "step": 312, + "epoch": 1.95, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.6994, + "grad_norm": 0.12765102088451385, + "learning_rate": 0.0001824659420907154 + }, + { + "step": 313, + "epoch": 1.95625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6856, + "grad_norm": 0.10082211345434189, + "learning_rate": 0.00018166673285384475 + }, + { + "step": 314, + "epoch": 1.9625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6845, + "grad_norm": 0.08444719761610031, + "learning_rate": 0.00018086658160383523 + }, + { + "step": 315, + "epoch": 1.96875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6888, + "grad_norm": 0.07316906005144119, + "learning_rate": 0.00018006551214336304 + }, + { + "step": 316, + "epoch": 1.975, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7012, + "grad_norm": 0.24288679659366608, + "learning_rate": 0.00017926354830241924 + }, + { + "step": 317, + "epoch": 1.98125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6969, + "grad_norm": 0.2005239576101303, + "learning_rate": 0.00017846071393760044 + }, + { + "step": 318, + "epoch": 1.9875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6949, + "grad_norm": 0.15211275219917297, + "learning_rate": 0.00017765703293139948 + }, + { + "step": 319, + "epoch": 1.99375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6962, + "grad_norm": 0.20403210818767548, + "learning_rate": 0.00017685252919149493 + }, + { + "step": 320, + "epoch": 2.0, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7019, + "grad_norm": 0.24791496992111206, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 321, + "epoch": 2.00625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6878, + "grad_norm": 0.39050108194351196, + "learning_rate": 0.00017524114926294887 + }, + { + "step": 322, + "epoch": 2.0125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6925, + "grad_norm": 0.0835358053445816, + "learning_rate": 0.0001744343210091883 + }, + { + "step": 323, + "epoch": 2.01875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6947, + "grad_norm": 0.09309510141611099, + "learning_rate": 0.00017362676589005967 + }, + { + "step": 324, + "epoch": 2.025, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455192064, + "loss": 0.7072, + "grad_norm": 0.27313050627708435, + "learning_rate": 0.0001728185079284875 + }, + { + "step": 325, + "epoch": 2.03125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6985, + "grad_norm": 0.16274261474609375, + "learning_rate": 0.00017200957116830423 + }, + { + "step": 326, + "epoch": 2.0375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455190528, + "loss": 0.7083, + "grad_norm": 0.1733526885509491, + "learning_rate": 0.00017119997967353514 + }, + { + "step": 327, + "epoch": 2.04375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6818, + "grad_norm": 0.04668434336781502, + "learning_rate": 0.00017038975752768211 + }, + { + "step": 328, + "epoch": 2.05, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.7065, + "grad_norm": 0.18328580260276794, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 329, + "epoch": 2.05625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.7162, + "grad_norm": 0.3091479241847992, + "learning_rate": 0.0001687675177098179 + }, + { + "step": 330, + "epoch": 2.0625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6976, + "grad_norm": 0.152513325214386, + "learning_rate": 0.00016795554829574435 + }, + { + "step": 331, + "epoch": 2.06875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6955, + "grad_norm": 0.10982862859964371, + "learning_rate": 0.00016714304474502696 + }, + { + "step": 332, + "epoch": 2.075, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7003, + "grad_norm": 0.3202073872089386, + "learning_rate": 0.00016633003122779467 + }, + { + "step": 333, + "epoch": 2.08125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7082, + "grad_norm": 0.2476348578929901, + "learning_rate": 0.00016551653192934694 + }, + { + "step": 334, + "epoch": 2.0875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6977, + "grad_norm": 0.08463730663061142, + "learning_rate": 0.0001647025710494341 + }, + { + "step": 335, + "epoch": 2.09375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6915, + "grad_norm": 0.07370319962501526, + "learning_rate": 0.00016388817280153735 + }, + { + "step": 336, + "epoch": 2.1, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6838, + "grad_norm": 0.21592728793621063, + "learning_rate": 0.00016307336141214873 + }, + { + "step": 337, + "epoch": 2.10625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6947, + "grad_norm": 0.1911967694759369, + "learning_rate": 0.00016225816112005022 + }, + { + "step": 338, + "epoch": 2.1125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7031, + "grad_norm": 0.259088397026062, + "learning_rate": 0.00016144259617559286 + }, + { + "step": 339, + "epoch": 2.11875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6999, + "grad_norm": 0.1672459989786148, + "learning_rate": 0.00016062669083997513 + }, + { + "step": 340, + "epoch": 2.125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7041, + "grad_norm": 0.13994230329990387, + "learning_rate": 0.00015981046938452146 + }, + { + "step": 341, + "epoch": 2.13125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6826, + "grad_norm": 0.06330393999814987, + "learning_rate": 0.00015899395608996015 + }, + { + "step": 342, + "epoch": 2.1375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6813, + "grad_norm": 0.07750769704580307, + "learning_rate": 0.00015817717524570094 + }, + { + "step": 343, + "epoch": 2.14375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6936, + "grad_norm": 0.13962404429912567, + "learning_rate": 0.0001573601511491127 + }, + { + "step": 344, + "epoch": 2.15, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.69, + "grad_norm": 0.08350162208080292, + "learning_rate": 0.00015654290810480042 + }, + { + "step": 345, + "epoch": 2.15625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6895, + "grad_norm": 0.0705256536602974, + "learning_rate": 0.00015572547042388223 + }, + { + "step": 346, + "epoch": 2.1625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6974, + "grad_norm": 0.11978083103895187, + "learning_rate": 0.00015490786242326643 + }, + { + "step": 347, + "epoch": 2.16875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.7099, + "grad_norm": 0.2585708200931549, + "learning_rate": 0.00015409010842492777 + }, + { + "step": 348, + "epoch": 2.175, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7081, + "grad_norm": 0.2275383323431015, + "learning_rate": 0.00015327223275518416 + }, + { + "step": 349, + "epoch": 2.18125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517056, + "loss": 0.6961, + "grad_norm": 0.09052446484565735, + "learning_rate": 0.000152454259743973 + }, + { + "step": 350, + "epoch": 2.1875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6876, + "grad_norm": 0.09005367755889893, + "learning_rate": 0.00015163621372412734 + }, + { + "step": 351, + "epoch": 2.19375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6942, + "grad_norm": 0.16654524207115173, + "learning_rate": 0.00015081811903065205 + }, + { + "step": 352, + "epoch": 2.2, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6955, + "grad_norm": 0.06814560294151306, + "learning_rate": 0.00015 + }, + { + "step": 353, + "epoch": 2.20625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7012, + "grad_norm": 0.12113191187381744, + "learning_rate": 0.0001491818809693479 + }, + { + "step": 354, + "epoch": 2.2125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6934, + "grad_norm": 0.5656611919403076, + "learning_rate": 0.00014836378627587266 + }, + { + "step": 355, + "epoch": 2.21875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6931, + "grad_norm": 0.14030271768569946, + "learning_rate": 0.00014754574025602698 + }, + { + "step": 356, + "epoch": 2.225, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.7016, + "grad_norm": 0.16414351761341095, + "learning_rate": 0.00014672776724481584 + }, + { + "step": 357, + "epoch": 2.23125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7044, + "grad_norm": 0.19135743379592896, + "learning_rate": 0.00014590989157507224 + }, + { + "step": 358, + "epoch": 2.2375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6867, + "grad_norm": 0.1488504558801651, + "learning_rate": 0.00014509213757673357 + }, + { + "step": 359, + "epoch": 2.24375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6993, + "grad_norm": 0.10839702934026718, + "learning_rate": 0.00014427452957611775 + }, + { + "step": 360, + "epoch": 2.25, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7041, + "grad_norm": 0.20591729879379272, + "learning_rate": 0.0001434570918951996 + }, + { + "step": 361, + "epoch": 2.25625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6908, + "grad_norm": 0.07144546508789062, + "learning_rate": 0.0001426398488508873 + }, + { + "step": 362, + "epoch": 2.2625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.7126, + "grad_norm": 0.3330305516719818, + "learning_rate": 0.00014182282475429903 + }, + { + "step": 363, + "epoch": 2.26875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6927, + "grad_norm": 0.15284813940525055, + "learning_rate": 0.00014100604391003985 + }, + { + "step": 364, + "epoch": 2.275, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6986, + "grad_norm": 0.1115977019071579, + "learning_rate": 0.0001401895306154785 + }, + { + "step": 365, + "epoch": 2.28125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6818, + "grad_norm": 0.13879795372486115, + "learning_rate": 0.00013937330916002487 + }, + { + "step": 366, + "epoch": 2.2875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6937, + "grad_norm": 0.05534493178129196, + "learning_rate": 0.00013855740382440714 + }, + { + "step": 367, + "epoch": 2.29375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.694, + "grad_norm": 0.25535982847213745, + "learning_rate": 0.0001377418388799498 + }, + { + "step": 368, + "epoch": 2.3, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.696, + "grad_norm": 0.20018376410007477, + "learning_rate": 0.00013692663858785124 + }, + { + "step": 369, + "epoch": 2.30625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6933, + "grad_norm": 0.17994064092636108, + "learning_rate": 0.00013611182719846268 + }, + { + "step": 370, + "epoch": 2.3125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6945, + "grad_norm": 0.07826676964759827, + "learning_rate": 0.0001352974289505659 + }, + { + "step": 371, + "epoch": 2.31875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6943, + "grad_norm": 0.08686373382806778, + "learning_rate": 0.000134483468070653 + }, + { + "step": 372, + "epoch": 2.325, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.7042, + "grad_norm": 0.12325959652662277, + "learning_rate": 0.00013366996877220533 + }, + { + "step": 373, + "epoch": 2.33125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6916, + "grad_norm": 0.07794619351625443, + "learning_rate": 0.000132856955254973 + }, + { + "step": 374, + "epoch": 2.3375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.4551936, + "loss": 0.6854, + "grad_norm": 0.17743749916553497, + "learning_rate": 0.00013204445170425565 + }, + { + "step": 375, + "epoch": 2.34375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7038, + "grad_norm": 0.18407177925109863, + "learning_rate": 0.00013123248229018214 + }, + { + "step": 376, + "epoch": 2.35, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7062, + "grad_norm": 0.22297176718711853, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 377, + "epoch": 2.35625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.4551936, + "loss": 0.7069, + "grad_norm": 0.18009164929389954, + "learning_rate": 0.0001296102424723179 + }, + { + "step": 378, + "epoch": 2.3625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6941, + "grad_norm": 0.21983860433101654, + "learning_rate": 0.0001288000203264649 + }, + { + "step": 379, + "epoch": 2.36875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6875, + "grad_norm": 0.07862849533557892, + "learning_rate": 0.00012799042883169574 + }, + { + "step": 380, + "epoch": 2.375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7021, + "grad_norm": 0.2796236574649811, + "learning_rate": 0.00012718149207151247 + }, + { + "step": 381, + "epoch": 2.38125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6875, + "grad_norm": 0.08359945565462112, + "learning_rate": 0.00012637323410994033 + }, + { + "step": 382, + "epoch": 2.3875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7001, + "grad_norm": 0.19457697868347168, + "learning_rate": 0.0001255656789908117 + }, + { + "step": 383, + "epoch": 2.39375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7033, + "grad_norm": 0.18059179186820984, + "learning_rate": 0.0001247588507370511 + }, + { + "step": 384, + "epoch": 2.4, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.701, + "grad_norm": 0.10024019330739975, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 385, + "epoch": 2.40625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6907, + "grad_norm": 0.2073695808649063, + "learning_rate": 0.0001231474708085051 + }, + { + "step": 386, + "epoch": 2.4125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.6998, + "grad_norm": 0.07620830833911896, + "learning_rate": 0.0001223429670686005 + }, + { + "step": 387, + "epoch": 2.41875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6908, + "grad_norm": 0.12943734228610992, + "learning_rate": 0.00012153928606239957 + }, + { + "step": 388, + "epoch": 2.425, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7032, + "grad_norm": 0.11993212252855301, + "learning_rate": 0.00012073645169758076 + }, + { + "step": 389, + "epoch": 2.43125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6873, + "grad_norm": 0.060181960463523865, + "learning_rate": 0.00011993448785663692 + }, + { + "step": 390, + "epoch": 2.4375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7037, + "grad_norm": 0.18327173590660095, + "learning_rate": 0.00011913341839616476 + }, + { + "step": 391, + "epoch": 2.44375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7002, + "grad_norm": 0.10324206203222275, + "learning_rate": 0.00011833326714615522 + }, + { + "step": 392, + "epoch": 2.45, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455190528, + "loss": 0.692, + "grad_norm": 0.07935202866792679, + "learning_rate": 0.00011753405790928456 + }, + { + "step": 393, + "epoch": 2.45625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7014, + "grad_norm": 0.16100293397903442, + "learning_rate": 0.0001167358144602063 + }, + { + "step": 394, + "epoch": 2.4625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6921, + "grad_norm": 0.05711306631565094, + "learning_rate": 0.00011593856054484402 + }, + { + "step": 395, + "epoch": 2.46875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.7027, + "grad_norm": 0.3956167697906494, + "learning_rate": 0.00011514231987968482 + }, + { + "step": 396, + "epoch": 2.475, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6987, + "grad_norm": 0.11201073229312897, + "learning_rate": 0.00011434711615107404 + }, + { + "step": 397, + "epoch": 2.48125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.6914, + "grad_norm": 0.08603334426879883, + "learning_rate": 0.00011355297301451042 + }, + { + "step": 398, + "epoch": 2.4875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455190528, + "loss": 0.7013, + "grad_norm": 0.07200030982494354, + "learning_rate": 0.00011275991409394253 + }, + { + "step": 399, + "epoch": 2.49375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6928, + "grad_norm": 0.15762853622436523, + "learning_rate": 0.00011196796298106608 + }, + { + "step": 400, + "epoch": 2.5, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7019, + "grad_norm": 0.14914177358150482, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 401, + "epoch": 2.50625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6955, + "grad_norm": 0.2088334858417511, + "learning_rate": 0.00011038747837969526 + }, + { + "step": 402, + "epoch": 2.5125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6946, + "grad_norm": 0.05736750364303589, + "learning_rate": 0.00010959899190701608 + }, + { + "step": 403, + "epoch": 2.51875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6959, + "grad_norm": 0.057242635637521744, + "learning_rate": 0.00010881170727226018 + }, + { + "step": 404, + "epoch": 2.525, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7053, + "grad_norm": 0.18231870234012604, + "learning_rate": 0.00010802564789535119 + }, + { + "step": 405, + "epoch": 2.53125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6879, + "grad_norm": 0.06622155755758286, + "learning_rate": 0.00010724083715976441 + }, + { + "step": 406, + "epoch": 2.5375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6765, + "grad_norm": 0.0694432407617569, + "learning_rate": 0.00010645729841183066 + }, + { + "step": 407, + "epoch": 2.54375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6907, + "grad_norm": 0.06541617214679718, + "learning_rate": 0.00010567505496004213 + }, + { + "step": 408, + "epoch": 2.55, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.6848, + "grad_norm": 0.058477673679590225, + "learning_rate": 0.00010489413007435904 + }, + { + "step": 409, + "epoch": 2.55625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6927, + "grad_norm": 0.06724480539560318, + "learning_rate": 0.00010411454698551695 + }, + { + "step": 410, + "epoch": 2.5625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6907, + "grad_norm": 0.11101619154214859, + "learning_rate": 0.00010333632888433638 + }, + { + "step": 411, + "epoch": 2.56875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.7011, + "grad_norm": 0.21792396903038025, + "learning_rate": 0.00010255949892103225 + }, + { + "step": 412, + "epoch": 2.575, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6914, + "grad_norm": 0.06248336285352707, + "learning_rate": 0.00010178408020452579 + }, + { + "step": 413, + "epoch": 2.58125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455190528, + "loss": 0.6915, + "grad_norm": 0.07560212910175323, + "learning_rate": 0.00010101009580175669 + }, + { + "step": 414, + "epoch": 2.5875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6996, + "grad_norm": 0.11243976652622223, + "learning_rate": 0.00010023756873699722 + }, + { + "step": 415, + "epoch": 2.59375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6913, + "grad_norm": 0.13241273164749146, + "learning_rate": 9.946652199116699e-05 + }, + { + "step": 416, + "epoch": 2.6, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.7077, + "grad_norm": 0.23406954109668732, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 417, + "epoch": 2.60625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7093, + "grad_norm": 0.22261063754558563, + "learning_rate": 9.792896115911045e-05 + }, + { + "step": 418, + "epoch": 2.6125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6907, + "grad_norm": 0.08120368421077728, + "learning_rate": 9.716249281181497e-05 + }, + { + "step": 419, + "epoch": 2.61875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7089, + "grad_norm": 0.26472389698028564, + "learning_rate": 9.639759625994998e-05 + }, + { + "step": 420, + "epoch": 2.625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6886, + "grad_norm": 0.07038640975952148, + "learning_rate": 9.563429425744476e-05 + }, + { + "step": 421, + "epoch": 2.63125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6952, + "grad_norm": 0.11296079307794571, + "learning_rate": 9.487260951079448e-05 + }, + { + "step": 422, + "epoch": 2.6375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6838, + "grad_norm": 0.1615072637796402, + "learning_rate": 9.411256467838455e-05 + }, + { + "step": 423, + "epoch": 2.64375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6993, + "grad_norm": 0.2985384464263916, + "learning_rate": 9.335418236981677e-05 + }, + { + "step": 424, + "epoch": 2.65, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7013, + "grad_norm": 0.06519924849271774, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 425, + "epoch": 2.65625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6916, + "grad_norm": 0.09150423109531403, + "learning_rate": 9.184249551466189e-05 + }, + { + "step": 426, + "epoch": 2.6625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6976, + "grad_norm": 0.16585230827331543, + "learning_rate": 9.10892359373139e-05 + }, + { + "step": 427, + "epoch": 2.66875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455190528, + "loss": 0.6973, + "grad_norm": 0.05313456803560257, + "learning_rate": 9.033772882094833e-05 + }, + { + "step": 428, + "epoch": 2.675, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.7042, + "grad_norm": 0.2472110390663147, + "learning_rate": 8.958799652118943e-05 + }, + { + "step": 429, + "epoch": 2.68125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6927, + "grad_norm": 0.07621602714061737, + "learning_rate": 8.884006134086449e-05 + }, + { + "step": 430, + "epoch": 2.6875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6945, + "grad_norm": 0.07932491600513458, + "learning_rate": 8.809394552934079e-05 + }, + { + "step": 431, + "epoch": 2.69375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6846, + "grad_norm": 0.1726214587688446, + "learning_rate": 8.734967128186338e-05 + }, + { + "step": 432, + "epoch": 2.7, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.7051, + "grad_norm": 0.07434499263763428, + "learning_rate": 8.660726073889511e-05 + }, + { + "step": 433, + "epoch": 2.70625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6918, + "grad_norm": 0.1066179871559143, + "learning_rate": 8.586673598545771e-05 + }, + { + "step": 434, + "epoch": 2.7125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6991, + "grad_norm": 0.10828086733818054, + "learning_rate": 8.512811905047505e-05 + }, + { + "step": 435, + "epoch": 2.71875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6939, + "grad_norm": 0.07230306416749954, + "learning_rate": 8.439143190611787e-05 + }, + { + "step": 436, + "epoch": 2.725, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6946, + "grad_norm": 0.08198399841785431, + "learning_rate": 8.365669646714983e-05 + }, + { + "step": 437, + "epoch": 2.73125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.7052, + "grad_norm": 0.11160536110401154, + "learning_rate": 8.29239345902759e-05 + }, + { + "step": 438, + "epoch": 2.7375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7018, + "grad_norm": 0.07604114711284637, + "learning_rate": 8.219316807349204e-05 + }, + { + "step": 439, + "epoch": 2.74375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6942, + "grad_norm": 0.07126818597316742, + "learning_rate": 8.146441865543689e-05 + }, + { + "step": 440, + "epoch": 2.75, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6825, + "grad_norm": 0.31242313981056213, + "learning_rate": 8.073770801474495e-05 + }, + { + "step": 441, + "epoch": 2.75625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.694, + "grad_norm": 0.06899519264698029, + "learning_rate": 8.001305776940163e-05 + }, + { + "step": 442, + "epoch": 2.7625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6949, + "grad_norm": 0.09090490639209747, + "learning_rate": 7.929048947610034e-05 + }, + { + "step": 443, + "epoch": 2.76875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6926, + "grad_norm": 0.10686199367046356, + "learning_rate": 7.857002462960132e-05 + }, + { + "step": 444, + "epoch": 2.775, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6897, + "grad_norm": 0.1054801344871521, + "learning_rate": 7.785168466209187e-05 + }, + { + "step": 445, + "epoch": 2.78125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6938, + "grad_norm": 0.06963946670293808, + "learning_rate": 7.713549094254897e-05 + }, + { + "step": 446, + "epoch": 2.7875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6874, + "grad_norm": 0.1387190967798233, + "learning_rate": 7.64214647761038e-05 + }, + { + "step": 447, + "epoch": 2.79375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6918, + "grad_norm": 0.09965158253908157, + "learning_rate": 7.570962740340759e-05 + }, + { + "step": 448, + "epoch": 2.8, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6859, + "grad_norm": 0.08279958367347717, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 449, + "epoch": 2.80625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6888, + "grad_norm": 0.09951020777225494, + "learning_rate": 7.429260367567916e-05 + }, + { + "step": 450, + "epoch": 2.8125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7047, + "grad_norm": 0.17851340770721436, + "learning_rate": 7.358745947387373e-05 + }, + { + "step": 451, + "epoch": 2.81875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6955, + "grad_norm": 0.09609993547201157, + "learning_rate": 7.288458837101675e-05 + }, + { + "step": 452, + "epoch": 2.825, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6758, + "grad_norm": 0.2418796569108963, + "learning_rate": 7.218401127592175e-05 + }, + { + "step": 453, + "epoch": 2.83125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6763, + "grad_norm": 0.16268447041511536, + "learning_rate": 7.14857490291609e-05 + }, + { + "step": 454, + "epoch": 2.8375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6993, + "grad_norm": 0.07034964859485626, + "learning_rate": 7.07898224024448e-05 + }, + { + "step": 455, + "epoch": 2.84375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7039, + "grad_norm": 0.23124106228351593, + "learning_rate": 7.009625209800465e-05 + }, + { + "step": 456, + "epoch": 2.85, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7028, + "grad_norm": 0.20626300573349, + "learning_rate": 6.940505874797639e-05 + }, + { + "step": 457, + "epoch": 2.85625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7022, + "grad_norm": 0.1670614778995514, + "learning_rate": 6.871626291378728e-05 + }, + { + "step": 458, + "epoch": 2.8625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7108, + "grad_norm": 0.292824923992157, + "learning_rate": 6.80298850855435e-05 + }, + { + "step": 459, + "epoch": 2.86875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7078, + "grad_norm": 0.12268426269292831, + "learning_rate": 6.734594568142142e-05 + }, + { + "step": 460, + "epoch": 2.875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7023, + "grad_norm": 0.1954732984304428, + "learning_rate": 6.66644650470597e-05 + }, + { + "step": 461, + "epoch": 2.88125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6804, + "grad_norm": 0.29552173614501953, + "learning_rate": 6.598546345495417e-05 + }, + { + "step": 462, + "epoch": 2.8875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7113, + "grad_norm": 0.34065765142440796, + "learning_rate": 6.530896110385494e-05 + }, + { + "step": 463, + "epoch": 2.89375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7026, + "grad_norm": 0.1295490860939026, + "learning_rate": 6.463497811816523e-05 + }, + { + "step": 464, + "epoch": 2.9, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6857, + "grad_norm": 0.06976092606782913, + "learning_rate": 6.396353454734311e-05 + }, + { + "step": 465, + "epoch": 2.90625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.701, + "grad_norm": 0.28560230135917664, + "learning_rate": 6.32946503653045e-05 + }, + { + "step": 466, + "epoch": 2.9125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6976, + "grad_norm": 0.07466460764408112, + "learning_rate": 6.262834546982969e-05 + }, + { + "step": 467, + "epoch": 2.91875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455190528, + "loss": 0.6961, + "grad_norm": 0.12852300703525543, + "learning_rate": 6.196463968197084e-05 + }, + { + "step": 468, + "epoch": 2.925, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6924, + "grad_norm": 0.06884022802114487, + "learning_rate": 6.130355274546267e-05 + }, + { + "step": 469, + "epoch": 2.93125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6957, + "grad_norm": 0.1759393811225891, + "learning_rate": 6.064510432613499e-05 + }, + { + "step": 470, + "epoch": 2.9375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6952, + "grad_norm": 0.0877147912979126, + "learning_rate": 5.998931401132786e-05 + }, + { + "step": 471, + "epoch": 2.94375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6928, + "grad_norm": 0.24687296152114868, + "learning_rate": 5.933620130930867e-05 + }, + { + "step": 472, + "epoch": 2.95, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6952, + "grad_norm": 0.134057879447937, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 473, + "epoch": 2.95625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6869, + "grad_norm": 0.07336528599262238, + "learning_rate": 5.803808637786135e-05 + }, + { + "step": 474, + "epoch": 2.9625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.6967, + "grad_norm": 0.11016734689474106, + "learning_rate": 5.739312276439427e-05 + }, + { + "step": 475, + "epoch": 2.96875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.692, + "grad_norm": 0.06456903368234634, + "learning_rate": 5.6750913994488415e-05 + }, + { + "step": 476, + "epoch": 2.975, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.698, + "grad_norm": 0.06256292760372162, + "learning_rate": 5.6111479172391136e-05 + }, + { + "step": 477, + "epoch": 2.98125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.693, + "grad_norm": 0.09560126811265945, + "learning_rate": 5.5474837319831314e-05 + }, + { + "step": 478, + "epoch": 2.9875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6903, + "grad_norm": 0.09532135725021362, + "learning_rate": 5.4841007375453186e-05 + }, + { + "step": 479, + "epoch": 2.99375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6931, + "grad_norm": 0.11039506644010544, + "learning_rate": 5.4210008194253196e-05 + }, + { + "step": 480, + "epoch": 3.0, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6936, + "grad_norm": 0.1081896647810936, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 481, + "epoch": 3.00625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.6849, + "grad_norm": 0.11472479999065399, + "learning_rate": 5.2956577119771405e-05 + }, + { + "step": 482, + "epoch": 3.0125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6916, + "grad_norm": 0.1512822061777115, + "learning_rate": 5.233418251320765e-05 + }, + { + "step": 483, + "epoch": 3.01875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7069, + "grad_norm": 0.2144189029932022, + "learning_rate": 5.171469324214901e-05 + }, + { + "step": 484, + "epoch": 3.025, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6976, + "grad_norm": 0.06340522319078445, + "learning_rate": 5.109812773498967e-05 + }, + { + "step": 485, + "epoch": 3.03125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6949, + "grad_norm": 0.1868722289800644, + "learning_rate": 5.048450433314835e-05 + }, + { + "step": 486, + "epoch": 3.0375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6841, + "grad_norm": 0.10565058887004852, + "learning_rate": 4.987384129052291e-05 + }, + { + "step": 487, + "epoch": 3.04375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.7024, + "grad_norm": 0.14132261276245117, + "learning_rate": 4.926615677294723e-05 + }, + { + "step": 488, + "epoch": 3.05, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.7007, + "grad_norm": 0.24679797887802124, + "learning_rate": 4.866146885765096e-05 + }, + { + "step": 489, + "epoch": 3.05625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6883, + "grad_norm": 0.18627075850963593, + "learning_rate": 4.8059795532721575e-05 + }, + { + "step": 490, + "epoch": 3.0625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6959, + "grad_norm": 0.15773208439350128, + "learning_rate": 4.7461154696569294e-05 + }, + { + "step": 491, + "epoch": 3.06875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.687, + "grad_norm": 0.16108962893486023, + "learning_rate": 4.686556415739488e-05 + }, + { + "step": 492, + "epoch": 3.075, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6905, + "grad_norm": 0.11753600090742111, + "learning_rate": 4.62730416326596e-05 + }, + { + "step": 493, + "epoch": 3.08125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6925, + "grad_norm": 0.21946026384830475, + "learning_rate": 4.568360474855826e-05 + }, + { + "step": 494, + "epoch": 3.0875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.694, + "grad_norm": 0.13158801198005676, + "learning_rate": 4.509727103949492e-05 + }, + { + "step": 495, + "epoch": 3.09375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.683, + "grad_norm": 0.26569944620132446, + "learning_rate": 4.451405794756138e-05 + }, + { + "step": 496, + "epoch": 3.1, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455190528, + "loss": 0.6977, + "grad_norm": 0.09167147427797318, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 497, + "epoch": 3.10625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455190528, + "loss": 0.6997, + "grad_norm": 0.2181912064552307, + "learning_rate": 4.33570629187776e-05 + }, + { + "step": 498, + "epoch": 3.1125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6985, + "grad_norm": 0.28222906589508057, + "learning_rate": 4.278331539989307e-05 + }, + { + "step": 499, + "epoch": 3.11875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.6912, + "grad_norm": 0.08191527426242828, + "learning_rate": 4.2212757333045283e-05 + }, + { + "step": 500, + "epoch": 3.125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6939, + "grad_norm": 0.1249212771654129, + "learning_rate": 4.164540569103667e-05 + }, + { + "step": 501, + "epoch": 3.13125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6955, + "grad_norm": 0.061766769737005234, + "learning_rate": 4.108127735128561e-05 + }, + { + "step": 502, + "epoch": 3.1375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6935, + "grad_norm": 0.1302478015422821, + "learning_rate": 4.052038909532469e-05 + }, + { + "step": 503, + "epoch": 3.14375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6938, + "grad_norm": 0.16663610935211182, + "learning_rate": 3.996275760830125e-05 + }, + { + "step": 504, + "epoch": 3.15, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6973, + "grad_norm": 0.06449966877698898, + "learning_rate": 3.94083994784814e-05 + }, + { + "step": 505, + "epoch": 3.15625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6965, + "grad_norm": 0.2272346466779709, + "learning_rate": 3.885733119675616e-05 + }, + { + "step": 506, + "epoch": 3.1625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6893, + "grad_norm": 0.07192134857177734, + "learning_rate": 3.830956915615106e-05 + }, + { + "step": 507, + "epoch": 3.16875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.6955, + "grad_norm": 0.06829710304737091, + "learning_rate": 3.776512965133863e-05 + }, + { + "step": 508, + "epoch": 3.175, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.6893, + "grad_norm": 0.1561611294746399, + "learning_rate": 3.72240288781534e-05 + }, + { + "step": 509, + "epoch": 3.18125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.696, + "grad_norm": 0.18520012497901917, + "learning_rate": 3.66862829331103e-05 + }, + { + "step": 510, + "epoch": 3.1875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6973, + "grad_norm": 0.13032078742980957, + "learning_rate": 3.6151907812925717e-05 + }, + { + "step": 511, + "epoch": 3.19375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.694, + "grad_norm": 0.08913850784301758, + "learning_rate": 3.562091941404179e-05 + }, + { + "step": 512, + "epoch": 3.2, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6976, + "grad_norm": 0.07432191073894501, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 513, + "epoch": 3.20625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6961, + "grad_norm": 0.1300523281097412, + "learning_rate": 3.456916586173797e-05 + }, + { + "step": 514, + "epoch": 3.2125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6941, + "grad_norm": 0.1699293702840805, + "learning_rate": 3.404843199558945e-05 + }, + { + "step": 515, + "epoch": 3.21875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6984, + "grad_norm": 0.19230447709560394, + "learning_rate": 3.3531147424353664e-05 + }, + { + "step": 516, + "epoch": 3.225, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6943, + "grad_norm": 0.16619108617305756, + "learning_rate": 3.301732753606776e-05 + }, + { + "step": 517, + "epoch": 3.23125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6928, + "grad_norm": 0.3183883726596832, + "learning_rate": 3.250698761570244e-05 + }, + { + "step": 518, + "epoch": 3.2375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6804, + "grad_norm": 0.3819268047809601, + "learning_rate": 3.200014284470745e-05 + }, + { + "step": 519, + "epoch": 3.24375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6939, + "grad_norm": 0.06214940920472145, + "learning_rate": 3.149680830055967e-05 + }, + { + "step": 520, + "epoch": 3.25, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.689, + "grad_norm": 0.16725780069828033, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 521, + "epoch": 3.25625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6953, + "grad_norm": 0.059534598141908646, + "learning_rate": 3.0500729680161663e-05 + }, + { + "step": 522, + "epoch": 3.2625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6877, + "grad_norm": 0.16333073377609253, + "learning_rate": 3.0008015234980552e-05 + }, + { + "step": 523, + "epoch": 3.26875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6948, + "grad_norm": 0.06167379021644592, + "learning_rate": 2.9518870277903274e-05 + }, + { + "step": 524, + "epoch": 3.275, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6843, + "grad_norm": 0.06552042812108994, + "learning_rate": 2.9033309359877597e-05 + }, + { + "step": 525, + "epoch": 3.28125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6969, + "grad_norm": 0.21957165002822876, + "learning_rate": 2.855134692523438e-05 + }, + { + "step": 526, + "epoch": 3.2875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.684, + "grad_norm": 0.08547736704349518, + "learning_rate": 2.807299731125773e-05 + }, + { + "step": 527, + "epoch": 3.29375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6988, + "grad_norm": 0.08907903730869293, + "learning_rate": 2.759827474775852e-05 + }, + { + "step": 528, + "epoch": 3.3, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6873, + "grad_norm": 0.10605067759752274, + "learning_rate": 2.7127193356651213e-05 + }, + { + "step": 529, + "epoch": 3.30625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6905, + "grad_norm": 0.21940283477306366, + "learning_rate": 2.665976715153377e-05 + }, + { + "step": 530, + "epoch": 3.3125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6887, + "grad_norm": 0.1911245584487915, + "learning_rate": 2.619601003727043e-05 + }, + { + "step": 531, + "epoch": 3.31875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455195136, + "loss": 0.6997, + "grad_norm": 0.2600195109844208, + "learning_rate": 2.5735935809578656e-05 + }, + { + "step": 532, + "epoch": 3.325, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6987, + "grad_norm": 0.2501670718193054, + "learning_rate": 2.5279558154618197e-05 + }, + { + "step": 533, + "epoch": 3.33125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6912, + "grad_norm": 0.10871375352144241, + "learning_rate": 2.4826890648584353e-05 + }, + { + "step": 534, + "epoch": 3.3375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6967, + "grad_norm": 0.08777184039354324, + "learning_rate": 2.4377946757303828e-05 + }, + { + "step": 535, + "epoch": 3.34375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.698, + "grad_norm": 0.08171583712100983, + "learning_rate": 2.393273983583427e-05 + }, + { + "step": 536, + "epoch": 3.35, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6981, + "grad_norm": 0.11075262725353241, + "learning_rate": 2.3491283128067174e-05 + }, + { + "step": 537, + "epoch": 3.35625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6871, + "grad_norm": 0.12110795825719833, + "learning_rate": 2.3053589766333414e-05 + }, + { + "step": 538, + "epoch": 3.3625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6891, + "grad_norm": 0.13659481704235077, + "learning_rate": 2.261967277101318e-05 + }, + { + "step": 539, + "epoch": 3.36875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6951, + "grad_norm": 0.07900399714708328, + "learning_rate": 2.218954505014821e-05 + }, + { + "step": 540, + "epoch": 3.375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6959, + "grad_norm": 0.2359241098165512, + "learning_rate": 2.1763219399058042e-05 + }, + { + "step": 541, + "epoch": 3.38125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6887, + "grad_norm": 0.06526245176792145, + "learning_rate": 2.1340708499959197e-05 + }, + { + "step": 542, + "epoch": 3.3875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6931, + "grad_norm": 0.13984207808971405, + "learning_rate": 2.0922024921588167e-05 + }, + { + "step": 543, + "epoch": 3.39375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.694, + "grad_norm": 0.12612609565258026, + "learning_rate": 2.0507181118827254e-05 + }, + { + "step": 544, + "epoch": 3.4, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6898, + "grad_norm": 0.06562681496143341, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 545, + "epoch": 3.40625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.7042, + "grad_norm": 0.05462134629487991, + "learning_rate": 1.9689062088175154e-05 + }, + { + "step": 546, + "epoch": 3.4125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.689, + "grad_norm": 0.048989612609148026, + "learning_rate": 1.928581119746081e-05 + }, + { + "step": 547, + "epoch": 3.41875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6924, + "grad_norm": 0.15155662596225739, + "learning_rate": 1.8886448755986193e-05 + }, + { + "step": 548, + "epoch": 3.425, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6903, + "grad_norm": 0.09525495022535324, + "learning_rate": 1.8490986643873845e-05 + }, + { + "step": 549, + "epoch": 3.43125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6978, + "grad_norm": 0.11176580935716629, + "learning_rate": 1.8099436625220443e-05 + }, + { + "step": 550, + "epoch": 3.4375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455192064, + "loss": 0.6908, + "grad_norm": 0.11443508416414261, + "learning_rate": 1.7711810347746757e-05 + }, + { + "step": 551, + "epoch": 3.44375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6977, + "grad_norm": 0.11713071912527084, + "learning_rate": 1.7328119342451165e-05 + }, + { + "step": 552, + "epoch": 3.45, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6918, + "grad_norm": 0.10448423027992249, + "learning_rate": 1.694837502326674e-05 + }, + { + "step": 553, + "epoch": 3.45625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6925, + "grad_norm": 0.10798266530036926, + "learning_rate": 1.6572588686721606e-05 + }, + { + "step": 554, + "epoch": 3.4625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6921, + "grad_norm": 0.2188544124364853, + "learning_rate": 1.6200771511602882e-05 + }, + { + "step": 555, + "epoch": 3.46875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.6981, + "grad_norm": 0.4251951575279236, + "learning_rate": 1.583293455862422e-05 + }, + { + "step": 556, + "epoch": 3.475, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6922, + "grad_norm": 0.09044427424669266, + "learning_rate": 1.546908877009676e-05 + }, + { + "step": 557, + "epoch": 3.48125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6955, + "grad_norm": 0.16407081484794617, + "learning_rate": 1.5109244969603546e-05 + }, + { + "step": 558, + "epoch": 3.4875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6969, + "grad_norm": 0.13608789443969727, + "learning_rate": 1.4753413861677604e-05 + }, + { + "step": 559, + "epoch": 3.49375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.697, + "grad_norm": 0.14408469200134277, + "learning_rate": 1.4401606031483497e-05 + }, + { + "step": 560, + "epoch": 3.5, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6959, + "grad_norm": 0.11263404041528702, + "learning_rate": 1.4053831944502508e-05 + }, + { + "step": 561, + "epoch": 3.50625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6926, + "grad_norm": 0.09657713025808334, + "learning_rate": 1.371010194622117e-05 + }, + { + "step": 562, + "epoch": 3.5125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7039, + "grad_norm": 0.23388832807540894, + "learning_rate": 1.3370426261823613e-05 + }, + { + "step": 563, + "epoch": 3.51875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6884, + "grad_norm": 0.08472965657711029, + "learning_rate": 1.3034814995887433e-05 + }, + { + "step": 564, + "epoch": 3.525, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6926, + "grad_norm": 0.11290962249040604, + "learning_rate": 1.2703278132082934e-05 + }, + { + "step": 565, + "epoch": 3.53125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6922, + "grad_norm": 0.06372485309839249, + "learning_rate": 1.237582553287631e-05 + }, + { + "step": 566, + "epoch": 3.5375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6974, + "grad_norm": 0.28268682956695557, + "learning_rate": 1.205246693923616e-05 + }, + { + "step": 567, + "epoch": 3.54375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.7014, + "grad_norm": 0.16941028833389282, + "learning_rate": 1.173321197034382e-05 + }, + { + "step": 568, + "epoch": 3.55, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7004, + "grad_norm": 0.15467463433742523, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 569, + "epoch": 3.55625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.7016, + "grad_norm": 0.21076899766921997, + "learning_rate": 1.1107050772877507e-05 + }, + { + "step": 570, + "epoch": 3.5625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6981, + "grad_norm": 0.09159217774868011, + "learning_rate": 1.0800163171172332e-05 + }, + { + "step": 571, + "epoch": 3.56875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6925, + "grad_norm": 0.08747881650924683, + "learning_rate": 1.0497416447398187e-05 + }, + { + "step": 572, + "epoch": 3.575, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6875, + "grad_norm": 0.2122151404619217, + "learning_rate": 1.0198819607580233e-05 + }, + { + "step": 573, + "epoch": 3.58125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6987, + "grad_norm": 0.11967389285564423, + "learning_rate": 9.904381534293993e-06 + }, + { + "step": 574, + "epoch": 3.5875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6931, + "grad_norm": 0.14777332544326782, + "learning_rate": 9.614110986401169e-06 + }, + { + "step": 575, + "epoch": 3.59375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6926, + "grad_norm": 0.07160502672195435, + "learning_rate": 9.32801659878905e-06 + }, + { + "step": 576, + "epoch": 3.6, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6992, + "grad_norm": 0.13870233297348022, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 577, + "epoch": 3.60625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6938, + "grad_norm": 0.08043424785137177, + "learning_rate": 8.768390222546895e-06 + }, + { + "step": 578, + "epoch": 3.6125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455190528, + "loss": 0.6946, + "grad_norm": 0.08527924120426178, + "learning_rate": 8.494874881526215e-06 + }, + { + "step": 579, + "epoch": 3.61875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6933, + "grad_norm": 0.0725308284163475, + "learning_rate": 8.225568995509834e-06 + }, + { + "step": 580, + "epoch": 3.625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6857, + "grad_norm": 0.22906231880187988, + "learning_rate": 7.960480575734162e-06 + }, + { + "step": 581, + "epoch": 3.63125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6905, + "grad_norm": 0.10136181861162186, + "learning_rate": 7.699617507975563e-06 + }, + { + "step": 582, + "epoch": 3.6375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.6919, + "grad_norm": 0.06638987362384796, + "learning_rate": 7.442987552315833e-06 + }, + { + "step": 583, + "epoch": 3.64375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6919, + "grad_norm": 0.07989633083343506, + "learning_rate": 7.190598342911358e-06 + }, + { + "step": 584, + "epoch": 3.65, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6883, + "grad_norm": 0.0515093095600605, + "learning_rate": 6.942457387765976e-06 + }, + { + "step": 585, + "epoch": 3.65625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455190528, + "loss": 0.6926, + "grad_norm": 0.10200107842683792, + "learning_rate": 6.698572068507596e-06 + }, + { + "step": 586, + "epoch": 3.6625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.7009, + "grad_norm": 0.2442573606967926, + "learning_rate": 6.458949640168675e-06 + }, + { + "step": 587, + "epoch": 3.66875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455190528, + "loss": 0.6906, + "grad_norm": 0.09759146720170975, + "learning_rate": 6.223597230970428e-06 + }, + { + "step": 588, + "epoch": 3.675, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6893, + "grad_norm": 0.20615549385547638, + "learning_rate": 5.992521842110709e-06 + }, + { + "step": 589, + "epoch": 3.68125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6977, + "grad_norm": 0.14104941487312317, + "learning_rate": 5.7657303475556974e-06 + }, + { + "step": 590, + "epoch": 3.6875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6982, + "grad_norm": 0.21131962537765503, + "learning_rate": 5.543229493835594e-06 + }, + { + "step": 591, + "epoch": 3.69375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6928, + "grad_norm": 0.3110273778438568, + "learning_rate": 5.325025899843732e-06 + }, + { + "step": 592, + "epoch": 3.7, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.7009, + "grad_norm": 0.18059374392032623, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 593, + "epoch": 3.70625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6995, + "grad_norm": 0.14213891327381134, + "learning_rate": 4.901536327256589e-06 + }, + { + "step": 594, + "epoch": 3.7125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6978, + "grad_norm": 0.32240593433380127, + "learning_rate": 4.6962629465110365e-06 + }, + { + "step": 595, + "epoch": 3.71875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455188992, + "loss": 0.6894, + "grad_norm": 0.07744120061397552, + "learning_rate": 4.495312020818403e-06 + }, + { + "step": 596, + "epoch": 3.725, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6907, + "grad_norm": 0.17349517345428467, + "learning_rate": 4.298689528010785e-06 + }, + { + "step": 597, + "epoch": 3.73125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6914, + "grad_norm": 0.10278797149658203, + "learning_rate": 4.106401317159275e-06 + }, + { + "step": 598, + "epoch": 3.7375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6942, + "grad_norm": 0.19197827577590942, + "learning_rate": 3.918453108399955e-06 + }, + { + "step": 599, + "epoch": 3.74375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6867, + "grad_norm": 0.1089465394616127, + "learning_rate": 3.7348504927637302e-06 + }, + { + "step": 600, + "epoch": 3.75, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6933, + "grad_norm": 0.10436882823705673, + "learning_rate": 3.5555989320099952e-06 + }, + { + "step": 601, + "epoch": 3.75625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6925, + "grad_norm": 0.08382229506969452, + "learning_rate": 3.3807037584642316e-06 + }, + { + "step": 602, + "epoch": 3.7625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6918, + "grad_norm": 0.09709219634532928, + "learning_rate": 3.21017017485925e-06 + }, + { + "step": 603, + "epoch": 3.76875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455187456, + "loss": 0.6916, + "grad_norm": 0.16083228588104248, + "learning_rate": 3.0440032541805825e-06 + }, + { + "step": 604, + "epoch": 3.775, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6985, + "grad_norm": 0.0870104506611824, + "learning_rate": 2.882207939515435e-06 + }, + { + "step": 605, + "epoch": 3.78125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455176704, + "loss": 0.6986, + "grad_norm": 0.26743918657302856, + "learning_rate": 2.7247890439057064e-06 + }, + { + "step": 606, + "epoch": 3.7875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6924, + "grad_norm": 0.1380254328250885, + "learning_rate": 2.5717512502048342e-06 + }, + { + "step": 607, + "epoch": 3.79375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6917, + "grad_norm": 0.09760503470897675, + "learning_rate": 2.423099110938376e-06 + }, + { + "step": 608, + "epoch": 3.8, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6918, + "grad_norm": 0.17074893414974213, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 609, + "epoch": 3.80625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6923, + "grad_norm": 0.22369350492954254, + "learning_rate": 2.1389693533636455e-06 + }, + { + "step": 610, + "epoch": 3.8125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6953, + "grad_norm": 0.06888538599014282, + "learning_rate": 2.003500187268153e-06 + }, + { + "step": 611, + "epoch": 3.81875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.688, + "grad_norm": 0.1524192839860916, + "learning_rate": 1.8724335797812685e-06 + }, + { + "step": 612, + "epoch": 3.825, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.691, + "grad_norm": 0.10887207835912704, + "learning_rate": 1.7457734298359005e-06 + }, + { + "step": 613, + "epoch": 3.83125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6977, + "grad_norm": 0.06318213045597076, + "learning_rate": 1.6235235052828476e-06 + }, + { + "step": 614, + "epoch": 3.8375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6919, + "grad_norm": 0.1335323303937912, + "learning_rate": 1.505687442778819e-06 + }, + { + "step": 615, + "epoch": 3.84375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6914, + "grad_norm": 0.1390407383441925, + "learning_rate": 1.3922687476781047e-06 + }, + { + "step": 616, + "epoch": 3.85, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6965, + "grad_norm": 0.06897924840450287, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 617, + "epoch": 3.85625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6936, + "grad_norm": 0.14121748507022858, + "learning_rate": 1.1786968239705486e-06 + }, + { + "step": 618, + "epoch": 3.8625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.69, + "grad_norm": 0.05877530202269554, + "learning_rate": 1.0785499486417438e-06 + }, + { + "step": 619, + "epoch": 3.86875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.695, + "grad_norm": 0.06572108715772629, + "learning_rate": 9.82833147083345e-07 + }, + { + "step": 620, + "epoch": 3.875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.692, + "grad_norm": 0.17487938702106476, + "learning_rate": 8.91549266652053e-07 + }, + { + "step": 621, + "epoch": 3.88125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6999, + "grad_norm": 0.24011503159999847, + "learning_rate": 8.04701022835319e-07 + }, + { + "step": 622, + "epoch": 3.8875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.7027, + "grad_norm": 0.10759444534778595, + "learning_rate": 7.222909991704773e-07 + }, + { + "step": 623, + "epoch": 3.89375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6903, + "grad_norm": 0.19380654394626617, + "learning_rate": 6.443216471679058e-07 + }, + { + "step": 624, + "epoch": 3.9, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6952, + "grad_norm": 0.11310966312885284, + "learning_rate": 5.707952862381681e-07 + }, + { + "step": 625, + "epoch": 3.90625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6888, + "grad_norm": 0.08393093198537827, + "learning_rate": 5.017141036229522e-07 + }, + { + "step": 626, + "epoch": 3.9125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6932, + "grad_norm": 0.056399255990982056, + "learning_rate": 4.370801543300051e-07 + }, + { + "step": 627, + "epoch": 3.91875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6924, + "grad_norm": 0.09899687767028809, + "learning_rate": 3.768953610720327e-07 + }, + { + "step": 628, + "epoch": 3.925, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6932, + "grad_norm": 0.09507197886705399, + "learning_rate": 3.211615142094781e-07 + }, + { + "step": 629, + "epoch": 3.93125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455179776, + "loss": 0.6901, + "grad_norm": 0.1366751790046692, + "learning_rate": 2.6988027169728145e-07 + }, + { + "step": 630, + "epoch": 3.9375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45517824, + "loss": 0.6942, + "grad_norm": 0.16891713440418243, + "learning_rate": 2.2305315903553555e-07 + }, + { + "step": 631, + "epoch": 3.94375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455192064, + "loss": 0.6856, + "grad_norm": 0.15213067829608917, + "learning_rate": 1.8068156922413924e-07 + }, + { + "step": 632, + "epoch": 3.95, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.6886, + "grad_norm": 0.08033210039138794, + "learning_rate": 1.4276676272133025e-07 + }, + { + "step": 633, + "epoch": 3.95625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6983, + "grad_norm": 0.07573483139276505, + "learning_rate": 1.0930986740621539e-07 + }, + { + "step": 634, + "epoch": 3.9625, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455182848, + "loss": 0.693, + "grad_norm": 0.1606021523475647, + "learning_rate": 8.031187854514731e-08 + }, + { + "step": 635, + "epoch": 3.96875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6934, + "grad_norm": 0.07979687303304672, + "learning_rate": 5.577365876224815e-08 + }, + { + "step": 636, + "epoch": 3.975, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455175168, + "loss": 0.6936, + "grad_norm": 0.09796908497810364, + "learning_rate": 3.5695938013630134e-08 + }, + { + "step": 637, + "epoch": 3.98125, + "cpu_mem": 1.854390272, + "gpu_mem": 4.45518592, + "loss": 0.6999, + "grad_norm": 0.27466443181037903, + "learning_rate": 2.007931356572956e-08 + }, + { + "step": 638, + "epoch": 3.9875, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6995, + "grad_norm": 0.1675737053155899, + "learning_rate": 8.924249977537712e-09 + }, + { + "step": 639, + "epoch": 3.99375, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455184384, + "loss": 0.6964, + "grad_norm": 0.14018340408802032, + "learning_rate": 2.2310790867619e-09 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "loss": 0.6949, + "grad_norm": 0.12293223291635513, + "learning_rate": 0.0 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.854390272, + "gpu_mem": 4.455181312, + "train_runtime": 2933.0392, + "train_samples_per_second": 13.957, + "train_steps_per_second": 0.218, + "total_flos": 1.4601807792635904e+16, + "train_loss": 0.803875460010022 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r32-a2/README.md b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r32-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r32-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r32-a2/adapter_config.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0ef1d724eca7640a4f365c193cda2fc4efdb2073 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r32-a2/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha": 64, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "inference_mode": true, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 32, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r32-a2/eval_results.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..1b5057b466e6f7c0356c9baf91a4c69a5eaa50fd --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "winogrande", + "results": 0.5240726124704025 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r32-a2/training_configuration.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..ac006fe2196db3fba026d44cdfd82719e3b152a8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "WINOGRANDE", + "dataset_id": "allenai/winogrande", + "preprocess_id": "winogrande_train_deepeval" + }, + "peft_config": { + "method": "loha", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 50462720 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loha-winogrande-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r32-a2", + "seed": 42, + "timestamp": "2025-09-14T04:47:55.894872" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r32-a2/training_logs.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..20a8e9092c772d5cab2ba02d8ac4e27e93bb27dc --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r32-a2/training_logs.json @@ -0,0 +1,5773 @@ +[ + { + "step": 1, + "epoch": 0.00625, + "cpu_mem": 1.854255104, + "gpu_mem": 4.619185664, + "loss": 3.3802, + "grad_norm": 3.3420424461364746, + "learning_rate": 4.6875e-06 + }, + { + "step": 2, + "epoch": 0.0125, + "cpu_mem": 1.859956736, + "gpu_mem": 5.022885376, + "loss": 3.3361, + "grad_norm": 3.299823522567749, + "learning_rate": 9.375e-06 + }, + { + "step": 3, + "epoch": 0.01875, + "cpu_mem": 1.860349952, + "gpu_mem": 5.022889984, + "loss": 3.2239, + "grad_norm": 3.114288330078125, + "learning_rate": 1.40625e-05 + }, + { + "step": 4, + "epoch": 0.025, + "cpu_mem": 1.860743168, + "gpu_mem": 5.022888448, + "loss": 3.176, + "grad_norm": 3.2095937728881836, + "learning_rate": 1.875e-05 + }, + { + "step": 5, + "epoch": 0.03125, + "cpu_mem": 1.861136384, + "gpu_mem": 5.022888448, + "loss": 3.1736, + "grad_norm": 3.1669957637786865, + "learning_rate": 2.3437499999999997e-05 + }, + { + "step": 6, + "epoch": 0.0375, + "cpu_mem": 1.861332992, + "gpu_mem": 5.022894592, + "loss": 3.1389, + "grad_norm": 3.20884370803833, + "learning_rate": 2.8125e-05 + }, + { + "step": 7, + "epoch": 0.04375, + "cpu_mem": 1.861726208, + "gpu_mem": 5.022900736, + "loss": 3.0071, + "grad_norm": 3.262693166732788, + "learning_rate": 3.28125e-05 + }, + { + "step": 8, + "epoch": 0.05, + "cpu_mem": 1.861922816, + "gpu_mem": 5.02288384, + "loss": 2.9123, + "grad_norm": 3.0326783657073975, + "learning_rate": 3.75e-05 + }, + { + "step": 9, + "epoch": 0.05625, + "cpu_mem": 1.862119424, + "gpu_mem": 5.022889984, + "loss": 3.0282, + "grad_norm": 3.3070523738861084, + "learning_rate": 4.2187499999999995e-05 + }, + { + "step": 10, + "epoch": 0.0625, + "cpu_mem": 1.862316032, + "gpu_mem": 5.022893056, + "loss": 2.7598, + "grad_norm": 3.247817039489746, + "learning_rate": 4.6874999999999994e-05 + }, + { + "step": 11, + "epoch": 0.06875, + "cpu_mem": 1.86251264, + "gpu_mem": 5.022882304, + "loss": 2.6134, + "grad_norm": 3.1692821979522705, + "learning_rate": 5.156249999999999e-05 + }, + { + "step": 12, + "epoch": 0.075, + "cpu_mem": 1.862709248, + "gpu_mem": 5.022886912, + "loss": 2.6313, + "grad_norm": 3.226085662841797, + "learning_rate": 5.625e-05 + }, + { + "step": 13, + "epoch": 0.08125, + "cpu_mem": 1.862905856, + "gpu_mem": 5.022894592, + "loss": 2.3166, + "grad_norm": 2.904945135116577, + "learning_rate": 6.09375e-05 + }, + { + "step": 14, + "epoch": 0.0875, + "cpu_mem": 1.862905856, + "gpu_mem": 5.022889984, + "loss": 2.1569, + "grad_norm": 2.7390832901000977, + "learning_rate": 6.5625e-05 + }, + { + "step": 15, + "epoch": 0.09375, + "cpu_mem": 1.863102464, + "gpu_mem": 5.022889984, + "loss": 1.9102, + "grad_norm": 2.5223379135131836, + "learning_rate": 7.03125e-05 + }, + { + "step": 16, + "epoch": 0.1, + "cpu_mem": 1.863102464, + "gpu_mem": 5.022886912, + "loss": 1.8025, + "grad_norm": 2.456543445587158, + "learning_rate": 7.5e-05 + }, + { + "step": 17, + "epoch": 0.10625, + "cpu_mem": 1.863102464, + "gpu_mem": 5.022886912, + "loss": 1.5567, + "grad_norm": 2.101299524307251, + "learning_rate": 7.968749999999999e-05 + }, + { + "step": 18, + "epoch": 0.1125, + "cpu_mem": 1.863299072, + "gpu_mem": 5.022889984, + "loss": 1.4896, + "grad_norm": 1.9655094146728516, + "learning_rate": 8.437499999999999e-05 + }, + { + "step": 19, + "epoch": 0.11875, + "cpu_mem": 1.86349568, + "gpu_mem": 5.022886912, + "loss": 1.3676, + "grad_norm": 1.7930899858474731, + "learning_rate": 8.906249999999999e-05 + }, + { + "step": 20, + "epoch": 0.125, + "cpu_mem": 1.86349568, + "gpu_mem": 5.022894592, + "loss": 1.3707, + "grad_norm": 1.7426024675369263, + "learning_rate": 9.374999999999999e-05 + }, + { + "step": 21, + "epoch": 0.13125, + "cpu_mem": 1.863692288, + "gpu_mem": 5.022886912, + "loss": 1.1956, + "grad_norm": 1.2095270156860352, + "learning_rate": 9.843749999999999e-05 + }, + { + "step": 22, + "epoch": 0.1375, + "cpu_mem": 1.863692288, + "gpu_mem": 5.022886912, + "loss": 1.0696, + "grad_norm": 0.9372053146362305, + "learning_rate": 0.00010312499999999999 + }, + { + "step": 23, + "epoch": 0.14375, + "cpu_mem": 1.863692288, + "gpu_mem": 5.022882304, + "loss": 1.0949, + "grad_norm": 0.9563419222831726, + "learning_rate": 0.00010781249999999998 + }, + { + "step": 24, + "epoch": 0.15, + "cpu_mem": 1.863692288, + "gpu_mem": 5.022885376, + "loss": 0.9875, + "grad_norm": 0.7014867663383484, + "learning_rate": 0.0001125 + }, + { + "step": 25, + "epoch": 0.15625, + "cpu_mem": 1.863888896, + "gpu_mem": 5.022888448, + "loss": 0.8992, + "grad_norm": 0.6170854568481445, + "learning_rate": 0.0001171875 + }, + { + "step": 26, + "epoch": 0.1625, + "cpu_mem": 1.863888896, + "gpu_mem": 5.02288384, + "loss": 0.776, + "grad_norm": 0.44696879386901855, + "learning_rate": 0.000121875 + }, + { + "step": 27, + "epoch": 0.16875, + "cpu_mem": 1.863888896, + "gpu_mem": 5.022882304, + "loss": 0.8439, + "grad_norm": 0.4866417348384857, + "learning_rate": 0.0001265625 + }, + { + "step": 28, + "epoch": 0.175, + "cpu_mem": 1.863888896, + "gpu_mem": 5.022888448, + "loss": 0.7959, + "grad_norm": 0.4859016239643097, + "learning_rate": 0.00013125 + }, + { + "step": 29, + "epoch": 0.18125, + "cpu_mem": 1.863888896, + "gpu_mem": 5.022886912, + "loss": 0.7841, + "grad_norm": 0.430417537689209, + "learning_rate": 0.0001359375 + }, + { + "step": 30, + "epoch": 0.1875, + "cpu_mem": 1.863888896, + "gpu_mem": 5.022886912, + "loss": 0.7606, + "grad_norm": 0.19597308337688446, + "learning_rate": 0.000140625 + }, + { + "step": 31, + "epoch": 0.19375, + "cpu_mem": 1.863888896, + "gpu_mem": 5.022886912, + "loss": 0.7203, + "grad_norm": 0.34330689907073975, + "learning_rate": 0.0001453125 + }, + { + "step": 32, + "epoch": 0.2, + "cpu_mem": 1.864085504, + "gpu_mem": 5.02288384, + "loss": 0.6987, + "grad_norm": 0.32493478059768677, + "learning_rate": 0.00015 + }, + { + "step": 33, + "epoch": 0.20625, + "cpu_mem": 1.864085504, + "gpu_mem": 5.02288384, + "loss": 0.7863, + "grad_norm": 0.8785906434059143, + "learning_rate": 0.00015468749999999999 + }, + { + "step": 34, + "epoch": 0.2125, + "cpu_mem": 1.864085504, + "gpu_mem": 5.02288384, + "loss": 0.7145, + "grad_norm": 0.5065202116966248, + "learning_rate": 0.00015937499999999998 + }, + { + "step": 35, + "epoch": 0.21875, + "cpu_mem": 1.864085504, + "gpu_mem": 5.022889984, + "loss": 0.6874, + "grad_norm": 0.2989569306373596, + "learning_rate": 0.00016406249999999998 + }, + { + "step": 36, + "epoch": 0.225, + "cpu_mem": 1.864085504, + "gpu_mem": 5.022885376, + "loss": 0.7594, + "grad_norm": 0.6584317684173584, + "learning_rate": 0.00016874999999999998 + }, + { + "step": 37, + "epoch": 0.23125, + "cpu_mem": 1.864085504, + "gpu_mem": 5.02288384, + "loss": 0.7058, + "grad_norm": 0.1865990310907364, + "learning_rate": 0.00017343749999999998 + }, + { + "step": 38, + "epoch": 0.2375, + "cpu_mem": 1.864085504, + "gpu_mem": 5.022888448, + "loss": 0.7291, + "grad_norm": 0.31094425916671753, + "learning_rate": 0.00017812499999999998 + }, + { + "step": 39, + "epoch": 0.24375, + "cpu_mem": 1.864085504, + "gpu_mem": 5.022894592, + "loss": 0.7272, + "grad_norm": 0.4919778108596802, + "learning_rate": 0.00018281249999999998 + }, + { + "step": 40, + "epoch": 0.25, + "cpu_mem": 1.864085504, + "gpu_mem": 5.02289152, + "loss": 0.7212, + "grad_norm": 0.2723500728607178, + "learning_rate": 0.00018749999999999998 + }, + { + "step": 41, + "epoch": 0.25625, + "cpu_mem": 1.864085504, + "gpu_mem": 5.02289152, + "loss": 0.7205, + "grad_norm": 0.17865227162837982, + "learning_rate": 0.00019218749999999998 + }, + { + "step": 42, + "epoch": 0.2625, + "cpu_mem": 1.864085504, + "gpu_mem": 5.022888448, + "loss": 0.7391, + "grad_norm": 0.6612763404846191, + "learning_rate": 0.00019687499999999997 + }, + { + "step": 43, + "epoch": 0.26875, + "cpu_mem": 1.864085504, + "gpu_mem": 5.022888448, + "loss": 0.6976, + "grad_norm": 0.13962002098560333, + "learning_rate": 0.00020156249999999997 + }, + { + "step": 44, + "epoch": 0.275, + "cpu_mem": 1.864282112, + "gpu_mem": 5.022888448, + "loss": 0.7432, + "grad_norm": 0.45537176728248596, + "learning_rate": 0.00020624999999999997 + }, + { + "step": 45, + "epoch": 0.28125, + "cpu_mem": 1.864282112, + "gpu_mem": 5.022896128, + "loss": 0.7114, + "grad_norm": 0.2570909857749939, + "learning_rate": 0.00021093749999999997 + }, + { + "step": 46, + "epoch": 0.2875, + "cpu_mem": 1.864282112, + "gpu_mem": 5.022888448, + "loss": 0.7363, + "grad_norm": 0.15641433000564575, + "learning_rate": 0.00021562499999999997 + }, + { + "step": 47, + "epoch": 0.29375, + "cpu_mem": 1.864282112, + "gpu_mem": 5.022889984, + "loss": 0.7493, + "grad_norm": 0.30653199553489685, + "learning_rate": 0.00022031249999999997 + }, + { + "step": 48, + "epoch": 0.3, + "cpu_mem": 1.864282112, + "gpu_mem": 5.022889984, + "loss": 0.7532, + "grad_norm": 0.4339975416660309, + "learning_rate": 0.000225 + }, + { + "step": 49, + "epoch": 0.30625, + "cpu_mem": 1.864282112, + "gpu_mem": 5.022880768, + "loss": 0.6645, + "grad_norm": 0.19253499805927277, + "learning_rate": 0.0002296875 + }, + { + "step": 50, + "epoch": 0.3125, + "cpu_mem": 1.864282112, + "gpu_mem": 5.02288384, + "loss": 0.7296, + "grad_norm": 0.22468185424804688, + "learning_rate": 0.000234375 + }, + { + "step": 51, + "epoch": 0.31875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.7051, + "grad_norm": 0.11638680100440979, + "learning_rate": 0.0002390625 + }, + { + "step": 52, + "epoch": 0.325, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.7019, + "grad_norm": 0.17107371985912323, + "learning_rate": 0.00024375 + }, + { + "step": 53, + "epoch": 0.33125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.7085, + "grad_norm": 0.23418158292770386, + "learning_rate": 0.00024843749999999996 + }, + { + "step": 54, + "epoch": 0.3375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022877696, + "loss": 0.7114, + "grad_norm": 0.21478185057640076, + "learning_rate": 0.000253125 + }, + { + "step": 55, + "epoch": 0.34375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022882304, + "loss": 0.6852, + "grad_norm": 0.12560124695301056, + "learning_rate": 0.00025781249999999996 + }, + { + "step": 56, + "epoch": 0.35, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.7081, + "grad_norm": 0.11541113257408142, + "learning_rate": 0.0002625 + }, + { + "step": 57, + "epoch": 0.35625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022880768, + "loss": 0.714, + "grad_norm": 0.24051599204540253, + "learning_rate": 0.00026718749999999996 + }, + { + "step": 58, + "epoch": 0.3625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.7005, + "grad_norm": 0.130173921585083, + "learning_rate": 0.000271875 + }, + { + "step": 59, + "epoch": 0.36875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022896128, + "loss": 0.692, + "grad_norm": 0.10410351306200027, + "learning_rate": 0.00027656249999999995 + }, + { + "step": 60, + "epoch": 0.375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.7029, + "grad_norm": 0.1173817366361618, + "learning_rate": 0.00028125 + }, + { + "step": 61, + "epoch": 0.38125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6886, + "grad_norm": 0.07088523358106613, + "learning_rate": 0.00028593749999999995 + }, + { + "step": 62, + "epoch": 0.3875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.7059, + "grad_norm": 0.14073340594768524, + "learning_rate": 0.000290625 + }, + { + "step": 63, + "epoch": 0.39375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.7193, + "grad_norm": 0.25572067499160767, + "learning_rate": 0.00029531249999999995 + }, + { + "step": 64, + "epoch": 0.4, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022894592, + "loss": 0.7006, + "grad_norm": 0.0783626139163971, + "learning_rate": 0.0003 + }, + { + "step": 65, + "epoch": 0.40625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.716, + "grad_norm": 0.188925102353096, + "learning_rate": 0.00029999776892091325 + }, + { + "step": 66, + "epoch": 0.4125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.7081, + "grad_norm": 0.11921793967485428, + "learning_rate": 0.00029999107575002246 + }, + { + "step": 67, + "epoch": 0.41875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6898, + "grad_norm": 0.14826375246047974, + "learning_rate": 0.0002999799206864343 + }, + { + "step": 68, + "epoch": 0.425, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.7081, + "grad_norm": 0.09918297082185745, + "learning_rate": 0.0002999643040619863 + }, + { + "step": 69, + "epoch": 0.43125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6985, + "grad_norm": 0.10156139731407166, + "learning_rate": 0.0002999442263412377 + }, + { + "step": 70, + "epoch": 0.4375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022894592, + "loss": 0.713, + "grad_norm": 0.20790845155715942, + "learning_rate": 0.00029991968812145484 + }, + { + "step": 71, + "epoch": 0.44375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.6917, + "grad_norm": 0.08901599049568176, + "learning_rate": 0.00029989069013259374 + }, + { + "step": 72, + "epoch": 0.45, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.7025, + "grad_norm": 0.1133250892162323, + "learning_rate": 0.00029985723323727866 + }, + { + "step": 73, + "epoch": 0.45625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.7048, + "grad_norm": 0.18714597821235657, + "learning_rate": 0.00029981931843077583 + }, + { + "step": 74, + "epoch": 0.4625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022882304, + "loss": 0.7024, + "grad_norm": 0.1038452684879303, + "learning_rate": 0.00029977694684096444 + }, + { + "step": 75, + "epoch": 0.46875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.7157, + "grad_norm": 0.25926074385643005, + "learning_rate": 0.0002997301197283027 + }, + { + "step": 76, + "epoch": 0.475, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.6895, + "grad_norm": 0.16989970207214355, + "learning_rate": 0.0002996788384857905 + }, + { + "step": 77, + "epoch": 0.48125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022894592, + "loss": 0.7019, + "grad_norm": 0.07173074781894684, + "learning_rate": 0.00029962310463892795 + }, + { + "step": 78, + "epoch": 0.4875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.6887, + "grad_norm": 0.06556952744722366, + "learning_rate": 0.00029956291984566997 + }, + { + "step": 79, + "epoch": 0.49375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.7012, + "grad_norm": 0.05873585864901543, + "learning_rate": 0.00029949828589637703 + }, + { + "step": 80, + "epoch": 0.5, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.7146, + "grad_norm": 0.06447529792785645, + "learning_rate": 0.0002994292047137618 + }, + { + "step": 81, + "epoch": 0.50625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022880768, + "loss": 0.6977, + "grad_norm": 0.06431767344474792, + "learning_rate": 0.00029935567835283203 + }, + { + "step": 82, + "epoch": 0.5125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022882304, + "loss": 0.6842, + "grad_norm": 0.08523130416870117, + "learning_rate": 0.00029927770900082954 + }, + { + "step": 83, + "epoch": 0.51875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.6918, + "grad_norm": 0.1602739691734314, + "learning_rate": 0.0002991952989771647 + }, + { + "step": 84, + "epoch": 0.525, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022882304, + "loss": 0.7009, + "grad_norm": 0.13546141982078552, + "learning_rate": 0.0002991084507333479 + }, + { + "step": 85, + "epoch": 0.53125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.704, + "grad_norm": 0.10498923063278198, + "learning_rate": 0.00029901716685291663 + }, + { + "step": 86, + "epoch": 0.5375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.709, + "grad_norm": 0.0734967291355133, + "learning_rate": 0.0002989214500513582 + }, + { + "step": 87, + "epoch": 0.54375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.6902, + "grad_norm": 0.08831345289945602, + "learning_rate": 0.0002988213031760294 + }, + { + "step": 88, + "epoch": 0.55, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.6905, + "grad_norm": 0.08198382705450058, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 89, + "epoch": 0.55625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.6992, + "grad_norm": 0.11903834342956543, + "learning_rate": 0.0002986077312523219 + }, + { + "step": 90, + "epoch": 0.5625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.6909, + "grad_norm": 0.12846015393733978, + "learning_rate": 0.00029849431255722116 + }, + { + "step": 91, + "epoch": 0.56875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022882304, + "loss": 0.6892, + "grad_norm": 0.2591746151447296, + "learning_rate": 0.00029837647649471715 + }, + { + "step": 92, + "epoch": 0.575, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.7147, + "grad_norm": 0.1962236762046814, + "learning_rate": 0.0002982542265701641 + }, + { + "step": 93, + "epoch": 0.58125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.6983, + "grad_norm": 0.1057763621211052, + "learning_rate": 0.0002981275664202187 + }, + { + "step": 94, + "epoch": 0.5875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.704, + "grad_norm": 0.08308777213096619, + "learning_rate": 0.00029799649981273186 + }, + { + "step": 95, + "epoch": 0.59375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.6855, + "grad_norm": 0.09676402807235718, + "learning_rate": 0.00029786103064663634 + }, + { + "step": 96, + "epoch": 0.6, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.7016, + "grad_norm": 0.21744176745414734, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 97, + "epoch": 0.60625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6976, + "grad_norm": 0.06532999873161316, + "learning_rate": 0.00029757690088906156 + }, + { + "step": 98, + "epoch": 0.6125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.6983, + "grad_norm": 0.05443178117275238, + "learning_rate": 0.00029742824874979515 + }, + { + "step": 99, + "epoch": 0.61875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.7005, + "grad_norm": 0.24574881792068481, + "learning_rate": 0.0002972752109560943 + }, + { + "step": 100, + "epoch": 0.625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6981, + "grad_norm": 0.09955016523599625, + "learning_rate": 0.00029711779206048454 + }, + { + "step": 101, + "epoch": 0.63125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.6947, + "grad_norm": 0.09686021506786346, + "learning_rate": 0.0002969559967458194 + }, + { + "step": 102, + "epoch": 0.6375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022894592, + "loss": 0.7165, + "grad_norm": 0.21458236873149872, + "learning_rate": 0.0002967898298251407 + }, + { + "step": 103, + "epoch": 0.64375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.6905, + "grad_norm": 0.09229875355958939, + "learning_rate": 0.0002966192962415358 + }, + { + "step": 104, + "epoch": 0.65, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.7171, + "grad_norm": 0.2037379890680313, + "learning_rate": 0.00029644440106799 + }, + { + "step": 105, + "epoch": 0.65625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.6976, + "grad_norm": 0.10044937580823898, + "learning_rate": 0.00029626514950723627 + }, + { + "step": 106, + "epoch": 0.6625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.696, + "grad_norm": 0.05621151998639107, + "learning_rate": 0.0002960815468916 + }, + { + "step": 107, + "epoch": 0.66875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6995, + "grad_norm": 0.0603727363049984, + "learning_rate": 0.0002958935986828407 + }, + { + "step": 108, + "epoch": 0.675, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6784, + "grad_norm": 0.06431302428245544, + "learning_rate": 0.00029570131047198915 + }, + { + "step": 109, + "epoch": 0.68125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.7143, + "grad_norm": 0.11501528322696686, + "learning_rate": 0.0002955046879791816 + }, + { + "step": 110, + "epoch": 0.6875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.7163, + "grad_norm": 0.21224664151668549, + "learning_rate": 0.00029530373705348895 + }, + { + "step": 111, + "epoch": 0.69375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.6997, + "grad_norm": 0.09792075306177139, + "learning_rate": 0.00029509846367274336 + }, + { + "step": 112, + "epoch": 0.7, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.7015, + "grad_norm": 0.07499217242002487, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 113, + "epoch": 0.70625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022879232, + "loss": 0.6991, + "grad_norm": 0.16404807567596436, + "learning_rate": 0.00029467497410015625 + }, + { + "step": 114, + "epoch": 0.7125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.7004, + "grad_norm": 0.0779486820101738, + "learning_rate": 0.00029445677050616437 + }, + { + "step": 115, + "epoch": 0.71875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.7005, + "grad_norm": 0.10200147330760956, + "learning_rate": 0.0002942342696524443 + }, + { + "step": 116, + "epoch": 0.725, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022882304, + "loss": 0.747, + "grad_norm": 0.41554805636405945, + "learning_rate": 0.0002940074781578893 + }, + { + "step": 117, + "epoch": 0.73125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.7012, + "grad_norm": 0.08043798804283142, + "learning_rate": 0.00029377640276902954 + }, + { + "step": 118, + "epoch": 0.7375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.7039, + "grad_norm": 0.11717724800109863, + "learning_rate": 0.0002935410503598313 + }, + { + "step": 119, + "epoch": 0.74375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6924, + "grad_norm": 0.17235015332698822, + "learning_rate": 0.00029330142793149237 + }, + { + "step": 120, + "epoch": 0.75, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6915, + "grad_norm": 0.04900851845741272, + "learning_rate": 0.000293057542612234 + }, + { + "step": 121, + "epoch": 0.75625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.6929, + "grad_norm": 0.0660431981086731, + "learning_rate": 0.0002928094016570886 + }, + { + "step": 122, + "epoch": 0.7625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.686, + "grad_norm": 0.17863404750823975, + "learning_rate": 0.00029255701244768414 + }, + { + "step": 123, + "epoch": 0.76875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.6999, + "grad_norm": 0.052777595818042755, + "learning_rate": 0.0002923003824920244 + }, + { + "step": 124, + "epoch": 0.775, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.7053, + "grad_norm": 0.11872716248035431, + "learning_rate": 0.0002920395194242658 + }, + { + "step": 125, + "epoch": 0.78125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022879232, + "loss": 0.7007, + "grad_norm": 0.09753600507974625, + "learning_rate": 0.00029177443100449014 + }, + { + "step": 126, + "epoch": 0.7875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.7018, + "grad_norm": 0.14342103898525238, + "learning_rate": 0.00029150512511847375 + }, + { + "step": 127, + "epoch": 0.79375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.7072, + "grad_norm": 0.1190154179930687, + "learning_rate": 0.00029123160977745306 + }, + { + "step": 128, + "epoch": 0.8, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022882304, + "loss": 0.6937, + "grad_norm": 0.05772944167256355, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 129, + "epoch": 0.80625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.7069, + "grad_norm": 0.06413938850164413, + "learning_rate": 0.00029067198340121094 + }, + { + "step": 130, + "epoch": 0.8125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.7041, + "grad_norm": 0.04841078445315361, + "learning_rate": 0.00029038588901359884 + }, + { + "step": 131, + "epoch": 0.81875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.6972, + "grad_norm": 0.08237235248088837, + "learning_rate": 0.00029009561846570604 + }, + { + "step": 132, + "epoch": 0.825, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.7021, + "grad_norm": 0.15062697231769562, + "learning_rate": 0.00028980118039241976 + }, + { + "step": 133, + "epoch": 0.83125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.7098, + "grad_norm": 0.29325351119041443, + "learning_rate": 0.00028950258355260177 + }, + { + "step": 134, + "epoch": 0.8375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6958, + "grad_norm": 0.03880230709910393, + "learning_rate": 0.00028919983682882766 + }, + { + "step": 135, + "epoch": 0.84375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.6956, + "grad_norm": 0.17499203979969025, + "learning_rate": 0.0002888929492271224 + }, + { + "step": 136, + "epoch": 0.85, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.6848, + "grad_norm": 0.11090856790542603, + "learning_rate": 0.000288581929876693 + }, + { + "step": 137, + "epoch": 0.85625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6941, + "grad_norm": 0.05287203565239906, + "learning_rate": 0.00028826678802965614 + }, + { + "step": 138, + "epoch": 0.8625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.708, + "grad_norm": 0.1414925754070282, + "learning_rate": 0.0002879475330607638 + }, + { + "step": 139, + "epoch": 0.86875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.7273, + "grad_norm": 0.22708694636821747, + "learning_rate": 0.00028762417446712363 + }, + { + "step": 140, + "epoch": 0.875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.6945, + "grad_norm": 0.0836324542760849, + "learning_rate": 0.00028729672186791704 + }, + { + "step": 141, + "epoch": 0.88125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6883, + "grad_norm": 0.08381729573011398, + "learning_rate": 0.00028696518500411254 + }, + { + "step": 142, + "epoch": 0.8875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6944, + "grad_norm": 0.06279807537794113, + "learning_rate": 0.0002866295737381763 + }, + { + "step": 143, + "epoch": 0.89375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6995, + "grad_norm": 0.08546354621648788, + "learning_rate": 0.0002862898980537788 + }, + { + "step": 144, + "epoch": 0.9, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.7013, + "grad_norm": 0.08343635499477386, + "learning_rate": 0.0002859461680554975 + }, + { + "step": 145, + "epoch": 0.90625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022894592, + "loss": 0.6938, + "grad_norm": 0.07303880900144577, + "learning_rate": 0.0002855983939685165 + }, + { + "step": 146, + "epoch": 0.9125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.6841, + "grad_norm": 0.08311878889799118, + "learning_rate": 0.0002852465861383224 + }, + { + "step": 147, + "epoch": 0.91875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.7009, + "grad_norm": 0.11103788763284683, + "learning_rate": 0.00028489075503039643 + }, + { + "step": 148, + "epoch": 0.925, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6953, + "grad_norm": 0.06386216729879379, + "learning_rate": 0.00028453091122990323 + }, + { + "step": 149, + "epoch": 0.93125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.6873, + "grad_norm": 0.11837296187877655, + "learning_rate": 0.0002841670654413757 + }, + { + "step": 150, + "epoch": 0.9375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.6889, + "grad_norm": 0.10817471891641617, + "learning_rate": 0.0002837992284883971 + }, + { + "step": 151, + "epoch": 0.94375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.6969, + "grad_norm": 0.09696518629789352, + "learning_rate": 0.0002834274113132784 + }, + { + "step": 152, + "epoch": 0.95, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.695, + "grad_norm": 0.03625301271677017, + "learning_rate": 0.0002830516249767332 + }, + { + "step": 153, + "epoch": 0.95625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.69, + "grad_norm": 0.05557387322187424, + "learning_rate": 0.0002826718806575488 + }, + { + "step": 154, + "epoch": 0.9625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.6971, + "grad_norm": 0.08237471431493759, + "learning_rate": 0.0002822881896522532 + }, + { + "step": 155, + "epoch": 0.96875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.7034, + "grad_norm": 0.17234545946121216, + "learning_rate": 0.0002819005633747795 + }, + { + "step": 156, + "epoch": 0.975, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022880768, + "loss": 0.6956, + "grad_norm": 0.07883638143539429, + "learning_rate": 0.00028150901335612615 + }, + { + "step": 157, + "epoch": 0.98125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6833, + "grad_norm": 0.3810880184173584, + "learning_rate": 0.0002811135512440138 + }, + { + "step": 158, + "epoch": 0.9875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022897664, + "loss": 0.6929, + "grad_norm": 0.09279962629079819, + "learning_rate": 0.0002807141888025392 + }, + { + "step": 159, + "epoch": 0.99375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.692, + "grad_norm": 0.05407567694783211, + "learning_rate": 0.00028031093791182484 + }, + { + "step": 160, + "epoch": 1.0, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022882304, + "loss": 0.7108, + "grad_norm": 0.16341188549995422, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 161, + "epoch": 1.00625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.7196, + "grad_norm": 0.1923682540655136, + "learning_rate": 0.0002794928188811727 + }, + { + "step": 162, + "epoch": 1.0125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.7234, + "grad_norm": 0.2010122835636139, + "learning_rate": 0.0002790779750784118 + }, + { + "step": 163, + "epoch": 1.01875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.7439, + "grad_norm": 0.31425896286964417, + "learning_rate": 0.0002786592915000408 + }, + { + "step": 164, + "epoch": 1.025, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.7185, + "grad_norm": 0.23818273842334747, + "learning_rate": 0.00027823678060094197 + }, + { + "step": 165, + "epoch": 1.03125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.7117, + "grad_norm": 0.16676673293113708, + "learning_rate": 0.0002778104549498518 + }, + { + "step": 166, + "epoch": 1.0375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6941, + "grad_norm": 0.04269300773739815, + "learning_rate": 0.00027738032722898683 + }, + { + "step": 167, + "epoch": 1.04375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.7048, + "grad_norm": 0.09783191978931427, + "learning_rate": 0.00027694641023366656 + }, + { + "step": 168, + "epoch": 1.05, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.7116, + "grad_norm": 0.11288964003324509, + "learning_rate": 0.0002765087168719328 + }, + { + "step": 169, + "epoch": 1.05625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.7198, + "grad_norm": 0.1468314379453659, + "learning_rate": 0.00027606726016416567 + }, + { + "step": 170, + "epoch": 1.0625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022894592, + "loss": 0.7427, + "grad_norm": 0.2195584625005722, + "learning_rate": 0.00027562205324269617 + }, + { + "step": 171, + "epoch": 1.06875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.6823, + "grad_norm": 0.03983402997255325, + "learning_rate": 0.00027517310935141565 + }, + { + "step": 172, + "epoch": 1.075, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.7257, + "grad_norm": 0.1594439446926117, + "learning_rate": 0.0002747204418453818 + }, + { + "step": 173, + "epoch": 1.08125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022879232, + "loss": 0.7268, + "grad_norm": 0.1426292210817337, + "learning_rate": 0.00027426406419042135 + }, + { + "step": 174, + "epoch": 1.0875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.7012, + "grad_norm": 0.0811227411031723, + "learning_rate": 0.00027380398996272956 + }, + { + "step": 175, + "epoch": 1.09375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6944, + "grad_norm": 0.04824547469615936, + "learning_rate": 0.0002733402328484662 + }, + { + "step": 176, + "epoch": 1.1, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.6941, + "grad_norm": 0.05277474597096443, + "learning_rate": 0.00027287280664334875 + }, + { + "step": 177, + "epoch": 1.10625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.7015, + "grad_norm": 0.04714040458202362, + "learning_rate": 0.0002724017252522415 + }, + { + "step": 178, + "epoch": 1.1125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.692, + "grad_norm": 0.07935027033090591, + "learning_rate": 0.0002719270026887423 + }, + { + "step": 179, + "epoch": 1.11875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.7065, + "grad_norm": 0.10633562505245209, + "learning_rate": 0.0002714486530747656 + }, + { + "step": 180, + "epoch": 1.125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.7, + "grad_norm": 0.09515109658241272, + "learning_rate": 0.0002709666906401224 + }, + { + "step": 181, + "epoch": 1.13125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022902272, + "loss": 0.7231, + "grad_norm": 0.2410014420747757, + "learning_rate": 0.0002704811297220967 + }, + { + "step": 182, + "epoch": 1.1375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.6821, + "grad_norm": 0.09220167249441147, + "learning_rate": 0.00026999198476501945 + }, + { + "step": 183, + "epoch": 1.14375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6831, + "grad_norm": 0.07731679826974869, + "learning_rate": 0.0002694992703198383 + }, + { + "step": 184, + "epoch": 1.15, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6894, + "grad_norm": 0.07815196365118027, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 185, + "epoch": 1.15625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.6935, + "grad_norm": 0.04614807665348053, + "learning_rate": 0.0002685031916994403 + }, + { + "step": 186, + "epoch": 1.1625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.6977, + "grad_norm": 0.0826205387711525, + "learning_rate": 0.0002679998571552925 + }, + { + "step": 187, + "epoch": 1.16875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.6809, + "grad_norm": 0.11772393435239792, + "learning_rate": 0.0002674930123842975 + }, + { + "step": 188, + "epoch": 1.175, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6862, + "grad_norm": 0.03204696998000145, + "learning_rate": 0.0002669826724639322 + }, + { + "step": 189, + "epoch": 1.18125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6997, + "grad_norm": 0.10099214315414429, + "learning_rate": 0.0002664688525756463 + }, + { + "step": 190, + "epoch": 1.1875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.6846, + "grad_norm": 0.0707707330584526, + "learning_rate": 0.0002659515680044105 + }, + { + "step": 191, + "epoch": 1.19375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022882304, + "loss": 0.6807, + "grad_norm": 0.06300125271081924, + "learning_rate": 0.00026543083413826203 + }, + { + "step": 192, + "epoch": 1.2, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.7162, + "grad_norm": 0.22199025750160217, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 193, + "epoch": 1.20625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.701, + "grad_norm": 0.10196724534034729, + "learning_rate": 0.0002643790805859582 + }, + { + "step": 194, + "epoch": 1.2125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022880768, + "loss": 0.7029, + "grad_norm": 0.10565154254436493, + "learning_rate": 0.00026384809218707423 + }, + { + "step": 195, + "epoch": 1.21875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.6952, + "grad_norm": 0.16150514781475067, + "learning_rate": 0.0002633137170668897 + }, + { + "step": 196, + "epoch": 1.225, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.6888, + "grad_norm": 0.05798394978046417, + "learning_rate": 0.0002627759711218466 + }, + { + "step": 197, + "epoch": 1.23125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022894592, + "loss": 0.6992, + "grad_norm": 0.06103166565299034, + "learning_rate": 0.00026223487034866133 + }, + { + "step": 198, + "epoch": 1.2375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6974, + "grad_norm": 0.05553818866610527, + "learning_rate": 0.00026169043084384896 + }, + { + "step": 199, + "epoch": 1.24375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.692, + "grad_norm": 0.11971419304609299, + "learning_rate": 0.00026114266880324387 + }, + { + "step": 200, + "epoch": 1.25, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6905, + "grad_norm": 0.05079853907227516, + "learning_rate": 0.0002605916005215186 + }, + { + "step": 201, + "epoch": 1.25625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022894592, + "loss": 0.6996, + "grad_norm": 0.04578455910086632, + "learning_rate": 0.00026003724239169874 + }, + { + "step": 202, + "epoch": 1.2625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.7034, + "grad_norm": 0.13986071944236755, + "learning_rate": 0.00025947961090467533 + }, + { + "step": 203, + "epoch": 1.26875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.6985, + "grad_norm": 0.22553972899913788, + "learning_rate": 0.0002589187226487144 + }, + { + "step": 204, + "epoch": 1.275, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6963, + "grad_norm": 0.048237189650535583, + "learning_rate": 0.0002583545943089633 + }, + { + "step": 205, + "epoch": 1.28125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022882304, + "loss": 0.691, + "grad_norm": 0.05531986802816391, + "learning_rate": 0.00025778724266695466 + }, + { + "step": 206, + "epoch": 1.2875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6959, + "grad_norm": 0.05314972624182701, + "learning_rate": 0.00025721668460010696 + }, + { + "step": 207, + "epoch": 1.29375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.7023, + "grad_norm": 0.0542740635573864, + "learning_rate": 0.0002566429370812223 + }, + { + "step": 208, + "epoch": 1.3, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.6957, + "grad_norm": 0.10625320672988892, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 209, + "epoch": 1.30625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.7062, + "grad_norm": 0.1161557286977768, + "learning_rate": 0.0002554859420524386 + }, + { + "step": 210, + "epoch": 1.3125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.709, + "grad_norm": 0.10773330181837082, + "learning_rate": 0.00025490272896050507 + }, + { + "step": 211, + "epoch": 1.31875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6915, + "grad_norm": 0.10601121187210083, + "learning_rate": 0.00025431639525144175 + }, + { + "step": 212, + "epoch": 1.325, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.6924, + "grad_norm": 0.04532993584871292, + "learning_rate": 0.0002537269583673404 + }, + { + "step": 213, + "epoch": 1.33125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.7016, + "grad_norm": 0.09951391071081161, + "learning_rate": 0.0002531344358426051 + }, + { + "step": 214, + "epoch": 1.3375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022896128, + "loss": 0.6838, + "grad_norm": 0.046630121767520905, + "learning_rate": 0.0002525388453034307 + }, + { + "step": 215, + "epoch": 1.34375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022882304, + "loss": 0.6975, + "grad_norm": 0.12996941804885864, + "learning_rate": 0.0002519402044672784 + }, + { + "step": 216, + "epoch": 1.35, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.7006, + "grad_norm": 0.15828275680541992, + "learning_rate": 0.00025133853114234905 + }, + { + "step": 217, + "epoch": 1.35625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.6984, + "grad_norm": 0.05494055524468422, + "learning_rate": 0.00025073384322705274 + }, + { + "step": 218, + "epoch": 1.3625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6902, + "grad_norm": 0.14348571002483368, + "learning_rate": 0.0002501261587094771 + }, + { + "step": 219, + "epoch": 1.36875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.6959, + "grad_norm": 0.03573177754878998, + "learning_rate": 0.00024951549566685165 + }, + { + "step": 220, + "epoch": 1.375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6959, + "grad_norm": 0.05158424749970436, + "learning_rate": 0.0002489018722650103 + }, + { + "step": 221, + "epoch": 1.38125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.682, + "grad_norm": 0.11732923984527588, + "learning_rate": 0.00024828530675785094 + }, + { + "step": 222, + "epoch": 1.3875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.7, + "grad_norm": 0.07457715272903442, + "learning_rate": 0.00024766581748679234 + }, + { + "step": 223, + "epoch": 1.39375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022880768, + "loss": 0.689, + "grad_norm": 0.0507243350148201, + "learning_rate": 0.0002470434228802286 + }, + { + "step": 224, + "epoch": 1.4, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.697, + "grad_norm": 0.06238958612084389, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 225, + "epoch": 1.40625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.7059, + "grad_norm": 0.1449166238307953, + "learning_rate": 0.0002457899918057468 + }, + { + "step": 226, + "epoch": 1.4125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.7037, + "grad_norm": 0.10551431775093079, + "learning_rate": 0.0002451589926245468 + }, + { + "step": 227, + "epoch": 1.41875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022882304, + "loss": 0.6968, + "grad_norm": 0.05715542286634445, + "learning_rate": 0.00024452516268016865 + }, + { + "step": 228, + "epoch": 1.425, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.6951, + "grad_norm": 0.05113060027360916, + "learning_rate": 0.00024388852082760884 + }, + { + "step": 229, + "epoch": 1.43125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.6912, + "grad_norm": 0.049700867384672165, + "learning_rate": 0.00024324908600551162 + }, + { + "step": 230, + "epoch": 1.4375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.695, + "grad_norm": 0.0783625990152359, + "learning_rate": 0.00024260687723560574 + }, + { + "step": 231, + "epoch": 1.44375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.6853, + "grad_norm": 0.09385474771261215, + "learning_rate": 0.00024196191362213862 + }, + { + "step": 232, + "epoch": 1.45, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.6909, + "grad_norm": 0.05511975288391113, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 233, + "epoch": 1.45625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.7025, + "grad_norm": 0.08943234384059906, + "learning_rate": 0.0002406637986906913 + }, + { + "step": 234, + "epoch": 1.4625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022897664, + "loss": 0.6918, + "grad_norm": 0.1152796596288681, + "learning_rate": 0.00024001068598867212 + }, + { + "step": 235, + "epoch": 1.46875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.7035, + "grad_norm": 0.12644128501415253, + "learning_rate": 0.000239354895673865 + }, + { + "step": 236, + "epoch": 1.475, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.697, + "grad_norm": 0.10530192404985428, + "learning_rate": 0.00023869644725453735 + }, + { + "step": 237, + "epoch": 1.48125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022896128, + "loss": 0.7127, + "grad_norm": 0.16442933678627014, + "learning_rate": 0.00023803536031802918 + }, + { + "step": 238, + "epoch": 1.4875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022894592, + "loss": 0.7079, + "grad_norm": 0.2318514585494995, + "learning_rate": 0.00023737165453017033 + }, + { + "step": 239, + "epoch": 1.49375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022882304, + "loss": 0.6992, + "grad_norm": 0.09004899859428406, + "learning_rate": 0.0002367053496346955 + }, + { + "step": 240, + "epoch": 1.5, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6974, + "grad_norm": 0.047981392592191696, + "learning_rate": 0.00023603646545265687 + }, + { + "step": 241, + "epoch": 1.50625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.6747, + "grad_norm": 0.10047528147697449, + "learning_rate": 0.00023536502188183472 + }, + { + "step": 242, + "epoch": 1.5125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6801, + "grad_norm": 0.0517236590385437, + "learning_rate": 0.00023469103889614505 + }, + { + "step": 243, + "epoch": 1.51875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022896128, + "loss": 0.727, + "grad_norm": 0.22050988674163818, + "learning_rate": 0.0002340145365450458 + }, + { + "step": 244, + "epoch": 1.525, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.716, + "grad_norm": 0.13989557325839996, + "learning_rate": 0.0002333355349529403 + }, + { + "step": 245, + "epoch": 1.53125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022882304, + "loss": 0.7074, + "grad_norm": 0.15132074058055878, + "learning_rate": 0.0002326540543185786 + }, + { + "step": 246, + "epoch": 1.5375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.693, + "grad_norm": 0.028956333175301552, + "learning_rate": 0.0002319701149144565 + }, + { + "step": 247, + "epoch": 1.54375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022882304, + "loss": 0.6973, + "grad_norm": 0.07182438671588898, + "learning_rate": 0.00023128373708621275 + }, + { + "step": 248, + "epoch": 1.55, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6935, + "grad_norm": 0.11719097942113876, + "learning_rate": 0.00023059494125202357 + }, + { + "step": 249, + "epoch": 1.55625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6876, + "grad_norm": 0.07269532978534698, + "learning_rate": 0.00022990374790199532 + }, + { + "step": 250, + "epoch": 1.5625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.7137, + "grad_norm": 0.20284482836723328, + "learning_rate": 0.0002292101775975552 + }, + { + "step": 251, + "epoch": 1.56875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.7016, + "grad_norm": 0.043480969965457916, + "learning_rate": 0.00022851425097083906 + }, + { + "step": 252, + "epoch": 1.575, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6866, + "grad_norm": 0.07847059518098831, + "learning_rate": 0.00022781598872407822 + }, + { + "step": 253, + "epoch": 1.58125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6978, + "grad_norm": 0.06052326411008835, + "learning_rate": 0.00022711541162898321 + }, + { + "step": 254, + "epoch": 1.5875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6971, + "grad_norm": 0.06935831159353256, + "learning_rate": 0.00022641254052612627 + }, + { + "step": 255, + "epoch": 1.59375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022880768, + "loss": 0.6931, + "grad_norm": 0.051458921283483505, + "learning_rate": 0.00022570739632432079 + }, + { + "step": 256, + "epoch": 1.6, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.6977, + "grad_norm": 0.10713467746973038, + "learning_rate": 0.000225 + }, + { + "step": 257, + "epoch": 1.60625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.6977, + "grad_norm": 0.043970126658678055, + "learning_rate": 0.0002242903725965924 + }, + { + "step": 258, + "epoch": 1.6125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.696, + "grad_norm": 0.11368314176797867, + "learning_rate": 0.00022357853522389615 + }, + { + "step": 259, + "epoch": 1.61875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.7053, + "grad_norm": 0.15356147289276123, + "learning_rate": 0.000222864509057451 + }, + { + "step": 260, + "epoch": 1.625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6963, + "grad_norm": 0.09141013026237488, + "learning_rate": 0.00022214831533790813 + }, + { + "step": 261, + "epoch": 1.63125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.6883, + "grad_norm": 0.15950870513916016, + "learning_rate": 0.0002214299753703987 + }, + { + "step": 262, + "epoch": 1.6375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.6957, + "grad_norm": 0.04524422809481621, + "learning_rate": 0.00022070951052389966 + }, + { + "step": 263, + "epoch": 1.64375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022897664, + "loss": 0.697, + "grad_norm": 0.04999703913927078, + "learning_rate": 0.00021998694223059837 + }, + { + "step": 264, + "epoch": 1.65, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.6939, + "grad_norm": 0.03520284593105316, + "learning_rate": 0.0002192622919852551 + }, + { + "step": 265, + "epoch": 1.65625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.7055, + "grad_norm": 0.09533064067363739, + "learning_rate": 0.00021853558134456307 + }, + { + "step": 266, + "epoch": 1.6625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.6842, + "grad_norm": 0.0390302836894989, + "learning_rate": 0.00021780683192650796 + }, + { + "step": 267, + "epoch": 1.66875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.6897, + "grad_norm": 0.04072475805878639, + "learning_rate": 0.00021707606540972413 + }, + { + "step": 268, + "epoch": 1.675, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.7018, + "grad_norm": 0.13950666785240173, + "learning_rate": 0.00021634330353285017 + }, + { + "step": 269, + "epoch": 1.68125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.6915, + "grad_norm": 0.06427665799856186, + "learning_rate": 0.00021560856809388213 + }, + { + "step": 270, + "epoch": 1.6875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6955, + "grad_norm": 0.10467036813497543, + "learning_rate": 0.00021487188094952489 + }, + { + "step": 271, + "epoch": 1.69375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6933, + "grad_norm": 0.03249180689454079, + "learning_rate": 0.0002141332640145423 + }, + { + "step": 272, + "epoch": 1.7, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.6913, + "grad_norm": 0.12475797533988953, + "learning_rate": 0.0002133927392611049 + }, + { + "step": 273, + "epoch": 1.70625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022882304, + "loss": 0.695, + "grad_norm": 0.06264282017946243, + "learning_rate": 0.00021265032871813658 + }, + { + "step": 274, + "epoch": 1.7125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.689, + "grad_norm": 0.11228593438863754, + "learning_rate": 0.00021190605447065917 + }, + { + "step": 275, + "epoch": 1.71875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.6965, + "grad_norm": 0.07302477210760117, + "learning_rate": 0.0002111599386591355 + }, + { + "step": 276, + "epoch": 1.725, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022882304, + "loss": 0.6949, + "grad_norm": 0.06042073294520378, + "learning_rate": 0.00021041200347881057 + }, + { + "step": 277, + "epoch": 1.73125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022894592, + "loss": 0.6961, + "grad_norm": 0.11533452570438385, + "learning_rate": 0.00020966227117905163 + }, + { + "step": 278, + "epoch": 1.7375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.6844, + "grad_norm": 0.03014272451400757, + "learning_rate": 0.00020891076406268612 + }, + { + "step": 279, + "epoch": 1.74375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.6966, + "grad_norm": 0.10111630707979202, + "learning_rate": 0.00020815750448533805 + }, + { + "step": 280, + "epoch": 1.75, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022894592, + "loss": 0.6992, + "grad_norm": 0.0528702512383461, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 281, + "epoch": 1.75625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022880768, + "loss": 0.6925, + "grad_norm": 0.08304446935653687, + "learning_rate": 0.00020664581763018324 + }, + { + "step": 282, + "epoch": 1.7625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.6972, + "grad_norm": 0.05445067957043648, + "learning_rate": 0.00020588743532161543 + }, + { + "step": 283, + "epoch": 1.76875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.7099, + "grad_norm": 0.19669613242149353, + "learning_rate": 0.00020512739048920552 + }, + { + "step": 284, + "epoch": 1.775, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.7069, + "grad_norm": 0.08291597664356232, + "learning_rate": 0.00020436570574255522 + }, + { + "step": 285, + "epoch": 1.78125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.695, + "grad_norm": 0.051776986569166183, + "learning_rate": 0.00020360240374005 + }, + { + "step": 286, + "epoch": 1.7875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.6974, + "grad_norm": 0.17675581574440002, + "learning_rate": 0.00020283750718818501 + }, + { + "step": 287, + "epoch": 1.79375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022880768, + "loss": 0.6958, + "grad_norm": 0.041607409715652466, + "learning_rate": 0.00020207103884088955 + }, + { + "step": 288, + "epoch": 1.8, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.6963, + "grad_norm": 0.026071982458233833, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 289, + "epoch": 1.80625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6999, + "grad_norm": 0.0705493837594986, + "learning_rate": 0.00020053347800883298 + }, + { + "step": 290, + "epoch": 1.8125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.689, + "grad_norm": 0.04902677237987518, + "learning_rate": 0.00019976243126300282 + }, + { + "step": 291, + "epoch": 1.81875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.6932, + "grad_norm": 0.030513059347867966, + "learning_rate": 0.00019898990419824333 + }, + { + "step": 292, + "epoch": 1.825, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022877696, + "loss": 0.6954, + "grad_norm": 0.03604958951473236, + "learning_rate": 0.00019821591979547423 + }, + { + "step": 293, + "epoch": 1.83125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.7073, + "grad_norm": 0.13755472004413605, + "learning_rate": 0.00019744050107896774 + }, + { + "step": 294, + "epoch": 1.8375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022879232, + "loss": 0.7095, + "grad_norm": 0.19906172156333923, + "learning_rate": 0.0001966636711156636 + }, + { + "step": 295, + "epoch": 1.84375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.6923, + "grad_norm": 0.042763277888298035, + "learning_rate": 0.00019588545301448302 + }, + { + "step": 296, + "epoch": 1.85, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.694, + "grad_norm": 0.1086958572268486, + "learning_rate": 0.00019510586992564093 + }, + { + "step": 297, + "epoch": 1.85625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.6981, + "grad_norm": 0.03022623248398304, + "learning_rate": 0.0001943249450399578 + }, + { + "step": 298, + "epoch": 1.8625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.6957, + "grad_norm": 0.040421757847070694, + "learning_rate": 0.0001935427015881693 + }, + { + "step": 299, + "epoch": 1.86875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.6965, + "grad_norm": 0.05317750200629234, + "learning_rate": 0.00019275916284023563 + }, + { + "step": 300, + "epoch": 1.875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.7023, + "grad_norm": 0.16668933629989624, + "learning_rate": 0.00019197435210464882 + }, + { + "step": 301, + "epoch": 1.88125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.6916, + "grad_norm": 0.037892404943704605, + "learning_rate": 0.00019118829272773985 + }, + { + "step": 302, + "epoch": 1.8875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.6933, + "grad_norm": 0.04143216088414192, + "learning_rate": 0.00019040100809298392 + }, + { + "step": 303, + "epoch": 1.89375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022897664, + "loss": 0.6971, + "grad_norm": 0.06078343093395233, + "learning_rate": 0.00018961252162030476 + }, + { + "step": 304, + "epoch": 1.9, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022889984, + "loss": 0.695, + "grad_norm": 0.07365698367357254, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 305, + "epoch": 1.90625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.6946, + "grad_norm": 0.07864898443222046, + "learning_rate": 0.00018803203701893393 + }, + { + "step": 306, + "epoch": 1.9125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.6903, + "grad_norm": 0.03586885333061218, + "learning_rate": 0.00018724008590605742 + }, + { + "step": 307, + "epoch": 1.91875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.7007, + "grad_norm": 0.11660092324018478, + "learning_rate": 0.0001864470269854896 + }, + { + "step": 308, + "epoch": 1.925, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022886912, + "loss": 0.6929, + "grad_norm": 0.051914554089307785, + "learning_rate": 0.00018565288384892595 + }, + { + "step": 309, + "epoch": 1.93125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.7101, + "grad_norm": 0.16689126193523407, + "learning_rate": 0.00018485768012031518 + }, + { + "step": 310, + "epoch": 1.9375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.6884, + "grad_norm": 0.08521142601966858, + "learning_rate": 0.00018406143945515598 + }, + { + "step": 311, + "epoch": 1.94375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022880768, + "loss": 0.6911, + "grad_norm": 0.05292697995901108, + "learning_rate": 0.00018326418553979367 + }, + { + "step": 312, + "epoch": 1.95, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022880768, + "loss": 0.6984, + "grad_norm": 0.0765916034579277, + "learning_rate": 0.0001824659420907154 + }, + { + "step": 313, + "epoch": 1.95625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.689, + "grad_norm": 0.03988257423043251, + "learning_rate": 0.00018166673285384475 + }, + { + "step": 314, + "epoch": 1.9625, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.6899, + "grad_norm": 0.03872169181704521, + "learning_rate": 0.00018086658160383523 + }, + { + "step": 315, + "epoch": 1.96875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02289152, + "loss": 0.6905, + "grad_norm": 0.04203181713819504, + "learning_rate": 0.00018006551214336304 + }, + { + "step": 316, + "epoch": 1.975, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022888448, + "loss": 0.7004, + "grad_norm": 0.13491398096084595, + "learning_rate": 0.00017926354830241924 + }, + { + "step": 317, + "epoch": 1.98125, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022885376, + "loss": 0.6995, + "grad_norm": 0.10882111638784409, + "learning_rate": 0.00017846071393760044 + }, + { + "step": 318, + "epoch": 1.9875, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.6965, + "grad_norm": 0.08178029954433441, + "learning_rate": 0.00017765703293139948 + }, + { + "step": 319, + "epoch": 1.99375, + "cpu_mem": 1.86447872, + "gpu_mem": 5.022893056, + "loss": 0.6904, + "grad_norm": 0.09847918152809143, + "learning_rate": 0.00017685252919149493 + }, + { + "step": 320, + "epoch": 2.0, + "cpu_mem": 1.86447872, + "gpu_mem": 5.02288384, + "loss": 0.695, + "grad_norm": 0.12119610607624054, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 321, + "epoch": 2.00625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6734, + "grad_norm": 0.18824538588523865, + "learning_rate": 0.00017524114926294887 + }, + { + "step": 322, + "epoch": 2.0125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6924, + "grad_norm": 0.04843165725469589, + "learning_rate": 0.0001744343210091883 + }, + { + "step": 323, + "epoch": 2.01875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.698, + "grad_norm": 0.0806940495967865, + "learning_rate": 0.00017362676589005967 + }, + { + "step": 324, + "epoch": 2.025, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022897664, + "loss": 0.7218, + "grad_norm": 0.18974600732326508, + "learning_rate": 0.0001728185079284875 + }, + { + "step": 325, + "epoch": 2.03125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.7077, + "grad_norm": 0.1242310106754303, + "learning_rate": 0.00017200957116830423 + }, + { + "step": 326, + "epoch": 2.0375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022896128, + "loss": 0.712, + "grad_norm": 0.12595714628696442, + "learning_rate": 0.00017119997967353514 + }, + { + "step": 327, + "epoch": 2.04375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6841, + "grad_norm": 0.03597329184412956, + "learning_rate": 0.00017038975752768211 + }, + { + "step": 328, + "epoch": 2.05, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.7106, + "grad_norm": 0.11950599402189255, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 329, + "epoch": 2.05625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.7087, + "grad_norm": 0.17105141282081604, + "learning_rate": 0.0001687675177098179 + }, + { + "step": 330, + "epoch": 2.0625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6937, + "grad_norm": 0.06421990692615509, + "learning_rate": 0.00016795554829574435 + }, + { + "step": 331, + "epoch": 2.06875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6931, + "grad_norm": 0.03734252601861954, + "learning_rate": 0.00016714304474502696 + }, + { + "step": 332, + "epoch": 2.075, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6906, + "grad_norm": 0.1412390023469925, + "learning_rate": 0.00016633003122779467 + }, + { + "step": 333, + "epoch": 2.08125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6902, + "grad_norm": 0.08187586069107056, + "learning_rate": 0.00016551653192934694 + }, + { + "step": 334, + "epoch": 2.0875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6978, + "grad_norm": 0.06964010000228882, + "learning_rate": 0.0001647025710494341 + }, + { + "step": 335, + "epoch": 2.09375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6952, + "grad_norm": 0.08658964931964874, + "learning_rate": 0.00016388817280153735 + }, + { + "step": 336, + "epoch": 2.1, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6742, + "grad_norm": 0.047259069979190826, + "learning_rate": 0.00016307336141214873 + }, + { + "step": 337, + "epoch": 2.10625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.7289, + "grad_norm": 0.22394919395446777, + "learning_rate": 0.00016225816112005022 + }, + { + "step": 338, + "epoch": 2.1125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.7353, + "grad_norm": 0.22947169840335846, + "learning_rate": 0.00016144259617559286 + }, + { + "step": 339, + "epoch": 2.11875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.7124, + "grad_norm": 0.14991749823093414, + "learning_rate": 0.00016062669083997513 + }, + { + "step": 340, + "epoch": 2.125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.7054, + "grad_norm": 0.09554284811019897, + "learning_rate": 0.00015981046938452146 + }, + { + "step": 341, + "epoch": 2.13125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6877, + "grad_norm": 0.04400734603404999, + "learning_rate": 0.00015899395608996015 + }, + { + "step": 342, + "epoch": 2.1375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.685, + "grad_norm": 0.06018107384443283, + "learning_rate": 0.00015817717524570094 + }, + { + "step": 343, + "epoch": 2.14375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6854, + "grad_norm": 0.034891609102487564, + "learning_rate": 0.0001573601511491127 + }, + { + "step": 344, + "epoch": 2.15, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6992, + "grad_norm": 0.09696390479803085, + "learning_rate": 0.00015654290810480042 + }, + { + "step": 345, + "epoch": 2.15625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6993, + "grad_norm": 0.0874435305595398, + "learning_rate": 0.00015572547042388223 + }, + { + "step": 346, + "epoch": 2.1625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.6922, + "grad_norm": 0.0349886380136013, + "learning_rate": 0.00015490786242326643 + }, + { + "step": 347, + "epoch": 2.16875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.6878, + "grad_norm": 0.057062115520238876, + "learning_rate": 0.00015409010842492777 + }, + { + "step": 348, + "epoch": 2.175, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6912, + "grad_norm": 0.0517832413315773, + "learning_rate": 0.00015327223275518416 + }, + { + "step": 349, + "epoch": 2.18125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02287616, + "loss": 0.6968, + "grad_norm": 0.058286406099796295, + "learning_rate": 0.000152454259743973 + }, + { + "step": 350, + "epoch": 2.1875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.691, + "grad_norm": 0.04055704176425934, + "learning_rate": 0.00015163621372412734 + }, + { + "step": 351, + "epoch": 2.19375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6883, + "grad_norm": 0.05200963094830513, + "learning_rate": 0.00015081811903065205 + }, + { + "step": 352, + "epoch": 2.2, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6978, + "grad_norm": 0.0470973402261734, + "learning_rate": 0.00015 + }, + { + "step": 353, + "epoch": 2.20625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6951, + "grad_norm": 0.033144380897283554, + "learning_rate": 0.0001491818809693479 + }, + { + "step": 354, + "epoch": 2.2125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6822, + "grad_norm": 0.2718160152435303, + "learning_rate": 0.00014836378627587266 + }, + { + "step": 355, + "epoch": 2.21875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.694, + "grad_norm": 0.0768592581152916, + "learning_rate": 0.00014754574025602698 + }, + { + "step": 356, + "epoch": 2.225, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.699, + "grad_norm": 0.07611589133739471, + "learning_rate": 0.00014672776724481584 + }, + { + "step": 357, + "epoch": 2.23125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6987, + "grad_norm": 0.09550268948078156, + "learning_rate": 0.00014590989157507224 + }, + { + "step": 358, + "epoch": 2.2375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6899, + "grad_norm": 0.07235929369926453, + "learning_rate": 0.00014509213757673357 + }, + { + "step": 359, + "epoch": 2.24375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6963, + "grad_norm": 0.04356210678815842, + "learning_rate": 0.00014427452957611775 + }, + { + "step": 360, + "epoch": 2.25, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6971, + "grad_norm": 0.08369924128055573, + "learning_rate": 0.0001434570918951996 + }, + { + "step": 361, + "epoch": 2.25625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6945, + "grad_norm": 0.024608567357063293, + "learning_rate": 0.0001426398488508873 + }, + { + "step": 362, + "epoch": 2.2625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.699, + "grad_norm": 0.14274239540100098, + "learning_rate": 0.00014182282475429903 + }, + { + "step": 363, + "epoch": 2.26875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6915, + "grad_norm": 0.049584902822971344, + "learning_rate": 0.00014100604391003985 + }, + { + "step": 364, + "epoch": 2.275, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6961, + "grad_norm": 0.0791303738951683, + "learning_rate": 0.0001401895306154785 + }, + { + "step": 365, + "epoch": 2.28125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6925, + "grad_norm": 0.10415507107973099, + "learning_rate": 0.00013937330916002487 + }, + { + "step": 366, + "epoch": 2.2875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6959, + "grad_norm": 0.04386407881975174, + "learning_rate": 0.00013855740382440714 + }, + { + "step": 367, + "epoch": 2.29375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6819, + "grad_norm": 0.09668760746717453, + "learning_rate": 0.0001377418388799498 + }, + { + "step": 368, + "epoch": 2.3, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6883, + "grad_norm": 0.07087373733520508, + "learning_rate": 0.00013692663858785124 + }, + { + "step": 369, + "epoch": 2.30625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.684, + "grad_norm": 0.06411807239055634, + "learning_rate": 0.00013611182719846268 + }, + { + "step": 370, + "epoch": 2.3125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.694, + "grad_norm": 0.0406595915555954, + "learning_rate": 0.0001352974289505659 + }, + { + "step": 371, + "epoch": 2.31875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6982, + "grad_norm": 0.06764250248670578, + "learning_rate": 0.000134483468070653 + }, + { + "step": 372, + "epoch": 2.325, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.7027, + "grad_norm": 0.08132321387529373, + "learning_rate": 0.00013366996877220533 + }, + { + "step": 373, + "epoch": 2.33125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6954, + "grad_norm": 0.051837220788002014, + "learning_rate": 0.000132856955254973 + }, + { + "step": 374, + "epoch": 2.3375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.0228992, + "loss": 0.6823, + "grad_norm": 0.07046253234148026, + "learning_rate": 0.00013204445170425565 + }, + { + "step": 375, + "epoch": 2.34375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.7036, + "grad_norm": 0.10005854070186615, + "learning_rate": 0.00013123248229018214 + }, + { + "step": 376, + "epoch": 2.35, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.7065, + "grad_norm": 0.11897020041942596, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 377, + "epoch": 2.35625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.0228992, + "loss": 0.7032, + "grad_norm": 0.09192755818367004, + "learning_rate": 0.0001296102424723179 + }, + { + "step": 378, + "epoch": 2.3625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.6957, + "grad_norm": 0.10540535300970078, + "learning_rate": 0.0001288000203264649 + }, + { + "step": 379, + "epoch": 2.36875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.694, + "grad_norm": 0.047926437109708786, + "learning_rate": 0.00012799042883169574 + }, + { + "step": 380, + "epoch": 2.375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.7011, + "grad_norm": 0.12681648135185242, + "learning_rate": 0.00012718149207151247 + }, + { + "step": 381, + "epoch": 2.38125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6909, + "grad_norm": 0.041792165488004684, + "learning_rate": 0.00012637323410994033 + }, + { + "step": 382, + "epoch": 2.3875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6923, + "grad_norm": 0.081211619079113, + "learning_rate": 0.0001255656789908117 + }, + { + "step": 383, + "epoch": 2.39375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6929, + "grad_norm": 0.06317201256752014, + "learning_rate": 0.0001247588507370511 + }, + { + "step": 384, + "epoch": 2.4, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.704, + "grad_norm": 0.08113985508680344, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 385, + "epoch": 2.40625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.7082, + "grad_norm": 0.14583829045295715, + "learning_rate": 0.0001231474708085051 + }, + { + "step": 386, + "epoch": 2.4125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022880768, + "loss": 0.7002, + "grad_norm": 0.06910689920186996, + "learning_rate": 0.0001223429670686005 + }, + { + "step": 387, + "epoch": 2.41875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6842, + "grad_norm": 0.03857361152768135, + "learning_rate": 0.00012153928606239957 + }, + { + "step": 388, + "epoch": 2.425, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.7008, + "grad_norm": 0.08800535649061203, + "learning_rate": 0.00012073645169758076 + }, + { + "step": 389, + "epoch": 2.43125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6927, + "grad_norm": 0.044442884624004364, + "learning_rate": 0.00011993448785663692 + }, + { + "step": 390, + "epoch": 2.4375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.7045, + "grad_norm": 0.1164683848619461, + "learning_rate": 0.00011913341839616476 + }, + { + "step": 391, + "epoch": 2.44375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6982, + "grad_norm": 0.05904219299554825, + "learning_rate": 0.00011833326714615522 + }, + { + "step": 392, + "epoch": 2.45, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022896128, + "loss": 0.6942, + "grad_norm": 0.04434257745742798, + "learning_rate": 0.00011753405790928456 + }, + { + "step": 393, + "epoch": 2.45625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.7038, + "grad_norm": 0.08845175802707672, + "learning_rate": 0.0001167358144602063 + }, + { + "step": 394, + "epoch": 2.4625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.689, + "grad_norm": 0.028469713404774666, + "learning_rate": 0.00011593856054484402 + }, + { + "step": 395, + "epoch": 2.46875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6944, + "grad_norm": 0.18446730077266693, + "learning_rate": 0.00011514231987968482 + }, + { + "step": 396, + "epoch": 2.475, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6971, + "grad_norm": 0.037452854216098785, + "learning_rate": 0.00011434711615107404 + }, + { + "step": 397, + "epoch": 2.48125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022880768, + "loss": 0.6911, + "grad_norm": 0.0252954363822937, + "learning_rate": 0.00011355297301451042 + }, + { + "step": 398, + "epoch": 2.4875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022896128, + "loss": 0.6971, + "grad_norm": 0.03233769163489342, + "learning_rate": 0.00011275991409394253 + }, + { + "step": 399, + "epoch": 2.49375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.7015, + "grad_norm": 0.0981166735291481, + "learning_rate": 0.00011196796298106608 + }, + { + "step": 400, + "epoch": 2.5, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.7072, + "grad_norm": 0.09424024075269699, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 401, + "epoch": 2.50625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6813, + "grad_norm": 0.07477609068155289, + "learning_rate": 0.00011038747837969526 + }, + { + "step": 402, + "epoch": 2.5125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6988, + "grad_norm": 0.04571758955717087, + "learning_rate": 0.00010959899190701608 + }, + { + "step": 403, + "epoch": 2.51875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6919, + "grad_norm": 0.03912100940942764, + "learning_rate": 0.00010881170727226018 + }, + { + "step": 404, + "epoch": 2.525, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.7104, + "grad_norm": 0.11150365322828293, + "learning_rate": 0.00010802564789535119 + }, + { + "step": 405, + "epoch": 2.53125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6883, + "grad_norm": 0.029354792088270187, + "learning_rate": 0.00010724083715976441 + }, + { + "step": 406, + "epoch": 2.5375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6819, + "grad_norm": 0.032244808971881866, + "learning_rate": 0.00010645729841183066 + }, + { + "step": 407, + "epoch": 2.54375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.6893, + "grad_norm": 0.028297701850533485, + "learning_rate": 0.00010567505496004213 + }, + { + "step": 408, + "epoch": 2.55, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022894592, + "loss": 0.6868, + "grad_norm": 0.028448516502976418, + "learning_rate": 0.00010489413007435904 + }, + { + "step": 409, + "epoch": 2.55625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6881, + "grad_norm": 0.026696354150772095, + "learning_rate": 0.00010411454698551695 + }, + { + "step": 410, + "epoch": 2.5625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6889, + "grad_norm": 0.04416930675506592, + "learning_rate": 0.00010333632888433638 + }, + { + "step": 411, + "epoch": 2.56875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.7071, + "grad_norm": 0.11504721641540527, + "learning_rate": 0.00010255949892103225 + }, + { + "step": 412, + "epoch": 2.575, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6901, + "grad_norm": 0.025724144652485847, + "learning_rate": 0.00010178408020452579 + }, + { + "step": 413, + "epoch": 2.58125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022896128, + "loss": 0.6895, + "grad_norm": 0.03155254200100899, + "learning_rate": 0.00010101009580175669 + }, + { + "step": 414, + "epoch": 2.5875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6973, + "grad_norm": 0.04638355225324631, + "learning_rate": 0.00010023756873699722 + }, + { + "step": 415, + "epoch": 2.59375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.692, + "grad_norm": 0.07104672491550446, + "learning_rate": 9.946652199116699e-05 + }, + { + "step": 416, + "epoch": 2.6, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.7022, + "grad_norm": 0.10259129852056503, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 417, + "epoch": 2.60625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.7058, + "grad_norm": 0.09780436009168625, + "learning_rate": 9.792896115911045e-05 + }, + { + "step": 418, + "epoch": 2.6125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6882, + "grad_norm": 0.043868765234947205, + "learning_rate": 9.716249281181497e-05 + }, + { + "step": 419, + "epoch": 2.61875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.7008, + "grad_norm": 0.11705806851387024, + "learning_rate": 9.639759625994998e-05 + }, + { + "step": 420, + "epoch": 2.625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6898, + "grad_norm": 0.02953208237886429, + "learning_rate": 9.563429425744476e-05 + }, + { + "step": 421, + "epoch": 2.63125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6996, + "grad_norm": 0.06444589048624039, + "learning_rate": 9.487260951079448e-05 + }, + { + "step": 422, + "epoch": 2.6375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6956, + "grad_norm": 0.09752791374921799, + "learning_rate": 9.411256467838455e-05 + }, + { + "step": 423, + "epoch": 2.64375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6906, + "grad_norm": 0.13153603672981262, + "learning_rate": 9.335418236981677e-05 + }, + { + "step": 424, + "epoch": 2.65, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.7008, + "grad_norm": 0.04314275085926056, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 425, + "epoch": 2.65625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6921, + "grad_norm": 0.04944797605276108, + "learning_rate": 9.184249551466189e-05 + }, + { + "step": 426, + "epoch": 2.6625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.6915, + "grad_norm": 0.06776456534862518, + "learning_rate": 9.10892359373139e-05 + }, + { + "step": 427, + "epoch": 2.66875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022896128, + "loss": 0.6963, + "grad_norm": 0.03197818621993065, + "learning_rate": 9.033772882094833e-05 + }, + { + "step": 428, + "epoch": 2.675, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022880768, + "loss": 0.7082, + "grad_norm": 0.14214974641799927, + "learning_rate": 8.958799652118943e-05 + }, + { + "step": 429, + "epoch": 2.68125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6949, + "grad_norm": 0.04219641909003258, + "learning_rate": 8.884006134086449e-05 + }, + { + "step": 430, + "epoch": 2.6875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6925, + "grad_norm": 0.035271745175123215, + "learning_rate": 8.809394552934079e-05 + }, + { + "step": 431, + "epoch": 2.69375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6835, + "grad_norm": 0.08239417523145676, + "learning_rate": 8.734967128186338e-05 + }, + { + "step": 432, + "epoch": 2.7, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.7027, + "grad_norm": 0.04198012128472328, + "learning_rate": 8.660726073889511e-05 + }, + { + "step": 433, + "epoch": 2.70625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6959, + "grad_norm": 0.059941262006759644, + "learning_rate": 8.586673598545771e-05 + }, + { + "step": 434, + "epoch": 2.7125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6984, + "grad_norm": 0.05842328816652298, + "learning_rate": 8.512811905047505e-05 + }, + { + "step": 435, + "epoch": 2.71875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.6917, + "grad_norm": 0.04339871555566788, + "learning_rate": 8.439143190611787e-05 + }, + { + "step": 436, + "epoch": 2.725, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6913, + "grad_norm": 0.03161982446908951, + "learning_rate": 8.365669646714983e-05 + }, + { + "step": 437, + "epoch": 2.73125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022894592, + "loss": 0.7033, + "grad_norm": 0.056630611419677734, + "learning_rate": 8.29239345902759e-05 + }, + { + "step": 438, + "epoch": 2.7375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6981, + "grad_norm": 0.035929445177316666, + "learning_rate": 8.219316807349204e-05 + }, + { + "step": 439, + "epoch": 2.74375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6937, + "grad_norm": 0.03238064423203468, + "learning_rate": 8.146441865543689e-05 + }, + { + "step": 440, + "epoch": 2.75, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6873, + "grad_norm": 0.1641979068517685, + "learning_rate": 8.073770801474495e-05 + }, + { + "step": 441, + "epoch": 2.75625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6936, + "grad_norm": 0.026521405205130577, + "learning_rate": 8.001305776940163e-05 + }, + { + "step": 442, + "epoch": 2.7625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6932, + "grad_norm": 0.047335829585790634, + "learning_rate": 7.929048947610034e-05 + }, + { + "step": 443, + "epoch": 2.76875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6928, + "grad_norm": 0.049395907670259476, + "learning_rate": 7.857002462960132e-05 + }, + { + "step": 444, + "epoch": 2.775, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6925, + "grad_norm": 0.05962188541889191, + "learning_rate": 7.785168466209187e-05 + }, + { + "step": 445, + "epoch": 2.78125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.6931, + "grad_norm": 0.04176504909992218, + "learning_rate": 7.713549094254897e-05 + }, + { + "step": 446, + "epoch": 2.7875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.687, + "grad_norm": 0.07498004287481308, + "learning_rate": 7.64214647761038e-05 + }, + { + "step": 447, + "epoch": 2.79375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6906, + "grad_norm": 0.050878819078207016, + "learning_rate": 7.570962740340759e-05 + }, + { + "step": 448, + "epoch": 2.8, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.688, + "grad_norm": 0.04449697956442833, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 449, + "epoch": 2.80625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6888, + "grad_norm": 0.044628970324993134, + "learning_rate": 7.429260367567916e-05 + }, + { + "step": 450, + "epoch": 2.8125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.7021, + "grad_norm": 0.08712606877088547, + "learning_rate": 7.358745947387373e-05 + }, + { + "step": 451, + "epoch": 2.81875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6952, + "grad_norm": 0.03987935930490494, + "learning_rate": 7.288458837101675e-05 + }, + { + "step": 452, + "epoch": 2.825, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.6762, + "grad_norm": 0.1237826943397522, + "learning_rate": 7.218401127592175e-05 + }, + { + "step": 453, + "epoch": 2.83125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.675, + "grad_norm": 0.08815290778875351, + "learning_rate": 7.14857490291609e-05 + }, + { + "step": 454, + "epoch": 2.8375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.7, + "grad_norm": 0.03138703852891922, + "learning_rate": 7.07898224024448e-05 + }, + { + "step": 455, + "epoch": 2.84375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.7059, + "grad_norm": 0.12030811607837677, + "learning_rate": 7.009625209800465e-05 + }, + { + "step": 456, + "epoch": 2.85, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.7034, + "grad_norm": 0.10329216718673706, + "learning_rate": 6.940505874797639e-05 + }, + { + "step": 457, + "epoch": 2.85625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.701, + "grad_norm": 0.08663800358772278, + "learning_rate": 6.871626291378728e-05 + }, + { + "step": 458, + "epoch": 2.8625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.7134, + "grad_norm": 0.151703879237175, + "learning_rate": 6.80298850855435e-05 + }, + { + "step": 459, + "epoch": 2.86875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.7018, + "grad_norm": 0.06121860072016716, + "learning_rate": 6.734594568142142e-05 + }, + { + "step": 460, + "epoch": 2.875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.7062, + "grad_norm": 0.10388082265853882, + "learning_rate": 6.66644650470597e-05 + }, + { + "step": 461, + "epoch": 2.88125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.6789, + "grad_norm": 0.1474076360464096, + "learning_rate": 6.598546345495417e-05 + }, + { + "step": 462, + "epoch": 2.8875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.7097, + "grad_norm": 0.17155906558036804, + "learning_rate": 6.530896110385494e-05 + }, + { + "step": 463, + "epoch": 2.89375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6999, + "grad_norm": 0.06109306961297989, + "learning_rate": 6.463497811816523e-05 + }, + { + "step": 464, + "epoch": 2.9, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6888, + "grad_norm": 0.029371464625000954, + "learning_rate": 6.396353454734311e-05 + }, + { + "step": 465, + "epoch": 2.90625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6974, + "grad_norm": 0.13461944460868835, + "learning_rate": 6.32946503653045e-05 + }, + { + "step": 466, + "epoch": 2.9125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.6974, + "grad_norm": 0.03368905559182167, + "learning_rate": 6.262834546982969e-05 + }, + { + "step": 467, + "epoch": 2.91875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022896128, + "loss": 0.695, + "grad_norm": 0.04819164425134659, + "learning_rate": 6.196463968197084e-05 + }, + { + "step": 468, + "epoch": 2.925, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6915, + "grad_norm": 0.030669376254081726, + "learning_rate": 6.130355274546267e-05 + }, + { + "step": 469, + "epoch": 2.93125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6927, + "grad_norm": 0.09550093859434128, + "learning_rate": 6.064510432613499e-05 + }, + { + "step": 470, + "epoch": 2.9375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6977, + "grad_norm": 0.051150668412446976, + "learning_rate": 5.998931401132786e-05 + }, + { + "step": 471, + "epoch": 2.94375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6909, + "grad_norm": 0.11456679552793503, + "learning_rate": 5.933620130930867e-05 + }, + { + "step": 472, + "epoch": 2.95, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6963, + "grad_norm": 0.0696570873260498, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 473, + "epoch": 2.95625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.689, + "grad_norm": 0.030857499688863754, + "learning_rate": 5.803808637786135e-05 + }, + { + "step": 474, + "epoch": 2.9625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022880768, + "loss": 0.6959, + "grad_norm": 0.0487452857196331, + "learning_rate": 5.739312276439427e-05 + }, + { + "step": 475, + "epoch": 2.96875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6933, + "grad_norm": 0.02613280713558197, + "learning_rate": 5.6750913994488415e-05 + }, + { + "step": 476, + "epoch": 2.975, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6937, + "grad_norm": 0.030959615483880043, + "learning_rate": 5.6111479172391136e-05 + }, + { + "step": 477, + "epoch": 2.98125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.692, + "grad_norm": 0.04078201204538345, + "learning_rate": 5.5474837319831314e-05 + }, + { + "step": 478, + "epoch": 2.9875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6921, + "grad_norm": 0.037513405084609985, + "learning_rate": 5.4841007375453186e-05 + }, + { + "step": 479, + "epoch": 2.99375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.6927, + "grad_norm": 0.04354485869407654, + "learning_rate": 5.4210008194253196e-05 + }, + { + "step": 480, + "epoch": 3.0, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6869, + "grad_norm": 0.04515594244003296, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 481, + "epoch": 3.00625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022880768, + "loss": 0.6855, + "grad_norm": 0.048359259963035583, + "learning_rate": 5.2956577119771405e-05 + }, + { + "step": 482, + "epoch": 3.0125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.6974, + "grad_norm": 0.08261068910360336, + "learning_rate": 5.233418251320765e-05 + }, + { + "step": 483, + "epoch": 3.01875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.7059, + "grad_norm": 0.10123255103826523, + "learning_rate": 5.171469324214901e-05 + }, + { + "step": 484, + "epoch": 3.025, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.696, + "grad_norm": 0.03538727015256882, + "learning_rate": 5.109812773498967e-05 + }, + { + "step": 485, + "epoch": 3.03125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6998, + "grad_norm": 0.0963803231716156, + "learning_rate": 5.048450433314835e-05 + }, + { + "step": 486, + "epoch": 3.0375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6862, + "grad_norm": 0.0531502440571785, + "learning_rate": 4.987384129052291e-05 + }, + { + "step": 487, + "epoch": 3.04375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.7021, + "grad_norm": 0.06597407162189484, + "learning_rate": 4.926615677294723e-05 + }, + { + "step": 488, + "epoch": 3.05, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.7022, + "grad_norm": 0.12460726499557495, + "learning_rate": 4.866146885765096e-05 + }, + { + "step": 489, + "epoch": 3.05625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.6897, + "grad_norm": 0.09059120714664459, + "learning_rate": 4.8059795532721575e-05 + }, + { + "step": 490, + "epoch": 3.0625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6963, + "grad_norm": 0.07325052469968796, + "learning_rate": 4.7461154696569294e-05 + }, + { + "step": 491, + "epoch": 3.06875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6892, + "grad_norm": 0.07726457715034485, + "learning_rate": 4.686556415739488e-05 + }, + { + "step": 492, + "epoch": 3.075, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.689, + "grad_norm": 0.04223347455263138, + "learning_rate": 4.62730416326596e-05 + }, + { + "step": 493, + "epoch": 3.08125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6915, + "grad_norm": 0.11139773577451706, + "learning_rate": 4.568360474855826e-05 + }, + { + "step": 494, + "epoch": 3.0875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6924, + "grad_norm": 0.053772035986185074, + "learning_rate": 4.509727103949492e-05 + }, + { + "step": 495, + "epoch": 3.09375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6844, + "grad_norm": 0.130367249250412, + "learning_rate": 4.451405794756138e-05 + }, + { + "step": 496, + "epoch": 3.1, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022896128, + "loss": 0.6948, + "grad_norm": 0.043926507234573364, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 497, + "epoch": 3.10625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022896128, + "loss": 0.6962, + "grad_norm": 0.1054593175649643, + "learning_rate": 4.33570629187776e-05 + }, + { + "step": 498, + "epoch": 3.1125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6951, + "grad_norm": 0.1354876309633255, + "learning_rate": 4.278331539989307e-05 + }, + { + "step": 499, + "epoch": 3.11875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022894592, + "loss": 0.6924, + "grad_norm": 0.03712666407227516, + "learning_rate": 4.2212757333045283e-05 + }, + { + "step": 500, + "epoch": 3.125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6949, + "grad_norm": 0.05538730323314667, + "learning_rate": 4.164540569103667e-05 + }, + { + "step": 501, + "epoch": 3.13125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6931, + "grad_norm": 0.024954015389084816, + "learning_rate": 4.108127735128561e-05 + }, + { + "step": 502, + "epoch": 3.1375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.6931, + "grad_norm": 0.06493628025054932, + "learning_rate": 4.052038909532469e-05 + }, + { + "step": 503, + "epoch": 3.14375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6899, + "grad_norm": 0.08080784231424332, + "learning_rate": 3.996275760830125e-05 + }, + { + "step": 504, + "epoch": 3.15, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6965, + "grad_norm": 0.023793745785951614, + "learning_rate": 3.94083994784814e-05 + }, + { + "step": 505, + "epoch": 3.15625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6958, + "grad_norm": 0.10332582890987396, + "learning_rate": 3.885733119675616e-05 + }, + { + "step": 506, + "epoch": 3.1625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6933, + "grad_norm": 0.030658062547445297, + "learning_rate": 3.830956915615106e-05 + }, + { + "step": 507, + "epoch": 3.16875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022894592, + "loss": 0.6962, + "grad_norm": 0.03218219429254532, + "learning_rate": 3.776512965133863e-05 + }, + { + "step": 508, + "epoch": 3.175, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022880768, + "loss": 0.6912, + "grad_norm": 0.08036378771066666, + "learning_rate": 3.72240288781534e-05 + }, + { + "step": 509, + "epoch": 3.18125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6956, + "grad_norm": 0.0848485678434372, + "learning_rate": 3.66862829331103e-05 + }, + { + "step": 510, + "epoch": 3.1875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.697, + "grad_norm": 0.05071956664323807, + "learning_rate": 3.6151907812925717e-05 + }, + { + "step": 511, + "epoch": 3.19375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6919, + "grad_norm": 0.032341137528419495, + "learning_rate": 3.562091941404179e-05 + }, + { + "step": 512, + "epoch": 3.2, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.6945, + "grad_norm": 0.024669472128152847, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 513, + "epoch": 3.20625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6942, + "grad_norm": 0.05769066512584686, + "learning_rate": 3.456916586173797e-05 + }, + { + "step": 514, + "epoch": 3.2125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6929, + "grad_norm": 0.082117460668087, + "learning_rate": 3.404843199558945e-05 + }, + { + "step": 515, + "epoch": 3.21875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6954, + "grad_norm": 0.08475469052791595, + "learning_rate": 3.3531147424353664e-05 + }, + { + "step": 516, + "epoch": 3.225, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6921, + "grad_norm": 0.077925905585289, + "learning_rate": 3.301732753606776e-05 + }, + { + "step": 517, + "epoch": 3.23125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.693, + "grad_norm": 0.1592109203338623, + "learning_rate": 3.250698761570244e-05 + }, + { + "step": 518, + "epoch": 3.2375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6868, + "grad_norm": 0.19120724499225616, + "learning_rate": 3.200014284470745e-05 + }, + { + "step": 519, + "epoch": 3.24375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6952, + "grad_norm": 0.023119451478123665, + "learning_rate": 3.149680830055967e-05 + }, + { + "step": 520, + "epoch": 3.25, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6895, + "grad_norm": 0.08198559284210205, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 521, + "epoch": 3.25625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6934, + "grad_norm": 0.032512690871953964, + "learning_rate": 3.0500729680161663e-05 + }, + { + "step": 522, + "epoch": 3.2625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6892, + "grad_norm": 0.07214111089706421, + "learning_rate": 3.0008015234980552e-05 + }, + { + "step": 523, + "epoch": 3.26875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6938, + "grad_norm": 0.025380538776516914, + "learning_rate": 2.9518870277903274e-05 + }, + { + "step": 524, + "epoch": 3.275, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6849, + "grad_norm": 0.03005836345255375, + "learning_rate": 2.9033309359877597e-05 + }, + { + "step": 525, + "epoch": 3.28125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.6982, + "grad_norm": 0.11050976812839508, + "learning_rate": 2.855134692523438e-05 + }, + { + "step": 526, + "epoch": 3.2875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6898, + "grad_norm": 0.03164194896817207, + "learning_rate": 2.807299731125773e-05 + }, + { + "step": 527, + "epoch": 3.29375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.697, + "grad_norm": 0.04952256381511688, + "learning_rate": 2.759827474775852e-05 + }, + { + "step": 528, + "epoch": 3.3, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.6884, + "grad_norm": 0.04662276431918144, + "learning_rate": 2.7127193356651213e-05 + }, + { + "step": 529, + "epoch": 3.30625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6927, + "grad_norm": 0.10950715094804764, + "learning_rate": 2.665976715153377e-05 + }, + { + "step": 530, + "epoch": 3.3125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6878, + "grad_norm": 0.09129885584115982, + "learning_rate": 2.619601003727043e-05 + }, + { + "step": 531, + "epoch": 3.31875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022900736, + "loss": 0.6984, + "grad_norm": 0.12552456557750702, + "learning_rate": 2.5735935809578656e-05 + }, + { + "step": 532, + "epoch": 3.325, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6999, + "grad_norm": 0.1224503442645073, + "learning_rate": 2.5279558154618197e-05 + }, + { + "step": 533, + "epoch": 3.33125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6915, + "grad_norm": 0.05501626059412956, + "learning_rate": 2.4826890648584353e-05 + }, + { + "step": 534, + "epoch": 3.3375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6957, + "grad_norm": 0.04395895451307297, + "learning_rate": 2.4377946757303828e-05 + }, + { + "step": 535, + "epoch": 3.34375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6995, + "grad_norm": 0.04087993502616882, + "learning_rate": 2.393273983583427e-05 + }, + { + "step": 536, + "epoch": 3.35, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6956, + "grad_norm": 0.051419373601675034, + "learning_rate": 2.3491283128067174e-05 + }, + { + "step": 537, + "epoch": 3.35625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6913, + "grad_norm": 0.053600117564201355, + "learning_rate": 2.3053589766333414e-05 + }, + { + "step": 538, + "epoch": 3.3625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6911, + "grad_norm": 0.06439915299415588, + "learning_rate": 2.261967277101318e-05 + }, + { + "step": 539, + "epoch": 3.36875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6935, + "grad_norm": 0.03999877721071243, + "learning_rate": 2.218954505014821e-05 + }, + { + "step": 540, + "epoch": 3.375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6928, + "grad_norm": 0.11375035345554352, + "learning_rate": 2.1763219399058042e-05 + }, + { + "step": 541, + "epoch": 3.38125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6895, + "grad_norm": 0.027782617136836052, + "learning_rate": 2.1340708499959197e-05 + }, + { + "step": 542, + "epoch": 3.3875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6947, + "grad_norm": 0.06576699763536453, + "learning_rate": 2.0922024921588167e-05 + }, + { + "step": 543, + "epoch": 3.39375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.692, + "grad_norm": 0.060851383954286575, + "learning_rate": 2.0507181118827254e-05 + }, + { + "step": 544, + "epoch": 3.4, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6892, + "grad_norm": 0.02612634375691414, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 545, + "epoch": 3.40625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022894592, + "loss": 0.7006, + "grad_norm": 0.03568480163812637, + "learning_rate": 1.9689062088175154e-05 + }, + { + "step": 546, + "epoch": 3.4125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6925, + "grad_norm": 0.021742895245552063, + "learning_rate": 1.928581119746081e-05 + }, + { + "step": 547, + "epoch": 3.41875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6934, + "grad_norm": 0.07500431686639786, + "learning_rate": 1.8886448755986193e-05 + }, + { + "step": 548, + "epoch": 3.425, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6915, + "grad_norm": 0.04113041236996651, + "learning_rate": 1.8490986643873845e-05 + }, + { + "step": 549, + "epoch": 3.43125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6958, + "grad_norm": 0.044444501399993896, + "learning_rate": 1.8099436625220443e-05 + }, + { + "step": 550, + "epoch": 3.4375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022897664, + "loss": 0.6939, + "grad_norm": 0.05527685955166817, + "learning_rate": 1.7711810347746757e-05 + }, + { + "step": 551, + "epoch": 3.44375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6949, + "grad_norm": 0.06198691949248314, + "learning_rate": 1.7328119342451165e-05 + }, + { + "step": 552, + "epoch": 3.45, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.694, + "grad_norm": 0.042763348668813705, + "learning_rate": 1.694837502326674e-05 + }, + { + "step": 553, + "epoch": 3.45625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.693, + "grad_norm": 0.05370129644870758, + "learning_rate": 1.6572588686721606e-05 + }, + { + "step": 554, + "epoch": 3.4625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6909, + "grad_norm": 0.10656838119029999, + "learning_rate": 1.6200771511602882e-05 + }, + { + "step": 555, + "epoch": 3.46875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022894592, + "loss": 0.6933, + "grad_norm": 0.20897749066352844, + "learning_rate": 1.583293455862422e-05 + }, + { + "step": 556, + "epoch": 3.475, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6926, + "grad_norm": 0.03617823123931885, + "learning_rate": 1.546908877009676e-05 + }, + { + "step": 557, + "epoch": 3.48125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6938, + "grad_norm": 0.08349359780550003, + "learning_rate": 1.5109244969603546e-05 + }, + { + "step": 558, + "epoch": 3.4875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6945, + "grad_norm": 0.06854367256164551, + "learning_rate": 1.4753413861677604e-05 + }, + { + "step": 559, + "epoch": 3.49375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6933, + "grad_norm": 0.071568064391613, + "learning_rate": 1.4401606031483497e-05 + }, + { + "step": 560, + "epoch": 3.5, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.691, + "grad_norm": 0.05263340845704079, + "learning_rate": 1.4053831944502508e-05 + }, + { + "step": 561, + "epoch": 3.50625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6908, + "grad_norm": 0.040181420743465424, + "learning_rate": 1.371010194622117e-05 + }, + { + "step": 562, + "epoch": 3.5125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.7015, + "grad_norm": 0.12373066693544388, + "learning_rate": 1.3370426261823613e-05 + }, + { + "step": 563, + "epoch": 3.51875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6932, + "grad_norm": 0.041022367775440216, + "learning_rate": 1.3034814995887433e-05 + }, + { + "step": 564, + "epoch": 3.525, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.6924, + "grad_norm": 0.050292566418647766, + "learning_rate": 1.2703278132082934e-05 + }, + { + "step": 565, + "epoch": 3.53125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6944, + "grad_norm": 0.03219418227672577, + "learning_rate": 1.237582553287631e-05 + }, + { + "step": 566, + "epoch": 3.5375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.699, + "grad_norm": 0.14257174730300903, + "learning_rate": 1.205246693923616e-05 + }, + { + "step": 567, + "epoch": 3.54375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6999, + "grad_norm": 0.08713658899068832, + "learning_rate": 1.173321197034382e-05 + }, + { + "step": 568, + "epoch": 3.55, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6953, + "grad_norm": 0.06708653271198273, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 569, + "epoch": 3.55625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6968, + "grad_norm": 0.10114135593175888, + "learning_rate": 1.1107050772877507e-05 + }, + { + "step": 570, + "epoch": 3.5625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.6926, + "grad_norm": 0.037672948092222214, + "learning_rate": 1.0800163171172332e-05 + }, + { + "step": 571, + "epoch": 3.56875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.6943, + "grad_norm": 0.03829833120107651, + "learning_rate": 1.0497416447398187e-05 + }, + { + "step": 572, + "epoch": 3.575, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6875, + "grad_norm": 0.10367531329393387, + "learning_rate": 1.0198819607580233e-05 + }, + { + "step": 573, + "epoch": 3.58125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.6946, + "grad_norm": 0.051281239837408066, + "learning_rate": 9.904381534293993e-06 + }, + { + "step": 574, + "epoch": 3.5875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6875, + "grad_norm": 0.07162943482398987, + "learning_rate": 9.614110986401169e-06 + }, + { + "step": 575, + "epoch": 3.59375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.691, + "grad_norm": 0.03250992298126221, + "learning_rate": 9.32801659878905e-06 + }, + { + "step": 576, + "epoch": 3.6, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6992, + "grad_norm": 0.07012076675891876, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 577, + "epoch": 3.60625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.6926, + "grad_norm": 0.03201422095298767, + "learning_rate": 8.768390222546895e-06 + }, + { + "step": 578, + "epoch": 3.6125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022896128, + "loss": 0.6957, + "grad_norm": 0.04261971637606621, + "learning_rate": 8.494874881526215e-06 + }, + { + "step": 579, + "epoch": 3.61875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.692, + "grad_norm": 0.027598712593317032, + "learning_rate": 8.225568995509834e-06 + }, + { + "step": 580, + "epoch": 3.625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6878, + "grad_norm": 0.1147519201040268, + "learning_rate": 7.960480575734162e-06 + }, + { + "step": 581, + "epoch": 3.63125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6902, + "grad_norm": 0.04806976392865181, + "learning_rate": 7.699617507975563e-06 + }, + { + "step": 582, + "epoch": 3.6375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022880768, + "loss": 0.692, + "grad_norm": 0.02844579704105854, + "learning_rate": 7.442987552315833e-06 + }, + { + "step": 583, + "epoch": 3.64375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6919, + "grad_norm": 0.0319044291973114, + "learning_rate": 7.190598342911358e-06 + }, + { + "step": 584, + "epoch": 3.65, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6896, + "grad_norm": 0.027507254853844643, + "learning_rate": 6.942457387765976e-06 + }, + { + "step": 585, + "epoch": 3.65625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022896128, + "loss": 0.6933, + "grad_norm": 0.047100476920604706, + "learning_rate": 6.698572068507596e-06 + }, + { + "step": 586, + "epoch": 3.6625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6977, + "grad_norm": 0.12606702744960785, + "learning_rate": 6.458949640168675e-06 + }, + { + "step": 587, + "epoch": 3.66875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022896128, + "loss": 0.6884, + "grad_norm": 0.0398981049656868, + "learning_rate": 6.223597230970428e-06 + }, + { + "step": 588, + "epoch": 3.675, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6844, + "grad_norm": 0.10037913918495178, + "learning_rate": 5.992521842110709e-06 + }, + { + "step": 589, + "epoch": 3.68125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6945, + "grad_norm": 0.0733317956328392, + "learning_rate": 5.7657303475556974e-06 + }, + { + "step": 590, + "epoch": 3.6875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6972, + "grad_norm": 0.11034728586673737, + "learning_rate": 5.543229493835594e-06 + }, + { + "step": 591, + "epoch": 3.69375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6929, + "grad_norm": 0.1575852632522583, + "learning_rate": 5.325025899843732e-06 + }, + { + "step": 592, + "epoch": 3.7, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022894592, + "loss": 0.6973, + "grad_norm": 0.08910872042179108, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 593, + "epoch": 3.70625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6958, + "grad_norm": 0.06674054265022278, + "learning_rate": 4.901536327256589e-06 + }, + { + "step": 594, + "epoch": 3.7125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.7018, + "grad_norm": 0.1619773954153061, + "learning_rate": 4.6962629465110365e-06 + }, + { + "step": 595, + "epoch": 3.71875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022894592, + "loss": 0.6923, + "grad_norm": 0.025898879393935204, + "learning_rate": 4.495312020818403e-06 + }, + { + "step": 596, + "epoch": 3.725, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6909, + "grad_norm": 0.08751345425844193, + "learning_rate": 4.298689528010785e-06 + }, + { + "step": 597, + "epoch": 3.73125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6933, + "grad_norm": 0.0425085723400116, + "learning_rate": 4.106401317159275e-06 + }, + { + "step": 598, + "epoch": 3.7375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.693, + "grad_norm": 0.08958403766155243, + "learning_rate": 3.918453108399955e-06 + }, + { + "step": 599, + "epoch": 3.74375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6911, + "grad_norm": 0.05304872989654541, + "learning_rate": 3.7348504927637302e-06 + }, + { + "step": 600, + "epoch": 3.75, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6943, + "grad_norm": 0.049351274967193604, + "learning_rate": 3.5555989320099952e-06 + }, + { + "step": 601, + "epoch": 3.75625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.6937, + "grad_norm": 0.035002488642930984, + "learning_rate": 3.3807037584642316e-06 + }, + { + "step": 602, + "epoch": 3.7625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6895, + "grad_norm": 0.04929906129837036, + "learning_rate": 3.21017017485925e-06 + }, + { + "step": 603, + "epoch": 3.76875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022893056, + "loss": 0.6947, + "grad_norm": 0.08106742054224014, + "learning_rate": 3.0440032541805825e-06 + }, + { + "step": 604, + "epoch": 3.775, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6988, + "grad_norm": 0.04023308306932449, + "learning_rate": 2.882207939515435e-06 + }, + { + "step": 605, + "epoch": 3.78125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022882304, + "loss": 0.6961, + "grad_norm": 0.13420113921165466, + "learning_rate": 2.7247890439057064e-06 + }, + { + "step": 606, + "epoch": 3.7875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6944, + "grad_norm": 0.06521180272102356, + "learning_rate": 2.5717512502048342e-06 + }, + { + "step": 607, + "epoch": 3.79375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6899, + "grad_norm": 0.049234531819820404, + "learning_rate": 2.423099110938376e-06 + }, + { + "step": 608, + "epoch": 3.8, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6894, + "grad_norm": 0.07457038015127182, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 609, + "epoch": 3.80625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6969, + "grad_norm": 0.11105072498321533, + "learning_rate": 2.1389693533636455e-06 + }, + { + "step": 610, + "epoch": 3.8125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6959, + "grad_norm": 0.03916989639401436, + "learning_rate": 2.003500187268153e-06 + }, + { + "step": 611, + "epoch": 3.81875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6892, + "grad_norm": 0.0697760209441185, + "learning_rate": 1.8724335797812685e-06 + }, + { + "step": 612, + "epoch": 3.825, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022880768, + "loss": 0.6901, + "grad_norm": 0.05426376312971115, + "learning_rate": 1.7457734298359005e-06 + }, + { + "step": 613, + "epoch": 3.83125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6967, + "grad_norm": 0.03106892667710781, + "learning_rate": 1.6235235052828476e-06 + }, + { + "step": 614, + "epoch": 3.8375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.691, + "grad_norm": 0.06403827667236328, + "learning_rate": 1.505687442778819e-06 + }, + { + "step": 615, + "epoch": 3.84375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6915, + "grad_norm": 0.06789837032556534, + "learning_rate": 1.3922687476781047e-06 + }, + { + "step": 616, + "epoch": 3.85, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.693, + "grad_norm": 0.02781832031905651, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 617, + "epoch": 3.85625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6896, + "grad_norm": 0.06926864385604858, + "learning_rate": 1.1786968239705486e-06 + }, + { + "step": 618, + "epoch": 3.8625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6886, + "grad_norm": 0.029269929975271225, + "learning_rate": 1.0785499486417438e-06 + }, + { + "step": 619, + "epoch": 3.86875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6914, + "grad_norm": 0.027628714218735695, + "learning_rate": 9.82833147083345e-07 + }, + { + "step": 620, + "epoch": 3.875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6911, + "grad_norm": 0.08255784958600998, + "learning_rate": 8.91549266652053e-07 + }, + { + "step": 621, + "epoch": 3.88125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6957, + "grad_norm": 0.11750220507383347, + "learning_rate": 8.04701022835319e-07 + }, + { + "step": 622, + "epoch": 3.8875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.698, + "grad_norm": 0.04874292388558388, + "learning_rate": 7.222909991704773e-07 + }, + { + "step": 623, + "epoch": 3.89375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6895, + "grad_norm": 0.0975748747587204, + "learning_rate": 6.443216471679058e-07 + }, + { + "step": 624, + "epoch": 3.9, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6967, + "grad_norm": 0.05236559361219406, + "learning_rate": 5.707952862381681e-07 + }, + { + "step": 625, + "epoch": 3.90625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6922, + "grad_norm": 0.04140351340174675, + "learning_rate": 5.017141036229522e-07 + }, + { + "step": 626, + "epoch": 3.9125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6933, + "grad_norm": 0.023921174928545952, + "learning_rate": 4.370801543300051e-07 + }, + { + "step": 627, + "epoch": 3.91875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6896, + "grad_norm": 0.050481077283620834, + "learning_rate": 3.768953610720327e-07 + }, + { + "step": 628, + "epoch": 3.925, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6925, + "grad_norm": 0.048850320279598236, + "learning_rate": 3.211615142094781e-07 + }, + { + "step": 629, + "epoch": 3.93125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022885376, + "loss": 0.6919, + "grad_norm": 0.07223442941904068, + "learning_rate": 2.6988027169728145e-07 + }, + { + "step": 630, + "epoch": 3.9375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02288384, + "loss": 0.6937, + "grad_norm": 0.07931733131408691, + "learning_rate": 2.2305315903553555e-07 + }, + { + "step": 631, + "epoch": 3.94375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022897664, + "loss": 0.6929, + "grad_norm": 0.07110463827848434, + "learning_rate": 1.8068156922413924e-07 + }, + { + "step": 632, + "epoch": 3.95, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6886, + "grad_norm": 0.03750075027346611, + "learning_rate": 1.4276676272133025e-07 + }, + { + "step": 633, + "epoch": 3.95625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6921, + "grad_norm": 0.025895163416862488, + "learning_rate": 1.0930986740621539e-07 + }, + { + "step": 634, + "epoch": 3.9625, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022888448, + "loss": 0.6928, + "grad_norm": 0.08014581352472305, + "learning_rate": 8.031187854514731e-08 + }, + { + "step": 635, + "epoch": 3.96875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6947, + "grad_norm": 0.028245337307453156, + "learning_rate": 5.577365876224815e-08 + }, + { + "step": 636, + "epoch": 3.975, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022880768, + "loss": 0.6913, + "grad_norm": 0.03812907636165619, + "learning_rate": 3.5695938013630134e-08 + }, + { + "step": 637, + "epoch": 3.98125, + "cpu_mem": 1.866641408, + "gpu_mem": 5.02289152, + "loss": 0.6982, + "grad_norm": 0.13256292045116425, + "learning_rate": 2.007931356572956e-08 + }, + { + "step": 638, + "epoch": 3.9875, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.695, + "grad_norm": 0.0716080367565155, + "learning_rate": 8.924249977537712e-09 + }, + { + "step": 639, + "epoch": 3.99375, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022889984, + "loss": 0.6935, + "grad_norm": 0.06708957254886627, + "learning_rate": 2.2310790867619e-09 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "loss": 0.6929, + "grad_norm": 0.05825101584196091, + "learning_rate": 0.0 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.866641408, + "gpu_mem": 5.022886912, + "train_runtime": 2963.5867, + "train_samples_per_second": 13.813, + "train_steps_per_second": 0.216, + "total_flos": 1.5267526629064704e+16, + "train_loss": 0.7577152683399617 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r8-a2/README.md b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r8-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r8-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r8-a2/adapter_config.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5723daa9f5f7b854bf548bbee9a6d37e12198a3a --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r8-a2/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha": 16, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "inference_mode": true, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 8, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r8-a2/eval_results.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..badf351d3ea98c3c8cea5fc7912e78a630fc6f98 --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "winogrande", + "results": 0.516179952644041 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r8-a2/training_configuration.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..a01d642a9c9f7e1de09f70bba28d41a30549a69e --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "WINOGRANDE", + "dataset_id": "allenai/winogrande", + "preprocess_id": "winogrande_train_deepeval" + }, + "peft_config": { + "method": "loha", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 12615680 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loha-winogrande-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r8-a2", + "seed": 42, + "timestamp": "2025-09-13T14:48:48.782172" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r8-a2/training_logs.json b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..39bc9c52b5e75c869d35a67b42bfcd8d7f8aec6c --- /dev/null +++ b/TinyLlama_v1.1-loha/TinyLlama_v1.1-loha-winogrande-r8-a2/training_logs.json @@ -0,0 +1,5773 @@ +[ + { + "step": 1, + "epoch": 0.00625, + "cpu_mem": 1.84346624, + "gpu_mem": 4.467797504, + "loss": 3.3802, + "grad_norm": 3.4504129886627197, + "learning_rate": 4.6875e-06 + }, + { + "step": 2, + "epoch": 0.0125, + "cpu_mem": 1.849167872, + "gpu_mem": 4.568720896, + "loss": 3.3361, + "grad_norm": 3.3937020301818848, + "learning_rate": 9.375e-06 + }, + { + "step": 3, + "epoch": 0.01875, + "cpu_mem": 1.849757696, + "gpu_mem": 4.568725504, + "loss": 3.2313, + "grad_norm": 3.213529348373413, + "learning_rate": 1.40625e-05 + }, + { + "step": 4, + "epoch": 0.025, + "cpu_mem": 1.850150912, + "gpu_mem": 4.568723968, + "loss": 3.1986, + "grad_norm": 3.3139498233795166, + "learning_rate": 1.875e-05 + }, + { + "step": 5, + "epoch": 0.03125, + "cpu_mem": 1.850544128, + "gpu_mem": 4.568723968, + "loss": 3.2192, + "grad_norm": 3.2745087146759033, + "learning_rate": 2.3437499999999997e-05 + }, + { + "step": 6, + "epoch": 0.0375, + "cpu_mem": 1.850740736, + "gpu_mem": 4.568730112, + "loss": 3.2166, + "grad_norm": 3.363171339035034, + "learning_rate": 2.8125e-05 + }, + { + "step": 7, + "epoch": 0.04375, + "cpu_mem": 1.850937344, + "gpu_mem": 4.568736256, + "loss": 3.1223, + "grad_norm": 3.3687081336975098, + "learning_rate": 3.28125e-05 + }, + { + "step": 8, + "epoch": 0.05, + "cpu_mem": 1.851133952, + "gpu_mem": 4.56871936, + "loss": 3.0669, + "grad_norm": 3.2087154388427734, + "learning_rate": 3.75e-05 + }, + { + "step": 9, + "epoch": 0.05625, + "cpu_mem": 1.85133056, + "gpu_mem": 4.568725504, + "loss": 3.2425, + "grad_norm": 3.442781925201416, + "learning_rate": 4.2187499999999995e-05 + }, + { + "step": 10, + "epoch": 0.0625, + "cpu_mem": 1.851723776, + "gpu_mem": 4.568728576, + "loss": 3.0373, + "grad_norm": 3.4485840797424316, + "learning_rate": 4.6874999999999994e-05 + }, + { + "step": 11, + "epoch": 0.06875, + "cpu_mem": 1.851723776, + "gpu_mem": 4.568717824, + "loss": 2.9519, + "grad_norm": 3.3718109130859375, + "learning_rate": 5.156249999999999e-05 + }, + { + "step": 12, + "epoch": 0.075, + "cpu_mem": 1.851920384, + "gpu_mem": 4.568722432, + "loss": 3.035, + "grad_norm": 3.4148144721984863, + "learning_rate": 5.625e-05 + }, + { + "step": 13, + "epoch": 0.08125, + "cpu_mem": 1.852116992, + "gpu_mem": 4.568730112, + "loss": 2.778, + "grad_norm": 3.2427854537963867, + "learning_rate": 6.09375e-05 + }, + { + "step": 14, + "epoch": 0.0875, + "cpu_mem": 1.8523136, + "gpu_mem": 4.568725504, + "loss": 2.6907, + "grad_norm": 3.355508327484131, + "learning_rate": 6.5625e-05 + }, + { + "step": 15, + "epoch": 0.09375, + "cpu_mem": 1.8523136, + "gpu_mem": 4.568725504, + "loss": 2.4991, + "grad_norm": 3.401430606842041, + "learning_rate": 7.03125e-05 + }, + { + "step": 16, + "epoch": 0.1, + "cpu_mem": 1.852510208, + "gpu_mem": 4.568722432, + "loss": 2.4509, + "grad_norm": 3.359720230102539, + "learning_rate": 7.5e-05 + }, + { + "step": 17, + "epoch": 0.10625, + "cpu_mem": 1.852706816, + "gpu_mem": 4.568722432, + "loss": 2.2392, + "grad_norm": 3.236818313598633, + "learning_rate": 7.968749999999999e-05 + }, + { + "step": 18, + "epoch": 0.1125, + "cpu_mem": 1.852706816, + "gpu_mem": 4.568725504, + "loss": 2.1985, + "grad_norm": 3.152644634246826, + "learning_rate": 8.437499999999999e-05 + }, + { + "step": 19, + "epoch": 0.11875, + "cpu_mem": 1.852903424, + "gpu_mem": 4.568722432, + "loss": 2.0676, + "grad_norm": 3.071321725845337, + "learning_rate": 8.906249999999999e-05 + }, + { + "step": 20, + "epoch": 0.125, + "cpu_mem": 1.852903424, + "gpu_mem": 4.568730112, + "loss": 2.0779, + "grad_norm": 2.9761135578155518, + "learning_rate": 9.374999999999999e-05 + }, + { + "step": 21, + "epoch": 0.13125, + "cpu_mem": 1.853100032, + "gpu_mem": 4.568722432, + "loss": 1.8957, + "grad_norm": 2.783867359161377, + "learning_rate": 9.843749999999999e-05 + }, + { + "step": 22, + "epoch": 0.1375, + "cpu_mem": 1.853100032, + "gpu_mem": 4.568722432, + "loss": 1.7282, + "grad_norm": 2.66548752784729, + "learning_rate": 0.00010312499999999999 + }, + { + "step": 23, + "epoch": 0.14375, + "cpu_mem": 1.853100032, + "gpu_mem": 4.568717824, + "loss": 1.7321, + "grad_norm": 2.64902400970459, + "learning_rate": 0.00010781249999999998 + }, + { + "step": 24, + "epoch": 0.15, + "cpu_mem": 1.853100032, + "gpu_mem": 4.568720896, + "loss": 1.5621, + "grad_norm": 2.1523537635803223, + "learning_rate": 0.0001125 + }, + { + "step": 25, + "epoch": 0.15625, + "cpu_mem": 1.853100032, + "gpu_mem": 4.568723968, + "loss": 1.4766, + "grad_norm": 2.01345157623291, + "learning_rate": 0.0001171875 + }, + { + "step": 26, + "epoch": 0.1625, + "cpu_mem": 1.853100032, + "gpu_mem": 4.56871936, + "loss": 1.2088, + "grad_norm": 1.6337004899978638, + "learning_rate": 0.000121875 + }, + { + "step": 27, + "epoch": 0.16875, + "cpu_mem": 1.853100032, + "gpu_mem": 4.568717824, + "loss": 1.305, + "grad_norm": 1.4882169961929321, + "learning_rate": 0.0001265625 + }, + { + "step": 28, + "epoch": 0.175, + "cpu_mem": 1.853100032, + "gpu_mem": 4.568723968, + "loss": 1.0731, + "grad_norm": 1.0536600351333618, + "learning_rate": 0.00013125 + }, + { + "step": 29, + "epoch": 0.18125, + "cpu_mem": 1.85329664, + "gpu_mem": 4.568722432, + "loss": 1.0621, + "grad_norm": 0.9423829317092896, + "learning_rate": 0.0001359375 + }, + { + "step": 30, + "epoch": 0.1875, + "cpu_mem": 1.85329664, + "gpu_mem": 4.568722432, + "loss": 1.0595, + "grad_norm": 0.9189621210098267, + "learning_rate": 0.000140625 + }, + { + "step": 31, + "epoch": 0.19375, + "cpu_mem": 1.85329664, + "gpu_mem": 4.568722432, + "loss": 0.9274, + "grad_norm": 0.6903952360153198, + "learning_rate": 0.0001453125 + }, + { + "step": 32, + "epoch": 0.2, + "cpu_mem": 1.85329664, + "gpu_mem": 4.56871936, + "loss": 0.9299, + "grad_norm": 0.7699345350265503, + "learning_rate": 0.00015 + }, + { + "step": 33, + "epoch": 0.20625, + "cpu_mem": 1.85329664, + "gpu_mem": 4.56871936, + "loss": 0.8292, + "grad_norm": 0.3511117696762085, + "learning_rate": 0.00015468749999999999 + }, + { + "step": 34, + "epoch": 0.2125, + "cpu_mem": 1.85329664, + "gpu_mem": 4.56871936, + "loss": 0.7818, + "grad_norm": 0.3477895259857178, + "learning_rate": 0.00015937499999999998 + }, + { + "step": 35, + "epoch": 0.21875, + "cpu_mem": 1.85329664, + "gpu_mem": 4.568725504, + "loss": 0.7306, + "grad_norm": 0.28685328364372253, + "learning_rate": 0.00016406249999999998 + }, + { + "step": 36, + "epoch": 0.225, + "cpu_mem": 1.85329664, + "gpu_mem": 4.568720896, + "loss": 0.7663, + "grad_norm": 0.32825714349746704, + "learning_rate": 0.00016874999999999998 + }, + { + "step": 37, + "epoch": 0.23125, + "cpu_mem": 1.853493248, + "gpu_mem": 4.56871936, + "loss": 0.7879, + "grad_norm": 0.8208101987838745, + "learning_rate": 0.00017343749999999998 + }, + { + "step": 38, + "epoch": 0.2375, + "cpu_mem": 1.853493248, + "gpu_mem": 4.568723968, + "loss": 0.7683, + "grad_norm": 0.4717964231967926, + "learning_rate": 0.00017812499999999998 + }, + { + "step": 39, + "epoch": 0.24375, + "cpu_mem": 1.853493248, + "gpu_mem": 4.568730112, + "loss": 0.7178, + "grad_norm": 0.29046425223350525, + "learning_rate": 0.00018281249999999998 + }, + { + "step": 40, + "epoch": 0.25, + "cpu_mem": 1.853493248, + "gpu_mem": 4.56872704, + "loss": 0.7502, + "grad_norm": 0.4415625035762787, + "learning_rate": 0.00018749999999999998 + }, + { + "step": 41, + "epoch": 0.25625, + "cpu_mem": 1.853689856, + "gpu_mem": 4.56872704, + "loss": 0.7399, + "grad_norm": 0.4340401887893677, + "learning_rate": 0.00019218749999999998 + }, + { + "step": 42, + "epoch": 0.2625, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568723968, + "loss": 0.7143, + "grad_norm": 0.4169151484966278, + "learning_rate": 0.00019687499999999997 + }, + { + "step": 43, + "epoch": 0.26875, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568723968, + "loss": 0.7451, + "grad_norm": 0.6522882580757141, + "learning_rate": 0.00020156249999999997 + }, + { + "step": 44, + "epoch": 0.275, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568723968, + "loss": 0.7422, + "grad_norm": 0.2987101674079895, + "learning_rate": 0.00020624999999999997 + }, + { + "step": 45, + "epoch": 0.28125, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568731648, + "loss": 0.7086, + "grad_norm": 0.2406589686870575, + "learning_rate": 0.00021093749999999997 + }, + { + "step": 46, + "epoch": 0.2875, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568723968, + "loss": 0.7484, + "grad_norm": 0.29658231139183044, + "learning_rate": 0.00021562499999999997 + }, + { + "step": 47, + "epoch": 0.29375, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568725504, + "loss": 0.7532, + "grad_norm": 0.3975699245929718, + "learning_rate": 0.00022031249999999997 + }, + { + "step": 48, + "epoch": 0.3, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568725504, + "loss": 0.7589, + "grad_norm": 0.5798319578170776, + "learning_rate": 0.000225 + }, + { + "step": 49, + "epoch": 0.30625, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568716288, + "loss": 0.7081, + "grad_norm": 0.8273880481719971, + "learning_rate": 0.0002296875 + }, + { + "step": 50, + "epoch": 0.3125, + "cpu_mem": 1.853689856, + "gpu_mem": 4.56871936, + "loss": 0.7265, + "grad_norm": 0.19095920026302338, + "learning_rate": 0.000234375 + }, + { + "step": 51, + "epoch": 0.31875, + "cpu_mem": 1.853689856, + "gpu_mem": 4.56872704, + "loss": 0.7418, + "grad_norm": 0.5553826093673706, + "learning_rate": 0.0002390625 + }, + { + "step": 52, + "epoch": 0.325, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568720896, + "loss": 0.6872, + "grad_norm": 0.24470654129981995, + "learning_rate": 0.00024375 + }, + { + "step": 53, + "epoch": 0.33125, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568725504, + "loss": 0.7267, + "grad_norm": 0.47674521803855896, + "learning_rate": 0.00024843749999999996 + }, + { + "step": 54, + "epoch": 0.3375, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568713216, + "loss": 0.7021, + "grad_norm": 0.26884526014328003, + "learning_rate": 0.000253125 + }, + { + "step": 55, + "epoch": 0.34375, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568717824, + "loss": 0.6785, + "grad_norm": 0.15531903505325317, + "learning_rate": 0.00025781249999999996 + }, + { + "step": 56, + "epoch": 0.35, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568725504, + "loss": 0.7206, + "grad_norm": 0.15900364518165588, + "learning_rate": 0.0002625 + }, + { + "step": 57, + "epoch": 0.35625, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568716288, + "loss": 0.69, + "grad_norm": 0.22393159568309784, + "learning_rate": 0.00026718749999999996 + }, + { + "step": 58, + "epoch": 0.3625, + "cpu_mem": 1.853689856, + "gpu_mem": 4.56871936, + "loss": 0.6959, + "grad_norm": 0.1432359218597412, + "learning_rate": 0.000271875 + }, + { + "step": 59, + "epoch": 0.36875, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568731648, + "loss": 0.7082, + "grad_norm": 0.16359610855579376, + "learning_rate": 0.00027656249999999995 + }, + { + "step": 60, + "epoch": 0.375, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568722432, + "loss": 0.6958, + "grad_norm": 0.20816905796527863, + "learning_rate": 0.00028125 + }, + { + "step": 61, + "epoch": 0.38125, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568722432, + "loss": 0.7031, + "grad_norm": 0.12582896649837494, + "learning_rate": 0.00028593749999999995 + }, + { + "step": 62, + "epoch": 0.3875, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568722432, + "loss": 0.7093, + "grad_norm": 0.195413738489151, + "learning_rate": 0.000290625 + }, + { + "step": 63, + "epoch": 0.39375, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568720896, + "loss": 0.7165, + "grad_norm": 0.4113353490829468, + "learning_rate": 0.00029531249999999995 + }, + { + "step": 64, + "epoch": 0.4, + "cpu_mem": 1.853689856, + "gpu_mem": 4.568730112, + "loss": 0.7057, + "grad_norm": 0.17279121279716492, + "learning_rate": 0.0003 + }, + { + "step": 65, + "epoch": 0.40625, + "cpu_mem": 1.853689856, + "gpu_mem": 4.56871936, + "loss": 0.7013, + "grad_norm": 0.22968897223472595, + "learning_rate": 0.00029999776892091325 + }, + { + "step": 66, + "epoch": 0.4125, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568725504, + "loss": 0.717, + "grad_norm": 0.32136449217796326, + "learning_rate": 0.00029999107575002246 + }, + { + "step": 67, + "epoch": 0.41875, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568723968, + "loss": 0.6874, + "grad_norm": 0.20254182815551758, + "learning_rate": 0.0002999799206864343 + }, + { + "step": 68, + "epoch": 0.425, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568722432, + "loss": 0.7118, + "grad_norm": 0.19545334577560425, + "learning_rate": 0.0002999643040619863 + }, + { + "step": 69, + "epoch": 0.43125, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568723968, + "loss": 0.7065, + "grad_norm": 0.15388335287570953, + "learning_rate": 0.0002999442263412377 + }, + { + "step": 70, + "epoch": 0.4375, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568730112, + "loss": 0.7139, + "grad_norm": 0.25726062059402466, + "learning_rate": 0.00029991968812145484 + }, + { + "step": 71, + "epoch": 0.44375, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568728576, + "loss": 0.6924, + "grad_norm": 0.11241628974676132, + "learning_rate": 0.00029989069013259374 + }, + { + "step": 72, + "epoch": 0.45, + "cpu_mem": 1.853886464, + "gpu_mem": 4.56872704, + "loss": 0.6898, + "grad_norm": 0.09832803905010223, + "learning_rate": 0.00029985723323727866 + }, + { + "step": 73, + "epoch": 0.45625, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568723968, + "loss": 0.7172, + "grad_norm": 0.30987173318862915, + "learning_rate": 0.00029981931843077583 + }, + { + "step": 74, + "epoch": 0.4625, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568717824, + "loss": 0.7038, + "grad_norm": 0.1952323019504547, + "learning_rate": 0.00029977694684096444 + }, + { + "step": 75, + "epoch": 0.46875, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568723968, + "loss": 0.718, + "grad_norm": 0.3868877589702606, + "learning_rate": 0.0002997301197283027 + }, + { + "step": 76, + "epoch": 0.475, + "cpu_mem": 1.853886464, + "gpu_mem": 4.56872704, + "loss": 0.7001, + "grad_norm": 0.2339070439338684, + "learning_rate": 0.0002996788384857905 + }, + { + "step": 77, + "epoch": 0.48125, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568730112, + "loss": 0.6964, + "grad_norm": 0.09525403380393982, + "learning_rate": 0.00029962310463892795 + }, + { + "step": 78, + "epoch": 0.4875, + "cpu_mem": 1.853886464, + "gpu_mem": 4.56871936, + "loss": 0.6878, + "grad_norm": 0.107442706823349, + "learning_rate": 0.00029956291984566997 + }, + { + "step": 79, + "epoch": 0.49375, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568722432, + "loss": 0.6994, + "grad_norm": 0.12416708469390869, + "learning_rate": 0.00029949828589637703 + }, + { + "step": 80, + "epoch": 0.5, + "cpu_mem": 1.853886464, + "gpu_mem": 4.56871936, + "loss": 0.7138, + "grad_norm": 0.12264889478683472, + "learning_rate": 0.0002994292047137618 + }, + { + "step": 81, + "epoch": 0.50625, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568716288, + "loss": 0.7047, + "grad_norm": 0.14545975625514984, + "learning_rate": 0.00029935567835283203 + }, + { + "step": 82, + "epoch": 0.5125, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568717824, + "loss": 0.6924, + "grad_norm": 0.16536127030849457, + "learning_rate": 0.00029927770900082954 + }, + { + "step": 83, + "epoch": 0.51875, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568720896, + "loss": 0.692, + "grad_norm": 0.20507080852985382, + "learning_rate": 0.0002991952989771647 + }, + { + "step": 84, + "epoch": 0.525, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568717824, + "loss": 0.7009, + "grad_norm": 0.16796040534973145, + "learning_rate": 0.0002991084507333479 + }, + { + "step": 85, + "epoch": 0.53125, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568722432, + "loss": 0.7058, + "grad_norm": 0.1183355301618576, + "learning_rate": 0.00029901716685291663 + }, + { + "step": 86, + "epoch": 0.5375, + "cpu_mem": 1.853886464, + "gpu_mem": 4.56871936, + "loss": 0.7098, + "grad_norm": 0.1230674460530281, + "learning_rate": 0.0002989214500513582 + }, + { + "step": 87, + "epoch": 0.54375, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568725504, + "loss": 0.6978, + "grad_norm": 0.13773928582668304, + "learning_rate": 0.0002988213031760294 + }, + { + "step": 88, + "epoch": 0.55, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568725504, + "loss": 0.6907, + "grad_norm": 0.07152018696069717, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 89, + "epoch": 0.55625, + "cpu_mem": 1.853886464, + "gpu_mem": 4.56872704, + "loss": 0.6998, + "grad_norm": 0.17459940910339355, + "learning_rate": 0.0002986077312523219 + }, + { + "step": 90, + "epoch": 0.5625, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568725504, + "loss": 0.6906, + "grad_norm": 0.1460246741771698, + "learning_rate": 0.00029849431255722116 + }, + { + "step": 91, + "epoch": 0.56875, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568717824, + "loss": 0.6893, + "grad_norm": 0.3170492649078369, + "learning_rate": 0.00029837647649471715 + }, + { + "step": 92, + "epoch": 0.575, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568725504, + "loss": 0.7146, + "grad_norm": 0.2245081067085266, + "learning_rate": 0.0002982542265701641 + }, + { + "step": 93, + "epoch": 0.58125, + "cpu_mem": 1.853886464, + "gpu_mem": 4.56871936, + "loss": 0.6992, + "grad_norm": 0.12044331431388855, + "learning_rate": 0.0002981275664202187 + }, + { + "step": 94, + "epoch": 0.5875, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568720896, + "loss": 0.6967, + "grad_norm": 0.10374974459409714, + "learning_rate": 0.00029799649981273186 + }, + { + "step": 95, + "epoch": 0.59375, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568725504, + "loss": 0.6892, + "grad_norm": 0.14845247566699982, + "learning_rate": 0.00029786103064663634 + }, + { + "step": 96, + "epoch": 0.6, + "cpu_mem": 1.853886464, + "gpu_mem": 4.56871936, + "loss": 0.7008, + "grad_norm": 0.2431282103061676, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 97, + "epoch": 0.60625, + "cpu_mem": 1.853886464, + "gpu_mem": 4.568722432, + "loss": 0.6955, + "grad_norm": 0.07878247648477554, + "learning_rate": 0.00029757690088906156 + }, + { + "step": 98, + "epoch": 0.6125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.6984, + "grad_norm": 0.06996665149927139, + "learning_rate": 0.00029742824874979515 + }, + { + "step": 99, + "epoch": 0.61875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.7073, + "grad_norm": 0.319320946931839, + "learning_rate": 0.0002972752109560943 + }, + { + "step": 100, + "epoch": 0.625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.6999, + "grad_norm": 0.1338057518005371, + "learning_rate": 0.00029711779206048454 + }, + { + "step": 101, + "epoch": 0.63125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.69, + "grad_norm": 0.12037083506584167, + "learning_rate": 0.0002969559967458194 + }, + { + "step": 102, + "epoch": 0.6375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568730112, + "loss": 0.7066, + "grad_norm": 0.20889697968959808, + "learning_rate": 0.0002967898298251407 + }, + { + "step": 103, + "epoch": 0.64375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.6902, + "grad_norm": 0.09260699152946472, + "learning_rate": 0.0002966192962415358 + }, + { + "step": 104, + "epoch": 0.65, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.7215, + "grad_norm": 0.2360643595457077, + "learning_rate": 0.00029644440106799 + }, + { + "step": 105, + "epoch": 0.65625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.7005, + "grad_norm": 0.15778642892837524, + "learning_rate": 0.00029626514950723627 + }, + { + "step": 106, + "epoch": 0.6625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6999, + "grad_norm": 0.07770750671625137, + "learning_rate": 0.0002960815468916 + }, + { + "step": 107, + "epoch": 0.66875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.7004, + "grad_norm": 0.0836663544178009, + "learning_rate": 0.0002958935986828407 + }, + { + "step": 108, + "epoch": 0.675, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.6699, + "grad_norm": 0.09540589898824692, + "learning_rate": 0.00029570131047198915 + }, + { + "step": 109, + "epoch": 0.68125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.7117, + "grad_norm": 0.12564902007579803, + "learning_rate": 0.0002955046879791816 + }, + { + "step": 110, + "epoch": 0.6875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.7116, + "grad_norm": 0.23568475246429443, + "learning_rate": 0.00029530373705348895 + }, + { + "step": 111, + "epoch": 0.69375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.6979, + "grad_norm": 0.14684464037418365, + "learning_rate": 0.00029509846367274336 + }, + { + "step": 112, + "epoch": 0.7, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.704, + "grad_norm": 0.10948102176189423, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 113, + "epoch": 0.70625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568714752, + "loss": 0.7075, + "grad_norm": 0.2302074134349823, + "learning_rate": 0.00029467497410015625 + }, + { + "step": 114, + "epoch": 0.7125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6953, + "grad_norm": 0.12810751795768738, + "learning_rate": 0.00029445677050616437 + }, + { + "step": 115, + "epoch": 0.71875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.692, + "grad_norm": 0.1111094132065773, + "learning_rate": 0.0002942342696524443 + }, + { + "step": 116, + "epoch": 0.725, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568717824, + "loss": 0.7328, + "grad_norm": 0.45820122957229614, + "learning_rate": 0.0002940074781578893 + }, + { + "step": 117, + "epoch": 0.73125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.7045, + "grad_norm": 0.08192773908376694, + "learning_rate": 0.00029377640276902954 + }, + { + "step": 118, + "epoch": 0.7375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.7029, + "grad_norm": 0.12758895754814148, + "learning_rate": 0.0002935410503598313 + }, + { + "step": 119, + "epoch": 0.74375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6946, + "grad_norm": 0.22305811941623688, + "learning_rate": 0.00029330142793149237 + }, + { + "step": 120, + "epoch": 0.75, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.6882, + "grad_norm": 0.0826788991689682, + "learning_rate": 0.000293057542612234 + }, + { + "step": 121, + "epoch": 0.75625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.696, + "grad_norm": 0.1128137931227684, + "learning_rate": 0.0002928094016570886 + }, + { + "step": 122, + "epoch": 0.7625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.6929, + "grad_norm": 0.25376710295677185, + "learning_rate": 0.00029255701244768414 + }, + { + "step": 123, + "epoch": 0.76875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.7034, + "grad_norm": 0.06663811951875687, + "learning_rate": 0.0002923003824920244 + }, + { + "step": 124, + "epoch": 0.775, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.7062, + "grad_norm": 0.12995542585849762, + "learning_rate": 0.0002920395194242658 + }, + { + "step": 125, + "epoch": 0.78125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568714752, + "loss": 0.6986, + "grad_norm": 0.11046002805233002, + "learning_rate": 0.00029177443100449014 + }, + { + "step": 126, + "epoch": 0.7875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.6999, + "grad_norm": 0.17273232340812683, + "learning_rate": 0.00029150512511847375 + }, + { + "step": 127, + "epoch": 0.79375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.7067, + "grad_norm": 0.154164656996727, + "learning_rate": 0.00029123160977745306 + }, + { + "step": 128, + "epoch": 0.8, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568717824, + "loss": 0.6936, + "grad_norm": 0.07747945189476013, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 129, + "epoch": 0.80625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.7109, + "grad_norm": 0.1001417264342308, + "learning_rate": 0.00029067198340121094 + }, + { + "step": 130, + "epoch": 0.8125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.7083, + "grad_norm": 0.08080457895994186, + "learning_rate": 0.00029038588901359884 + }, + { + "step": 131, + "epoch": 0.81875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.7006, + "grad_norm": 0.13520367443561554, + "learning_rate": 0.00029009561846570604 + }, + { + "step": 132, + "epoch": 0.825, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.6996, + "grad_norm": 0.1593540459871292, + "learning_rate": 0.00028980118039241976 + }, + { + "step": 133, + "epoch": 0.83125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.7033, + "grad_norm": 0.3171948492527008, + "learning_rate": 0.00028950258355260177 + }, + { + "step": 134, + "epoch": 0.8375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.7002, + "grad_norm": 0.05765560641884804, + "learning_rate": 0.00028919983682882766 + }, + { + "step": 135, + "epoch": 0.84375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.6996, + "grad_norm": 0.21977953612804413, + "learning_rate": 0.0002888929492271224 + }, + { + "step": 136, + "epoch": 0.85, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.6928, + "grad_norm": 0.1360054463148117, + "learning_rate": 0.000288581929876693 + }, + { + "step": 137, + "epoch": 0.85625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6896, + "grad_norm": 0.06808435171842575, + "learning_rate": 0.00028826678802965614 + }, + { + "step": 138, + "epoch": 0.8625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.7027, + "grad_norm": 0.11082667857408524, + "learning_rate": 0.0002879475330607638 + }, + { + "step": 139, + "epoch": 0.86875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.719, + "grad_norm": 0.1999606341123581, + "learning_rate": 0.00028762417446712363 + }, + { + "step": 140, + "epoch": 0.875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.6924, + "grad_norm": 0.07000905275344849, + "learning_rate": 0.00028729672186791704 + }, + { + "step": 141, + "epoch": 0.88125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6846, + "grad_norm": 0.09739074856042862, + "learning_rate": 0.00028696518500411254 + }, + { + "step": 142, + "epoch": 0.8875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6921, + "grad_norm": 0.08326389640569687, + "learning_rate": 0.0002866295737381763 + }, + { + "step": 143, + "epoch": 0.89375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.7022, + "grad_norm": 0.13074083626270294, + "learning_rate": 0.0002862898980537788 + }, + { + "step": 144, + "epoch": 0.9, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.7065, + "grad_norm": 0.13657601177692413, + "learning_rate": 0.0002859461680554975 + }, + { + "step": 145, + "epoch": 0.90625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568730112, + "loss": 0.6918, + "grad_norm": 0.07735764980316162, + "learning_rate": 0.0002855983939685165 + }, + { + "step": 146, + "epoch": 0.9125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.68, + "grad_norm": 0.07243376970291138, + "learning_rate": 0.0002852465861383224 + }, + { + "step": 147, + "epoch": 0.91875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.7035, + "grad_norm": 0.15880069136619568, + "learning_rate": 0.00028489075503039643 + }, + { + "step": 148, + "epoch": 0.925, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.6931, + "grad_norm": 0.061384767293930054, + "learning_rate": 0.00028453091122990323 + }, + { + "step": 149, + "epoch": 0.93125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.6856, + "grad_norm": 0.13749147951602936, + "learning_rate": 0.0002841670654413757 + }, + { + "step": 150, + "epoch": 0.9375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.6881, + "grad_norm": 0.1323305368423462, + "learning_rate": 0.0002837992284883971 + }, + { + "step": 151, + "epoch": 0.94375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.6949, + "grad_norm": 0.09503695368766785, + "learning_rate": 0.0002834274113132784 + }, + { + "step": 152, + "epoch": 0.95, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.6958, + "grad_norm": 0.04638821259140968, + "learning_rate": 0.0002830516249767332 + }, + { + "step": 153, + "epoch": 0.95625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.6916, + "grad_norm": 0.09366676211357117, + "learning_rate": 0.0002826718806575488 + }, + { + "step": 154, + "epoch": 0.9625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.6978, + "grad_norm": 0.06327635049819946, + "learning_rate": 0.0002822881896522532 + }, + { + "step": 155, + "epoch": 0.96875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.6972, + "grad_norm": 0.16531430184841156, + "learning_rate": 0.0002819005633747795 + }, + { + "step": 156, + "epoch": 0.975, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568716288, + "loss": 0.6975, + "grad_norm": 0.09249331057071686, + "learning_rate": 0.00028150901335612615 + }, + { + "step": 157, + "epoch": 0.98125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.6832, + "grad_norm": 0.445426881313324, + "learning_rate": 0.0002811135512440138 + }, + { + "step": 158, + "epoch": 0.9875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568733184, + "loss": 0.6915, + "grad_norm": 0.08213992416858673, + "learning_rate": 0.0002807141888025392 + }, + { + "step": 159, + "epoch": 0.99375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.6924, + "grad_norm": 0.07736635208129883, + "learning_rate": 0.00028031093791182484 + }, + { + "step": 160, + "epoch": 1.0, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568717824, + "loss": 0.702, + "grad_norm": 0.13544806838035583, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 161, + "epoch": 1.00625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.7089, + "grad_norm": 0.1819426715373993, + "learning_rate": 0.0002794928188811727 + }, + { + "step": 162, + "epoch": 1.0125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.7172, + "grad_norm": 0.19652830064296722, + "learning_rate": 0.0002790779750784118 + }, + { + "step": 163, + "epoch": 1.01875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.7338, + "grad_norm": 0.3434111773967743, + "learning_rate": 0.0002786592915000408 + }, + { + "step": 164, + "epoch": 1.025, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.7192, + "grad_norm": 0.2874121367931366, + "learning_rate": 0.00027823678060094197 + }, + { + "step": 165, + "epoch": 1.03125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.7167, + "grad_norm": 0.23032519221305847, + "learning_rate": 0.0002778104549498518 + }, + { + "step": 166, + "epoch": 1.0375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6951, + "grad_norm": 0.052257560193538666, + "learning_rate": 0.00027738032722898683 + }, + { + "step": 167, + "epoch": 1.04375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.703, + "grad_norm": 0.10879190266132355, + "learning_rate": 0.00027694641023366656 + }, + { + "step": 168, + "epoch": 1.05, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.7078, + "grad_norm": 0.12197710573673248, + "learning_rate": 0.0002765087168719328 + }, + { + "step": 169, + "epoch": 1.05625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.7118, + "grad_norm": 0.18378819525241852, + "learning_rate": 0.00027606726016416567 + }, + { + "step": 170, + "epoch": 1.0625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568730112, + "loss": 0.7266, + "grad_norm": 0.2912314534187317, + "learning_rate": 0.00027562205324269617 + }, + { + "step": 171, + "epoch": 1.06875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.6815, + "grad_norm": 0.06145884841680527, + "learning_rate": 0.00027517310935141565 + }, + { + "step": 172, + "epoch": 1.075, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.7255, + "grad_norm": 0.1996554434299469, + "learning_rate": 0.0002747204418453818 + }, + { + "step": 173, + "epoch": 1.08125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568714752, + "loss": 0.7215, + "grad_norm": 0.1858702450990677, + "learning_rate": 0.00027426406419042135 + }, + { + "step": 174, + "epoch": 1.0875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.7022, + "grad_norm": 0.09776411950588226, + "learning_rate": 0.00027380398996272956 + }, + { + "step": 175, + "epoch": 1.09375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.6911, + "grad_norm": 0.06149415671825409, + "learning_rate": 0.0002733402328484662 + }, + { + "step": 176, + "epoch": 1.1, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.6995, + "grad_norm": 0.07900120317935944, + "learning_rate": 0.00027287280664334875 + }, + { + "step": 177, + "epoch": 1.10625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.7034, + "grad_norm": 0.0779632106423378, + "learning_rate": 0.0002724017252522415 + }, + { + "step": 178, + "epoch": 1.1125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.6891, + "grad_norm": 0.06844642758369446, + "learning_rate": 0.0002719270026887423 + }, + { + "step": 179, + "epoch": 1.11875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.6964, + "grad_norm": 0.09328778833150864, + "learning_rate": 0.0002714486530747656 + }, + { + "step": 180, + "epoch": 1.125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.6965, + "grad_norm": 0.07550402730703354, + "learning_rate": 0.0002709666906401224 + }, + { + "step": 181, + "epoch": 1.13125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568737792, + "loss": 0.7062, + "grad_norm": 0.2618708610534668, + "learning_rate": 0.0002704811297220967 + }, + { + "step": 182, + "epoch": 1.1375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.689, + "grad_norm": 0.13620974123477936, + "learning_rate": 0.00026999198476501945 + }, + { + "step": 183, + "epoch": 1.14375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6812, + "grad_norm": 0.10667377710342407, + "learning_rate": 0.0002694992703198383 + }, + { + "step": 184, + "epoch": 1.15, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6928, + "grad_norm": 0.10220897197723389, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 185, + "epoch": 1.15625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.6951, + "grad_norm": 0.06887640058994293, + "learning_rate": 0.0002685031916994403 + }, + { + "step": 186, + "epoch": 1.1625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.6954, + "grad_norm": 0.12092932313680649, + "learning_rate": 0.0002679998571552925 + }, + { + "step": 187, + "epoch": 1.16875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.686, + "grad_norm": 0.13079635798931122, + "learning_rate": 0.0002674930123842975 + }, + { + "step": 188, + "epoch": 1.175, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.6849, + "grad_norm": 0.052671920508146286, + "learning_rate": 0.0002669826724639322 + }, + { + "step": 189, + "epoch": 1.18125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.7001, + "grad_norm": 0.14996251463890076, + "learning_rate": 0.0002664688525756463 + }, + { + "step": 190, + "epoch": 1.1875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.6849, + "grad_norm": 0.06776002794504166, + "learning_rate": 0.0002659515680044105 + }, + { + "step": 191, + "epoch": 1.19375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568717824, + "loss": 0.6801, + "grad_norm": 0.058321885764598846, + "learning_rate": 0.00026543083413826203 + }, + { + "step": 192, + "epoch": 1.2, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.7221, + "grad_norm": 0.30239298939704895, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 193, + "epoch": 1.20625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.7015, + "grad_norm": 0.1540907919406891, + "learning_rate": 0.0002643790805859582 + }, + { + "step": 194, + "epoch": 1.2125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568716288, + "loss": 0.7018, + "grad_norm": 0.14612354338169098, + "learning_rate": 0.00026384809218707423 + }, + { + "step": 195, + "epoch": 1.21875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.6924, + "grad_norm": 0.20554287731647491, + "learning_rate": 0.0002633137170668897 + }, + { + "step": 196, + "epoch": 1.225, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.6918, + "grad_norm": 0.06863006949424744, + "learning_rate": 0.0002627759711218466 + }, + { + "step": 197, + "epoch": 1.23125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568730112, + "loss": 0.6973, + "grad_norm": 0.07825738936662674, + "learning_rate": 0.00026223487034866133 + }, + { + "step": 198, + "epoch": 1.2375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.6925, + "grad_norm": 0.07648196816444397, + "learning_rate": 0.00026169043084384896 + }, + { + "step": 199, + "epoch": 1.24375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.6939, + "grad_norm": 0.1698685735464096, + "learning_rate": 0.00026114266880324387 + }, + { + "step": 200, + "epoch": 1.25, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.6937, + "grad_norm": 0.06397654861211777, + "learning_rate": 0.0002605916005215186 + }, + { + "step": 201, + "epoch": 1.25625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568730112, + "loss": 0.7019, + "grad_norm": 0.053321342915296555, + "learning_rate": 0.00026003724239169874 + }, + { + "step": 202, + "epoch": 1.2625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.7015, + "grad_norm": 0.19043336808681488, + "learning_rate": 0.00025947961090467533 + }, + { + "step": 203, + "epoch": 1.26875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.6991, + "grad_norm": 0.288440465927124, + "learning_rate": 0.0002589187226487144 + }, + { + "step": 204, + "epoch": 1.275, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6988, + "grad_norm": 0.07199437916278839, + "learning_rate": 0.0002583545943089633 + }, + { + "step": 205, + "epoch": 1.28125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568717824, + "loss": 0.6928, + "grad_norm": 0.06742195039987564, + "learning_rate": 0.00025778724266695466 + }, + { + "step": 206, + "epoch": 1.2875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.691, + "grad_norm": 0.06388840079307556, + "learning_rate": 0.00025721668460010696 + }, + { + "step": 207, + "epoch": 1.29375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.6981, + "grad_norm": 0.07368846237659454, + "learning_rate": 0.0002566429370812223 + }, + { + "step": 208, + "epoch": 1.3, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.6965, + "grad_norm": 0.14568206667900085, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 209, + "epoch": 1.30625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.7042, + "grad_norm": 0.13774316012859344, + "learning_rate": 0.0002554859420524386 + }, + { + "step": 210, + "epoch": 1.3125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.7084, + "grad_norm": 0.1262449324131012, + "learning_rate": 0.00025490272896050507 + }, + { + "step": 211, + "epoch": 1.31875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6928, + "grad_norm": 0.14259923994541168, + "learning_rate": 0.00025431639525144175 + }, + { + "step": 212, + "epoch": 1.325, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.6896, + "grad_norm": 0.056164465844631195, + "learning_rate": 0.0002537269583673404 + }, + { + "step": 213, + "epoch": 1.33125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.7013, + "grad_norm": 0.1321733295917511, + "learning_rate": 0.0002531344358426051 + }, + { + "step": 214, + "epoch": 1.3375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568731648, + "loss": 0.6865, + "grad_norm": 0.06659462302923203, + "learning_rate": 0.0002525388453034307 + }, + { + "step": 215, + "epoch": 1.34375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568717824, + "loss": 0.7025, + "grad_norm": 0.18153777718544006, + "learning_rate": 0.0002519402044672784 + }, + { + "step": 216, + "epoch": 1.35, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.7046, + "grad_norm": 0.22185759246349335, + "learning_rate": 0.00025133853114234905 + }, + { + "step": 217, + "epoch": 1.35625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.6979, + "grad_norm": 0.059270620346069336, + "learning_rate": 0.00025073384322705274 + }, + { + "step": 218, + "epoch": 1.3625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.687, + "grad_norm": 0.160055011510849, + "learning_rate": 0.0002501261587094771 + }, + { + "step": 219, + "epoch": 1.36875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.695, + "grad_norm": 0.05660351738333702, + "learning_rate": 0.00024951549566685165 + }, + { + "step": 220, + "epoch": 1.375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6935, + "grad_norm": 0.04720282182097435, + "learning_rate": 0.0002489018722650103 + }, + { + "step": 221, + "epoch": 1.38125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.685, + "grad_norm": 0.16811363399028778, + "learning_rate": 0.00024828530675785094 + }, + { + "step": 222, + "epoch": 1.3875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.6986, + "grad_norm": 0.08217649161815643, + "learning_rate": 0.00024766581748679234 + }, + { + "step": 223, + "epoch": 1.39375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568716288, + "loss": 0.6889, + "grad_norm": 0.07627619057893753, + "learning_rate": 0.0002470434228802286 + }, + { + "step": 224, + "epoch": 1.4, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6956, + "grad_norm": 0.08060864359140396, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 225, + "epoch": 1.40625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.7074, + "grad_norm": 0.18192525207996368, + "learning_rate": 0.0002457899918057468 + }, + { + "step": 226, + "epoch": 1.4125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.7083, + "grad_norm": 0.14217273890972137, + "learning_rate": 0.0002451589926245468 + }, + { + "step": 227, + "epoch": 1.41875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568717824, + "loss": 0.7006, + "grad_norm": 0.0780787467956543, + "learning_rate": 0.00024452516268016865 + }, + { + "step": 228, + "epoch": 1.425, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.6952, + "grad_norm": 0.07562148571014404, + "learning_rate": 0.00024388852082760884 + }, + { + "step": 229, + "epoch": 1.43125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.6908, + "grad_norm": 0.0654444769024849, + "learning_rate": 0.00024324908600551162 + }, + { + "step": 230, + "epoch": 1.4375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.6919, + "grad_norm": 0.07079274207353592, + "learning_rate": 0.00024260687723560574 + }, + { + "step": 231, + "epoch": 1.44375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.6917, + "grad_norm": 0.1457984894514084, + "learning_rate": 0.00024196191362213862 + }, + { + "step": 232, + "epoch": 1.45, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.6963, + "grad_norm": 0.0761900395154953, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 233, + "epoch": 1.45625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.7008, + "grad_norm": 0.10101860761642456, + "learning_rate": 0.0002406637986906913 + }, + { + "step": 234, + "epoch": 1.4625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568733184, + "loss": 0.6869, + "grad_norm": 0.13449053466320038, + "learning_rate": 0.00024001068598867212 + }, + { + "step": 235, + "epoch": 1.46875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.7038, + "grad_norm": 0.16189780831336975, + "learning_rate": 0.000239354895673865 + }, + { + "step": 236, + "epoch": 1.475, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.6958, + "grad_norm": 0.12727639079093933, + "learning_rate": 0.00023869644725453735 + }, + { + "step": 237, + "epoch": 1.48125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568731648, + "loss": 0.7149, + "grad_norm": 0.24275602400302887, + "learning_rate": 0.00023803536031802918 + }, + { + "step": 238, + "epoch": 1.4875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568730112, + "loss": 0.7194, + "grad_norm": 0.3477574586868286, + "learning_rate": 0.00023737165453017033 + }, + { + "step": 239, + "epoch": 1.49375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568717824, + "loss": 0.6952, + "grad_norm": 0.10241160541772842, + "learning_rate": 0.0002367053496346955 + }, + { + "step": 240, + "epoch": 1.5, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.6954, + "grad_norm": 0.05166786164045334, + "learning_rate": 0.00023603646545265687 + }, + { + "step": 241, + "epoch": 1.50625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.6771, + "grad_norm": 0.15929239988327026, + "learning_rate": 0.00023536502188183472 + }, + { + "step": 242, + "epoch": 1.5125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6786, + "grad_norm": 0.07853837311267853, + "learning_rate": 0.00023469103889614505 + }, + { + "step": 243, + "epoch": 1.51875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568731648, + "loss": 0.7218, + "grad_norm": 0.25261589884757996, + "learning_rate": 0.0002340145365450458 + }, + { + "step": 244, + "epoch": 1.525, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.7154, + "grad_norm": 0.1677459478378296, + "learning_rate": 0.0002333355349529403 + }, + { + "step": 245, + "epoch": 1.53125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568717824, + "loss": 0.7122, + "grad_norm": 0.19025690853595734, + "learning_rate": 0.0002326540543185786 + }, + { + "step": 246, + "epoch": 1.5375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.6923, + "grad_norm": 0.04499945044517517, + "learning_rate": 0.0002319701149144565 + }, + { + "step": 247, + "epoch": 1.54375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568717824, + "loss": 0.7043, + "grad_norm": 0.12304318696260452, + "learning_rate": 0.00023128373708621275 + }, + { + "step": 248, + "epoch": 1.55, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6867, + "grad_norm": 0.0621953010559082, + "learning_rate": 0.00023059494125202357 + }, + { + "step": 249, + "epoch": 1.55625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.6994, + "grad_norm": 0.14125511050224304, + "learning_rate": 0.00022990374790199532 + }, + { + "step": 250, + "epoch": 1.5625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6973, + "grad_norm": 0.11613954603672028, + "learning_rate": 0.0002292101775975552 + }, + { + "step": 251, + "epoch": 1.56875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.7003, + "grad_norm": 0.08803705871105194, + "learning_rate": 0.00022851425097083906 + }, + { + "step": 252, + "epoch": 1.575, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.6942, + "grad_norm": 0.11844367533922195, + "learning_rate": 0.00022781598872407822 + }, + { + "step": 253, + "epoch": 1.58125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.7013, + "grad_norm": 0.051238421350717545, + "learning_rate": 0.00022711541162898321 + }, + { + "step": 254, + "epoch": 1.5875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6993, + "grad_norm": 0.06984098255634308, + "learning_rate": 0.00022641254052612627 + }, + { + "step": 255, + "epoch": 1.59375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568716288, + "loss": 0.6868, + "grad_norm": 0.054323676973581314, + "learning_rate": 0.00022570739632432079 + }, + { + "step": 256, + "epoch": 1.6, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.7019, + "grad_norm": 0.13219225406646729, + "learning_rate": 0.000225 + }, + { + "step": 257, + "epoch": 1.60625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.7003, + "grad_norm": 0.09455225616693497, + "learning_rate": 0.0002242903725965924 + }, + { + "step": 258, + "epoch": 1.6125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.7074, + "grad_norm": 0.1854027509689331, + "learning_rate": 0.00022357853522389615 + }, + { + "step": 259, + "epoch": 1.61875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6876, + "grad_norm": 0.113352470099926, + "learning_rate": 0.000222864509057451 + }, + { + "step": 260, + "epoch": 1.625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6877, + "grad_norm": 0.04072001203894615, + "learning_rate": 0.00022214831533790813 + }, + { + "step": 261, + "epoch": 1.63125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.7135, + "grad_norm": 0.2658751308917999, + "learning_rate": 0.0002214299753703987 + }, + { + "step": 262, + "epoch": 1.6375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.6936, + "grad_norm": 0.058157872408628464, + "learning_rate": 0.00022070951052389966 + }, + { + "step": 263, + "epoch": 1.64375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568733184, + "loss": 0.6949, + "grad_norm": 0.04390765354037285, + "learning_rate": 0.00021998694223059837 + }, + { + "step": 264, + "epoch": 1.65, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.6924, + "grad_norm": 0.03930649161338806, + "learning_rate": 0.0002192622919852551 + }, + { + "step": 265, + "epoch": 1.65625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.7017, + "grad_norm": 0.07820788025856018, + "learning_rate": 0.00021853558134456307 + }, + { + "step": 266, + "epoch": 1.6625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.6851, + "grad_norm": 0.0698002353310585, + "learning_rate": 0.00021780683192650796 + }, + { + "step": 267, + "epoch": 1.66875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.6906, + "grad_norm": 0.06187206879258156, + "learning_rate": 0.00021707606540972413 + }, + { + "step": 268, + "epoch": 1.675, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.7028, + "grad_norm": 0.18601058423519135, + "learning_rate": 0.00021634330353285017 + }, + { + "step": 269, + "epoch": 1.68125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.6914, + "grad_norm": 0.06263062357902527, + "learning_rate": 0.00021560856809388213 + }, + { + "step": 270, + "epoch": 1.6875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.7013, + "grad_norm": 0.16186247766017914, + "learning_rate": 0.00021487188094952489 + }, + { + "step": 271, + "epoch": 1.69375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.694, + "grad_norm": 0.07229592651128769, + "learning_rate": 0.0002141332640145423 + }, + { + "step": 272, + "epoch": 1.7, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.6827, + "grad_norm": 0.11698590219020844, + "learning_rate": 0.0002133927392611049 + }, + { + "step": 273, + "epoch": 1.70625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568717824, + "loss": 0.7034, + "grad_norm": 0.13175538182258606, + "learning_rate": 0.00021265032871813658 + }, + { + "step": 274, + "epoch": 1.7125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.6802, + "grad_norm": 0.10112902522087097, + "learning_rate": 0.00021190605447065917 + }, + { + "step": 275, + "epoch": 1.71875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.7048, + "grad_norm": 0.13724392652511597, + "learning_rate": 0.0002111599386591355 + }, + { + "step": 276, + "epoch": 1.725, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568717824, + "loss": 0.7007, + "grad_norm": 0.10959451645612717, + "learning_rate": 0.00021041200347881057 + }, + { + "step": 277, + "epoch": 1.73125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568730112, + "loss": 0.6906, + "grad_norm": 0.12101038545370102, + "learning_rate": 0.00020966227117905163 + }, + { + "step": 278, + "epoch": 1.7375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.6836, + "grad_norm": 0.05412405729293823, + "learning_rate": 0.00020891076406268612 + }, + { + "step": 279, + "epoch": 1.74375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.6999, + "grad_norm": 0.14444603025913239, + "learning_rate": 0.00020815750448533805 + }, + { + "step": 280, + "epoch": 1.75, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568730112, + "loss": 0.6978, + "grad_norm": 0.0637621060013771, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 281, + "epoch": 1.75625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568716288, + "loss": 0.6937, + "grad_norm": 0.10877161473035812, + "learning_rate": 0.00020664581763018324 + }, + { + "step": 282, + "epoch": 1.7625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.699, + "grad_norm": 0.07844477146863937, + "learning_rate": 0.00020588743532161543 + }, + { + "step": 283, + "epoch": 1.76875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.7118, + "grad_norm": 0.25183504819869995, + "learning_rate": 0.00020512739048920552 + }, + { + "step": 284, + "epoch": 1.775, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.7104, + "grad_norm": 0.12056131660938263, + "learning_rate": 0.00020436570574255522 + }, + { + "step": 285, + "epoch": 1.78125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.6963, + "grad_norm": 0.07834596931934357, + "learning_rate": 0.00020360240374005 + }, + { + "step": 286, + "epoch": 1.7875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.6926, + "grad_norm": 0.20563526451587677, + "learning_rate": 0.00020283750718818501 + }, + { + "step": 287, + "epoch": 1.79375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568716288, + "loss": 0.696, + "grad_norm": 0.04166165739297867, + "learning_rate": 0.00020207103884088955 + }, + { + "step": 288, + "epoch": 1.8, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.6997, + "grad_norm": 0.0381128154695034, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 289, + "epoch": 1.80625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.7003, + "grad_norm": 0.07162873446941376, + "learning_rate": 0.00020053347800883298 + }, + { + "step": 290, + "epoch": 1.8125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.6884, + "grad_norm": 0.054096564650535583, + "learning_rate": 0.00019976243126300282 + }, + { + "step": 291, + "epoch": 1.81875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.6912, + "grad_norm": 0.04966523125767708, + "learning_rate": 0.00019898990419824333 + }, + { + "step": 292, + "epoch": 1.825, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568713216, + "loss": 0.6949, + "grad_norm": 0.05009598657488823, + "learning_rate": 0.00019821591979547423 + }, + { + "step": 293, + "epoch": 1.83125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.7062, + "grad_norm": 0.17606361210346222, + "learning_rate": 0.00019744050107896774 + }, + { + "step": 294, + "epoch": 1.8375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568714752, + "loss": 0.7114, + "grad_norm": 0.24837324023246765, + "learning_rate": 0.0001966636711156636 + }, + { + "step": 295, + "epoch": 1.84375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.6922, + "grad_norm": 0.05836619436740875, + "learning_rate": 0.00019588545301448302 + }, + { + "step": 296, + "epoch": 1.85, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6945, + "grad_norm": 0.13102282583713531, + "learning_rate": 0.00019510586992564093 + }, + { + "step": 297, + "epoch": 1.85625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.6967, + "grad_norm": 0.04907192289829254, + "learning_rate": 0.0001943249450399578 + }, + { + "step": 298, + "epoch": 1.8625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.696, + "grad_norm": 0.05123186483979225, + "learning_rate": 0.0001935427015881693 + }, + { + "step": 299, + "epoch": 1.86875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.698, + "grad_norm": 0.07110757380723953, + "learning_rate": 0.00019275916284023563 + }, + { + "step": 300, + "epoch": 1.875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.7009, + "grad_norm": 0.19952966272830963, + "learning_rate": 0.00019197435210464882 + }, + { + "step": 301, + "epoch": 1.88125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.6905, + "grad_norm": 0.050469059497117996, + "learning_rate": 0.00019118829272773985 + }, + { + "step": 302, + "epoch": 1.8875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.6925, + "grad_norm": 0.0514473058283329, + "learning_rate": 0.00019040100809298392 + }, + { + "step": 303, + "epoch": 1.89375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568733184, + "loss": 0.6984, + "grad_norm": 0.08106235414743423, + "learning_rate": 0.00018961252162030476 + }, + { + "step": 304, + "epoch": 1.9, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.6962, + "grad_norm": 0.1008216068148613, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 305, + "epoch": 1.90625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.6964, + "grad_norm": 0.1103982925415039, + "learning_rate": 0.00018803203701893393 + }, + { + "step": 306, + "epoch": 1.9125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.6904, + "grad_norm": 0.05638118460774422, + "learning_rate": 0.00018724008590605742 + }, + { + "step": 307, + "epoch": 1.91875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.6962, + "grad_norm": 0.12480581551790237, + "learning_rate": 0.0001864470269854896 + }, + { + "step": 308, + "epoch": 1.925, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6944, + "grad_norm": 0.07480910420417786, + "learning_rate": 0.00018565288384892595 + }, + { + "step": 309, + "epoch": 1.93125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.7075, + "grad_norm": 0.1889805793762207, + "learning_rate": 0.00018485768012031518 + }, + { + "step": 310, + "epoch": 1.9375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.6909, + "grad_norm": 0.11781422048807144, + "learning_rate": 0.00018406143945515598 + }, + { + "step": 311, + "epoch": 1.94375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568716288, + "loss": 0.6937, + "grad_norm": 0.07443884015083313, + "learning_rate": 0.00018326418553979367 + }, + { + "step": 312, + "epoch": 1.95, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568716288, + "loss": 0.6995, + "grad_norm": 0.09390020370483398, + "learning_rate": 0.0001824659420907154 + }, + { + "step": 313, + "epoch": 1.95625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.6871, + "grad_norm": 0.05187615007162094, + "learning_rate": 0.00018166673285384475 + }, + { + "step": 314, + "epoch": 1.9625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.689, + "grad_norm": 0.04864860698580742, + "learning_rate": 0.00018086658160383523 + }, + { + "step": 315, + "epoch": 1.96875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.6895, + "grad_norm": 0.05322040244936943, + "learning_rate": 0.00018006551214336304 + }, + { + "step": 316, + "epoch": 1.975, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.7021, + "grad_norm": 0.17000792920589447, + "learning_rate": 0.00017926354830241924 + }, + { + "step": 317, + "epoch": 1.98125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.7018, + "grad_norm": 0.14305894076824188, + "learning_rate": 0.00017846071393760044 + }, + { + "step": 318, + "epoch": 1.9875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.6969, + "grad_norm": 0.11205185949802399, + "learning_rate": 0.00017765703293139948 + }, + { + "step": 319, + "epoch": 1.99375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.6941, + "grad_norm": 0.13587351143360138, + "learning_rate": 0.00017685252919149493 + }, + { + "step": 320, + "epoch": 2.0, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56871936, + "loss": 0.6999, + "grad_norm": 0.16821838915348053, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 321, + "epoch": 2.00625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6833, + "grad_norm": 0.25253528356552124, + "learning_rate": 0.00017524114926294887 + }, + { + "step": 322, + "epoch": 2.0125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.6926, + "grad_norm": 0.04928479343652725, + "learning_rate": 0.0001744343210091883 + }, + { + "step": 323, + "epoch": 2.01875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.6972, + "grad_norm": 0.072969950735569, + "learning_rate": 0.00017362676589005967 + }, + { + "step": 324, + "epoch": 2.025, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568733184, + "loss": 0.7143, + "grad_norm": 0.2066340148448944, + "learning_rate": 0.0001728185079284875 + }, + { + "step": 325, + "epoch": 2.03125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.7039, + "grad_norm": 0.12595170736312866, + "learning_rate": 0.00017200957116830423 + }, + { + "step": 326, + "epoch": 2.0375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568731648, + "loss": 0.7089, + "grad_norm": 0.13437609374523163, + "learning_rate": 0.00017119997967353514 + }, + { + "step": 327, + "epoch": 2.04375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.6821, + "grad_norm": 0.03884238749742508, + "learning_rate": 0.00017038975752768211 + }, + { + "step": 328, + "epoch": 2.05, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568717824, + "loss": 0.7091, + "grad_norm": 0.14203065633773804, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 329, + "epoch": 2.05625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568717824, + "loss": 0.714, + "grad_norm": 0.2226100116968155, + "learning_rate": 0.0001687675177098179 + }, + { + "step": 330, + "epoch": 2.0625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.6948, + "grad_norm": 0.10160825401544571, + "learning_rate": 0.00016795554829574435 + }, + { + "step": 331, + "epoch": 2.06875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568723968, + "loss": 0.6967, + "grad_norm": 0.07650664448738098, + "learning_rate": 0.00016714304474502696 + }, + { + "step": 332, + "epoch": 2.075, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.7015, + "grad_norm": 0.21604572236537933, + "learning_rate": 0.00016633003122779467 + }, + { + "step": 333, + "epoch": 2.08125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.702, + "grad_norm": 0.15451766550540924, + "learning_rate": 0.00016551653192934694 + }, + { + "step": 334, + "epoch": 2.0875, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568720896, + "loss": 0.6918, + "grad_norm": 0.04759039357304573, + "learning_rate": 0.0001647025710494341 + }, + { + "step": 335, + "epoch": 2.09375, + "cpu_mem": 1.85526272, + "gpu_mem": 4.56872704, + "loss": 0.69, + "grad_norm": 0.040966518223285675, + "learning_rate": 0.00016388817280153735 + }, + { + "step": 336, + "epoch": 2.1, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568722432, + "loss": 0.6825, + "grad_norm": 0.110944963991642, + "learning_rate": 0.00016307336141214873 + }, + { + "step": 337, + "epoch": 2.10625, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568728576, + "loss": 0.7069, + "grad_norm": 0.18592797219753265, + "learning_rate": 0.00016225816112005022 + }, + { + "step": 338, + "epoch": 2.1125, + "cpu_mem": 1.85526272, + "gpu_mem": 4.568725504, + "loss": 0.7166, + "grad_norm": 0.2256513237953186, + "learning_rate": 0.00016144259617559286 + }, + { + "step": 339, + "epoch": 2.11875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.7061, + "grad_norm": 0.16412708163261414, + "learning_rate": 0.00016062669083997513 + }, + { + "step": 340, + "epoch": 2.125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.7078, + "grad_norm": 0.13440434634685516, + "learning_rate": 0.00015981046938452146 + }, + { + "step": 341, + "epoch": 2.13125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6843, + "grad_norm": 0.0481422059237957, + "learning_rate": 0.00015899395608996015 + }, + { + "step": 342, + "epoch": 2.1375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6813, + "grad_norm": 0.04724964126944542, + "learning_rate": 0.00015817717524570094 + }, + { + "step": 343, + "epoch": 2.14375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6902, + "grad_norm": 0.08720191568136215, + "learning_rate": 0.0001573601511491127 + }, + { + "step": 344, + "epoch": 2.15, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6913, + "grad_norm": 0.06165364757180214, + "learning_rate": 0.00015654290810480042 + }, + { + "step": 345, + "epoch": 2.15625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6925, + "grad_norm": 0.06138116121292114, + "learning_rate": 0.00015572547042388223 + }, + { + "step": 346, + "epoch": 2.1625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568717824, + "loss": 0.6945, + "grad_norm": 0.06758056581020355, + "learning_rate": 0.00015490786242326643 + }, + { + "step": 347, + "epoch": 2.16875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568728576, + "loss": 0.6988, + "grad_norm": 0.13849711418151855, + "learning_rate": 0.00015409010842492777 + }, + { + "step": 348, + "epoch": 2.175, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6993, + "grad_norm": 0.12426025420427322, + "learning_rate": 0.00015327223275518416 + }, + { + "step": 349, + "epoch": 2.18125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871168, + "loss": 0.6923, + "grad_norm": 0.05094262585043907, + "learning_rate": 0.000152454259743973 + }, + { + "step": 350, + "epoch": 2.1875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6874, + "grad_norm": 0.041270505636930466, + "learning_rate": 0.00015163621372412734 + }, + { + "step": 351, + "epoch": 2.19375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6918, + "grad_norm": 0.08740103244781494, + "learning_rate": 0.00015081811903065205 + }, + { + "step": 352, + "epoch": 2.2, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6988, + "grad_norm": 0.05433983355760574, + "learning_rate": 0.00015 + }, + { + "step": 353, + "epoch": 2.20625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6972, + "grad_norm": 0.052345160394907, + "learning_rate": 0.0001491818809693479 + }, + { + "step": 354, + "epoch": 2.2125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6807, + "grad_norm": 0.3569212853908539, + "learning_rate": 0.00014836378627587266 + }, + { + "step": 355, + "epoch": 2.21875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6946, + "grad_norm": 0.10960318893194199, + "learning_rate": 0.00014754574025602698 + }, + { + "step": 356, + "epoch": 2.225, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568717824, + "loss": 0.6999, + "grad_norm": 0.11043385416269302, + "learning_rate": 0.00014672776724481584 + }, + { + "step": 357, + "epoch": 2.23125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.7041, + "grad_norm": 0.1482958048582077, + "learning_rate": 0.00014590989157507224 + }, + { + "step": 358, + "epoch": 2.2375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6925, + "grad_norm": 0.113813117146492, + "learning_rate": 0.00014509213757673357 + }, + { + "step": 359, + "epoch": 2.24375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.7001, + "grad_norm": 0.0789138525724411, + "learning_rate": 0.00014427452957611775 + }, + { + "step": 360, + "epoch": 2.25, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.703, + "grad_norm": 0.1365068256855011, + "learning_rate": 0.0001434570918951996 + }, + { + "step": 361, + "epoch": 2.25625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6946, + "grad_norm": 0.051438212394714355, + "learning_rate": 0.0001426398488508873 + }, + { + "step": 362, + "epoch": 2.2625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568728576, + "loss": 0.7107, + "grad_norm": 0.21676485240459442, + "learning_rate": 0.00014182282475429903 + }, + { + "step": 363, + "epoch": 2.26875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6943, + "grad_norm": 0.0931646004319191, + "learning_rate": 0.00014100604391003985 + }, + { + "step": 364, + "epoch": 2.275, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6938, + "grad_norm": 0.08557265996932983, + "learning_rate": 0.0001401895306154785 + }, + { + "step": 365, + "epoch": 2.28125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6861, + "grad_norm": 0.10873681306838989, + "learning_rate": 0.00013937330916002487 + }, + { + "step": 366, + "epoch": 2.2875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6946, + "grad_norm": 0.03973779082298279, + "learning_rate": 0.00013855740382440714 + }, + { + "step": 367, + "epoch": 2.29375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6891, + "grad_norm": 0.15243104100227356, + "learning_rate": 0.0001377418388799498 + }, + { + "step": 368, + "epoch": 2.3, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6918, + "grad_norm": 0.11295383423566818, + "learning_rate": 0.00013692663858785124 + }, + { + "step": 369, + "epoch": 2.30625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6887, + "grad_norm": 0.10425041615962982, + "learning_rate": 0.00013611182719846268 + }, + { + "step": 370, + "epoch": 2.3125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6952, + "grad_norm": 0.05153089389204979, + "learning_rate": 0.0001352974289505659 + }, + { + "step": 371, + "epoch": 2.31875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6972, + "grad_norm": 0.07740368694067001, + "learning_rate": 0.000134483468070653 + }, + { + "step": 372, + "epoch": 2.325, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.7031, + "grad_norm": 0.096824511885643, + "learning_rate": 0.00013366996877220533 + }, + { + "step": 373, + "epoch": 2.33125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6947, + "grad_norm": 0.07162784785032272, + "learning_rate": 0.000132856955254973 + }, + { + "step": 374, + "epoch": 2.3375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56873472, + "loss": 0.6807, + "grad_norm": 0.10157425701618195, + "learning_rate": 0.00013204445170425565 + }, + { + "step": 375, + "epoch": 2.34375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.705, + "grad_norm": 0.13477148115634918, + "learning_rate": 0.00013123248229018214 + }, + { + "step": 376, + "epoch": 2.35, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.7063, + "grad_norm": 0.1613539308309555, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 377, + "epoch": 2.35625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56873472, + "loss": 0.7096, + "grad_norm": 0.1276397407054901, + "learning_rate": 0.0001296102424723179 + }, + { + "step": 378, + "epoch": 2.3625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568728576, + "loss": 0.6978, + "grad_norm": 0.14898060262203217, + "learning_rate": 0.0001288000203264649 + }, + { + "step": 379, + "epoch": 2.36875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6899, + "grad_norm": 0.050504013895988464, + "learning_rate": 0.00012799042883169574 + }, + { + "step": 380, + "epoch": 2.375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.7026, + "grad_norm": 0.1839565485715866, + "learning_rate": 0.00012718149207151247 + }, + { + "step": 381, + "epoch": 2.38125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6884, + "grad_norm": 0.04726330190896988, + "learning_rate": 0.00012637323410994033 + }, + { + "step": 382, + "epoch": 2.3875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6977, + "grad_norm": 0.12925122678279877, + "learning_rate": 0.0001255656789908117 + }, + { + "step": 383, + "epoch": 2.39375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6966, + "grad_norm": 0.11150603741407394, + "learning_rate": 0.0001247588507370511 + }, + { + "step": 384, + "epoch": 2.4, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6999, + "grad_norm": 0.07461898028850555, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 385, + "epoch": 2.40625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6988, + "grad_norm": 0.1526094526052475, + "learning_rate": 0.0001231474708085051 + }, + { + "step": 386, + "epoch": 2.4125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568716288, + "loss": 0.6974, + "grad_norm": 0.06657658517360687, + "learning_rate": 0.0001223429670686005 + }, + { + "step": 387, + "epoch": 2.41875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6859, + "grad_norm": 0.06916019320487976, + "learning_rate": 0.00012153928606239957 + }, + { + "step": 388, + "epoch": 2.425, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6984, + "grad_norm": 0.09099823236465454, + "learning_rate": 0.00012073645169758076 + }, + { + "step": 389, + "epoch": 2.43125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6916, + "grad_norm": 0.04499451443552971, + "learning_rate": 0.00011993448785663692 + }, + { + "step": 390, + "epoch": 2.4375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.7035, + "grad_norm": 0.14227603375911713, + "learning_rate": 0.00011913341839616476 + }, + { + "step": 391, + "epoch": 2.44375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6979, + "grad_norm": 0.07381153851747513, + "learning_rate": 0.00011833326714615522 + }, + { + "step": 392, + "epoch": 2.45, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568731648, + "loss": 0.6954, + "grad_norm": 0.05968286469578743, + "learning_rate": 0.00011753405790928456 + }, + { + "step": 393, + "epoch": 2.45625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.7062, + "grad_norm": 0.1184651181101799, + "learning_rate": 0.0001167358144602063 + }, + { + "step": 394, + "epoch": 2.4625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6905, + "grad_norm": 0.03869135305285454, + "learning_rate": 0.00011593856054484402 + }, + { + "step": 395, + "epoch": 2.46875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.7043, + "grad_norm": 0.26188573241233826, + "learning_rate": 0.00011514231987968482 + }, + { + "step": 396, + "epoch": 2.475, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.7, + "grad_norm": 0.0695328488945961, + "learning_rate": 0.00011434711615107404 + }, + { + "step": 397, + "epoch": 2.48125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568716288, + "loss": 0.692, + "grad_norm": 0.052054740488529205, + "learning_rate": 0.00011355297301451042 + }, + { + "step": 398, + "epoch": 2.4875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568731648, + "loss": 0.6987, + "grad_norm": 0.04661169648170471, + "learning_rate": 0.00011275991409394253 + }, + { + "step": 399, + "epoch": 2.49375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6969, + "grad_norm": 0.10610973089933395, + "learning_rate": 0.00011196796298106608 + }, + { + "step": 400, + "epoch": 2.5, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.7022, + "grad_norm": 0.10162895917892456, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 401, + "epoch": 2.50625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6885, + "grad_norm": 0.13090471923351288, + "learning_rate": 0.00011038747837969526 + }, + { + "step": 402, + "epoch": 2.5125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6965, + "grad_norm": 0.0425846241414547, + "learning_rate": 0.00010959899190701608 + }, + { + "step": 403, + "epoch": 2.51875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6911, + "grad_norm": 0.0407833606004715, + "learning_rate": 0.00010881170727226018 + }, + { + "step": 404, + "epoch": 2.525, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.7057, + "grad_norm": 0.13415461778640747, + "learning_rate": 0.00010802564789535119 + }, + { + "step": 405, + "epoch": 2.53125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6892, + "grad_norm": 0.04620187357068062, + "learning_rate": 0.00010724083715976441 + }, + { + "step": 406, + "epoch": 2.5375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6791, + "grad_norm": 0.05101415142416954, + "learning_rate": 0.00010645729841183066 + }, + { + "step": 407, + "epoch": 2.54375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568728576, + "loss": 0.6883, + "grad_norm": 0.03434237465262413, + "learning_rate": 0.00010567505496004213 + }, + { + "step": 408, + "epoch": 2.55, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568730112, + "loss": 0.6833, + "grad_norm": 0.0376986563205719, + "learning_rate": 0.00010489413007435904 + }, + { + "step": 409, + "epoch": 2.55625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6865, + "grad_norm": 0.03673483803868294, + "learning_rate": 0.00010411454698551695 + }, + { + "step": 410, + "epoch": 2.5625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6883, + "grad_norm": 0.05888773128390312, + "learning_rate": 0.00010333632888433638 + }, + { + "step": 411, + "epoch": 2.56875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568728576, + "loss": 0.7054, + "grad_norm": 0.15462328493595123, + "learning_rate": 0.00010255949892103225 + }, + { + "step": 412, + "epoch": 2.575, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.69, + "grad_norm": 0.036005232483148575, + "learning_rate": 0.00010178408020452579 + }, + { + "step": 413, + "epoch": 2.58125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568731648, + "loss": 0.689, + "grad_norm": 0.048952847719192505, + "learning_rate": 0.00010101009580175669 + }, + { + "step": 414, + "epoch": 2.5875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6982, + "grad_norm": 0.07073793560266495, + "learning_rate": 0.00010023756873699722 + }, + { + "step": 415, + "epoch": 2.59375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6906, + "grad_norm": 0.10087292641401291, + "learning_rate": 9.946652199116699e-05 + }, + { + "step": 416, + "epoch": 2.6, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.7053, + "grad_norm": 0.14664332568645477, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 417, + "epoch": 2.60625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.7097, + "grad_norm": 0.14997157454490662, + "learning_rate": 9.792896115911045e-05 + }, + { + "step": 418, + "epoch": 2.6125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6881, + "grad_norm": 0.051492393016815186, + "learning_rate": 9.716249281181497e-05 + }, + { + "step": 419, + "epoch": 2.61875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.7057, + "grad_norm": 0.17492865025997162, + "learning_rate": 9.639759625994998e-05 + }, + { + "step": 420, + "epoch": 2.625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.689, + "grad_norm": 0.04631288722157478, + "learning_rate": 9.563429425744476e-05 + }, + { + "step": 421, + "epoch": 2.63125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6981, + "grad_norm": 0.07247461378574371, + "learning_rate": 9.487260951079448e-05 + }, + { + "step": 422, + "epoch": 2.6375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6919, + "grad_norm": 0.11709819734096527, + "learning_rate": 9.411256467838455e-05 + }, + { + "step": 423, + "epoch": 2.64375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6949, + "grad_norm": 0.1885722279548645, + "learning_rate": 9.335418236981677e-05 + }, + { + "step": 424, + "epoch": 2.65, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.702, + "grad_norm": 0.05048968270421028, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 425, + "epoch": 2.65625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6927, + "grad_norm": 0.056769296526908875, + "learning_rate": 9.184249551466189e-05 + }, + { + "step": 426, + "epoch": 2.6625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568717824, + "loss": 0.6943, + "grad_norm": 0.10498225688934326, + "learning_rate": 9.10892359373139e-05 + }, + { + "step": 427, + "epoch": 2.66875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568731648, + "loss": 0.6969, + "grad_norm": 0.044346872717142105, + "learning_rate": 9.033772882094833e-05 + }, + { + "step": 428, + "epoch": 2.675, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568716288, + "loss": 0.706, + "grad_norm": 0.17917107045650482, + "learning_rate": 8.958799652118943e-05 + }, + { + "step": 429, + "epoch": 2.68125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.693, + "grad_norm": 0.049765463918447495, + "learning_rate": 8.884006134086449e-05 + }, + { + "step": 430, + "epoch": 2.6875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6928, + "grad_norm": 0.042269930243492126, + "learning_rate": 8.809394552934079e-05 + }, + { + "step": 431, + "epoch": 2.69375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6835, + "grad_norm": 0.10812237858772278, + "learning_rate": 8.734967128186338e-05 + }, + { + "step": 432, + "epoch": 2.7, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568717824, + "loss": 0.7041, + "grad_norm": 0.05831798538565636, + "learning_rate": 8.660726073889511e-05 + }, + { + "step": 433, + "epoch": 2.70625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6951, + "grad_norm": 0.07923564314842224, + "learning_rate": 8.586673598545771e-05 + }, + { + "step": 434, + "epoch": 2.7125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6974, + "grad_norm": 0.07728171348571777, + "learning_rate": 8.512811905047505e-05 + }, + { + "step": 435, + "epoch": 2.71875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568717824, + "loss": 0.6932, + "grad_norm": 0.047816164791584015, + "learning_rate": 8.439143190611787e-05 + }, + { + "step": 436, + "epoch": 2.725, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.691, + "grad_norm": 0.04560060054063797, + "learning_rate": 8.365669646714983e-05 + }, + { + "step": 437, + "epoch": 2.73125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568730112, + "loss": 0.705, + "grad_norm": 0.07674155384302139, + "learning_rate": 8.29239345902759e-05 + }, + { + "step": 438, + "epoch": 2.7375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.7015, + "grad_norm": 0.05421040952205658, + "learning_rate": 8.219316807349204e-05 + }, + { + "step": 439, + "epoch": 2.74375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6935, + "grad_norm": 0.04990521818399429, + "learning_rate": 8.146441865543689e-05 + }, + { + "step": 440, + "epoch": 2.75, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.686, + "grad_norm": 0.20847037434577942, + "learning_rate": 8.073770801474495e-05 + }, + { + "step": 441, + "epoch": 2.75625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6966, + "grad_norm": 0.04778938367962837, + "learning_rate": 8.001305776940163e-05 + }, + { + "step": 442, + "epoch": 2.7625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6943, + "grad_norm": 0.0582558773458004, + "learning_rate": 7.929048947610034e-05 + }, + { + "step": 443, + "epoch": 2.76875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6923, + "grad_norm": 0.06871891021728516, + "learning_rate": 7.857002462960132e-05 + }, + { + "step": 444, + "epoch": 2.775, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6917, + "grad_norm": 0.06936467438936234, + "learning_rate": 7.785168466209187e-05 + }, + { + "step": 445, + "epoch": 2.78125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568728576, + "loss": 0.6929, + "grad_norm": 0.05200151354074478, + "learning_rate": 7.713549094254897e-05 + }, + { + "step": 446, + "epoch": 2.7875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.687, + "grad_norm": 0.09301310032606125, + "learning_rate": 7.64214647761038e-05 + }, + { + "step": 447, + "epoch": 2.79375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6917, + "grad_norm": 0.06589304655790329, + "learning_rate": 7.570962740340759e-05 + }, + { + "step": 448, + "epoch": 2.8, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568717824, + "loss": 0.6878, + "grad_norm": 0.056901391595602036, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 449, + "epoch": 2.80625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.687, + "grad_norm": 0.05450790748000145, + "learning_rate": 7.429260367567916e-05 + }, + { + "step": 450, + "epoch": 2.8125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.7018, + "grad_norm": 0.11489569395780563, + "learning_rate": 7.358745947387373e-05 + }, + { + "step": 451, + "epoch": 2.81875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6954, + "grad_norm": 0.051400862634181976, + "learning_rate": 7.288458837101675e-05 + }, + { + "step": 452, + "epoch": 2.825, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568728576, + "loss": 0.6772, + "grad_norm": 0.16284003853797913, + "learning_rate": 7.218401127592175e-05 + }, + { + "step": 453, + "epoch": 2.83125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6758, + "grad_norm": 0.11490625143051147, + "learning_rate": 7.14857490291609e-05 + }, + { + "step": 454, + "epoch": 2.8375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6995, + "grad_norm": 0.03971033915877342, + "learning_rate": 7.07898224024448e-05 + }, + { + "step": 455, + "epoch": 2.84375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.7049, + "grad_norm": 0.15054874122142792, + "learning_rate": 7.009625209800465e-05 + }, + { + "step": 456, + "epoch": 2.85, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.7028, + "grad_norm": 0.1298811286687851, + "learning_rate": 6.940505874797639e-05 + }, + { + "step": 457, + "epoch": 2.85625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.7, + "grad_norm": 0.10595667362213135, + "learning_rate": 6.871626291378728e-05 + }, + { + "step": 458, + "epoch": 2.8625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.7118, + "grad_norm": 0.19149240851402283, + "learning_rate": 6.80298850855435e-05 + }, + { + "step": 459, + "epoch": 2.86875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.7037, + "grad_norm": 0.08489054441452026, + "learning_rate": 6.734594568142142e-05 + }, + { + "step": 460, + "epoch": 2.875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.7036, + "grad_norm": 0.1307344287633896, + "learning_rate": 6.66644650470597e-05 + }, + { + "step": 461, + "epoch": 2.88125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568717824, + "loss": 0.6803, + "grad_norm": 0.19161999225616455, + "learning_rate": 6.598546345495417e-05 + }, + { + "step": 462, + "epoch": 2.8875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.7099, + "grad_norm": 0.22361429035663605, + "learning_rate": 6.530896110385494e-05 + }, + { + "step": 463, + "epoch": 2.89375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.7, + "grad_norm": 0.08471038192510605, + "learning_rate": 6.463497811816523e-05 + }, + { + "step": 464, + "epoch": 2.9, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6886, + "grad_norm": 0.04230336844921112, + "learning_rate": 6.396353454734311e-05 + }, + { + "step": 465, + "epoch": 2.90625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6976, + "grad_norm": 0.18010984361171722, + "learning_rate": 6.32946503653045e-05 + }, + { + "step": 466, + "epoch": 2.9125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568728576, + "loss": 0.6984, + "grad_norm": 0.04509400576353073, + "learning_rate": 6.262834546982969e-05 + }, + { + "step": 467, + "epoch": 2.91875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568731648, + "loss": 0.6958, + "grad_norm": 0.06472751498222351, + "learning_rate": 6.196463968197084e-05 + }, + { + "step": 468, + "epoch": 2.925, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6913, + "grad_norm": 0.05414916202425957, + "learning_rate": 6.130355274546267e-05 + }, + { + "step": 469, + "epoch": 2.93125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6915, + "grad_norm": 0.1201988086104393, + "learning_rate": 6.064510432613499e-05 + }, + { + "step": 470, + "epoch": 2.9375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6956, + "grad_norm": 0.06418033689260483, + "learning_rate": 5.998931401132786e-05 + }, + { + "step": 471, + "epoch": 2.94375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6926, + "grad_norm": 0.15704886615276337, + "learning_rate": 5.933620130930867e-05 + }, + { + "step": 472, + "epoch": 2.95, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6955, + "grad_norm": 0.08756083995103836, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 473, + "epoch": 2.95625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568728576, + "loss": 0.6884, + "grad_norm": 0.04312921315431595, + "learning_rate": 5.803808637786135e-05 + }, + { + "step": 474, + "epoch": 2.9625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568716288, + "loss": 0.6945, + "grad_norm": 0.06447487324476242, + "learning_rate": 5.739312276439427e-05 + }, + { + "step": 475, + "epoch": 2.96875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6929, + "grad_norm": 0.03452318534255028, + "learning_rate": 5.6750913994488415e-05 + }, + { + "step": 476, + "epoch": 2.975, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.694, + "grad_norm": 0.04169880598783493, + "learning_rate": 5.6111479172391136e-05 + }, + { + "step": 477, + "epoch": 2.98125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.694, + "grad_norm": 0.061373088508844376, + "learning_rate": 5.5474837319831314e-05 + }, + { + "step": 478, + "epoch": 2.9875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6924, + "grad_norm": 0.05678066238760948, + "learning_rate": 5.4841007375453186e-05 + }, + { + "step": 479, + "epoch": 2.99375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568728576, + "loss": 0.6928, + "grad_norm": 0.07065337151288986, + "learning_rate": 5.4210008194253196e-05 + }, + { + "step": 480, + "epoch": 3.0, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6881, + "grad_norm": 0.06494921445846558, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 481, + "epoch": 3.00625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568716288, + "loss": 0.6842, + "grad_norm": 0.07169365882873535, + "learning_rate": 5.2956577119771405e-05 + }, + { + "step": 482, + "epoch": 3.0125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568717824, + "loss": 0.6937, + "grad_norm": 0.10525424033403397, + "learning_rate": 5.233418251320765e-05 + }, + { + "step": 483, + "epoch": 3.01875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.7056, + "grad_norm": 0.13718923926353455, + "learning_rate": 5.171469324214901e-05 + }, + { + "step": 484, + "epoch": 3.025, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6982, + "grad_norm": 0.054841190576553345, + "learning_rate": 5.109812773498967e-05 + }, + { + "step": 485, + "epoch": 3.03125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.7002, + "grad_norm": 0.13110651075839996, + "learning_rate": 5.048450433314835e-05 + }, + { + "step": 486, + "epoch": 3.0375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6869, + "grad_norm": 0.0696168765425682, + "learning_rate": 4.987384129052291e-05 + }, + { + "step": 487, + "epoch": 3.04375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.7015, + "grad_norm": 0.08911189436912537, + "learning_rate": 4.926615677294723e-05 + }, + { + "step": 488, + "epoch": 3.05, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.7003, + "grad_norm": 0.1656215488910675, + "learning_rate": 4.866146885765096e-05 + }, + { + "step": 489, + "epoch": 3.05625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568728576, + "loss": 0.6881, + "grad_norm": 0.12205776572227478, + "learning_rate": 4.8059795532721575e-05 + }, + { + "step": 490, + "epoch": 3.0625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6987, + "grad_norm": 0.10236950218677521, + "learning_rate": 4.7461154696569294e-05 + }, + { + "step": 491, + "epoch": 3.06875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.689, + "grad_norm": 0.10472013801336288, + "learning_rate": 4.686556415739488e-05 + }, + { + "step": 492, + "epoch": 3.075, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6885, + "grad_norm": 0.06311122328042984, + "learning_rate": 4.62730416326596e-05 + }, + { + "step": 493, + "epoch": 3.08125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6921, + "grad_norm": 0.14902713894844055, + "learning_rate": 4.568360474855826e-05 + }, + { + "step": 494, + "epoch": 3.0875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6927, + "grad_norm": 0.07632028311491013, + "learning_rate": 4.509727103949492e-05 + }, + { + "step": 495, + "epoch": 3.09375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6829, + "grad_norm": 0.17148610949516296, + "learning_rate": 4.451405794756138e-05 + }, + { + "step": 496, + "epoch": 3.1, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568731648, + "loss": 0.6946, + "grad_norm": 0.05878931283950806, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 497, + "epoch": 3.10625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568731648, + "loss": 0.697, + "grad_norm": 0.14408718049526215, + "learning_rate": 4.33570629187776e-05 + }, + { + "step": 498, + "epoch": 3.1125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6954, + "grad_norm": 0.18540993332862854, + "learning_rate": 4.278331539989307e-05 + }, + { + "step": 499, + "epoch": 3.11875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568730112, + "loss": 0.6919, + "grad_norm": 0.05077129602432251, + "learning_rate": 4.2212757333045283e-05 + }, + { + "step": 500, + "epoch": 3.125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6947, + "grad_norm": 0.07814621925354004, + "learning_rate": 4.164540569103667e-05 + }, + { + "step": 501, + "epoch": 3.13125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6931, + "grad_norm": 0.03943296894431114, + "learning_rate": 4.108127735128561e-05 + }, + { + "step": 502, + "epoch": 3.1375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568728576, + "loss": 0.6931, + "grad_norm": 0.08433721214532852, + "learning_rate": 4.052038909532469e-05 + }, + { + "step": 503, + "epoch": 3.14375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6908, + "grad_norm": 0.10744985938072205, + "learning_rate": 3.996275760830125e-05 + }, + { + "step": 504, + "epoch": 3.15, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6997, + "grad_norm": 0.037091366946697235, + "learning_rate": 3.94083994784814e-05 + }, + { + "step": 505, + "epoch": 3.15625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6953, + "grad_norm": 0.14634302258491516, + "learning_rate": 3.885733119675616e-05 + }, + { + "step": 506, + "epoch": 3.1625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6929, + "grad_norm": 0.050474293529987335, + "learning_rate": 3.830956915615106e-05 + }, + { + "step": 507, + "epoch": 3.16875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568730112, + "loss": 0.6962, + "grad_norm": 0.04855343699455261, + "learning_rate": 3.776512965133863e-05 + }, + { + "step": 508, + "epoch": 3.175, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568716288, + "loss": 0.6894, + "grad_norm": 0.1029915064573288, + "learning_rate": 3.72240288781534e-05 + }, + { + "step": 509, + "epoch": 3.18125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6958, + "grad_norm": 0.11748373508453369, + "learning_rate": 3.66862829331103e-05 + }, + { + "step": 510, + "epoch": 3.1875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568717824, + "loss": 0.6958, + "grad_norm": 0.07771521061658859, + "learning_rate": 3.6151907812925717e-05 + }, + { + "step": 511, + "epoch": 3.19375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6935, + "grad_norm": 0.04995797947049141, + "learning_rate": 3.562091941404179e-05 + }, + { + "step": 512, + "epoch": 3.2, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568728576, + "loss": 0.6964, + "grad_norm": 0.03591790795326233, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 513, + "epoch": 3.20625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6968, + "grad_norm": 0.08589250594377518, + "learning_rate": 3.456916586173797e-05 + }, + { + "step": 514, + "epoch": 3.2125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6933, + "grad_norm": 0.10803424566984177, + "learning_rate": 3.404843199558945e-05 + }, + { + "step": 515, + "epoch": 3.21875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6983, + "grad_norm": 0.12387946993112564, + "learning_rate": 3.3531147424353664e-05 + }, + { + "step": 516, + "epoch": 3.225, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6936, + "grad_norm": 0.10464803129434586, + "learning_rate": 3.301732753606776e-05 + }, + { + "step": 517, + "epoch": 3.23125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568717824, + "loss": 0.6924, + "grad_norm": 0.215131938457489, + "learning_rate": 3.250698761570244e-05 + }, + { + "step": 518, + "epoch": 3.2375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6854, + "grad_norm": 0.25530877709388733, + "learning_rate": 3.200014284470745e-05 + }, + { + "step": 519, + "epoch": 3.24375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6957, + "grad_norm": 0.034736596047878265, + "learning_rate": 3.149680830055967e-05 + }, + { + "step": 520, + "epoch": 3.25, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6892, + "grad_norm": 0.11319643259048462, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 521, + "epoch": 3.25625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6946, + "grad_norm": 0.04104110971093178, + "learning_rate": 3.0500729680161663e-05 + }, + { + "step": 522, + "epoch": 3.2625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6885, + "grad_norm": 0.10457754880189896, + "learning_rate": 3.0008015234980552e-05 + }, + { + "step": 523, + "epoch": 3.26875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6942, + "grad_norm": 0.036570653319358826, + "learning_rate": 2.9518870277903274e-05 + }, + { + "step": 524, + "epoch": 3.275, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6838, + "grad_norm": 0.04337208718061447, + "learning_rate": 2.9033309359877597e-05 + }, + { + "step": 525, + "epoch": 3.28125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568717824, + "loss": 0.6996, + "grad_norm": 0.1480090469121933, + "learning_rate": 2.855134692523438e-05 + }, + { + "step": 526, + "epoch": 3.2875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6886, + "grad_norm": 0.05210980772972107, + "learning_rate": 2.807299731125773e-05 + }, + { + "step": 527, + "epoch": 3.29375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568728576, + "loss": 0.6988, + "grad_norm": 0.06615308672189713, + "learning_rate": 2.759827474775852e-05 + }, + { + "step": 528, + "epoch": 3.3, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568717824, + "loss": 0.6893, + "grad_norm": 0.06612817198038101, + "learning_rate": 2.7127193356651213e-05 + }, + { + "step": 529, + "epoch": 3.30625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6906, + "grad_norm": 0.14526495337486267, + "learning_rate": 2.665976715153377e-05 + }, + { + "step": 530, + "epoch": 3.3125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6878, + "grad_norm": 0.12375463545322418, + "learning_rate": 2.619601003727043e-05 + }, + { + "step": 531, + "epoch": 3.31875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568736256, + "loss": 0.6973, + "grad_norm": 0.1709333062171936, + "learning_rate": 2.5735935809578656e-05 + }, + { + "step": 532, + "epoch": 3.325, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.7008, + "grad_norm": 0.166754812002182, + "learning_rate": 2.5279558154618197e-05 + }, + { + "step": 533, + "epoch": 3.33125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6911, + "grad_norm": 0.07224541902542114, + "learning_rate": 2.4826890648584353e-05 + }, + { + "step": 534, + "epoch": 3.3375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6963, + "grad_norm": 0.058658402413129807, + "learning_rate": 2.4377946757303828e-05 + }, + { + "step": 535, + "epoch": 3.34375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6981, + "grad_norm": 0.05131181702017784, + "learning_rate": 2.393273983583427e-05 + }, + { + "step": 536, + "epoch": 3.35, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6962, + "grad_norm": 0.07549209892749786, + "learning_rate": 2.3491283128067174e-05 + }, + { + "step": 537, + "epoch": 3.35625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6901, + "grad_norm": 0.07970201969146729, + "learning_rate": 2.3053589766333414e-05 + }, + { + "step": 538, + "epoch": 3.3625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6901, + "grad_norm": 0.08799194544553757, + "learning_rate": 2.261967277101318e-05 + }, + { + "step": 539, + "epoch": 3.36875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6952, + "grad_norm": 0.05751499533653259, + "learning_rate": 2.218954505014821e-05 + }, + { + "step": 540, + "epoch": 3.375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6943, + "grad_norm": 0.15134644508361816, + "learning_rate": 2.1763219399058042e-05 + }, + { + "step": 541, + "epoch": 3.38125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6882, + "grad_norm": 0.03918123245239258, + "learning_rate": 2.1340708499959197e-05 + }, + { + "step": 542, + "epoch": 3.3875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6941, + "grad_norm": 0.09022883325815201, + "learning_rate": 2.0922024921588167e-05 + }, + { + "step": 543, + "epoch": 3.39375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6919, + "grad_norm": 0.08075565099716187, + "learning_rate": 2.0507181118827254e-05 + }, + { + "step": 544, + "epoch": 3.4, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6888, + "grad_norm": 0.03629725053906441, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 545, + "epoch": 3.40625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568730112, + "loss": 0.7006, + "grad_norm": 0.04961460828781128, + "learning_rate": 1.9689062088175154e-05 + }, + { + "step": 546, + "epoch": 3.4125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6925, + "grad_norm": 0.03605938330292702, + "learning_rate": 1.928581119746081e-05 + }, + { + "step": 547, + "epoch": 3.41875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.693, + "grad_norm": 0.10066578537225723, + "learning_rate": 1.8886448755986193e-05 + }, + { + "step": 548, + "epoch": 3.425, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6908, + "grad_norm": 0.058576423674821854, + "learning_rate": 1.8490986643873845e-05 + }, + { + "step": 549, + "epoch": 3.43125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6969, + "grad_norm": 0.06309940665960312, + "learning_rate": 1.8099436625220443e-05 + }, + { + "step": 550, + "epoch": 3.4375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568733184, + "loss": 0.694, + "grad_norm": 0.07262679189443588, + "learning_rate": 1.7711810347746757e-05 + }, + { + "step": 551, + "epoch": 3.44375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.694, + "grad_norm": 0.07931637018918991, + "learning_rate": 1.7328119342451165e-05 + }, + { + "step": 552, + "epoch": 3.45, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568717824, + "loss": 0.6934, + "grad_norm": 0.06133287027478218, + "learning_rate": 1.694837502326674e-05 + }, + { + "step": 553, + "epoch": 3.45625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.693, + "grad_norm": 0.070188969373703, + "learning_rate": 1.6572588686721606e-05 + }, + { + "step": 554, + "epoch": 3.4625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6918, + "grad_norm": 0.14381824433803558, + "learning_rate": 1.6200771511602882e-05 + }, + { + "step": 555, + "epoch": 3.46875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568730112, + "loss": 0.6948, + "grad_norm": 0.27887627482414246, + "learning_rate": 1.583293455862422e-05 + }, + { + "step": 556, + "epoch": 3.475, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6917, + "grad_norm": 0.04885395988821983, + "learning_rate": 1.546908877009676e-05 + }, + { + "step": 557, + "epoch": 3.48125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6947, + "grad_norm": 0.10982704162597656, + "learning_rate": 1.5109244969603546e-05 + }, + { + "step": 558, + "epoch": 3.4875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6952, + "grad_norm": 0.0856032595038414, + "learning_rate": 1.4753413861677604e-05 + }, + { + "step": 559, + "epoch": 3.49375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.696, + "grad_norm": 0.09806036204099655, + "learning_rate": 1.4401606031483497e-05 + }, + { + "step": 560, + "epoch": 3.5, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6922, + "grad_norm": 0.07029327005147934, + "learning_rate": 1.4053831944502508e-05 + }, + { + "step": 561, + "epoch": 3.50625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6906, + "grad_norm": 0.05572587996721268, + "learning_rate": 1.371010194622117e-05 + }, + { + "step": 562, + "epoch": 3.5125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6996, + "grad_norm": 0.16475233435630798, + "learning_rate": 1.3370426261823613e-05 + }, + { + "step": 563, + "epoch": 3.51875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6918, + "grad_norm": 0.052723683416843414, + "learning_rate": 1.3034814995887433e-05 + }, + { + "step": 564, + "epoch": 3.525, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568717824, + "loss": 0.6922, + "grad_norm": 0.06842104345560074, + "learning_rate": 1.2703278132082934e-05 + }, + { + "step": 565, + "epoch": 3.53125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6961, + "grad_norm": 0.040101371705532074, + "learning_rate": 1.237582553287631e-05 + }, + { + "step": 566, + "epoch": 3.5375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6974, + "grad_norm": 0.1878637969493866, + "learning_rate": 1.205246693923616e-05 + }, + { + "step": 567, + "epoch": 3.54375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.7008, + "grad_norm": 0.11525216698646545, + "learning_rate": 1.173321197034382e-05 + }, + { + "step": 568, + "epoch": 3.55, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6967, + "grad_norm": 0.09618380665779114, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 569, + "epoch": 3.55625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6988, + "grad_norm": 0.13766540586948395, + "learning_rate": 1.1107050772877507e-05 + }, + { + "step": 570, + "epoch": 3.5625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568717824, + "loss": 0.6946, + "grad_norm": 0.05730373039841652, + "learning_rate": 1.0800163171172332e-05 + }, + { + "step": 571, + "epoch": 3.56875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568728576, + "loss": 0.6942, + "grad_norm": 0.054478440433740616, + "learning_rate": 1.0497416447398187e-05 + }, + { + "step": 572, + "epoch": 3.575, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6873, + "grad_norm": 0.1401209980249405, + "learning_rate": 1.0198819607580233e-05 + }, + { + "step": 573, + "epoch": 3.58125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568728576, + "loss": 0.6974, + "grad_norm": 0.07234185934066772, + "learning_rate": 9.904381534293993e-06 + }, + { + "step": 574, + "epoch": 3.5875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6892, + "grad_norm": 0.09473156929016113, + "learning_rate": 9.614110986401169e-06 + }, + { + "step": 575, + "epoch": 3.59375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6915, + "grad_norm": 0.04797021672129631, + "learning_rate": 9.32801659878905e-06 + }, + { + "step": 576, + "epoch": 3.6, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6992, + "grad_norm": 0.09570544958114624, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 577, + "epoch": 3.60625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568717824, + "loss": 0.6932, + "grad_norm": 0.04867026209831238, + "learning_rate": 8.768390222546895e-06 + }, + { + "step": 578, + "epoch": 3.6125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568731648, + "loss": 0.6956, + "grad_norm": 0.05665061995387077, + "learning_rate": 8.494874881526215e-06 + }, + { + "step": 579, + "epoch": 3.61875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6915, + "grad_norm": 0.038303617388010025, + "learning_rate": 8.225568995509834e-06 + }, + { + "step": 580, + "epoch": 3.625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6883, + "grad_norm": 0.15117743611335754, + "learning_rate": 7.960480575734162e-06 + }, + { + "step": 581, + "epoch": 3.63125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6897, + "grad_norm": 0.06413128226995468, + "learning_rate": 7.699617507975563e-06 + }, + { + "step": 582, + "epoch": 3.6375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568716288, + "loss": 0.6917, + "grad_norm": 0.04234510660171509, + "learning_rate": 7.442987552315833e-06 + }, + { + "step": 583, + "epoch": 3.64375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6918, + "grad_norm": 0.04766499251127243, + "learning_rate": 7.190598342911358e-06 + }, + { + "step": 584, + "epoch": 3.65, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6897, + "grad_norm": 0.03783171996474266, + "learning_rate": 6.942457387765976e-06 + }, + { + "step": 585, + "epoch": 3.65625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568731648, + "loss": 0.6939, + "grad_norm": 0.06593240797519684, + "learning_rate": 6.698572068507596e-06 + }, + { + "step": 586, + "epoch": 3.6625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6991, + "grad_norm": 0.16689562797546387, + "learning_rate": 6.458949640168675e-06 + }, + { + "step": 587, + "epoch": 3.66875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568731648, + "loss": 0.6886, + "grad_norm": 0.060014791786670685, + "learning_rate": 6.223597230970428e-06 + }, + { + "step": 588, + "epoch": 3.675, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6852, + "grad_norm": 0.13220937550067902, + "learning_rate": 5.992521842110709e-06 + }, + { + "step": 589, + "epoch": 3.68125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6955, + "grad_norm": 0.09731923788785934, + "learning_rate": 5.7657303475556974e-06 + }, + { + "step": 590, + "epoch": 3.6875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.697, + "grad_norm": 0.14534607529640198, + "learning_rate": 5.543229493835594e-06 + }, + { + "step": 591, + "epoch": 3.69375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6902, + "grad_norm": 0.20513303577899933, + "learning_rate": 5.325025899843732e-06 + }, + { + "step": 592, + "epoch": 3.7, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568730112, + "loss": 0.6982, + "grad_norm": 0.11945518106222153, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 593, + "epoch": 3.70625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.698, + "grad_norm": 0.0876612439751625, + "learning_rate": 4.901536327256589e-06 + }, + { + "step": 594, + "epoch": 3.7125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.7008, + "grad_norm": 0.21419629454612732, + "learning_rate": 4.6962629465110365e-06 + }, + { + "step": 595, + "epoch": 3.71875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568730112, + "loss": 0.6932, + "grad_norm": 0.04071776941418648, + "learning_rate": 4.495312020818403e-06 + }, + { + "step": 596, + "epoch": 3.725, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6906, + "grad_norm": 0.11549919098615646, + "learning_rate": 4.298689528010785e-06 + }, + { + "step": 597, + "epoch": 3.73125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6932, + "grad_norm": 0.06317292898893356, + "learning_rate": 4.106401317159275e-06 + }, + { + "step": 598, + "epoch": 3.7375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6923, + "grad_norm": 0.11856354773044586, + "learning_rate": 3.918453108399955e-06 + }, + { + "step": 599, + "epoch": 3.74375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.691, + "grad_norm": 0.07129275053739548, + "learning_rate": 3.7348504927637302e-06 + }, + { + "step": 600, + "epoch": 3.75, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6933, + "grad_norm": 0.06670328229665756, + "learning_rate": 3.5555989320099952e-06 + }, + { + "step": 601, + "epoch": 3.75625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568728576, + "loss": 0.6945, + "grad_norm": 0.04833657667040825, + "learning_rate": 3.3807037584642316e-06 + }, + { + "step": 602, + "epoch": 3.7625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6881, + "grad_norm": 0.0667136162519455, + "learning_rate": 3.21017017485925e-06 + }, + { + "step": 603, + "epoch": 3.76875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568728576, + "loss": 0.694, + "grad_norm": 0.1077226921916008, + "learning_rate": 3.0440032541805825e-06 + }, + { + "step": 604, + "epoch": 3.775, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.7015, + "grad_norm": 0.05046232044696808, + "learning_rate": 2.882207939515435e-06 + }, + { + "step": 605, + "epoch": 3.78125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568717824, + "loss": 0.6964, + "grad_norm": 0.17576420307159424, + "learning_rate": 2.7247890439057064e-06 + }, + { + "step": 606, + "epoch": 3.7875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6939, + "grad_norm": 0.08719099313020706, + "learning_rate": 2.5717512502048342e-06 + }, + { + "step": 607, + "epoch": 3.79375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6906, + "grad_norm": 0.0637313574552536, + "learning_rate": 2.423099110938376e-06 + }, + { + "step": 608, + "epoch": 3.8, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6895, + "grad_norm": 0.09915664792060852, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 609, + "epoch": 3.80625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6954, + "grad_norm": 0.14777147769927979, + "learning_rate": 2.1389693533636455e-06 + }, + { + "step": 610, + "epoch": 3.8125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6948, + "grad_norm": 0.05205446854233742, + "learning_rate": 2.003500187268153e-06 + }, + { + "step": 611, + "epoch": 3.81875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6881, + "grad_norm": 0.09432388842105865, + "learning_rate": 1.8724335797812685e-06 + }, + { + "step": 612, + "epoch": 3.825, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568716288, + "loss": 0.6905, + "grad_norm": 0.07120528072118759, + "learning_rate": 1.7457734298359005e-06 + }, + { + "step": 613, + "epoch": 3.83125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6985, + "grad_norm": 0.03972097113728523, + "learning_rate": 1.6235235052828476e-06 + }, + { + "step": 614, + "epoch": 3.8375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6915, + "grad_norm": 0.0842246487736702, + "learning_rate": 1.505687442778819e-06 + }, + { + "step": 615, + "epoch": 3.84375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6917, + "grad_norm": 0.09255807101726532, + "learning_rate": 1.3922687476781047e-06 + }, + { + "step": 616, + "epoch": 3.85, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6945, + "grad_norm": 0.03942221775650978, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 617, + "epoch": 3.85625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6902, + "grad_norm": 0.09405840188264847, + "learning_rate": 1.1786968239705486e-06 + }, + { + "step": 618, + "epoch": 3.8625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6888, + "grad_norm": 0.04610726982355118, + "learning_rate": 1.0785499486417438e-06 + }, + { + "step": 619, + "epoch": 3.86875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6923, + "grad_norm": 0.036732323467731476, + "learning_rate": 9.82833147083345e-07 + }, + { + "step": 620, + "epoch": 3.875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6914, + "grad_norm": 0.11039964109659195, + "learning_rate": 8.91549266652053e-07 + }, + { + "step": 621, + "epoch": 3.88125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6954, + "grad_norm": 0.15592555701732635, + "learning_rate": 8.04701022835319e-07 + }, + { + "step": 622, + "epoch": 3.8875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.7007, + "grad_norm": 0.06869472563266754, + "learning_rate": 7.222909991704773e-07 + }, + { + "step": 623, + "epoch": 3.89375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6902, + "grad_norm": 0.1269441545009613, + "learning_rate": 6.443216471679058e-07 + }, + { + "step": 624, + "epoch": 3.9, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6956, + "grad_norm": 0.07067258656024933, + "learning_rate": 5.707952862381681e-07 + }, + { + "step": 625, + "epoch": 3.90625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6906, + "grad_norm": 0.0604085698723793, + "learning_rate": 5.017141036229522e-07 + }, + { + "step": 626, + "epoch": 3.9125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6931, + "grad_norm": 0.03121452033519745, + "learning_rate": 4.370801543300051e-07 + }, + { + "step": 627, + "epoch": 3.91875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6914, + "grad_norm": 0.06452342122793198, + "learning_rate": 3.768953610720327e-07 + }, + { + "step": 628, + "epoch": 3.925, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.691, + "grad_norm": 0.06869728118181229, + "learning_rate": 3.211615142094781e-07 + }, + { + "step": 629, + "epoch": 3.93125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568720896, + "loss": 0.6909, + "grad_norm": 0.09471630305051804, + "learning_rate": 2.6988027169728145e-07 + }, + { + "step": 630, + "epoch": 3.9375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56871936, + "loss": 0.6933, + "grad_norm": 0.10710395872592926, + "learning_rate": 2.2305315903553555e-07 + }, + { + "step": 631, + "epoch": 3.94375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568733184, + "loss": 0.6913, + "grad_norm": 0.09963320195674896, + "learning_rate": 1.8068156922413924e-07 + }, + { + "step": 632, + "epoch": 3.95, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6887, + "grad_norm": 0.052151117473840714, + "learning_rate": 1.4276676272133025e-07 + }, + { + "step": 633, + "epoch": 3.95625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6939, + "grad_norm": 0.041492775082588196, + "learning_rate": 1.0930986740621539e-07 + }, + { + "step": 634, + "epoch": 3.9625, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568723968, + "loss": 0.6928, + "grad_norm": 0.10671940445899963, + "learning_rate": 8.031187854514731e-08 + }, + { + "step": 635, + "epoch": 3.96875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.696, + "grad_norm": 0.04117032140493393, + "learning_rate": 5.577365876224815e-08 + }, + { + "step": 636, + "epoch": 3.975, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568716288, + "loss": 0.6914, + "grad_norm": 0.05413367971777916, + "learning_rate": 3.5695938013630134e-08 + }, + { + "step": 637, + "epoch": 3.98125, + "cpu_mem": 1.85624576, + "gpu_mem": 4.56872704, + "loss": 0.6978, + "grad_norm": 0.17923520505428314, + "learning_rate": 2.007931356572956e-08 + }, + { + "step": 638, + "epoch": 3.9875, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6964, + "grad_norm": 0.0974799245595932, + "learning_rate": 8.924249977537712e-09 + }, + { + "step": 639, + "epoch": 3.99375, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568725504, + "loss": 0.6949, + "grad_norm": 0.0859425812959671, + "learning_rate": 2.2310790867619e-09 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "loss": 0.6937, + "grad_norm": 0.07871342450380325, + "learning_rate": 0.0 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.85624576, + "gpu_mem": 4.568722432, + "train_runtime": 2934.6606, + "train_samples_per_second": 13.949, + "train_steps_per_second": 0.218, + "total_flos": 1.4734951559921664e+16, + "train_loss": 0.777228327281773 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r2-a2/README.md b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r2-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r2-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r2-a2/adapter_config.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..26ebe9ef584396639cb6b281f2c8108d7f3fd14a --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r2-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 4, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 2, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r2-a2/eval_results.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ed6721b7f54ab26b7292cf2f5764c4cde8cf4fe8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_c", + "results": 0.3199658703071672 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r2-a2/training_configuration.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..6c2b768b639baaa54bf903e170b5ea5073eb5f9f --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_C", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "lora", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 1576960 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-lora-arc_c-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r2-a2", + "seed": 42, + "timestamp": "2025-08-29T18:18:46.840920" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r2-a2/training_logs.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..bcb1610ee15866bc3a342e17d08c6a15ac6ca19d --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r2-a2/training_logs.json @@ -0,0 +1,625 @@ +[ + { + "step": 1, + "epoch": 0.05714285714285714, + "cpu_mem": 1.679192064, + "gpu_mem": 4.423771648, + "loss": 4.4614, + "grad_norm": 16.15836524963379, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 2, + "epoch": 0.11428571428571428, + "cpu_mem": 1.684697088, + "gpu_mem": 4.4363776, + "loss": 4.6994, + "grad_norm": 16.514631271362305, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 3, + "epoch": 0.17142857142857143, + "cpu_mem": 1.684697088, + "gpu_mem": 4.43640832, + "loss": 4.2088, + "grad_norm": 17.19388198852539, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 4, + "epoch": 0.22857142857142856, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436374528, + "loss": 3.6439, + "grad_norm": 17.112430572509766, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 5, + "epoch": 0.2857142857142857, + "cpu_mem": 1.684893696, + "gpu_mem": 4.43636224, + "loss": 3.0013, + "grad_norm": 14.36368465423584, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 6, + "epoch": 0.34285714285714286, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436425216, + "loss": 2.634, + "grad_norm": 16.42566680908203, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 7, + "epoch": 0.4, + "cpu_mem": 1.684893696, + "gpu_mem": 4.43643136, + "loss": 1.928, + "grad_norm": 6.609675884246826, + "learning_rate": 0.0003 + }, + { + "step": 8, + "epoch": 0.45714285714285713, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436389888, + "loss": 1.6857, + "grad_norm": 2.7208445072174072, + "learning_rate": 0.00029980111348272456 + }, + { + "step": 9, + "epoch": 0.5142857142857142, + "cpu_mem": 1.684893696, + "gpu_mem": 4.43638528, + "loss": 1.6406, + "grad_norm": 3.141676902770996, + "learning_rate": 0.00029920498134218835 + }, + { + "step": 10, + "epoch": 0.5714285714285714, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436374528, + "loss": 1.4243, + "grad_norm": 1.1265151500701904, + "learning_rate": 0.0002982131844136615 + }, + { + "step": 11, + "epoch": 0.6285714285714286, + "cpu_mem": 1.684893696, + "gpu_mem": 4.43638528, + "loss": 1.3673, + "grad_norm": 0.7945563793182373, + "learning_rate": 0.0002968283527643036 + }, + { + "step": 12, + "epoch": 0.6857142857142857, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436409856, + "loss": 1.469, + "grad_norm": 3.043248176574707, + "learning_rate": 0.000295054158718698 + }, + { + "step": 13, + "epoch": 0.7428571428571429, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436409856, + "loss": 1.5799, + "grad_norm": 4.370447158813477, + "learning_rate": 0.00029289530712050735 + }, + { + "step": 14, + "epoch": 0.8, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436357632, + "loss": 1.4707, + "grad_norm": 2.2408101558685303, + "learning_rate": 0.000290357522856074 + }, + { + "step": 15, + "epoch": 0.8571428571428571, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436432896, + "loss": 1.4692, + "grad_norm": 2.9767096042633057, + "learning_rate": 0.0002874475356730507 + }, + { + "step": 16, + "epoch": 0.9142857142857143, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436426752, + "loss": 1.3922, + "grad_norm": 0.8569664359092712, + "learning_rate": 0.0002841730623343193 + }, + { + "step": 17, + "epoch": 0.9714285714285714, + "cpu_mem": 1.684893696, + "gpu_mem": 4.43643136, + "loss": 1.4025, + "grad_norm": 1.8853036165237427, + "learning_rate": 0.00028054278615452326 + }, + { + "step": 18, + "epoch": 1.0285714285714285, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442696192, + "loss": 1.9887, + "grad_norm": 1.4266043901443481, + "learning_rate": 0.0002765663339734778 + }, + { + "step": 19, + "epoch": 1.0857142857142856, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442694656, + "loss": 1.3434, + "grad_norm": 0.8163749575614929, + "learning_rate": 0.00027225425062752165 + }, + { + "step": 20, + "epoch": 1.1428571428571428, + "cpu_mem": 1.684893696, + "gpu_mem": 4.44267008, + "loss": 1.362, + "grad_norm": 1.9476549625396729, + "learning_rate": 0.0002676179709865066 + }, + { + "step": 21, + "epoch": 1.2, + "cpu_mem": 1.684893696, + "gpu_mem": 4.44267776, + "loss": 1.3675, + "grad_norm": 1.6805366277694702, + "learning_rate": 0.0002626697896305779 + }, + { + "step": 22, + "epoch": 1.2571428571428571, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442706944, + "loss": 1.3555, + "grad_norm": 1.680128574371338, + "learning_rate": 0.000257422828247159 + }, + { + "step": 23, + "epoch": 1.3142857142857143, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442736128, + "loss": 1.3086, + "grad_norm": 0.9515631794929504, + "learning_rate": 0.00025189100083459397 + }, + { + "step": 24, + "epoch": 1.3714285714285714, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442679296, + "loss": 1.37, + "grad_norm": 1.1897468566894531, + "learning_rate": 0.0002460889768047263 + }, + { + "step": 25, + "epoch": 1.4285714285714286, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442748416, + "loss": 1.312, + "grad_norm": 1.4755345582962036, + "learning_rate": 0.00024003214208225522 + }, + { + "step": 26, + "epoch": 1.4857142857142858, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442705408, + "loss": 1.331, + "grad_norm": 1.1155736446380615, + "learning_rate": 0.00023373655830402968 + }, + { + "step": 27, + "epoch": 1.5428571428571427, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442663936, + "loss": 1.3349, + "grad_norm": 0.8196800947189331, + "learning_rate": 0.00022721892022647462 + }, + { + "step": 28, + "epoch": 1.6, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442710016, + "loss": 1.4945, + "grad_norm": 3.2902934551239014, + "learning_rate": 0.000220496511454098 + }, + { + "step": 29, + "epoch": 1.657142857142857, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442705408, + "loss": 1.3649, + "grad_norm": 0.996429979801178, + "learning_rate": 0.0002135871586064791 + }, + { + "step": 30, + "epoch": 1.7142857142857144, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442694656, + "loss": 1.387, + "grad_norm": 1.2195707559585571, + "learning_rate": 0.00020650918404527775 + }, + { + "step": 31, + "epoch": 1.7714285714285714, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442725376, + "loss": 1.3682, + "grad_norm": 0.838907778263092, + "learning_rate": 0.00019928135728662522 + }, + { + "step": 32, + "epoch": 1.8285714285714287, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442734592, + "loss": 1.355, + "grad_norm": 0.6391316056251526, + "learning_rate": 0.00019192284522774142 + }, + { + "step": 33, + "epoch": 1.8857142857142857, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442714624, + "loss": 1.4071, + "grad_norm": 0.9244645237922668, + "learning_rate": 0.00018445316131976934 + }, + { + "step": 34, + "epoch": 1.9428571428571428, + "cpu_mem": 1.684893696, + "gpu_mem": 4.44269312, + "loss": 1.3863, + "grad_norm": 0.990753710269928, + "learning_rate": 0.00017689211382161034 + }, + { + "step": 35, + "epoch": 2.0, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442580992, + "loss": 2.1315, + "grad_norm": 1.8902976512908936, + "learning_rate": 0.00016925975327198266 + }, + { + "step": 36, + "epoch": 2.057142857142857, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436403712, + "loss": 1.4269, + "grad_norm": 1.7275190353393555, + "learning_rate": 0.00016157631931899697 + }, + { + "step": 37, + "epoch": 2.1142857142857143, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436412928, + "loss": 1.3643, + "grad_norm": 0.6880694627761841, + "learning_rate": 0.0001538621870482483 + }, + { + "step": 38, + "epoch": 2.1714285714285713, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436383744, + "loss": 1.3635, + "grad_norm": 0.751710057258606, + "learning_rate": 0.00014613781295175172 + }, + { + "step": 39, + "epoch": 2.2285714285714286, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436402176, + "loss": 1.3378, + "grad_norm": 0.4194468557834625, + "learning_rate": 0.00013842368068100303 + }, + { + "step": 40, + "epoch": 2.2857142857142856, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436379136, + "loss": 1.3923, + "grad_norm": 0.6989176273345947, + "learning_rate": 0.00013074024672801731 + }, + { + "step": 41, + "epoch": 2.342857142857143, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436380672, + "loss": 1.3874, + "grad_norm": 0.7492637038230896, + "learning_rate": 0.00012310788617838966 + }, + { + "step": 42, + "epoch": 2.4, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436409856, + "loss": 1.3155, + "grad_norm": 0.8533624410629272, + "learning_rate": 0.00011554683868023067 + }, + { + "step": 43, + "epoch": 2.4571428571428573, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436425216, + "loss": 1.3561, + "grad_norm": 0.7743812203407288, + "learning_rate": 0.00010807715477225858 + }, + { + "step": 44, + "epoch": 2.5142857142857142, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436443648, + "loss": 1.3316, + "grad_norm": 0.6294273138046265, + "learning_rate": 0.00010071864271337478 + }, + { + "step": 45, + "epoch": 2.571428571428571, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436397568, + "loss": 1.3295, + "grad_norm": 1.2644308805465698, + "learning_rate": 9.34908159547222e-05 + }, + { + "step": 46, + "epoch": 2.6285714285714286, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436391424, + "loss": 1.2753, + "grad_norm": 1.008877158164978, + "learning_rate": 8.641284139352091e-05 + }, + { + "step": 47, + "epoch": 2.685714285714286, + "cpu_mem": 1.684893696, + "gpu_mem": 4.43638528, + "loss": 1.3078, + "grad_norm": 1.1003679037094116, + "learning_rate": 7.950348854590204e-05 + }, + { + "step": 48, + "epoch": 2.742857142857143, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436389888, + "loss": 1.2951, + "grad_norm": 1.5551389455795288, + "learning_rate": 7.278107977352543e-05 + }, + { + "step": 49, + "epoch": 2.8, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436380672, + "loss": 1.3023, + "grad_norm": 1.0544744729995728, + "learning_rate": 6.626344169597031e-05 + }, + { + "step": 50, + "epoch": 2.857142857142857, + "cpu_mem": 1.684893696, + "gpu_mem": 4.43636224, + "loss": 1.348, + "grad_norm": 1.3254765272140503, + "learning_rate": 5.996785791774478e-05 + }, + { + "step": 51, + "epoch": 2.914285714285714, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436386816, + "loss": 1.3493, + "grad_norm": 1.0576057434082031, + "learning_rate": 5.391102319527373e-05 + }, + { + "step": 52, + "epoch": 2.9714285714285715, + "cpu_mem": 1.684893696, + "gpu_mem": 4.436414464, + "loss": 1.3591, + "grad_norm": 0.47110751271247864, + "learning_rate": 4.8108999165406026e-05 + }, + { + "step": 53, + "epoch": 3.0285714285714285, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442691584, + "loss": 2.0005, + "grad_norm": 1.450379729270935, + "learning_rate": 4.257717175284103e-05 + }, + { + "step": 54, + "epoch": 3.085714285714286, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442660864, + "loss": 1.3667, + "grad_norm": 0.9130564332008362, + "learning_rate": 3.733021036942205e-05 + }, + { + "step": 55, + "epoch": 3.142857142857143, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442694656, + "loss": 1.3785, + "grad_norm": 1.3635129928588867, + "learning_rate": 3.238202901349345e-05 + }, + { + "step": 56, + "epoch": 3.2, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442768384, + "loss": 1.3423, + "grad_norm": 0.7703405618667603, + "learning_rate": 2.774574937247831e-05 + }, + { + "step": 57, + "epoch": 3.257142857142857, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442711552, + "loss": 1.3346, + "grad_norm": 0.4012717008590698, + "learning_rate": 2.3433666026522153e-05 + }, + { + "step": 58, + "epoch": 3.314285714285714, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442705408, + "loss": 1.2521, + "grad_norm": 1.1206154823303223, + "learning_rate": 1.945721384547671e-05 + }, + { + "step": 59, + "epoch": 3.3714285714285714, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442756096, + "loss": 1.3209, + "grad_norm": 0.6273525953292847, + "learning_rate": 1.5826937665680693e-05 + }, + { + "step": 60, + "epoch": 3.4285714285714284, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442682368, + "loss": 1.349, + "grad_norm": 0.9245339035987854, + "learning_rate": 1.2552464326949302e-05 + }, + { + "step": 61, + "epoch": 3.4857142857142858, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442696192, + "loss": 1.3687, + "grad_norm": 0.7916859984397888, + "learning_rate": 9.64247714392597e-06 + }, + { + "step": 62, + "epoch": 3.5428571428571427, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442697728, + "loss": 1.3427, + "grad_norm": 0.823630690574646, + "learning_rate": 7.104692879492624e-06 + }, + { + "step": 63, + "epoch": 3.6, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442686976, + "loss": 1.3256, + "grad_norm": 0.5097457766532898, + "learning_rate": 4.945841281301943e-06 + }, + { + "step": 64, + "epoch": 3.657142857142857, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442703872, + "loss": 1.3134, + "grad_norm": 0.605135440826416, + "learning_rate": 3.1716472356963286e-06 + }, + { + "step": 65, + "epoch": 3.7142857142857144, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442725376, + "loss": 1.3379, + "grad_norm": 0.9839047193527222, + "learning_rate": 1.7868155863384415e-06 + }, + { + "step": 66, + "epoch": 3.7714285714285714, + "cpu_mem": 1.684893696, + "gpu_mem": 4.44271616, + "loss": 1.3114, + "grad_norm": 1.184880256652832, + "learning_rate": 7.950186578116413e-07 + }, + { + "step": 67, + "epoch": 3.8285714285714287, + "cpu_mem": 1.684893696, + "gpu_mem": 4.442742272, + "loss": 1.31, + "grad_norm": 1.0318105220794678, + "learning_rate": 1.988865172754206e-07 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.684893696, + "gpu_mem": 4.44269312, + "loss": 1.3179, + "grad_norm": 0.7273362874984741, + "learning_rate": 0.0 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.684893696, + "gpu_mem": 4.44269312, + "train_runtime": 386.0463, + "train_samples_per_second": 11.594, + "train_steps_per_second": 0.176, + "total_flos": 4001546965180416.0, + "train_loss": 1.621473962769789 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r32-a2/README.md b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r32-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r32-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r32-a2/adapter_config.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..03eabaea80bc9f8c1936ead28264f565a8ac69c0 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r32-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r32-a2/eval_results.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..28998d51471eb7eec0cb433eaa03b4fc357b8f72 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_c", + "results": 0.37542662116040953 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r32-a2/training_configuration.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..9305e5c670de5bf4f5f8e307818708e65851a305 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_C", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "lora", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 25231360 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-lora-arc_c-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r32-a2", + "seed": 42, + "timestamp": "2025-08-30T08:37:32.718821" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r32-a2/training_logs.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..7e7177f9b7a2ac6237b976c10ab5c887922ecfd5 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r32-a2/training_logs.json @@ -0,0 +1,625 @@ +[ + { + "step": 1, + "epoch": 0.05714285714285714, + "cpu_mem": 1.720455168, + "gpu_mem": 4.518389248, + "loss": 4.4614, + "grad_norm": 64.07659149169922, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 2, + "epoch": 0.11428571428571428, + "cpu_mem": 1.725566976, + "gpu_mem": 4.7202304, + "loss": 4.6994, + "grad_norm": 65.2947998046875, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 3, + "epoch": 0.17142857142857143, + "cpu_mem": 1.725763584, + "gpu_mem": 4.72026112, + "loss": 2.3426, + "grad_norm": 31.082796096801758, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 4, + "epoch": 0.22857142857142856, + "cpu_mem": 1.725763584, + "gpu_mem": 4.720227328, + "loss": 1.5813, + "grad_norm": 4.258829593658447, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 5, + "epoch": 0.2857142857142857, + "cpu_mem": 1.725763584, + "gpu_mem": 4.72021504, + "loss": 1.4135, + "grad_norm": 2.7421786785125732, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 6, + "epoch": 0.34285714285714286, + "cpu_mem": 1.725763584, + "gpu_mem": 4.720278016, + "loss": 1.6048, + "grad_norm": 9.075495719909668, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 7, + "epoch": 0.4, + "cpu_mem": 1.725763584, + "gpu_mem": 4.72028416, + "loss": 1.6982, + "grad_norm": 12.564203262329102, + "learning_rate": 0.0003 + }, + { + "step": 8, + "epoch": 0.45714285714285713, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720242688, + "loss": 1.4221, + "grad_norm": 2.09076189994812, + "learning_rate": 0.00029980111348272456 + }, + { + "step": 9, + "epoch": 0.5142857142857142, + "cpu_mem": 1.725960192, + "gpu_mem": 4.72023808, + "loss": 1.3472, + "grad_norm": 2.7396843433380127, + "learning_rate": 0.00029920498134218835 + }, + { + "step": 10, + "epoch": 0.5714285714285714, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720227328, + "loss": 1.6136, + "grad_norm": 5.114203929901123, + "learning_rate": 0.0002982131844136615 + }, + { + "step": 11, + "epoch": 0.6285714285714286, + "cpu_mem": 1.725960192, + "gpu_mem": 4.72023808, + "loss": 1.4537, + "grad_norm": 3.0868992805480957, + "learning_rate": 0.0002968283527643036 + }, + { + "step": 12, + "epoch": 0.6857142857142857, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720262656, + "loss": 1.4308, + "grad_norm": 3.1253883838653564, + "learning_rate": 0.000295054158718698 + }, + { + "step": 13, + "epoch": 0.7428571428571429, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720262656, + "loss": 1.4231, + "grad_norm": 6.2904133796691895, + "learning_rate": 0.00029289530712050735 + }, + { + "step": 14, + "epoch": 0.8, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720210432, + "loss": 1.4354, + "grad_norm": 3.2429397106170654, + "learning_rate": 0.000290357522856074 + }, + { + "step": 15, + "epoch": 0.8571428571428571, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720285696, + "loss": 1.3817, + "grad_norm": 1.4617501497268677, + "learning_rate": 0.0002874475356730507 + }, + { + "step": 16, + "epoch": 0.9142857142857143, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720279552, + "loss": 1.5101, + "grad_norm": 4.85157585144043, + "learning_rate": 0.0002841730623343193 + }, + { + "step": 17, + "epoch": 0.9714285714285714, + "cpu_mem": 1.725960192, + "gpu_mem": 4.72028416, + "loss": 1.3988, + "grad_norm": 2.35978102684021, + "learning_rate": 0.00028054278615452326 + }, + { + "step": 18, + "epoch": 1.0285714285714285, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821166592, + "loss": 2.0866, + "grad_norm": 3.782046318054199, + "learning_rate": 0.0002765663339734778 + }, + { + "step": 19, + "epoch": 1.0857142857142856, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821165056, + "loss": 1.3962, + "grad_norm": 1.7360825538635254, + "learning_rate": 0.00027225425062752165 + }, + { + "step": 20, + "epoch": 1.1428571428571428, + "cpu_mem": 1.725960192, + "gpu_mem": 4.82114048, + "loss": 1.3508, + "grad_norm": 1.9804561138153076, + "learning_rate": 0.0002676179709865066 + }, + { + "step": 21, + "epoch": 1.2, + "cpu_mem": 1.725960192, + "gpu_mem": 4.82114816, + "loss": 1.4478, + "grad_norm": 2.795927047729492, + "learning_rate": 0.0002626697896305779 + }, + { + "step": 22, + "epoch": 1.2571428571428571, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821177344, + "loss": 1.3806, + "grad_norm": 2.142228126525879, + "learning_rate": 0.000257422828247159 + }, + { + "step": 23, + "epoch": 1.3142857142857143, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821206528, + "loss": 1.3861, + "grad_norm": 4.4899163246154785, + "learning_rate": 0.00025189100083459397 + }, + { + "step": 24, + "epoch": 1.3714285714285714, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821149696, + "loss": 1.3778, + "grad_norm": 1.379177212715149, + "learning_rate": 0.0002460889768047263 + }, + { + "step": 25, + "epoch": 1.4285714285714286, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821218816, + "loss": 1.353, + "grad_norm": 2.0313150882720947, + "learning_rate": 0.00024003214208225522 + }, + { + "step": 26, + "epoch": 1.4857142857142858, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821175808, + "loss": 1.3359, + "grad_norm": 1.1166026592254639, + "learning_rate": 0.00023373655830402968 + }, + { + "step": 27, + "epoch": 1.5428571428571427, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821134336, + "loss": 1.3594, + "grad_norm": 1.3546416759490967, + "learning_rate": 0.00022721892022647462 + }, + { + "step": 28, + "epoch": 1.6, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821180416, + "loss": 1.5019, + "grad_norm": 3.3213307857513428, + "learning_rate": 0.000220496511454098 + }, + { + "step": 29, + "epoch": 1.657142857142857, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821175808, + "loss": 1.3731, + "grad_norm": 0.8751102089881897, + "learning_rate": 0.0002135871586064791 + }, + { + "step": 30, + "epoch": 1.7142857142857144, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821165056, + "loss": 1.3415, + "grad_norm": 0.9210039377212524, + "learning_rate": 0.00020650918404527775 + }, + { + "step": 31, + "epoch": 1.7714285714285714, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821195776, + "loss": 1.3675, + "grad_norm": 0.6948165893554688, + "learning_rate": 0.00019928135728662522 + }, + { + "step": 32, + "epoch": 1.8285714285714287, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821204992, + "loss": 1.3587, + "grad_norm": 0.8291835188865662, + "learning_rate": 0.00019192284522774142 + }, + { + "step": 33, + "epoch": 1.8857142857142857, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821185024, + "loss": 1.4337, + "grad_norm": 1.7935640811920166, + "learning_rate": 0.00018445316131976934 + }, + { + "step": 34, + "epoch": 1.9428571428571428, + "cpu_mem": 1.725960192, + "gpu_mem": 4.82116352, + "loss": 1.4033, + "grad_norm": 1.5851809978485107, + "learning_rate": 0.00017689211382161034 + }, + { + "step": 35, + "epoch": 2.0, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821051392, + "loss": 2.0291, + "grad_norm": 1.1575442552566528, + "learning_rate": 0.00016925975327198266 + }, + { + "step": 36, + "epoch": 2.057142857142857, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720256512, + "loss": 1.3524, + "grad_norm": 1.5324089527130127, + "learning_rate": 0.00016157631931899697 + }, + { + "step": 37, + "epoch": 2.1142857142857143, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720265728, + "loss": 1.4613, + "grad_norm": 3.449747323989868, + "learning_rate": 0.0001538621870482483 + }, + { + "step": 38, + "epoch": 2.1714285714285713, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720236544, + "loss": 1.3824, + "grad_norm": 2.016643524169922, + "learning_rate": 0.00014613781295175172 + }, + { + "step": 39, + "epoch": 2.2285714285714286, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720254976, + "loss": 1.3725, + "grad_norm": 2.0235366821289062, + "learning_rate": 0.00013842368068100303 + }, + { + "step": 40, + "epoch": 2.2857142857142856, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720231936, + "loss": 1.3694, + "grad_norm": 1.171370506286621, + "learning_rate": 0.00013074024672801731 + }, + { + "step": 41, + "epoch": 2.342857142857143, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720233472, + "loss": 1.3959, + "grad_norm": 1.2297394275665283, + "learning_rate": 0.00012310788617838966 + }, + { + "step": 42, + "epoch": 2.4, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720262656, + "loss": 1.338, + "grad_norm": 1.472746729850769, + "learning_rate": 0.00011554683868023067 + }, + { + "step": 43, + "epoch": 2.4571428571428573, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720278016, + "loss": 1.3786, + "grad_norm": 2.2151598930358887, + "learning_rate": 0.00010807715477225858 + }, + { + "step": 44, + "epoch": 2.5142857142857142, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720296448, + "loss": 1.3194, + "grad_norm": 0.7696126103401184, + "learning_rate": 0.00010071864271337478 + }, + { + "step": 45, + "epoch": 2.571428571428571, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720250368, + "loss": 1.32, + "grad_norm": 0.8085951209068298, + "learning_rate": 9.34908159547222e-05 + }, + { + "step": 46, + "epoch": 2.6285714285714286, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720244224, + "loss": 1.2945, + "grad_norm": 0.7401658892631531, + "learning_rate": 8.641284139352091e-05 + }, + { + "step": 47, + "epoch": 2.685714285714286, + "cpu_mem": 1.725960192, + "gpu_mem": 4.72023808, + "loss": 1.2886, + "grad_norm": 0.9493575096130371, + "learning_rate": 7.950348854590204e-05 + }, + { + "step": 48, + "epoch": 2.742857142857143, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720242688, + "loss": 1.2853, + "grad_norm": 0.8986085057258606, + "learning_rate": 7.278107977352543e-05 + }, + { + "step": 49, + "epoch": 2.8, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720233472, + "loss": 1.2921, + "grad_norm": 0.907228410243988, + "learning_rate": 6.626344169597031e-05 + }, + { + "step": 50, + "epoch": 2.857142857142857, + "cpu_mem": 1.725960192, + "gpu_mem": 4.72021504, + "loss": 1.3411, + "grad_norm": 1.119447112083435, + "learning_rate": 5.996785791774478e-05 + }, + { + "step": 51, + "epoch": 2.914285714285714, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720239616, + "loss": 1.3237, + "grad_norm": 1.2924776077270508, + "learning_rate": 5.391102319527373e-05 + }, + { + "step": 52, + "epoch": 2.9714285714285715, + "cpu_mem": 1.725960192, + "gpu_mem": 4.720267264, + "loss": 1.3431, + "grad_norm": 0.9602690935134888, + "learning_rate": 4.8108999165406026e-05 + }, + { + "step": 53, + "epoch": 3.0285714285714285, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821161984, + "loss": 1.9328, + "grad_norm": 1.6960599422454834, + "learning_rate": 4.257717175284103e-05 + }, + { + "step": 54, + "epoch": 3.085714285714286, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821131264, + "loss": 1.3181, + "grad_norm": 0.9233318567276001, + "learning_rate": 3.733021036942205e-05 + }, + { + "step": 55, + "epoch": 3.142857142857143, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821165056, + "loss": 1.3459, + "grad_norm": 1.386860728263855, + "learning_rate": 3.238202901349345e-05 + }, + { + "step": 56, + "epoch": 3.2, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821238784, + "loss": 1.2846, + "grad_norm": 0.8437211513519287, + "learning_rate": 2.774574937247831e-05 + }, + { + "step": 57, + "epoch": 3.257142857142857, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821181952, + "loss": 1.2875, + "grad_norm": 0.705308198928833, + "learning_rate": 2.3433666026522153e-05 + }, + { + "step": 58, + "epoch": 3.314285714285714, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821175808, + "loss": 1.2084, + "grad_norm": 1.1343612670898438, + "learning_rate": 1.945721384547671e-05 + }, + { + "step": 59, + "epoch": 3.3714285714285714, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821226496, + "loss": 1.243, + "grad_norm": 0.7648884654045105, + "learning_rate": 1.5826937665680693e-05 + }, + { + "step": 60, + "epoch": 3.4285714285714284, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821152768, + "loss": 1.3477, + "grad_norm": 2.0110929012298584, + "learning_rate": 1.2552464326949302e-05 + }, + { + "step": 61, + "epoch": 3.4857142857142858, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821166592, + "loss": 1.3471, + "grad_norm": 1.4026252031326294, + "learning_rate": 9.64247714392597e-06 + }, + { + "step": 62, + "epoch": 3.5428571428571427, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821168128, + "loss": 1.3246, + "grad_norm": 1.4051792621612549, + "learning_rate": 7.104692879492624e-06 + }, + { + "step": 63, + "epoch": 3.6, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821157376, + "loss": 1.3261, + "grad_norm": 1.2583266496658325, + "learning_rate": 4.945841281301943e-06 + }, + { + "step": 64, + "epoch": 3.657142857142857, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821174272, + "loss": 1.2887, + "grad_norm": 1.42184579372406, + "learning_rate": 3.1716472356963286e-06 + }, + { + "step": 65, + "epoch": 3.7142857142857144, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821195776, + "loss": 1.3347, + "grad_norm": 1.728265404701233, + "learning_rate": 1.7868155863384415e-06 + }, + { + "step": 66, + "epoch": 3.7714285714285714, + "cpu_mem": 1.725960192, + "gpu_mem": 4.82118656, + "loss": 1.254, + "grad_norm": 1.2858455181121826, + "learning_rate": 7.950186578116413e-07 + }, + { + "step": 67, + "epoch": 3.8285714285714287, + "cpu_mem": 1.725960192, + "gpu_mem": 4.821212672, + "loss": 1.2526, + "grad_norm": 1.1587661504745483, + "learning_rate": 1.988865172754206e-07 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.725960192, + "gpu_mem": 4.82116352, + "loss": 1.3022, + "grad_norm": 2.142974615097046, + "learning_rate": 0.0 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.725960192, + "gpu_mem": 4.82116352, + "train_runtime": 387.4626, + "train_samples_per_second": 11.552, + "train_steps_per_second": 0.176, + "total_flos": 4092904137302016.0, + "train_loss": 1.5111301593920763 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r8-a2/README.md b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r8-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r8-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r8-a2/adapter_config.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..616e0cc3677d4646846654f1887fbef4d57d10ca --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r8-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r8-a2/eval_results.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..cc19424c8217040caba621a6c2ccc8053b13e1fc --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_c", + "results": 0.33532423208191126 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r8-a2/training_configuration.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..12b262678ba6132ff5244067e339e6bbd592e465 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_C", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "lora", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 6307840 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-lora-arc_c-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r8-a2", + "seed": 42, + "timestamp": "2025-08-30T01:29:46.361882" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r8-a2/training_logs.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..d7910254e5f6bf5da898b0a19636f49c5f5a5e30 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_c-r8-a2/training_logs.json @@ -0,0 +1,625 @@ +[ + { + "step": 1, + "epoch": 0.05714285714285714, + "cpu_mem": 1.68337408, + "gpu_mem": 4.442695168, + "loss": 4.4614, + "grad_norm": 31.636381149291992, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 2, + "epoch": 0.11428571428571428, + "cpu_mem": 1.688879104, + "gpu_mem": 4.49314816, + "loss": 4.6994, + "grad_norm": 32.280643463134766, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 3, + "epoch": 0.17142857142857143, + "cpu_mem": 1.689075712, + "gpu_mem": 4.49317888, + "loss": 3.6552, + "grad_norm": 31.23707389831543, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 4, + "epoch": 0.22857142857142856, + "cpu_mem": 1.68927232, + "gpu_mem": 4.493145088, + "loss": 2.4482, + "grad_norm": 22.35167694091797, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 5, + "epoch": 0.2857142857142857, + "cpu_mem": 1.68927232, + "gpu_mem": 4.4931328, + "loss": 1.7979, + "grad_norm": 6.67398738861084, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 6, + "epoch": 0.34285714285714286, + "cpu_mem": 1.68927232, + "gpu_mem": 4.493195776, + "loss": 1.6102, + "grad_norm": 3.161860466003418, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 7, + "epoch": 0.4, + "cpu_mem": 1.68927232, + "gpu_mem": 4.49320192, + "loss": 1.4797, + "grad_norm": 2.3964056968688965, + "learning_rate": 0.0003 + }, + { + "step": 8, + "epoch": 0.45714285714285713, + "cpu_mem": 1.68927232, + "gpu_mem": 4.493160448, + "loss": 1.431, + "grad_norm": 1.9207026958465576, + "learning_rate": 0.00029980111348272456 + }, + { + "step": 9, + "epoch": 0.5142857142857142, + "cpu_mem": 1.68927232, + "gpu_mem": 4.49315584, + "loss": 1.3435, + "grad_norm": 2.6554951667785645, + "learning_rate": 0.00029920498134218835 + }, + { + "step": 10, + "epoch": 0.5714285714285714, + "cpu_mem": 1.68927232, + "gpu_mem": 4.493145088, + "loss": 1.5617, + "grad_norm": 5.361380100250244, + "learning_rate": 0.0002982131844136615 + }, + { + "step": 11, + "epoch": 0.6285714285714286, + "cpu_mem": 1.68927232, + "gpu_mem": 4.49315584, + "loss": 1.4135, + "grad_norm": 2.331951856613159, + "learning_rate": 0.0002968283527643036 + }, + { + "step": 12, + "epoch": 0.6857142857142857, + "cpu_mem": 1.68927232, + "gpu_mem": 4.493180416, + "loss": 1.3706, + "grad_norm": 1.5762097835540771, + "learning_rate": 0.000295054158718698 + }, + { + "step": 13, + "epoch": 0.7428571428571429, + "cpu_mem": 1.68927232, + "gpu_mem": 4.493180416, + "loss": 1.3928, + "grad_norm": 3.1512579917907715, + "learning_rate": 0.00029289530712050735 + }, + { + "step": 14, + "epoch": 0.8, + "cpu_mem": 1.68927232, + "gpu_mem": 4.493128192, + "loss": 1.4474, + "grad_norm": 2.6397762298583984, + "learning_rate": 0.000290357522856074 + }, + { + "step": 15, + "epoch": 0.8571428571428571, + "cpu_mem": 1.68927232, + "gpu_mem": 4.493203456, + "loss": 1.433, + "grad_norm": 2.0272762775421143, + "learning_rate": 0.0002874475356730507 + }, + { + "step": 16, + "epoch": 0.9142857142857143, + "cpu_mem": 1.68927232, + "gpu_mem": 4.493197312, + "loss": 1.4144, + "grad_norm": 1.3336317539215088, + "learning_rate": 0.0002841730623343193 + }, + { + "step": 17, + "epoch": 0.9714285714285714, + "cpu_mem": 1.68927232, + "gpu_mem": 4.49320192, + "loss": 1.4782, + "grad_norm": 2.0704710483551025, + "learning_rate": 0.00028054278615452326 + }, + { + "step": 18, + "epoch": 1.0285714285714285, + "cpu_mem": 1.68927232, + "gpu_mem": 4.518390272, + "loss": 2.0582, + "grad_norm": 1.7593141794204712, + "learning_rate": 0.0002765663339734778 + }, + { + "step": 19, + "epoch": 1.0857142857142856, + "cpu_mem": 1.68927232, + "gpu_mem": 4.518388736, + "loss": 1.3606, + "grad_norm": 0.7312186360359192, + "learning_rate": 0.00027225425062752165 + }, + { + "step": 20, + "epoch": 1.1428571428571428, + "cpu_mem": 1.68927232, + "gpu_mem": 4.51836416, + "loss": 1.3605, + "grad_norm": 1.7935729026794434, + "learning_rate": 0.0002676179709865066 + }, + { + "step": 21, + "epoch": 1.2, + "cpu_mem": 1.68927232, + "gpu_mem": 4.51837184, + "loss": 1.3925, + "grad_norm": 1.4244946241378784, + "learning_rate": 0.0002626697896305779 + }, + { + "step": 22, + "epoch": 1.2571428571428571, + "cpu_mem": 1.68927232, + "gpu_mem": 4.518401024, + "loss": 1.3754, + "grad_norm": 1.3779041767120361, + "learning_rate": 0.000257422828247159 + }, + { + "step": 23, + "epoch": 1.3142857142857143, + "cpu_mem": 1.68927232, + "gpu_mem": 4.518430208, + "loss": 1.3037, + "grad_norm": 0.5090811848640442, + "learning_rate": 0.00025189100083459397 + }, + { + "step": 24, + "epoch": 1.3714285714285714, + "cpu_mem": 1.68927232, + "gpu_mem": 4.518373376, + "loss": 1.391, + "grad_norm": 1.1390941143035889, + "learning_rate": 0.0002460889768047263 + }, + { + "step": 25, + "epoch": 1.4285714285714286, + "cpu_mem": 1.68927232, + "gpu_mem": 4.518442496, + "loss": 1.3305, + "grad_norm": 1.386357307434082, + "learning_rate": 0.00024003214208225522 + }, + { + "step": 26, + "epoch": 1.4857142857142858, + "cpu_mem": 1.68927232, + "gpu_mem": 4.518399488, + "loss": 1.3259, + "grad_norm": 0.8728662133216858, + "learning_rate": 0.00023373655830402968 + }, + { + "step": 27, + "epoch": 1.5428571428571427, + "cpu_mem": 1.68927232, + "gpu_mem": 4.518358016, + "loss": 1.3498, + "grad_norm": 0.9702808856964111, + "learning_rate": 0.00022721892022647462 + }, + { + "step": 28, + "epoch": 1.6, + "cpu_mem": 1.68927232, + "gpu_mem": 4.518404096, + "loss": 1.5135, + "grad_norm": 2.863943099975586, + "learning_rate": 0.000220496511454098 + }, + { + "step": 29, + "epoch": 1.657142857142857, + "cpu_mem": 1.68927232, + "gpu_mem": 4.518399488, + "loss": 1.357, + "grad_norm": 0.8509116768836975, + "learning_rate": 0.0002135871586064791 + }, + { + "step": 30, + "epoch": 1.7142857142857144, + "cpu_mem": 1.68927232, + "gpu_mem": 4.518388736, + "loss": 1.3626, + "grad_norm": 1.0801059007644653, + "learning_rate": 0.00020650918404527775 + }, + { + "step": 31, + "epoch": 1.7714285714285714, + "cpu_mem": 1.68927232, + "gpu_mem": 4.518419456, + "loss": 1.3396, + "grad_norm": 0.5183775424957275, + "learning_rate": 0.00019928135728662522 + }, + { + "step": 32, + "epoch": 1.8285714285714287, + "cpu_mem": 1.68927232, + "gpu_mem": 4.518428672, + "loss": 1.3716, + "grad_norm": 1.0891592502593994, + "learning_rate": 0.00019192284522774142 + }, + { + "step": 33, + "epoch": 1.8857142857142857, + "cpu_mem": 1.68927232, + "gpu_mem": 4.518408704, + "loss": 1.4182, + "grad_norm": 1.400274634361267, + "learning_rate": 0.00018445316131976934 + }, + { + "step": 34, + "epoch": 1.9428571428571428, + "cpu_mem": 1.68927232, + "gpu_mem": 4.5183872, + "loss": 1.3807, + "grad_norm": 0.9330995678901672, + "learning_rate": 0.00017689211382161034 + }, + { + "step": 35, + "epoch": 2.0, + "cpu_mem": 1.68927232, + "gpu_mem": 4.518275072, + "loss": 2.1007, + "grad_norm": 1.5206280946731567, + "learning_rate": 0.00016925975327198266 + }, + { + "step": 36, + "epoch": 2.057142857142857, + "cpu_mem": 1.68927232, + "gpu_mem": 4.493174272, + "loss": 1.3724, + "grad_norm": 1.0392643213272095, + "learning_rate": 0.00016157631931899697 + }, + { + "step": 37, + "epoch": 2.1142857142857143, + "cpu_mem": 1.68927232, + "gpu_mem": 4.493183488, + "loss": 1.4002, + "grad_norm": 1.0827171802520752, + "learning_rate": 0.0001538621870482483 + }, + { + "step": 38, + "epoch": 2.1714285714285713, + "cpu_mem": 1.68927232, + "gpu_mem": 4.493154304, + "loss": 1.3645, + "grad_norm": 0.673108696937561, + "learning_rate": 0.00014613781295175172 + }, + { + "step": 39, + "epoch": 2.2285714285714286, + "cpu_mem": 1.690845184, + "gpu_mem": 4.493172736, + "loss": 1.3674, + "grad_norm": 0.7864466309547424, + "learning_rate": 0.00013842368068100303 + }, + { + "step": 40, + "epoch": 2.2857142857142856, + "cpu_mem": 1.690845184, + "gpu_mem": 4.493149696, + "loss": 1.3789, + "grad_norm": 0.43801477551460266, + "learning_rate": 0.00013074024672801731 + }, + { + "step": 41, + "epoch": 2.342857142857143, + "cpu_mem": 1.690845184, + "gpu_mem": 4.493151232, + "loss": 1.387, + "grad_norm": 0.5549925565719604, + "learning_rate": 0.00012310788617838966 + }, + { + "step": 42, + "epoch": 2.4, + "cpu_mem": 1.690845184, + "gpu_mem": 4.493180416, + "loss": 1.3385, + "grad_norm": 0.9428305625915527, + "learning_rate": 0.00011554683868023067 + }, + { + "step": 43, + "epoch": 2.4571428571428573, + "cpu_mem": 1.690845184, + "gpu_mem": 4.493195776, + "loss": 1.3473, + "grad_norm": 0.6966314911842346, + "learning_rate": 0.00010807715477225858 + }, + { + "step": 44, + "epoch": 2.5142857142857142, + "cpu_mem": 1.690845184, + "gpu_mem": 4.493214208, + "loss": 1.3283, + "grad_norm": 0.4528641998767853, + "learning_rate": 0.00010071864271337478 + }, + { + "step": 45, + "epoch": 2.571428571428571, + "cpu_mem": 1.690845184, + "gpu_mem": 4.493168128, + "loss": 1.3179, + "grad_norm": 0.38591450452804565, + "learning_rate": 9.34908159547222e-05 + }, + { + "step": 46, + "epoch": 2.6285714285714286, + "cpu_mem": 1.690845184, + "gpu_mem": 4.493161984, + "loss": 1.2922, + "grad_norm": 0.618762195110321, + "learning_rate": 8.641284139352091e-05 + }, + { + "step": 47, + "epoch": 2.685714285714286, + "cpu_mem": 1.690845184, + "gpu_mem": 4.49315584, + "loss": 1.2893, + "grad_norm": 0.5950611233711243, + "learning_rate": 7.950348854590204e-05 + }, + { + "step": 48, + "epoch": 2.742857142857143, + "cpu_mem": 1.690845184, + "gpu_mem": 4.493160448, + "loss": 1.2752, + "grad_norm": 0.4702757000923157, + "learning_rate": 7.278107977352543e-05 + }, + { + "step": 49, + "epoch": 2.8, + "cpu_mem": 1.690845184, + "gpu_mem": 4.493151232, + "loss": 1.2898, + "grad_norm": 0.54841548204422, + "learning_rate": 6.626344169597031e-05 + }, + { + "step": 50, + "epoch": 2.857142857142857, + "cpu_mem": 1.690845184, + "gpu_mem": 4.4931328, + "loss": 1.3438, + "grad_norm": 0.7720724940299988, + "learning_rate": 5.996785791774478e-05 + }, + { + "step": 51, + "epoch": 2.914285714285714, + "cpu_mem": 1.690845184, + "gpu_mem": 4.493157376, + "loss": 1.3173, + "grad_norm": 0.7672045230865479, + "learning_rate": 5.391102319527373e-05 + }, + { + "step": 52, + "epoch": 2.9714285714285715, + "cpu_mem": 1.690845184, + "gpu_mem": 4.493185024, + "loss": 1.3659, + "grad_norm": 0.7325172424316406, + "learning_rate": 4.8108999165406026e-05 + }, + { + "step": 53, + "epoch": 3.0285714285714285, + "cpu_mem": 1.690845184, + "gpu_mem": 4.518385664, + "loss": 1.9457, + "grad_norm": 0.9215725660324097, + "learning_rate": 4.257717175284103e-05 + }, + { + "step": 54, + "epoch": 3.085714285714286, + "cpu_mem": 1.690845184, + "gpu_mem": 4.518354944, + "loss": 1.3724, + "grad_norm": 0.9825336933135986, + "learning_rate": 3.733021036942205e-05 + }, + { + "step": 55, + "epoch": 3.142857142857143, + "cpu_mem": 1.690845184, + "gpu_mem": 4.518388736, + "loss": 1.3758, + "grad_norm": 1.2695392370224, + "learning_rate": 3.238202901349345e-05 + }, + { + "step": 56, + "epoch": 3.2, + "cpu_mem": 1.690845184, + "gpu_mem": 4.518462464, + "loss": 1.3178, + "grad_norm": 0.570442795753479, + "learning_rate": 2.774574937247831e-05 + }, + { + "step": 57, + "epoch": 3.257142857142857, + "cpu_mem": 1.690845184, + "gpu_mem": 4.518405632, + "loss": 1.317, + "grad_norm": 0.5343690514564514, + "learning_rate": 2.3433666026522153e-05 + }, + { + "step": 58, + "epoch": 3.314285714285714, + "cpu_mem": 1.690845184, + "gpu_mem": 4.518399488, + "loss": 1.2187, + "grad_norm": 0.6323482990264893, + "learning_rate": 1.945721384547671e-05 + }, + { + "step": 59, + "epoch": 3.3714285714285714, + "cpu_mem": 1.690845184, + "gpu_mem": 4.518450176, + "loss": 1.2944, + "grad_norm": 0.5856432318687439, + "learning_rate": 1.5826937665680693e-05 + }, + { + "step": 60, + "epoch": 3.4285714285714284, + "cpu_mem": 1.690845184, + "gpu_mem": 4.518376448, + "loss": 1.356, + "grad_norm": 0.9861604571342468, + "learning_rate": 1.2552464326949302e-05 + }, + { + "step": 61, + "epoch": 3.4857142857142858, + "cpu_mem": 1.690845184, + "gpu_mem": 4.518390272, + "loss": 1.3699, + "grad_norm": 0.8181774020195007, + "learning_rate": 9.64247714392597e-06 + }, + { + "step": 62, + "epoch": 3.5428571428571427, + "cpu_mem": 1.690845184, + "gpu_mem": 4.518391808, + "loss": 1.3391, + "grad_norm": 0.7536750435829163, + "learning_rate": 7.104692879492624e-06 + }, + { + "step": 63, + "epoch": 3.6, + "cpu_mem": 1.690845184, + "gpu_mem": 4.518381056, + "loss": 1.3255, + "grad_norm": 0.6886075735092163, + "learning_rate": 4.945841281301943e-06 + }, + { + "step": 64, + "epoch": 3.657142857142857, + "cpu_mem": 1.690845184, + "gpu_mem": 4.518397952, + "loss": 1.304, + "grad_norm": 0.774066150188446, + "learning_rate": 3.1716472356963286e-06 + }, + { + "step": 65, + "epoch": 3.7142857142857144, + "cpu_mem": 1.690845184, + "gpu_mem": 4.518419456, + "loss": 1.3271, + "grad_norm": 0.7751500010490417, + "learning_rate": 1.7868155863384415e-06 + }, + { + "step": 66, + "epoch": 3.7714285714285714, + "cpu_mem": 1.690845184, + "gpu_mem": 4.51841024, + "loss": 1.2896, + "grad_norm": 0.8040803074836731, + "learning_rate": 7.950186578116413e-07 + }, + { + "step": 67, + "epoch": 3.8285714285714287, + "cpu_mem": 1.690845184, + "gpu_mem": 4.518436352, + "loss": 1.2878, + "grad_norm": 0.6177627444267273, + "learning_rate": 1.988865172754206e-07 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.690845184, + "gpu_mem": 4.5183872, + "loss": 1.3015, + "grad_norm": 0.8774092793464661, + "learning_rate": 0.0 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 1.690845184, + "gpu_mem": 4.5183872, + "train_runtime": 385.5468, + "train_samples_per_second": 11.609, + "train_steps_per_second": 0.176, + "total_flos": 4019818399604736.0, + "train_loss": 1.5448249929091509 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r2-a2/README.md b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r2-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r2-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r2-a2/adapter_config.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..26ebe9ef584396639cb6b281f2c8108d7f3fd14a --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r2-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 4, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 2, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r2-a2/eval_results.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..708f65cd43b697be8e5f8d6c883266f87bdca25f --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_e", + "results": 0.2857744107744108 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r2-a2/training_configuration.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..9ec76e5cc7ef279bb765f77c7089449f1a75c350 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_E", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "lora", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 1576960 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-lora-arc_e-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r2-a2", + "seed": 42, + "timestamp": "2025-08-29T17:37:16.499759" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r2-a2/training_logs.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..5782c226dac2d13d893a1fdc71f9accc69ad0376 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r2-a2/training_logs.json @@ -0,0 +1,1273 @@ +[ + { + "step": 1, + "epoch": 0.028169014084507043, + "cpu_mem": 1.688563712, + "gpu_mem": 4.423716352, + "loss": 4.6319, + "grad_norm": 16.733354568481445, + "learning_rate": 2.1428571428571425e-05 + }, + { + "step": 2, + "epoch": 0.056338028169014086, + "cpu_mem": 1.694265344, + "gpu_mem": 4.43639296, + "loss": 4.4578, + "grad_norm": 16.731847763061523, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 3, + "epoch": 0.08450704225352113, + "cpu_mem": 1.694461952, + "gpu_mem": 4.436371456, + "loss": 4.5584, + "grad_norm": 17.068809509277344, + "learning_rate": 6.428571428571427e-05 + }, + { + "step": 4, + "epoch": 0.11267605633802817, + "cpu_mem": 1.69465856, + "gpu_mem": 4.436349952, + "loss": 4.5509, + "grad_norm": 16.538555145263672, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 5, + "epoch": 0.14084507042253522, + "cpu_mem": 1.694855168, + "gpu_mem": 4.436391424, + "loss": 4.0563, + "grad_norm": 16.463178634643555, + "learning_rate": 0.00010714285714285714 + }, + { + "step": 6, + "epoch": 0.16901408450704225, + "cpu_mem": 1.694855168, + "gpu_mem": 4.436366848, + "loss": 3.3959, + "grad_norm": 16.10589599609375, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 7, + "epoch": 0.19718309859154928, + "cpu_mem": 1.694855168, + "gpu_mem": 4.436389888, + "loss": 2.9983, + "grad_norm": 12.606690406799316, + "learning_rate": 0.00015 + }, + { + "step": 8, + "epoch": 0.22535211267605634, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436348416, + "loss": 2.4644, + "grad_norm": 15.964723587036133, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 9, + "epoch": 0.2535211267605634, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436349952, + "loss": 2.3265, + "grad_norm": 13.07651138305664, + "learning_rate": 0.00019285714285714286 + }, + { + "step": 10, + "epoch": 0.28169014084507044, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436345344, + "loss": 1.8553, + "grad_norm": 3.890451192855835, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 11, + "epoch": 0.30985915492957744, + "cpu_mem": 1.695051776, + "gpu_mem": 4.43642368, + "loss": 1.6456, + "grad_norm": 2.9075863361358643, + "learning_rate": 0.00023571428571428569 + }, + { + "step": 12, + "epoch": 0.3380281690140845, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436397568, + "loss": 1.589, + "grad_norm": 2.6345012187957764, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 13, + "epoch": 0.36619718309859156, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436348416, + "loss": 1.4205, + "grad_norm": 1.7682219743728638, + "learning_rate": 0.00027857142857142854 + }, + { + "step": 14, + "epoch": 0.39436619718309857, + "cpu_mem": 1.695051776, + "gpu_mem": 4.43636992, + "loss": 1.4318, + "grad_norm": 3.3279316425323486, + "learning_rate": 0.0003 + }, + { + "step": 15, + "epoch": 0.4225352112676056, + "cpu_mem": 1.695051776, + "gpu_mem": 4.43634688, + "loss": 1.3586, + "grad_norm": 2.2742724418640137, + "learning_rate": 0.0002999533773001224 + }, + { + "step": 16, + "epoch": 0.4507042253521127, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436351488, + "loss": 1.351, + "grad_norm": 1.0051603317260742, + "learning_rate": 0.0002998135381828383 + }, + { + "step": 17, + "epoch": 0.4788732394366197, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436388352, + "loss": 1.3166, + "grad_norm": 1.7901557683944702, + "learning_rate": 0.00029958056957717696 + }, + { + "step": 18, + "epoch": 0.5070422535211268, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436399104, + "loss": 1.3686, + "grad_norm": 2.816706895828247, + "learning_rate": 0.0002992546163048102 + }, + { + "step": 19, + "epoch": 0.5352112676056338, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436342272, + "loss": 1.4891, + "grad_norm": 5.841419219970703, + "learning_rate": 0.0002988358809900258 + }, + { + "step": 20, + "epoch": 0.5633802816901409, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436412928, + "loss": 1.4813, + "grad_norm": 6.3125224113464355, + "learning_rate": 0.0002983246239337692 + }, + { + "step": 21, + "epoch": 0.5915492957746479, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436411392, + "loss": 1.3097, + "grad_norm": 1.6513506174087524, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 22, + "epoch": 0.6197183098591549, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436368384, + "loss": 1.305, + "grad_norm": 2.241537094116211, + "learning_rate": 0.00029702587317728153 + }, + { + "step": 23, + "epoch": 0.647887323943662, + "cpu_mem": 1.695051776, + "gpu_mem": 4.43638528, + "loss": 1.3069, + "grad_norm": 1.4712564945220947, + "learning_rate": 0.0002962391868272735 + }, + { + "step": 24, + "epoch": 0.676056338028169, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436342272, + "loss": 1.3613, + "grad_norm": 1.906409502029419, + "learning_rate": 0.00029536159293436166 + }, + { + "step": 25, + "epoch": 0.704225352112676, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436371456, + "loss": 1.3563, + "grad_norm": 0.9724729061126709, + "learning_rate": 0.00029439363704250176 + }, + { + "step": 26, + "epoch": 0.7323943661971831, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436351488, + "loss": 1.4104, + "grad_norm": 0.8772833347320557, + "learning_rate": 0.00029333592086792107 + }, + { + "step": 27, + "epoch": 0.7605633802816901, + "cpu_mem": 1.695051776, + "gpu_mem": 4.4363776, + "loss": 1.3526, + "grad_norm": 1.0292201042175293, + "learning_rate": 0.0002921891019250697 + }, + { + "step": 28, + "epoch": 0.7887323943661971, + "cpu_mem": 1.695051776, + "gpu_mem": 4.4363776, + "loss": 1.3533, + "grad_norm": 1.3381017446517944, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 29, + "epoch": 0.8169014084507042, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436356096, + "loss": 1.2556, + "grad_norm": 1.062872052192688, + "learning_rate": 0.00028963106229663063 + }, + { + "step": 30, + "epoch": 0.8450704225352113, + "cpu_mem": 1.695051776, + "gpu_mem": 4.43634688, + "loss": 1.3615, + "grad_norm": 0.949510931968689, + "learning_rate": 0.00028822143178056114 + }, + { + "step": 31, + "epoch": 0.8732394366197183, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436365312, + "loss": 1.3413, + "grad_norm": 0.5636250972747803, + "learning_rate": 0.00028672587784675096 + }, + { + "step": 32, + "epoch": 0.9014084507042254, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436388352, + "loss": 1.3528, + "grad_norm": 1.3210155963897705, + "learning_rate": 0.0002851453301853628 + }, + { + "step": 33, + "epoch": 0.9295774647887324, + "cpu_mem": 1.695051776, + "gpu_mem": 4.43638528, + "loss": 1.3639, + "grad_norm": 1.3865032196044922, + "learning_rate": 0.00028348077132172027 + }, + { + "step": 34, + "epoch": 0.9577464788732394, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436388352, + "loss": 1.4274, + "grad_norm": 1.7470793724060059, + "learning_rate": 0.0002817332360055343 + }, + { + "step": 35, + "epoch": 0.9859154929577465, + "cpu_mem": 1.695051776, + "gpu_mem": 4.43636992, + "loss": 1.3043, + "grad_norm": 0.6553197503089905, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 36, + "epoch": 1.0140845070422535, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442668544, + "loss": 1.9333, + "grad_norm": 1.119891881942749, + "learning_rate": 0.0002779936322448233 + }, + { + "step": 37, + "epoch": 1.0422535211267605, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442673152, + "loss": 1.358, + "grad_norm": 0.9220662713050842, + "learning_rate": 0.0002760038884726157 + }, + { + "step": 38, + "epoch": 1.0704225352112675, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442651648, + "loss": 1.246, + "grad_norm": 0.7248559594154358, + "learning_rate": 0.00027393581614739923 + }, + { + "step": 39, + "epoch": 1.0985915492957747, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442640896, + "loss": 1.3683, + "grad_norm": 1.4945263862609863, + "learning_rate": 0.0002717907008573785 + }, + { + "step": 40, + "epoch": 1.1267605633802817, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442703872, + "loss": 1.4034, + "grad_norm": 1.2522392272949219, + "learning_rate": 0.0002695698760834384 + }, + { + "step": 41, + "epoch": 1.1549295774647887, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442663936, + "loss": 1.3205, + "grad_norm": 0.44245103001594543, + "learning_rate": 0.00026727472237020447 + }, + { + "step": 42, + "epoch": 1.1830985915492958, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442706944, + "loss": 1.3396, + "grad_norm": 0.4195997416973114, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 43, + "epoch": 1.2112676056338028, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442656256, + "loss": 1.4157, + "grad_norm": 0.9303027391433716, + "learning_rate": 0.0002624671804451601 + }, + { + "step": 44, + "epoch": 1.2394366197183098, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442720768, + "loss": 1.3676, + "grad_norm": 0.986302375793457, + "learning_rate": 0.0002599577807744739 + }, + { + "step": 45, + "epoch": 1.267605633802817, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442688512, + "loss": 1.3656, + "grad_norm": 0.7733666896820068, + "learning_rate": 0.0002573800273889577 + }, + { + "step": 46, + "epoch": 1.295774647887324, + "cpu_mem": 1.695051776, + "gpu_mem": 4.44269312, + "loss": 1.3775, + "grad_norm": 0.8901095390319824, + "learning_rate": 0.0002547355227129109 + }, + { + "step": 47, + "epoch": 1.323943661971831, + "cpu_mem": 1.695051776, + "gpu_mem": 4.44263936, + "loss": 1.3393, + "grad_norm": 1.5515167713165283, + "learning_rate": 0.00025202591066563786 + }, + { + "step": 48, + "epoch": 1.352112676056338, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442653184, + "loss": 1.3299, + "grad_norm": 0.9678941369056702, + "learning_rate": 0.0002492528756395289 + }, + { + "step": 49, + "epoch": 1.380281690140845, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442642432, + "loss": 1.3326, + "grad_norm": 0.8067137598991394, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 50, + "epoch": 1.408450704225352, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442656256, + "loss": 1.3189, + "grad_norm": 0.6493967771530151, + "learning_rate": 0.00024352347027881003 + }, + { + "step": 51, + "epoch": 1.436619718309859, + "cpu_mem": 1.695051776, + "gpu_mem": 4.44270848, + "loss": 1.3477, + "grad_norm": 0.8248728513717651, + "learning_rate": 0.0002405706615488216 + }, + { + "step": 52, + "epoch": 1.4647887323943662, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442656256, + "loss": 1.3632, + "grad_norm": 1.142303228378296, + "learning_rate": 0.00023756155083521846 + }, + { + "step": 53, + "epoch": 1.4929577464788732, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442725376, + "loss": 1.3292, + "grad_norm": 0.8967983722686768, + "learning_rate": 0.00023449800870954326 + }, + { + "step": 54, + "epoch": 1.5211267605633803, + "cpu_mem": 1.695051776, + "gpu_mem": 4.44269312, + "loss": 1.3048, + "grad_norm": 1.0400514602661133, + "learning_rate": 0.0002313819395798639 + }, + { + "step": 55, + "epoch": 1.5492957746478875, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442702336, + "loss": 1.36, + "grad_norm": 0.7784701585769653, + "learning_rate": 0.0002282152805069247 + }, + { + "step": 56, + "epoch": 1.5774647887323945, + "cpu_mem": 1.695051776, + "gpu_mem": 4.44267776, + "loss": 1.3099, + "grad_norm": 0.5178675651550293, + "learning_rate": 0.000225 + }, + { + "step": 57, + "epoch": 1.6056338028169015, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442711552, + "loss": 1.3165, + "grad_norm": 0.5065286159515381, + "learning_rate": 0.00022173809679319772 + }, + { + "step": 58, + "epoch": 1.6338028169014085, + "cpu_mem": 1.695051776, + "gpu_mem": 4.44269312, + "loss": 1.3229, + "grad_norm": 0.6662645936012268, + "learning_rate": 0.00021843159860297442 + }, + { + "step": 59, + "epoch": 1.6619718309859155, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442679296, + "loss": 1.3101, + "grad_norm": 0.3355940878391266, + "learning_rate": 0.00021508256086763368 + }, + { + "step": 60, + "epoch": 1.6901408450704225, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442717696, + "loss": 1.309, + "grad_norm": 1.169063925743103, + "learning_rate": 0.00021169306546959174 + }, + { + "step": 61, + "epoch": 1.7183098591549295, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442650112, + "loss": 1.2904, + "grad_norm": 0.88178950548172, + "learning_rate": 0.0002082652194412042 + }, + { + "step": 62, + "epoch": 1.7464788732394365, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442697728, + "loss": 1.3603, + "grad_norm": 0.8476381301879883, + "learning_rate": 0.00020480115365495926 + }, + { + "step": 63, + "epoch": 1.7746478873239435, + "cpu_mem": 1.695051776, + "gpu_mem": 4.44264704, + "loss": 1.3305, + "grad_norm": 1.2525146007537842, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 64, + "epoch": 1.8028169014084507, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442696192, + "loss": 1.3485, + "grad_norm": 1.333308458328247, + "learning_rate": 0.00019777299753775265 + }, + { + "step": 65, + "epoch": 1.8309859154929577, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442694656, + "loss": 1.3355, + "grad_norm": 0.6429489254951477, + "learning_rate": 0.00019421327616163563 + }, + { + "step": 66, + "epoch": 1.8591549295774648, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442713088, + "loss": 1.2971, + "grad_norm": 0.7872833609580994, + "learning_rate": 0.00019062607022145078 + }, + { + "step": 67, + "epoch": 1.887323943661972, + "cpu_mem": 1.695051776, + "gpu_mem": 4.44265472, + "loss": 1.3045, + "grad_norm": 0.5733119249343872, + "learning_rate": 0.00018701360965354402 + }, + { + "step": 68, + "epoch": 1.915492957746479, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442667008, + "loss": 1.3512, + "grad_norm": 0.44897904992103577, + "learning_rate": 0.00018337814009344714 + }, + { + "step": 69, + "epoch": 1.943661971830986, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442691584, + "loss": 1.326, + "grad_norm": 1.0834873914718628, + "learning_rate": 0.0001797219214799096 + }, + { + "step": 70, + "epoch": 1.971830985915493, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442668544, + "loss": 1.2832, + "grad_norm": 0.6222435235977173, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 71, + "epoch": 2.0, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442504192, + "loss": 2.0347, + "grad_norm": 2.3903703689575195, + "learning_rate": 0.00017235633992642615 + }, + { + "step": 72, + "epoch": 2.028169014084507, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436380672, + "loss": 1.343, + "grad_norm": 0.5889484882354736, + "learning_rate": 0.00016865155569712278 + }, + { + "step": 73, + "epoch": 2.056338028169014, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436343808, + "loss": 1.3137, + "grad_norm": 0.5363865494728088, + "learning_rate": 0.0001649351769893725 + }, + { + "step": 74, + "epoch": 2.084507042253521, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436403712, + "loss": 1.3111, + "grad_norm": 0.5384390354156494, + "learning_rate": 0.00016120951403796364 + }, + { + "step": 75, + "epoch": 2.112676056338028, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436371456, + "loss": 1.3446, + "grad_norm": 0.8592147827148438, + "learning_rate": 0.00015747688284910457 + }, + { + "step": 76, + "epoch": 2.140845070422535, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436382208, + "loss": 1.2992, + "grad_norm": 0.44442299008369446, + "learning_rate": 0.00015373960376071093 + }, + { + "step": 77, + "epoch": 2.169014084507042, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436419072, + "loss": 1.3468, + "grad_norm": 0.34039196372032166, + "learning_rate": 0.00015 + }, + { + "step": 78, + "epoch": 2.1971830985915495, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436403712, + "loss": 1.3645, + "grad_norm": 0.8474765419960022, + "learning_rate": 0.00014626039623928907 + }, + { + "step": 79, + "epoch": 2.2253521126760565, + "cpu_mem": 1.695051776, + "gpu_mem": 4.43635456, + "loss": 1.2834, + "grad_norm": 0.7842165231704712, + "learning_rate": 0.0001425231171508954 + }, + { + "step": 80, + "epoch": 2.2535211267605635, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436399104, + "loss": 1.3535, + "grad_norm": 0.6954917907714844, + "learning_rate": 0.00013879048596203636 + }, + { + "step": 81, + "epoch": 2.2816901408450705, + "cpu_mem": 1.695051776, + "gpu_mem": 4.43638528, + "loss": 1.3242, + "grad_norm": 0.5529902577400208, + "learning_rate": 0.0001350648230106275 + }, + { + "step": 82, + "epoch": 2.3098591549295775, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436353024, + "loss": 1.3316, + "grad_norm": 0.6608796119689941, + "learning_rate": 0.00013134844430287725 + }, + { + "step": 83, + "epoch": 2.3380281690140845, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436403712, + "loss": 1.313, + "grad_norm": 1.658279299736023, + "learning_rate": 0.0001276436600735738 + }, + { + "step": 84, + "epoch": 2.3661971830985915, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436342272, + "loss": 1.3687, + "grad_norm": 0.972527027130127, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 85, + "epoch": 2.3943661971830985, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436388352, + "loss": 1.3726, + "grad_norm": 1.023406982421875, + "learning_rate": 0.00012027807852009038 + }, + { + "step": 86, + "epoch": 2.4225352112676055, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436342272, + "loss": 1.3362, + "grad_norm": 0.7401344180107117, + "learning_rate": 0.00011662185990655284 + }, + { + "step": 87, + "epoch": 2.4507042253521125, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436372992, + "loss": 1.3764, + "grad_norm": 0.6783835291862488, + "learning_rate": 0.00011298639034645593 + }, + { + "step": 88, + "epoch": 2.4788732394366195, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436348416, + "loss": 1.3412, + "grad_norm": 0.9525921940803528, + "learning_rate": 0.00010937392977854923 + }, + { + "step": 89, + "epoch": 2.507042253521127, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436402176, + "loss": 1.3006, + "grad_norm": 0.7852182984352112, + "learning_rate": 0.00010578672383836435 + }, + { + "step": 90, + "epoch": 2.535211267605634, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436383744, + "loss": 1.3294, + "grad_norm": 0.7373659014701843, + "learning_rate": 0.00010222700246224735 + }, + { + "step": 91, + "epoch": 2.563380281690141, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436333056, + "loss": 1.4118, + "grad_norm": 1.4976142644882202, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 92, + "epoch": 2.591549295774648, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436357632, + "loss": 1.3062, + "grad_norm": 0.4859124720096588, + "learning_rate": 9.519884634504074e-05 + }, + { + "step": 93, + "epoch": 2.619718309859155, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436360704, + "loss": 1.2483, + "grad_norm": 1.0271308422088623, + "learning_rate": 9.17347805587958e-05 + }, + { + "step": 94, + "epoch": 2.647887323943662, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436353024, + "loss": 1.2847, + "grad_norm": 0.4792281985282898, + "learning_rate": 8.830693453040829e-05 + }, + { + "step": 95, + "epoch": 2.676056338028169, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436391424, + "loss": 1.3146, + "grad_norm": 0.564704179763794, + "learning_rate": 8.491743913236628e-05 + }, + { + "step": 96, + "epoch": 2.704225352112676, + "cpu_mem": 1.695051776, + "gpu_mem": 4.43640064, + "loss": 1.2438, + "grad_norm": 1.0190887451171875, + "learning_rate": 8.156840139702554e-05 + }, + { + "step": 97, + "epoch": 2.732394366197183, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436343808, + "loss": 1.3397, + "grad_norm": 0.7395093441009521, + "learning_rate": 7.82619032068023e-05 + }, + { + "step": 98, + "epoch": 2.76056338028169, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436343808, + "loss": 1.341, + "grad_norm": 0.4369877874851227, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 99, + "epoch": 2.788732394366197, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436340736, + "loss": 1.2914, + "grad_norm": 0.6666271090507507, + "learning_rate": 7.17847194930753e-05 + }, + { + "step": 100, + "epoch": 2.816901408450704, + "cpu_mem": 1.695051776, + "gpu_mem": 4.4363392, + "loss": 1.2739, + "grad_norm": 0.7983679175376892, + "learning_rate": 6.86180604201361e-05 + }, + { + "step": 101, + "epoch": 2.845070422535211, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436382208, + "loss": 1.2501, + "grad_norm": 0.7632639408111572, + "learning_rate": 6.550199129045668e-05 + }, + { + "step": 102, + "epoch": 2.873239436619718, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436320768, + "loss": 1.2884, + "grad_norm": 0.4494268000125885, + "learning_rate": 6.243844916478155e-05 + }, + { + "step": 103, + "epoch": 2.9014084507042255, + "cpu_mem": 1.695051776, + "gpu_mem": 4.43636992, + "loss": 1.2984, + "grad_norm": 0.4631851613521576, + "learning_rate": 5.9429338451178355e-05 + }, + { + "step": 104, + "epoch": 2.9295774647887325, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436432896, + "loss": 1.333, + "grad_norm": 0.9131077527999878, + "learning_rate": 5.6476529721189974e-05 + }, + { + "step": 105, + "epoch": 2.9577464788732395, + "cpu_mem": 1.695051776, + "gpu_mem": 4.43638528, + "loss": 1.2615, + "grad_norm": 0.4854181706905365, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 106, + "epoch": 2.9859154929577465, + "cpu_mem": 1.695051776, + "gpu_mem": 4.436366848, + "loss": 1.3068, + "grad_norm": 0.4039210081100464, + "learning_rate": 5.074712436047112e-05 + }, + { + "step": 107, + "epoch": 3.0140845070422535, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442691584, + "loss": 1.8431, + "grad_norm": 1.3718855381011963, + "learning_rate": 4.7974089334362057e-05 + }, + { + "step": 108, + "epoch": 3.0422535211267605, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442673152, + "loss": 1.3033, + "grad_norm": 0.7583311200141907, + "learning_rate": 4.526447728708908e-05 + }, + { + "step": 109, + "epoch": 3.0704225352112675, + "cpu_mem": 1.695051776, + "gpu_mem": 4.4426624, + "loss": 1.308, + "grad_norm": 0.5152148008346558, + "learning_rate": 4.261997261104223e-05 + }, + { + "step": 110, + "epoch": 3.0985915492957745, + "cpu_mem": 1.695051776, + "gpu_mem": 4.44271616, + "loss": 1.3468, + "grad_norm": 2.0088555812835693, + "learning_rate": 4.004221922552608e-05 + }, + { + "step": 111, + "epoch": 3.1267605633802815, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442676224, + "loss": 1.3182, + "grad_norm": 0.7090792655944824, + "learning_rate": 3.753281955483985e-05 + }, + { + "step": 112, + "epoch": 3.1549295774647885, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442694656, + "loss": 1.2909, + "grad_norm": 0.4198123514652252, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 113, + "epoch": 3.183098591549296, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442757632, + "loss": 1.3008, + "grad_norm": 0.6029356122016907, + "learning_rate": 3.2725277629795526e-05 + }, + { + "step": 114, + "epoch": 3.211267605633803, + "cpu_mem": 1.695051776, + "gpu_mem": 4.44268544, + "loss": 1.3313, + "grad_norm": 0.6015781760215759, + "learning_rate": 3.0430123916561672e-05 + }, + { + "step": 115, + "epoch": 3.23943661971831, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442679296, + "loss": 1.3507, + "grad_norm": 1.096090316772461, + "learning_rate": 2.8209299142621522e-05 + }, + { + "step": 116, + "epoch": 3.267605633802817, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442694656, + "loss": 1.3108, + "grad_norm": 0.6515424847602844, + "learning_rate": 2.6064183852600797e-05 + }, + { + "step": 117, + "epoch": 3.295774647887324, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442710016, + "loss": 1.2689, + "grad_norm": 0.6639953255653381, + "learning_rate": 2.3996111527384288e-05 + }, + { + "step": 118, + "epoch": 3.323943661971831, + "cpu_mem": 1.695051776, + "gpu_mem": 4.4427008, + "loss": 1.3026, + "grad_norm": 0.699027419090271, + "learning_rate": 2.2006367755176655e-05 + }, + { + "step": 119, + "epoch": 3.352112676056338, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442691584, + "loss": 1.3284, + "grad_norm": 0.5354934930801392, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 120, + "epoch": 3.380281690140845, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442710016, + "loss": 1.2948, + "grad_norm": 0.5070159435272217, + "learning_rate": 1.82667639944657e-05 + }, + { + "step": 121, + "epoch": 3.408450704225352, + "cpu_mem": 1.695051776, + "gpu_mem": 4.44270848, + "loss": 1.2932, + "grad_norm": 0.6671300530433655, + "learning_rate": 1.6519228678279718e-05 + }, + { + "step": 122, + "epoch": 3.436619718309859, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442665472, + "loss": 1.2886, + "grad_norm": 0.6800514459609985, + "learning_rate": 1.4854669814637143e-05 + }, + { + "step": 123, + "epoch": 3.464788732394366, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442697728, + "loss": 1.2872, + "grad_norm": 0.6827869415283203, + "learning_rate": 1.3274122153249028e-05 + }, + { + "step": 124, + "epoch": 3.492957746478873, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442651648, + "loss": 1.3156, + "grad_norm": 0.4668143689632416, + "learning_rate": 1.1778568219438839e-05 + }, + { + "step": 125, + "epoch": 3.52112676056338, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442696192, + "loss": 1.2967, + "grad_norm": 1.0199906826019287, + "learning_rate": 1.036893770336938e-05 + }, + { + "step": 126, + "epoch": 3.5492957746478875, + "cpu_mem": 1.695051776, + "gpu_mem": 4.44264704, + "loss": 1.2844, + "grad_norm": 0.7969029545783997, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 127, + "epoch": 3.5774647887323945, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442659328, + "loss": 1.3007, + "grad_norm": 0.44741517305374146, + "learning_rate": 7.810898074930243e-06 + }, + { + "step": 128, + "epoch": 3.6056338028169015, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442683904, + "loss": 1.301, + "grad_norm": 0.6374433636665344, + "learning_rate": 6.664079132078881e-06 + }, + { + "step": 129, + "epoch": 3.6338028169014085, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442645504, + "loss": 1.3001, + "grad_norm": 0.41190409660339355, + "learning_rate": 5.606362957498195e-06 + }, + { + "step": 130, + "epoch": 3.6619718309859155, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442648576, + "loss": 1.3122, + "grad_norm": 0.4634973406791687, + "learning_rate": 4.638407065638322e-06 + }, + { + "step": 131, + "epoch": 3.6901408450704225, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442660864, + "loss": 1.2695, + "grad_norm": 0.5479190945625305, + "learning_rate": 3.760813172726457e-06 + }, + { + "step": 132, + "epoch": 3.7183098591549295, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442625536, + "loss": 1.2789, + "grad_norm": 0.5278027057647705, + "learning_rate": 2.9741268227184255e-06 + }, + { + "step": 133, + "epoch": 3.7464788732394365, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442667008, + "loss": 1.2556, + "grad_norm": 0.635922908782959, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 134, + "epoch": 3.7746478873239435, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442682368, + "loss": 1.2523, + "grad_norm": 0.6001003384590149, + "learning_rate": 1.6753760662307215e-06 + }, + { + "step": 135, + "epoch": 3.802816901408451, + "cpu_mem": 1.695051776, + "gpu_mem": 4.44264704, + "loss": 1.3082, + "grad_norm": 1.1793105602264404, + "learning_rate": 1.1641190099741904e-06 + }, + { + "step": 136, + "epoch": 3.830985915492958, + "cpu_mem": 1.695051776, + "gpu_mem": 4.44265472, + "loss": 1.3359, + "grad_norm": 0.6606824398040771, + "learning_rate": 7.453836951897885e-07 + }, + { + "step": 137, + "epoch": 3.859154929577465, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442676224, + "loss": 1.2964, + "grad_norm": 0.8650245070457458, + "learning_rate": 4.194304228229806e-07 + }, + { + "step": 138, + "epoch": 3.887323943661972, + "cpu_mem": 1.695051776, + "gpu_mem": 4.442686976, + "loss": 1.3482, + "grad_norm": 0.6099734902381897, + "learning_rate": 1.8646181716164831e-07 + }, + { + "step": 139, + "epoch": 3.915492957746479, + "cpu_mem": 1.695248384, + "gpu_mem": 4.442679296, + "loss": 1.3525, + "grad_norm": 0.5313996076583862, + "learning_rate": 4.662269987756317e-08 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.695248384, + "gpu_mem": 4.442713088, + "loss": 1.3165, + "grad_norm": 0.5951855778694153, + "learning_rate": 0.0 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.695248384, + "gpu_mem": 4.442713088, + "train_runtime": 693.2125, + "train_samples_per_second": 12.989, + "train_steps_per_second": 0.202, + "total_flos": 7230061454266368.0, + "train_loss": 1.502149166379656 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r32-a2/README.md b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r32-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r32-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r32-a2/adapter_config.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..03eabaea80bc9f8c1936ead28264f565a8ac69c0 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r32-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r32-a2/eval_results.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b04a24f23363eb9b09cb983f636c5be426d926cf --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_e", + "results": 0.5054713804713805 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r32-a2/training_configuration.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..5412471a93d95cb5a9489a59dafc56077a1cd51b --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_E", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "lora", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 25231360 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-lora-arc_e-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r32-a2", + "seed": 42, + "timestamp": "2025-08-30T07:57:38.768863" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r32-a2/training_logs.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..ae553e8a97b6f10f43f26eeb0f08a293d91587cc --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r32-a2/training_logs.json @@ -0,0 +1,1273 @@ +[ + { + "step": 1, + "epoch": 0.028169014084507043, + "cpu_mem": 1.777905664, + "gpu_mem": 4.518333952, + "loss": 4.6319, + "grad_norm": 66.03837585449219, + "learning_rate": 2.1428571428571425e-05 + }, + { + "step": 2, + "epoch": 0.056338028169014086, + "cpu_mem": 1.725468672, + "gpu_mem": 4.72024576, + "loss": 4.4578, + "grad_norm": 66.08963775634766, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 3, + "epoch": 0.08450704225352113, + "cpu_mem": 1.72566528, + "gpu_mem": 4.720224256, + "loss": 3.3425, + "grad_norm": 45.85478210449219, + "learning_rate": 6.428571428571427e-05 + }, + { + "step": 4, + "epoch": 0.11267605633802817, + "cpu_mem": 1.726058496, + "gpu_mem": 4.720202752, + "loss": 2.5582, + "grad_norm": 72.2707748413086, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 5, + "epoch": 0.14084507042253522, + "cpu_mem": 1.726058496, + "gpu_mem": 4.720244224, + "loss": 1.6846, + "grad_norm": 7.0534586906433105, + "learning_rate": 0.00010714285714285714 + }, + { + "step": 6, + "epoch": 0.16901408450704225, + "cpu_mem": 1.726255104, + "gpu_mem": 4.720219648, + "loss": 1.51, + "grad_norm": 5.754985332489014, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 7, + "epoch": 0.19718309859154928, + "cpu_mem": 1.726255104, + "gpu_mem": 4.720242688, + "loss": 1.4494, + "grad_norm": 4.177745342254639, + "learning_rate": 0.00015 + }, + { + "step": 8, + "epoch": 0.22535211267605634, + "cpu_mem": 1.726255104, + "gpu_mem": 4.720201216, + "loss": 1.3782, + "grad_norm": 3.205333948135376, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 9, + "epoch": 0.2535211267605634, + "cpu_mem": 1.726255104, + "gpu_mem": 4.720202752, + "loss": 1.3586, + "grad_norm": 3.2945213317871094, + "learning_rate": 0.00019285714285714286 + }, + { + "step": 10, + "epoch": 0.28169014084507044, + "cpu_mem": 1.726255104, + "gpu_mem": 4.720198144, + "loss": 1.591, + "grad_norm": 11.20108699798584, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 11, + "epoch": 0.30985915492957744, + "cpu_mem": 1.726255104, + "gpu_mem": 4.72027648, + "loss": 1.3609, + "grad_norm": 2.2239484786987305, + "learning_rate": 0.00023571428571428569 + }, + { + "step": 12, + "epoch": 0.3380281690140845, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720250368, + "loss": 1.34, + "grad_norm": 2.1631321907043457, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 13, + "epoch": 0.36619718309859156, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720201216, + "loss": 1.389, + "grad_norm": 2.425006628036499, + "learning_rate": 0.00027857142857142854 + }, + { + "step": 14, + "epoch": 0.39436619718309857, + "cpu_mem": 1.726451712, + "gpu_mem": 4.72022272, + "loss": 1.3962, + "grad_norm": 2.440736770629883, + "learning_rate": 0.0003 + }, + { + "step": 15, + "epoch": 0.4225352112676056, + "cpu_mem": 1.726451712, + "gpu_mem": 4.72019968, + "loss": 1.363, + "grad_norm": 1.4997576475143433, + "learning_rate": 0.0002999533773001224 + }, + { + "step": 16, + "epoch": 0.4507042253521127, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720204288, + "loss": 1.3945, + "grad_norm": 1.1862719058990479, + "learning_rate": 0.0002998135381828383 + }, + { + "step": 17, + "epoch": 0.4788732394366197, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720241152, + "loss": 1.3532, + "grad_norm": 1.2509536743164062, + "learning_rate": 0.00029958056957717696 + }, + { + "step": 18, + "epoch": 0.5070422535211268, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720251904, + "loss": 1.3652, + "grad_norm": 1.7078202962875366, + "learning_rate": 0.0002992546163048102 + }, + { + "step": 19, + "epoch": 0.5352112676056338, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720195072, + "loss": 1.3714, + "grad_norm": 2.476296901702881, + "learning_rate": 0.0002988358809900258 + }, + { + "step": 20, + "epoch": 0.5633802816901409, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720265728, + "loss": 1.4314, + "grad_norm": 2.513395309448242, + "learning_rate": 0.0002983246239337692 + }, + { + "step": 21, + "epoch": 0.5915492957746479, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720264192, + "loss": 1.3377, + "grad_norm": 2.0373988151550293, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 22, + "epoch": 0.6197183098591549, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720221184, + "loss": 1.3517, + "grad_norm": 1.954820990562439, + "learning_rate": 0.00029702587317728153 + }, + { + "step": 23, + "epoch": 0.647887323943662, + "cpu_mem": 1.726451712, + "gpu_mem": 4.72023808, + "loss": 1.3221, + "grad_norm": 1.3133777379989624, + "learning_rate": 0.0002962391868272735 + }, + { + "step": 24, + "epoch": 0.676056338028169, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720195072, + "loss": 1.3367, + "grad_norm": 1.6057462692260742, + "learning_rate": 0.00029536159293436166 + }, + { + "step": 25, + "epoch": 0.704225352112676, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720224256, + "loss": 1.4123, + "grad_norm": 2.1148736476898193, + "learning_rate": 0.00029439363704250176 + }, + { + "step": 26, + "epoch": 0.7323943661971831, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720204288, + "loss": 1.4671, + "grad_norm": 1.6069798469543457, + "learning_rate": 0.00029333592086792107 + }, + { + "step": 27, + "epoch": 0.7605633802816901, + "cpu_mem": 1.726451712, + "gpu_mem": 4.7202304, + "loss": 1.3511, + "grad_norm": 1.189152717590332, + "learning_rate": 0.0002921891019250697 + }, + { + "step": 28, + "epoch": 0.7887323943661971, + "cpu_mem": 1.726451712, + "gpu_mem": 4.7202304, + "loss": 1.3827, + "grad_norm": 1.0186585187911987, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 29, + "epoch": 0.8169014084507042, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720208896, + "loss": 1.2864, + "grad_norm": 1.3406333923339844, + "learning_rate": 0.00028963106229663063 + }, + { + "step": 30, + "epoch": 0.8450704225352113, + "cpu_mem": 1.726451712, + "gpu_mem": 4.72019968, + "loss": 1.3332, + "grad_norm": 0.871654748916626, + "learning_rate": 0.00028822143178056114 + }, + { + "step": 31, + "epoch": 0.8732394366197183, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720218112, + "loss": 1.3581, + "grad_norm": 0.9379727244377136, + "learning_rate": 0.00028672587784675096 + }, + { + "step": 32, + "epoch": 0.9014084507042254, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720241152, + "loss": 1.3325, + "grad_norm": 1.482436180114746, + "learning_rate": 0.0002851453301853628 + }, + { + "step": 33, + "epoch": 0.9295774647887324, + "cpu_mem": 1.726451712, + "gpu_mem": 4.72023808, + "loss": 1.3551, + "grad_norm": 0.916314959526062, + "learning_rate": 0.00028348077132172027 + }, + { + "step": 34, + "epoch": 0.9577464788732394, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720241152, + "loss": 1.3864, + "grad_norm": 1.1163442134857178, + "learning_rate": 0.0002817332360055343 + }, + { + "step": 35, + "epoch": 0.9859154929577465, + "cpu_mem": 1.726451712, + "gpu_mem": 4.72022272, + "loss": 1.3008, + "grad_norm": 0.6469650268554688, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 36, + "epoch": 1.0140845070422535, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821138944, + "loss": 1.9619, + "grad_norm": 1.981237769126892, + "learning_rate": 0.0002779936322448233 + }, + { + "step": 37, + "epoch": 1.0422535211267605, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821143552, + "loss": 1.3713, + "grad_norm": 1.6961532831192017, + "learning_rate": 0.0002760038884726157 + }, + { + "step": 38, + "epoch": 1.0704225352112675, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821122048, + "loss": 1.2336, + "grad_norm": 1.1861004829406738, + "learning_rate": 0.00027393581614739923 + }, + { + "step": 39, + "epoch": 1.0985915492957747, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821111296, + "loss": 1.3381, + "grad_norm": 2.140319347381592, + "learning_rate": 0.0002717907008573785 + }, + { + "step": 40, + "epoch": 1.1267605633802817, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821174272, + "loss": 1.3624, + "grad_norm": 1.6575071811676025, + "learning_rate": 0.0002695698760834384 + }, + { + "step": 41, + "epoch": 1.1549295774647887, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821134336, + "loss": 1.363, + "grad_norm": 2.02266263961792, + "learning_rate": 0.00026727472237020447 + }, + { + "step": 42, + "epoch": 1.1830985915492958, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821177344, + "loss": 1.3677, + "grad_norm": 1.6093175411224365, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 43, + "epoch": 1.2112676056338028, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821126656, + "loss": 1.3834, + "grad_norm": 0.8308978080749512, + "learning_rate": 0.0002624671804451601 + }, + { + "step": 44, + "epoch": 1.2394366197183098, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821191168, + "loss": 1.3682, + "grad_norm": 1.159053921699524, + "learning_rate": 0.0002599577807744739 + }, + { + "step": 45, + "epoch": 1.267605633802817, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821158912, + "loss": 1.3729, + "grad_norm": 0.9415165185928345, + "learning_rate": 0.0002573800273889577 + }, + { + "step": 46, + "epoch": 1.295774647887324, + "cpu_mem": 1.726451712, + "gpu_mem": 4.82116352, + "loss": 1.3652, + "grad_norm": 1.0574718713760376, + "learning_rate": 0.0002547355227129109 + }, + { + "step": 47, + "epoch": 1.323943661971831, + "cpu_mem": 1.726451712, + "gpu_mem": 4.82110976, + "loss": 1.3281, + "grad_norm": 1.7069846391677856, + "learning_rate": 0.00025202591066563786 + }, + { + "step": 48, + "epoch": 1.352112676056338, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821123584, + "loss": 1.3138, + "grad_norm": 1.1803359985351562, + "learning_rate": 0.0002492528756395289 + }, + { + "step": 49, + "epoch": 1.380281690140845, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821112832, + "loss": 1.355, + "grad_norm": 2.1591832637786865, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 50, + "epoch": 1.408450704225352, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821126656, + "loss": 1.3911, + "grad_norm": 2.5143864154815674, + "learning_rate": 0.00024352347027881003 + }, + { + "step": 51, + "epoch": 1.436619718309859, + "cpu_mem": 1.726451712, + "gpu_mem": 4.82117888, + "loss": 1.3506, + "grad_norm": 2.433032274246216, + "learning_rate": 0.0002405706615488216 + }, + { + "step": 52, + "epoch": 1.4647887323943662, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821126656, + "loss": 1.4117, + "grad_norm": 3.9875152111053467, + "learning_rate": 0.00023756155083521846 + }, + { + "step": 53, + "epoch": 1.4929577464788732, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821195776, + "loss": 1.301, + "grad_norm": 1.0287299156188965, + "learning_rate": 0.00023449800870954326 + }, + { + "step": 54, + "epoch": 1.5211267605633803, + "cpu_mem": 1.726451712, + "gpu_mem": 4.82116352, + "loss": 1.3172, + "grad_norm": 1.8995263576507568, + "learning_rate": 0.0002313819395798639 + }, + { + "step": 55, + "epoch": 1.5492957746478875, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821172736, + "loss": 1.3802, + "grad_norm": 2.2760636806488037, + "learning_rate": 0.0002282152805069247 + }, + { + "step": 56, + "epoch": 1.5774647887323945, + "cpu_mem": 1.726451712, + "gpu_mem": 4.82114816, + "loss": 1.303, + "grad_norm": 1.452477216720581, + "learning_rate": 0.000225 + }, + { + "step": 57, + "epoch": 1.6056338028169015, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821181952, + "loss": 1.325, + "grad_norm": 1.2978683710098267, + "learning_rate": 0.00022173809679319772 + }, + { + "step": 58, + "epoch": 1.6338028169014085, + "cpu_mem": 1.726451712, + "gpu_mem": 4.82116352, + "loss": 1.3087, + "grad_norm": 1.2289164066314697, + "learning_rate": 0.00021843159860297442 + }, + { + "step": 59, + "epoch": 1.6619718309859155, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821149696, + "loss": 1.3615, + "grad_norm": 1.8523905277252197, + "learning_rate": 0.00021508256086763368 + }, + { + "step": 60, + "epoch": 1.6901408450704225, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821188096, + "loss": 1.3801, + "grad_norm": 2.1218063831329346, + "learning_rate": 0.00021169306546959174 + }, + { + "step": 61, + "epoch": 1.7183098591549295, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821120512, + "loss": 1.3031, + "grad_norm": 1.4536633491516113, + "learning_rate": 0.0002082652194412042 + }, + { + "step": 62, + "epoch": 1.7464788732394365, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821168128, + "loss": 1.3724, + "grad_norm": 1.313852310180664, + "learning_rate": 0.00020480115365495926 + }, + { + "step": 63, + "epoch": 1.7746478873239435, + "cpu_mem": 1.726451712, + "gpu_mem": 4.82111744, + "loss": 1.3494, + "grad_norm": 1.6237351894378662, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 64, + "epoch": 1.8028169014084507, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821166592, + "loss": 1.3347, + "grad_norm": 1.2837320566177368, + "learning_rate": 0.00019777299753775265 + }, + { + "step": 65, + "epoch": 1.8309859154929577, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821165056, + "loss": 1.3533, + "grad_norm": 1.3841702938079834, + "learning_rate": 0.00019421327616163563 + }, + { + "step": 66, + "epoch": 1.8591549295774648, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821183488, + "loss": 1.3392, + "grad_norm": 2.0691921710968018, + "learning_rate": 0.00019062607022145078 + }, + { + "step": 67, + "epoch": 1.887323943661972, + "cpu_mem": 1.726451712, + "gpu_mem": 4.82112512, + "loss": 1.3528, + "grad_norm": 1.3308155536651611, + "learning_rate": 0.00018701360965354402 + }, + { + "step": 68, + "epoch": 1.915492957746479, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821137408, + "loss": 1.3626, + "grad_norm": 0.8293198943138123, + "learning_rate": 0.00018337814009344714 + }, + { + "step": 69, + "epoch": 1.943661971830986, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821161984, + "loss": 1.3145, + "grad_norm": 1.2693272829055786, + "learning_rate": 0.0001797219214799096 + }, + { + "step": 70, + "epoch": 1.971830985915493, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821138944, + "loss": 1.2607, + "grad_norm": 1.3352925777435303, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 71, + "epoch": 2.0, + "cpu_mem": 1.726451712, + "gpu_mem": 4.820974592, + "loss": 1.922, + "grad_norm": 1.9041682481765747, + "learning_rate": 0.00017235633992642615 + }, + { + "step": 72, + "epoch": 2.028169014084507, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720233472, + "loss": 1.404, + "grad_norm": 1.658454418182373, + "learning_rate": 0.00016865155569712278 + }, + { + "step": 73, + "epoch": 2.056338028169014, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720196608, + "loss": 1.3413, + "grad_norm": 1.2257252931594849, + "learning_rate": 0.0001649351769893725 + }, + { + "step": 74, + "epoch": 2.084507042253521, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720256512, + "loss": 1.3167, + "grad_norm": 1.1663110256195068, + "learning_rate": 0.00016120951403796364 + }, + { + "step": 75, + "epoch": 2.112676056338028, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720224256, + "loss": 1.3468, + "grad_norm": 1.1925673484802246, + "learning_rate": 0.00015747688284910457 + }, + { + "step": 76, + "epoch": 2.140845070422535, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720235008, + "loss": 1.2712, + "grad_norm": 0.5582539439201355, + "learning_rate": 0.00015373960376071093 + }, + { + "step": 77, + "epoch": 2.169014084507042, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720271872, + "loss": 1.3353, + "grad_norm": 0.7910153865814209, + "learning_rate": 0.00015 + }, + { + "step": 78, + "epoch": 2.1971830985915495, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720256512, + "loss": 1.3564, + "grad_norm": 0.9837383031845093, + "learning_rate": 0.00014626039623928907 + }, + { + "step": 79, + "epoch": 2.2253521126760565, + "cpu_mem": 1.726451712, + "gpu_mem": 4.72020736, + "loss": 1.2749, + "grad_norm": 1.0822142362594604, + "learning_rate": 0.0001425231171508954 + }, + { + "step": 80, + "epoch": 2.2535211267605635, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720251904, + "loss": 1.3183, + "grad_norm": 0.892155647277832, + "learning_rate": 0.00013879048596203636 + }, + { + "step": 81, + "epoch": 2.2816901408450705, + "cpu_mem": 1.726451712, + "gpu_mem": 4.72023808, + "loss": 1.3268, + "grad_norm": 0.8442540168762207, + "learning_rate": 0.0001350648230106275 + }, + { + "step": 82, + "epoch": 2.3098591549295775, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720205824, + "loss": 1.3052, + "grad_norm": 0.9757601618766785, + "learning_rate": 0.00013134844430287725 + }, + { + "step": 83, + "epoch": 2.3380281690140845, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720256512, + "loss": 1.3343, + "grad_norm": 1.9416048526763916, + "learning_rate": 0.0001276436600735738 + }, + { + "step": 84, + "epoch": 2.3661971830985915, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720195072, + "loss": 1.3589, + "grad_norm": 1.134802222251892, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 85, + "epoch": 2.3943661971830985, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720241152, + "loss": 1.345, + "grad_norm": 1.4902468919754028, + "learning_rate": 0.00012027807852009038 + }, + { + "step": 86, + "epoch": 2.4225352112676055, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720195072, + "loss": 1.348, + "grad_norm": 1.1360399723052979, + "learning_rate": 0.00011662185990655284 + }, + { + "step": 87, + "epoch": 2.4507042253521125, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720225792, + "loss": 1.3494, + "grad_norm": 1.367822289466858, + "learning_rate": 0.00011298639034645593 + }, + { + "step": 88, + "epoch": 2.4788732394366195, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720201216, + "loss": 1.2545, + "grad_norm": 1.2328016757965088, + "learning_rate": 0.00010937392977854923 + }, + { + "step": 89, + "epoch": 2.507042253521127, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720254976, + "loss": 1.2513, + "grad_norm": 1.2508933544158936, + "learning_rate": 0.00010578672383836435 + }, + { + "step": 90, + "epoch": 2.535211267605634, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720236544, + "loss": 1.3723, + "grad_norm": 1.9212051630020142, + "learning_rate": 0.00010222700246224735 + }, + { + "step": 91, + "epoch": 2.563380281690141, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720185856, + "loss": 1.3844, + "grad_norm": 2.2792913913726807, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 92, + "epoch": 2.591549295774648, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720210432, + "loss": 1.3145, + "grad_norm": 1.4580612182617188, + "learning_rate": 9.519884634504074e-05 + }, + { + "step": 93, + "epoch": 2.619718309859155, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720213504, + "loss": 1.2835, + "grad_norm": 1.9744666814804077, + "learning_rate": 9.17347805587958e-05 + }, + { + "step": 94, + "epoch": 2.647887323943662, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720205824, + "loss": 1.2464, + "grad_norm": 1.0725305080413818, + "learning_rate": 8.830693453040829e-05 + }, + { + "step": 95, + "epoch": 2.676056338028169, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720244224, + "loss": 1.2719, + "grad_norm": 1.198302149772644, + "learning_rate": 8.491743913236628e-05 + }, + { + "step": 96, + "epoch": 2.704225352112676, + "cpu_mem": 1.726451712, + "gpu_mem": 4.72025344, + "loss": 1.1486, + "grad_norm": 1.2075554132461548, + "learning_rate": 8.156840139702554e-05 + }, + { + "step": 97, + "epoch": 2.732394366197183, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720196608, + "loss": 1.3245, + "grad_norm": 1.8964240550994873, + "learning_rate": 7.82619032068023e-05 + }, + { + "step": 98, + "epoch": 2.76056338028169, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720196608, + "loss": 1.3332, + "grad_norm": 1.9328669309616089, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 99, + "epoch": 2.788732394366197, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720193536, + "loss": 1.2582, + "grad_norm": 1.4671530723571777, + "learning_rate": 7.17847194930753e-05 + }, + { + "step": 100, + "epoch": 2.816901408450704, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720192, + "loss": 1.2488, + "grad_norm": 1.7880338430404663, + "learning_rate": 6.86180604201361e-05 + }, + { + "step": 101, + "epoch": 2.845070422535211, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720235008, + "loss": 1.1896, + "grad_norm": 1.2528692483901978, + "learning_rate": 6.550199129045668e-05 + }, + { + "step": 102, + "epoch": 2.873239436619718, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720173568, + "loss": 1.2893, + "grad_norm": 1.3280245065689087, + "learning_rate": 6.243844916478155e-05 + }, + { + "step": 103, + "epoch": 2.9014084507042255, + "cpu_mem": 1.726451712, + "gpu_mem": 4.72022272, + "loss": 1.2294, + "grad_norm": 0.959479808807373, + "learning_rate": 5.9429338451178355e-05 + }, + { + "step": 104, + "epoch": 2.9295774647887325, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720285696, + "loss": 1.3722, + "grad_norm": 2.0665886402130127, + "learning_rate": 5.6476529721189974e-05 + }, + { + "step": 105, + "epoch": 2.9577464788732395, + "cpu_mem": 1.726451712, + "gpu_mem": 4.72023808, + "loss": 1.2206, + "grad_norm": 1.2340753078460693, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 106, + "epoch": 2.9859154929577465, + "cpu_mem": 1.726451712, + "gpu_mem": 4.720219648, + "loss": 1.2566, + "grad_norm": 1.4578205347061157, + "learning_rate": 5.074712436047112e-05 + }, + { + "step": 107, + "epoch": 3.0140845070422535, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821161984, + "loss": 1.7452, + "grad_norm": 2.5083541870117188, + "learning_rate": 4.7974089334362057e-05 + }, + { + "step": 108, + "epoch": 3.0422535211267605, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821143552, + "loss": 1.2074, + "grad_norm": 1.4265955686569214, + "learning_rate": 4.526447728708908e-05 + }, + { + "step": 109, + "epoch": 3.0704225352112675, + "cpu_mem": 1.726451712, + "gpu_mem": 4.8211328, + "loss": 1.1774, + "grad_norm": 1.1866763830184937, + "learning_rate": 4.261997261104223e-05 + }, + { + "step": 110, + "epoch": 3.0985915492957745, + "cpu_mem": 1.726451712, + "gpu_mem": 4.82118656, + "loss": 1.211, + "grad_norm": 1.914292335510254, + "learning_rate": 4.004221922552608e-05 + }, + { + "step": 111, + "epoch": 3.1267605633802815, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821146624, + "loss": 1.2281, + "grad_norm": 1.467635154724121, + "learning_rate": 3.753281955483985e-05 + }, + { + "step": 112, + "epoch": 3.1549295774647885, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821165056, + "loss": 1.1892, + "grad_norm": 1.324704647064209, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 113, + "epoch": 3.183098591549296, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821228032, + "loss": 1.1712, + "grad_norm": 1.2509040832519531, + "learning_rate": 3.2725277629795526e-05 + }, + { + "step": 114, + "epoch": 3.211267605633803, + "cpu_mem": 1.726451712, + "gpu_mem": 4.82115584, + "loss": 1.2038, + "grad_norm": 1.8074793815612793, + "learning_rate": 3.0430123916561672e-05 + }, + { + "step": 115, + "epoch": 3.23943661971831, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821149696, + "loss": 1.282, + "grad_norm": 1.7198525667190552, + "learning_rate": 2.8209299142621522e-05 + }, + { + "step": 116, + "epoch": 3.267605633802817, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821165056, + "loss": 1.1502, + "grad_norm": 1.4012295007705688, + "learning_rate": 2.6064183852600797e-05 + }, + { + "step": 117, + "epoch": 3.295774647887324, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821180416, + "loss": 1.1929, + "grad_norm": 1.7254172563552856, + "learning_rate": 2.3996111527384288e-05 + }, + { + "step": 118, + "epoch": 3.323943661971831, + "cpu_mem": 1.726451712, + "gpu_mem": 4.8211712, + "loss": 1.1839, + "grad_norm": 1.9166858196258545, + "learning_rate": 2.2006367755176655e-05 + }, + { + "step": 119, + "epoch": 3.352112676056338, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821161984, + "loss": 1.2102, + "grad_norm": 2.0617146492004395, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 120, + "epoch": 3.380281690140845, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821180416, + "loss": 1.1838, + "grad_norm": 1.6986435651779175, + "learning_rate": 1.82667639944657e-05 + }, + { + "step": 121, + "epoch": 3.408450704225352, + "cpu_mem": 1.726451712, + "gpu_mem": 4.82117888, + "loss": 1.1245, + "grad_norm": 1.622990608215332, + "learning_rate": 1.6519228678279718e-05 + }, + { + "step": 122, + "epoch": 3.436619718309859, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821135872, + "loss": 1.0628, + "grad_norm": 1.7015457153320312, + "learning_rate": 1.4854669814637143e-05 + }, + { + "step": 123, + "epoch": 3.464788732394366, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821168128, + "loss": 1.1193, + "grad_norm": 2.1626291275024414, + "learning_rate": 1.3274122153249028e-05 + }, + { + "step": 124, + "epoch": 3.492957746478873, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821122048, + "loss": 1.1555, + "grad_norm": 1.754165768623352, + "learning_rate": 1.1778568219438839e-05 + }, + { + "step": 125, + "epoch": 3.52112676056338, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821166592, + "loss": 1.1148, + "grad_norm": 2.4328746795654297, + "learning_rate": 1.036893770336938e-05 + }, + { + "step": 126, + "epoch": 3.5492957746478875, + "cpu_mem": 1.726451712, + "gpu_mem": 4.82111744, + "loss": 1.0728, + "grad_norm": 1.994571328163147, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 127, + "epoch": 3.5774647887323945, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821129728, + "loss": 1.1135, + "grad_norm": 2.1933178901672363, + "learning_rate": 7.810898074930243e-06 + }, + { + "step": 128, + "epoch": 3.6056338028169015, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821154304, + "loss": 1.1208, + "grad_norm": 2.465247392654419, + "learning_rate": 6.664079132078881e-06 + }, + { + "step": 129, + "epoch": 3.6338028169014085, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821115904, + "loss": 1.0761, + "grad_norm": 2.4749441146850586, + "learning_rate": 5.606362957498195e-06 + }, + { + "step": 130, + "epoch": 3.6619718309859155, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821118976, + "loss": 1.1505, + "grad_norm": 2.6121437549591064, + "learning_rate": 4.638407065638322e-06 + }, + { + "step": 131, + "epoch": 3.6901408450704225, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821131264, + "loss": 1.0602, + "grad_norm": 2.501903772354126, + "learning_rate": 3.760813172726457e-06 + }, + { + "step": 132, + "epoch": 3.7183098591549295, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821095936, + "loss": 1.0899, + "grad_norm": 2.1251721382141113, + "learning_rate": 2.9741268227184255e-06 + }, + { + "step": 133, + "epoch": 3.7464788732394365, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821137408, + "loss": 1.1153, + "grad_norm": 2.649550676345825, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 134, + "epoch": 3.7746478873239435, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821152768, + "loss": 1.1046, + "grad_norm": 2.4253811836242676, + "learning_rate": 1.6753760662307215e-06 + }, + { + "step": 135, + "epoch": 3.802816901408451, + "cpu_mem": 1.726451712, + "gpu_mem": 4.82111744, + "loss": 1.1179, + "grad_norm": 2.575252056121826, + "learning_rate": 1.1641190099741904e-06 + }, + { + "step": 136, + "epoch": 3.830985915492958, + "cpu_mem": 1.726451712, + "gpu_mem": 4.82112512, + "loss": 1.1422, + "grad_norm": 2.5371711254119873, + "learning_rate": 7.453836951897885e-07 + }, + { + "step": 137, + "epoch": 3.859154929577465, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821146624, + "loss": 1.0676, + "grad_norm": 2.5488364696502686, + "learning_rate": 4.194304228229806e-07 + }, + { + "step": 138, + "epoch": 3.887323943661972, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821157376, + "loss": 1.0803, + "grad_norm": 2.0775012969970703, + "learning_rate": 1.8646181716164831e-07 + }, + { + "step": 139, + "epoch": 3.915492957746479, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821149696, + "loss": 1.1464, + "grad_norm": 2.855201482772827, + "learning_rate": 4.662269987756317e-08 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821183488, + "loss": 1.0758, + "grad_norm": 2.365943670272827, + "learning_rate": 0.0 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.726451712, + "gpu_mem": 4.821183488, + "train_runtime": 695.2257, + "train_samples_per_second": 12.951, + "train_steps_per_second": 0.201, + "total_flos": 7395127108743168.0, + "train_loss": 1.3760694035461971 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r8-a2/README.md b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r8-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r8-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r8-a2/adapter_config.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..616e0cc3677d4646846654f1887fbef4d57d10ca --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r8-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r8-a2/eval_results.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..404487193c818fc57861f6c4d35daa7c276f3163 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_e", + "results": 0.3846801346801347 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r8-a2/training_configuration.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..dc6902840e5e644d8581a02a2c93128e83834985 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_E", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "lora", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 6307840 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-lora-arc_e-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r8-a2", + "seed": 42, + "timestamp": "2025-08-30T00:49:33.080472" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r8-a2/training_logs.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..1d6d04e1997d3717c89a866faf3a61955bb5a80a --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-arc_e-r8-a2/training_logs.json @@ -0,0 +1,1273 @@ +[ + { + "step": 1, + "epoch": 0.028169014084507043, + "cpu_mem": 1.723809792, + "gpu_mem": 4.442639872, + "loss": 4.6319, + "grad_norm": 32.376888275146484, + "learning_rate": 2.1428571428571425e-05 + }, + { + "step": 2, + "epoch": 0.056338028169014086, + "cpu_mem": 1.729314816, + "gpu_mem": 4.49316352, + "loss": 4.4578, + "grad_norm": 32.73485565185547, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 3, + "epoch": 0.08450704225352113, + "cpu_mem": 1.729511424, + "gpu_mem": 4.493142016, + "loss": 4.2763, + "grad_norm": 32.61184310913086, + "learning_rate": 6.428571428571427e-05 + }, + { + "step": 4, + "epoch": 0.11267605633802817, + "cpu_mem": 1.729708032, + "gpu_mem": 4.493120512, + "loss": 3.7748, + "grad_norm": 26.892738342285156, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 5, + "epoch": 0.14084507042253522, + "cpu_mem": 1.72990464, + "gpu_mem": 4.493161984, + "loss": 2.9348, + "grad_norm": 50.64318084716797, + "learning_rate": 0.00010714285714285714 + }, + { + "step": 6, + "epoch": 0.16901408450704225, + "cpu_mem": 1.72990464, + "gpu_mem": 4.493137408, + "loss": 2.0591, + "grad_norm": 14.761756896972656, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 7, + "epoch": 0.19718309859154928, + "cpu_mem": 1.730101248, + "gpu_mem": 4.493160448, + "loss": 1.8346, + "grad_norm": 5.141823768615723, + "learning_rate": 0.00015 + }, + { + "step": 8, + "epoch": 0.22535211267605634, + "cpu_mem": 1.730101248, + "gpu_mem": 4.493118976, + "loss": 1.5876, + "grad_norm": 2.794651985168457, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 9, + "epoch": 0.2535211267605634, + "cpu_mem": 1.730101248, + "gpu_mem": 4.493120512, + "loss": 1.4263, + "grad_norm": 2.0859081745147705, + "learning_rate": 0.00019285714285714286 + }, + { + "step": 10, + "epoch": 0.28169014084507044, + "cpu_mem": 1.730101248, + "gpu_mem": 4.493115904, + "loss": 1.534, + "grad_norm": 4.199974060058594, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 11, + "epoch": 0.30985915492957744, + "cpu_mem": 1.730101248, + "gpu_mem": 4.49319424, + "loss": 1.4339, + "grad_norm": 3.905238628387451, + "learning_rate": 0.00023571428571428569 + }, + { + "step": 12, + "epoch": 0.3380281690140845, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493168128, + "loss": 1.3136, + "grad_norm": 1.6123130321502686, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 13, + "epoch": 0.36619718309859156, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493118976, + "loss": 1.4198, + "grad_norm": 4.1598076820373535, + "learning_rate": 0.00027857142857142854 + }, + { + "step": 14, + "epoch": 0.39436619718309857, + "cpu_mem": 1.730297856, + "gpu_mem": 4.49314048, + "loss": 1.3593, + "grad_norm": 2.6406188011169434, + "learning_rate": 0.0003 + }, + { + "step": 15, + "epoch": 0.4225352112676056, + "cpu_mem": 1.730297856, + "gpu_mem": 4.49311744, + "loss": 1.4025, + "grad_norm": 2.4922306537628174, + "learning_rate": 0.0002999533773001224 + }, + { + "step": 16, + "epoch": 0.4507042253521127, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493122048, + "loss": 1.377, + "grad_norm": 1.5196889638900757, + "learning_rate": 0.0002998135381828383 + }, + { + "step": 17, + "epoch": 0.4788732394366197, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493158912, + "loss": 1.3516, + "grad_norm": 1.6665343046188354, + "learning_rate": 0.00029958056957717696 + }, + { + "step": 18, + "epoch": 0.5070422535211268, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493169664, + "loss": 1.3446, + "grad_norm": 1.5395841598510742, + "learning_rate": 0.0002992546163048102 + }, + { + "step": 19, + "epoch": 0.5352112676056338, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493112832, + "loss": 1.3577, + "grad_norm": 2.532010555267334, + "learning_rate": 0.0002988358809900258 + }, + { + "step": 20, + "epoch": 0.5633802816901409, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493183488, + "loss": 1.3926, + "grad_norm": 1.8894164562225342, + "learning_rate": 0.0002983246239337692 + }, + { + "step": 21, + "epoch": 0.5915492957746479, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493181952, + "loss": 1.3161, + "grad_norm": 1.0951297283172607, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 22, + "epoch": 0.6197183098591549, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493138944, + "loss": 1.3383, + "grad_norm": 1.3431637287139893, + "learning_rate": 0.00029702587317728153 + }, + { + "step": 23, + "epoch": 0.647887323943662, + "cpu_mem": 1.730297856, + "gpu_mem": 4.49315584, + "loss": 1.3116, + "grad_norm": 0.9438855051994324, + "learning_rate": 0.0002962391868272735 + }, + { + "step": 24, + "epoch": 0.676056338028169, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493112832, + "loss": 1.351, + "grad_norm": 1.2012569904327393, + "learning_rate": 0.00029536159293436166 + }, + { + "step": 25, + "epoch": 0.704225352112676, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493142016, + "loss": 1.3765, + "grad_norm": 0.7998529672622681, + "learning_rate": 0.00029439363704250176 + }, + { + "step": 26, + "epoch": 0.7323943661971831, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493122048, + "loss": 1.4242, + "grad_norm": 0.9050416350364685, + "learning_rate": 0.00029333592086792107 + }, + { + "step": 27, + "epoch": 0.7605633802816901, + "cpu_mem": 1.730297856, + "gpu_mem": 4.49314816, + "loss": 1.3562, + "grad_norm": 0.6776003241539001, + "learning_rate": 0.0002921891019250697 + }, + { + "step": 28, + "epoch": 0.7887323943661971, + "cpu_mem": 1.730297856, + "gpu_mem": 4.49314816, + "loss": 1.3673, + "grad_norm": 0.8479077219963074, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 29, + "epoch": 0.8169014084507042, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493126656, + "loss": 1.2591, + "grad_norm": 0.8013228178024292, + "learning_rate": 0.00028963106229663063 + }, + { + "step": 30, + "epoch": 0.8450704225352113, + "cpu_mem": 1.730297856, + "gpu_mem": 4.49311744, + "loss": 1.3629, + "grad_norm": 0.7932245135307312, + "learning_rate": 0.00028822143178056114 + }, + { + "step": 31, + "epoch": 0.8732394366197183, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493135872, + "loss": 1.3566, + "grad_norm": 0.7106972336769104, + "learning_rate": 0.00028672587784675096 + }, + { + "step": 32, + "epoch": 0.9014084507042254, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493158912, + "loss": 1.3325, + "grad_norm": 1.1124621629714966, + "learning_rate": 0.0002851453301853628 + }, + { + "step": 33, + "epoch": 0.9295774647887324, + "cpu_mem": 1.730297856, + "gpu_mem": 4.49315584, + "loss": 1.3591, + "grad_norm": 0.9761446714401245, + "learning_rate": 0.00028348077132172027 + }, + { + "step": 34, + "epoch": 0.9577464788732394, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493158912, + "loss": 1.4319, + "grad_norm": 1.4460577964782715, + "learning_rate": 0.0002817332360055343 + }, + { + "step": 35, + "epoch": 0.9859154929577465, + "cpu_mem": 1.730297856, + "gpu_mem": 4.49314048, + "loss": 1.3133, + "grad_norm": 0.842886209487915, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 36, + "epoch": 1.0140845070422535, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518362624, + "loss": 1.9469, + "grad_norm": 1.2212440967559814, + "learning_rate": 0.0002779936322448233 + }, + { + "step": 37, + "epoch": 1.0422535211267605, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518367232, + "loss": 1.333, + "grad_norm": 0.4803023338317871, + "learning_rate": 0.0002760038884726157 + }, + { + "step": 38, + "epoch": 1.0704225352112675, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518345728, + "loss": 1.2358, + "grad_norm": 0.6758935451507568, + "learning_rate": 0.00027393581614739923 + }, + { + "step": 39, + "epoch": 1.0985915492957747, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518334976, + "loss": 1.4057, + "grad_norm": 2.309436559677124, + "learning_rate": 0.0002717907008573785 + }, + { + "step": 40, + "epoch": 1.1267605633802817, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518397952, + "loss": 1.4445, + "grad_norm": 2.2486984729766846, + "learning_rate": 0.0002695698760834384 + }, + { + "step": 41, + "epoch": 1.1549295774647887, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518358016, + "loss": 1.3211, + "grad_norm": 0.712881326675415, + "learning_rate": 0.00026727472237020447 + }, + { + "step": 42, + "epoch": 1.1830985915492958, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518401024, + "loss": 1.3408, + "grad_norm": 0.601819634437561, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 43, + "epoch": 1.2112676056338028, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518350336, + "loss": 1.4024, + "grad_norm": 1.2004601955413818, + "learning_rate": 0.0002624671804451601 + }, + { + "step": 44, + "epoch": 1.2394366197183098, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518414848, + "loss": 1.3519, + "grad_norm": 1.2012524604797363, + "learning_rate": 0.0002599577807744739 + }, + { + "step": 45, + "epoch": 1.267605633802817, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518382592, + "loss": 1.3718, + "grad_norm": 1.2979300022125244, + "learning_rate": 0.0002573800273889577 + }, + { + "step": 46, + "epoch": 1.295774647887324, + "cpu_mem": 1.730297856, + "gpu_mem": 4.5183872, + "loss": 1.4052, + "grad_norm": 2.025125026702881, + "learning_rate": 0.0002547355227129109 + }, + { + "step": 47, + "epoch": 1.323943661971831, + "cpu_mem": 1.730297856, + "gpu_mem": 4.51833344, + "loss": 1.375, + "grad_norm": 2.6145341396331787, + "learning_rate": 0.00025202591066563786 + }, + { + "step": 48, + "epoch": 1.352112676056338, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518347264, + "loss": 1.3436, + "grad_norm": 1.5505282878875732, + "learning_rate": 0.0002492528756395289 + }, + { + "step": 49, + "epoch": 1.380281690140845, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518336512, + "loss": 1.3318, + "grad_norm": 1.0699002742767334, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 50, + "epoch": 1.408450704225352, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518350336, + "loss": 1.3251, + "grad_norm": 0.6353930234909058, + "learning_rate": 0.00024352347027881003 + }, + { + "step": 51, + "epoch": 1.436619718309859, + "cpu_mem": 1.730297856, + "gpu_mem": 4.51840256, + "loss": 1.3499, + "grad_norm": 1.0858540534973145, + "learning_rate": 0.0002405706615488216 + }, + { + "step": 52, + "epoch": 1.4647887323943662, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518350336, + "loss": 1.3912, + "grad_norm": 1.4450255632400513, + "learning_rate": 0.00023756155083521846 + }, + { + "step": 53, + "epoch": 1.4929577464788732, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518419456, + "loss": 1.3261, + "grad_norm": 1.0397456884384155, + "learning_rate": 0.00023449800870954326 + }, + { + "step": 54, + "epoch": 1.5211267605633803, + "cpu_mem": 1.730297856, + "gpu_mem": 4.5183872, + "loss": 1.3099, + "grad_norm": 1.0418322086334229, + "learning_rate": 0.0002313819395798639 + }, + { + "step": 55, + "epoch": 1.5492957746478875, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518396416, + "loss": 1.3527, + "grad_norm": 0.6689828038215637, + "learning_rate": 0.0002282152805069247 + }, + { + "step": 56, + "epoch": 1.5774647887323945, + "cpu_mem": 1.730297856, + "gpu_mem": 4.51837184, + "loss": 1.3033, + "grad_norm": 0.4389662444591522, + "learning_rate": 0.000225 + }, + { + "step": 57, + "epoch": 1.6056338028169015, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518405632, + "loss": 1.313, + "grad_norm": 0.483308881521225, + "learning_rate": 0.00022173809679319772 + }, + { + "step": 58, + "epoch": 1.6338028169014085, + "cpu_mem": 1.730297856, + "gpu_mem": 4.5183872, + "loss": 1.3254, + "grad_norm": 0.8518397808074951, + "learning_rate": 0.00021843159860297442 + }, + { + "step": 59, + "epoch": 1.6619718309859155, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518373376, + "loss": 1.3104, + "grad_norm": 0.33613321185112, + "learning_rate": 0.00021508256086763368 + }, + { + "step": 60, + "epoch": 1.6901408450704225, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518411776, + "loss": 1.3206, + "grad_norm": 1.0505858659744263, + "learning_rate": 0.00021169306546959174 + }, + { + "step": 61, + "epoch": 1.7183098591549295, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518344192, + "loss": 1.282, + "grad_norm": 0.7932109236717224, + "learning_rate": 0.0002082652194412042 + }, + { + "step": 62, + "epoch": 1.7464788732394365, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518391808, + "loss": 1.3669, + "grad_norm": 0.8794665336608887, + "learning_rate": 0.00020480115365495926 + }, + { + "step": 63, + "epoch": 1.7746478873239435, + "cpu_mem": 1.730297856, + "gpu_mem": 4.51834112, + "loss": 1.3392, + "grad_norm": 1.1630457639694214, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 64, + "epoch": 1.8028169014084507, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518390272, + "loss": 1.3429, + "grad_norm": 1.0927584171295166, + "learning_rate": 0.00019777299753775265 + }, + { + "step": 65, + "epoch": 1.8309859154929577, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518388736, + "loss": 1.3228, + "grad_norm": 0.6140114068984985, + "learning_rate": 0.00019421327616163563 + }, + { + "step": 66, + "epoch": 1.8591549295774648, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518407168, + "loss": 1.2911, + "grad_norm": 0.8698592185974121, + "learning_rate": 0.00019062607022145078 + }, + { + "step": 67, + "epoch": 1.887323943661972, + "cpu_mem": 1.730297856, + "gpu_mem": 4.5183488, + "loss": 1.3174, + "grad_norm": 0.7384346127510071, + "learning_rate": 0.00018701360965354402 + }, + { + "step": 68, + "epoch": 1.915492957746479, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518361088, + "loss": 1.3537, + "grad_norm": 0.7509185671806335, + "learning_rate": 0.00018337814009344714 + }, + { + "step": 69, + "epoch": 1.943661971830986, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518385664, + "loss": 1.332, + "grad_norm": 1.1684280633926392, + "learning_rate": 0.0001797219214799096 + }, + { + "step": 70, + "epoch": 1.971830985915493, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518362624, + "loss": 1.2766, + "grad_norm": 0.6160197854042053, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 71, + "epoch": 2.0, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518198272, + "loss": 2.0365, + "grad_norm": 2.357161283493042, + "learning_rate": 0.00017235633992642615 + }, + { + "step": 72, + "epoch": 2.028169014084507, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493151232, + "loss": 1.3431, + "grad_norm": 0.8691197037696838, + "learning_rate": 0.00016865155569712278 + }, + { + "step": 73, + "epoch": 2.056338028169014, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493114368, + "loss": 1.2934, + "grad_norm": 0.4674645662307739, + "learning_rate": 0.0001649351769893725 + }, + { + "step": 74, + "epoch": 2.084507042253521, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493174272, + "loss": 1.2888, + "grad_norm": 0.47916749119758606, + "learning_rate": 0.00016120951403796364 + }, + { + "step": 75, + "epoch": 2.112676056338028, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493142016, + "loss": 1.3249, + "grad_norm": 0.7412468194961548, + "learning_rate": 0.00015747688284910457 + }, + { + "step": 76, + "epoch": 2.140845070422535, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493152768, + "loss": 1.2822, + "grad_norm": 0.5107205510139465, + "learning_rate": 0.00015373960376071093 + }, + { + "step": 77, + "epoch": 2.169014084507042, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493189632, + "loss": 1.3326, + "grad_norm": 0.48960080742836, + "learning_rate": 0.00015 + }, + { + "step": 78, + "epoch": 2.1971830985915495, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493174272, + "loss": 1.3668, + "grad_norm": 0.7163977026939392, + "learning_rate": 0.00014626039623928907 + }, + { + "step": 79, + "epoch": 2.2253521126760565, + "cpu_mem": 1.730297856, + "gpu_mem": 4.49312512, + "loss": 1.2607, + "grad_norm": 1.078974723815918, + "learning_rate": 0.0001425231171508954 + }, + { + "step": 80, + "epoch": 2.2535211267605635, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493169664, + "loss": 1.3331, + "grad_norm": 0.7531007528305054, + "learning_rate": 0.00013879048596203636 + }, + { + "step": 81, + "epoch": 2.2816901408450705, + "cpu_mem": 1.730297856, + "gpu_mem": 4.49315584, + "loss": 1.3514, + "grad_norm": 1.3257535696029663, + "learning_rate": 0.0001350648230106275 + }, + { + "step": 82, + "epoch": 2.3098591549295775, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493123584, + "loss": 1.3362, + "grad_norm": 1.4762738943099976, + "learning_rate": 0.00013134844430287725 + }, + { + "step": 83, + "epoch": 2.3380281690140845, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493174272, + "loss": 1.2869, + "grad_norm": 1.357598066329956, + "learning_rate": 0.0001276436600735738 + }, + { + "step": 84, + "epoch": 2.3661971830985915, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493112832, + "loss": 1.379, + "grad_norm": 1.53059720993042, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 85, + "epoch": 2.3943661971830985, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493158912, + "loss": 1.371, + "grad_norm": 1.7697720527648926, + "learning_rate": 0.00012027807852009038 + }, + { + "step": 86, + "epoch": 2.4225352112676055, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493112832, + "loss": 1.3035, + "grad_norm": 0.7443304061889648, + "learning_rate": 0.00011662185990655284 + }, + { + "step": 87, + "epoch": 2.4507042253521125, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493143552, + "loss": 1.375, + "grad_norm": 1.1929677724838257, + "learning_rate": 0.00011298639034645593 + }, + { + "step": 88, + "epoch": 2.4788732394366195, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493118976, + "loss": 1.3073, + "grad_norm": 1.1270703077316284, + "learning_rate": 0.00010937392977854923 + }, + { + "step": 89, + "epoch": 2.507042253521127, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493172736, + "loss": 1.2851, + "grad_norm": 1.1373447179794312, + "learning_rate": 0.00010578672383836435 + }, + { + "step": 90, + "epoch": 2.535211267605634, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493154304, + "loss": 1.3251, + "grad_norm": 1.0819443464279175, + "learning_rate": 0.00010222700246224735 + }, + { + "step": 91, + "epoch": 2.563380281690141, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493103616, + "loss": 1.3776, + "grad_norm": 1.9100227355957031, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 92, + "epoch": 2.591549295774648, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493128192, + "loss": 1.3118, + "grad_norm": 0.6823450922966003, + "learning_rate": 9.519884634504074e-05 + }, + { + "step": 93, + "epoch": 2.619718309859155, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493131264, + "loss": 1.2504, + "grad_norm": 0.9577957987785339, + "learning_rate": 9.17347805587958e-05 + }, + { + "step": 94, + "epoch": 2.647887323943662, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493123584, + "loss": 1.2574, + "grad_norm": 0.8287014961242676, + "learning_rate": 8.830693453040829e-05 + }, + { + "step": 95, + "epoch": 2.676056338028169, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493161984, + "loss": 1.3235, + "grad_norm": 0.7351569533348083, + "learning_rate": 8.491743913236628e-05 + }, + { + "step": 96, + "epoch": 2.704225352112676, + "cpu_mem": 1.730297856, + "gpu_mem": 4.4931712, + "loss": 1.2261, + "grad_norm": 1.4165246486663818, + "learning_rate": 8.156840139702554e-05 + }, + { + "step": 97, + "epoch": 2.732394366197183, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493114368, + "loss": 1.3254, + "grad_norm": 0.8995327949523926, + "learning_rate": 7.82619032068023e-05 + }, + { + "step": 98, + "epoch": 2.76056338028169, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493114368, + "loss": 1.3268, + "grad_norm": 0.6716095805168152, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 99, + "epoch": 2.788732394366197, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493111296, + "loss": 1.2714, + "grad_norm": 0.8761784434318542, + "learning_rate": 7.17847194930753e-05 + }, + { + "step": 100, + "epoch": 2.816901408450704, + "cpu_mem": 1.730297856, + "gpu_mem": 4.49310976, + "loss": 1.2445, + "grad_norm": 0.9111322164535522, + "learning_rate": 6.86180604201361e-05 + }, + { + "step": 101, + "epoch": 2.845070422535211, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493152768, + "loss": 1.2178, + "grad_norm": 0.999971330165863, + "learning_rate": 6.550199129045668e-05 + }, + { + "step": 102, + "epoch": 2.873239436619718, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493091328, + "loss": 1.2843, + "grad_norm": 0.8601518273353577, + "learning_rate": 6.243844916478155e-05 + }, + { + "step": 103, + "epoch": 2.9014084507042255, + "cpu_mem": 1.730297856, + "gpu_mem": 4.49314048, + "loss": 1.2683, + "grad_norm": 0.6579645276069641, + "learning_rate": 5.9429338451178355e-05 + }, + { + "step": 104, + "epoch": 2.9295774647887325, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493203456, + "loss": 1.3377, + "grad_norm": 1.3469940423965454, + "learning_rate": 5.6476529721189974e-05 + }, + { + "step": 105, + "epoch": 2.9577464788732395, + "cpu_mem": 1.730297856, + "gpu_mem": 4.49315584, + "loss": 1.2286, + "grad_norm": 0.7698795795440674, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 106, + "epoch": 2.9859154929577465, + "cpu_mem": 1.730297856, + "gpu_mem": 4.493137408, + "loss": 1.308, + "grad_norm": 0.899922788143158, + "learning_rate": 5.074712436047112e-05 + }, + { + "step": 107, + "epoch": 3.0140845070422535, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518385664, + "loss": 1.7812, + "grad_norm": 1.63968026638031, + "learning_rate": 4.7974089334362057e-05 + }, + { + "step": 108, + "epoch": 3.0422535211267605, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518367232, + "loss": 1.2797, + "grad_norm": 0.974501371383667, + "learning_rate": 4.526447728708908e-05 + }, + { + "step": 109, + "epoch": 3.0704225352112675, + "cpu_mem": 1.730297856, + "gpu_mem": 4.51835648, + "loss": 1.2485, + "grad_norm": 0.874064028263092, + "learning_rate": 4.261997261104223e-05 + }, + { + "step": 110, + "epoch": 3.0985915492957745, + "cpu_mem": 1.730297856, + "gpu_mem": 4.51841024, + "loss": 1.3329, + "grad_norm": 1.917747974395752, + "learning_rate": 4.004221922552608e-05 + }, + { + "step": 111, + "epoch": 3.1267605633802815, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518370304, + "loss": 1.2811, + "grad_norm": 1.0042014122009277, + "learning_rate": 3.753281955483985e-05 + }, + { + "step": 112, + "epoch": 3.1549295774647885, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518388736, + "loss": 1.2512, + "grad_norm": 0.756230354309082, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 113, + "epoch": 3.183098591549296, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518451712, + "loss": 1.2568, + "grad_norm": 0.8588067889213562, + "learning_rate": 3.2725277629795526e-05 + }, + { + "step": 114, + "epoch": 3.211267605633803, + "cpu_mem": 1.730297856, + "gpu_mem": 4.51837952, + "loss": 1.3017, + "grad_norm": 1.0848031044006348, + "learning_rate": 3.0430123916561672e-05 + }, + { + "step": 115, + "epoch": 3.23943661971831, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518373376, + "loss": 1.2898, + "grad_norm": 1.1334939002990723, + "learning_rate": 2.8209299142621522e-05 + }, + { + "step": 116, + "epoch": 3.267605633802817, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518388736, + "loss": 1.2394, + "grad_norm": 0.8868346214294434, + "learning_rate": 2.6064183852600797e-05 + }, + { + "step": 117, + "epoch": 3.295774647887324, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518404096, + "loss": 1.226, + "grad_norm": 1.0195926427841187, + "learning_rate": 2.3996111527384288e-05 + }, + { + "step": 118, + "epoch": 3.323943661971831, + "cpu_mem": 1.730297856, + "gpu_mem": 4.51839488, + "loss": 1.2585, + "grad_norm": 1.2687636613845825, + "learning_rate": 2.2006367755176655e-05 + }, + { + "step": 119, + "epoch": 3.352112676056338, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518385664, + "loss": 1.2727, + "grad_norm": 1.2463037967681885, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 120, + "epoch": 3.380281690140845, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518404096, + "loss": 1.2482, + "grad_norm": 1.2298178672790527, + "learning_rate": 1.82667639944657e-05 + }, + { + "step": 121, + "epoch": 3.408450704225352, + "cpu_mem": 1.730297856, + "gpu_mem": 4.51840256, + "loss": 1.2397, + "grad_norm": 1.4216192960739136, + "learning_rate": 1.6519228678279718e-05 + }, + { + "step": 122, + "epoch": 3.436619718309859, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518359552, + "loss": 1.2052, + "grad_norm": 1.4542094469070435, + "learning_rate": 1.4854669814637143e-05 + }, + { + "step": 123, + "epoch": 3.464788732394366, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518391808, + "loss": 1.2339, + "grad_norm": 1.1476835012435913, + "learning_rate": 1.3274122153249028e-05 + }, + { + "step": 124, + "epoch": 3.492957746478873, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518345728, + "loss": 1.2675, + "grad_norm": 0.9283921122550964, + "learning_rate": 1.1778568219438839e-05 + }, + { + "step": 125, + "epoch": 3.52112676056338, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518390272, + "loss": 1.2138, + "grad_norm": 1.9112040996551514, + "learning_rate": 1.036893770336938e-05 + }, + { + "step": 126, + "epoch": 3.5492957746478875, + "cpu_mem": 1.730297856, + "gpu_mem": 4.51834112, + "loss": 1.2253, + "grad_norm": 1.5455485582351685, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 127, + "epoch": 3.5774647887323945, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518353408, + "loss": 1.2172, + "grad_norm": 0.9593633413314819, + "learning_rate": 7.810898074930243e-06 + }, + { + "step": 128, + "epoch": 3.6056338028169015, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518377984, + "loss": 1.2555, + "grad_norm": 1.2230002880096436, + "learning_rate": 6.664079132078881e-06 + }, + { + "step": 129, + "epoch": 3.6338028169014085, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518339584, + "loss": 1.2324, + "grad_norm": 1.1518198251724243, + "learning_rate": 5.606362957498195e-06 + }, + { + "step": 130, + "epoch": 3.6619718309859155, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518342656, + "loss": 1.2686, + "grad_norm": 1.216966986656189, + "learning_rate": 4.638407065638322e-06 + }, + { + "step": 131, + "epoch": 3.6901408450704225, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518354944, + "loss": 1.1786, + "grad_norm": 1.046160101890564, + "learning_rate": 3.760813172726457e-06 + }, + { + "step": 132, + "epoch": 3.7183098591549295, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518319616, + "loss": 1.2147, + "grad_norm": 1.0946044921875, + "learning_rate": 2.9741268227184255e-06 + }, + { + "step": 133, + "epoch": 3.7464788732394365, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518361088, + "loss": 1.1983, + "grad_norm": 1.038320541381836, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 134, + "epoch": 3.7746478873239435, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518376448, + "loss": 1.218, + "grad_norm": 1.0189380645751953, + "learning_rate": 1.6753760662307215e-06 + }, + { + "step": 135, + "epoch": 3.802816901408451, + "cpu_mem": 1.730297856, + "gpu_mem": 4.51834112, + "loss": 1.2666, + "grad_norm": 1.4586992263793945, + "learning_rate": 1.1641190099741904e-06 + }, + { + "step": 136, + "epoch": 3.830985915492958, + "cpu_mem": 1.730297856, + "gpu_mem": 4.5183488, + "loss": 1.2612, + "grad_norm": 1.4176570177078247, + "learning_rate": 7.453836951897885e-07 + }, + { + "step": 137, + "epoch": 3.859154929577465, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518370304, + "loss": 1.2276, + "grad_norm": 1.7587099075317383, + "learning_rate": 4.194304228229806e-07 + }, + { + "step": 138, + "epoch": 3.887323943661972, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518381056, + "loss": 1.2869, + "grad_norm": 1.0424158573150635, + "learning_rate": 1.8646181716164831e-07 + }, + { + "step": 139, + "epoch": 3.915492957746479, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518373376, + "loss": 1.3009, + "grad_norm": 1.3875409364700317, + "learning_rate": 4.662269987756317e-08 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518407168, + "loss": 1.2462, + "grad_norm": 1.1829123497009277, + "learning_rate": 0.0 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 1.730297856, + "gpu_mem": 4.518407168, + "train_runtime": 690.3993, + "train_samples_per_second": 13.042, + "train_steps_per_second": 0.203, + "total_flos": 7263074585161728.0, + "train_loss": 1.4339750570910317 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r2-a2/README.md b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r2-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r2-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r2-a2/adapter_config.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..26ebe9ef584396639cb6b281f2c8108d7f3fd14a --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r2-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 4, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 2, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r2-a2/eval_results.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..fbdddf0effac60122947a3ead6db643f603776c5 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "boolq", + "results": 0.7602446483180428 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r2-a2/training_configuration.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..76df472b57ee1deb5eae4313ccfaf21634879609 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "BOOLQ", + "dataset_id": "google/boolq", + "preprocess_id": "boolq_train_deepeval" + }, + "peft_config": { + "method": "lora", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 1576960 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 2, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-lora-boolq-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r2-a2", + "seed": 42, + "timestamp": "2025-08-29T13:47:11.610247" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r2-a2/training_logs.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..b5e0e29db18b43dc28d1334d1cf40e52d329b106 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r2-a2/training_logs.json @@ -0,0 +1,2659 @@ +[ + { + "step": 1, + "epoch": 0.006779661016949152, + "cpu_mem": 0.983474176, + "gpu_mem": 4.407041024, + "loss": 8.869, + "grad_norm": 12.906107902526855, + "learning_rate": 9.999999999999999e-06 + }, + { + "step": 2, + "epoch": 0.013559322033898305, + "cpu_mem": 1.42348288, + "gpu_mem": 4.436833792, + "loss": 8.9376, + "grad_norm": 13.228190422058105, + "learning_rate": 1.9999999999999998e-05 + }, + { + "step": 3, + "epoch": 0.020338983050847456, + "cpu_mem": 1.424269312, + "gpu_mem": 4.436752384, + "loss": 8.925, + "grad_norm": 13.560230255126953, + "learning_rate": 2.9999999999999997e-05 + }, + { + "step": 4, + "epoch": 0.02711864406779661, + "cpu_mem": 1.424859136, + "gpu_mem": 4.436752384, + "loss": 8.8443, + "grad_norm": 13.839821815490723, + "learning_rate": 3.9999999999999996e-05 + }, + { + "step": 5, + "epoch": 0.03389830508474576, + "cpu_mem": 1.425645568, + "gpu_mem": 4.436687872, + "loss": 8.6208, + "grad_norm": 14.34335708618164, + "learning_rate": 4.9999999999999996e-05 + }, + { + "step": 6, + "epoch": 0.04067796610169491, + "cpu_mem": 1.426235392, + "gpu_mem": 4.43670784, + "loss": 8.6661, + "grad_norm": 13.510994911193848, + "learning_rate": 5.9999999999999995e-05 + }, + { + "step": 7, + "epoch": 0.04745762711864407, + "cpu_mem": 1.427218432, + "gpu_mem": 4.436760064, + "loss": 8.4056, + "grad_norm": 14.874983787536621, + "learning_rate": 7e-05 + }, + { + "step": 8, + "epoch": 0.05423728813559322, + "cpu_mem": 1.427808256, + "gpu_mem": 4.43684608, + "loss": 8.0918, + "grad_norm": 15.769499778747559, + "learning_rate": 7.999999999999999e-05 + }, + { + "step": 9, + "epoch": 0.061016949152542375, + "cpu_mem": 1.42839808, + "gpu_mem": 4.43675392, + "loss": 7.5402, + "grad_norm": 16.25830078125, + "learning_rate": 8.999999999999999e-05 + }, + { + "step": 10, + "epoch": 0.06779661016949153, + "cpu_mem": 1.428791296, + "gpu_mem": 4.43665408, + "loss": 7.1229, + "grad_norm": 16.762798309326172, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 11, + "epoch": 0.07457627118644068, + "cpu_mem": 1.429184512, + "gpu_mem": 4.436758528, + "loss": 6.5236, + "grad_norm": 18.534788131713867, + "learning_rate": 0.00010999999999999998 + }, + { + "step": 12, + "epoch": 0.08135593220338982, + "cpu_mem": 1.429577728, + "gpu_mem": 4.43713024, + "loss": 5.7931, + "grad_norm": 19.805143356323242, + "learning_rate": 0.00011999999999999999 + }, + { + "step": 13, + "epoch": 0.08813559322033898, + "cpu_mem": 1.429970944, + "gpu_mem": 4.436733952, + "loss": 4.9634, + "grad_norm": 19.951181411743164, + "learning_rate": 0.00013 + }, + { + "step": 14, + "epoch": 0.09491525423728814, + "cpu_mem": 1.430560768, + "gpu_mem": 4.436710912, + "loss": 3.7073, + "grad_norm": 19.208539962768555, + "learning_rate": 0.00014 + }, + { + "step": 15, + "epoch": 0.1016949152542373, + "cpu_mem": 1.430953984, + "gpu_mem": 4.436649472, + "loss": 2.8593, + "grad_norm": 15.399179458618164, + "learning_rate": 0.00015 + }, + { + "step": 16, + "epoch": 0.10847457627118644, + "cpu_mem": 1.4313472, + "gpu_mem": 4.436733952, + "loss": 2.1134, + "grad_norm": 11.178773880004883, + "learning_rate": 0.00015999999999999999 + }, + { + "step": 17, + "epoch": 0.1152542372881356, + "cpu_mem": 1.431543808, + "gpu_mem": 4.436773888, + "loss": 1.5229, + "grad_norm": 7.250026702880859, + "learning_rate": 0.00016999999999999999 + }, + { + "step": 18, + "epoch": 0.12203389830508475, + "cpu_mem": 1.432133632, + "gpu_mem": 4.436836864, + "loss": 1.3118, + "grad_norm": 6.159986972808838, + "learning_rate": 0.00017999999999999998 + }, + { + "step": 19, + "epoch": 0.1288135593220339, + "cpu_mem": 1.432526848, + "gpu_mem": 4.436674048, + "loss": 0.9809, + "grad_norm": 3.4541738033294678, + "learning_rate": 0.00018999999999999998 + }, + { + "step": 20, + "epoch": 0.13559322033898305, + "cpu_mem": 1.432723456, + "gpu_mem": 4.436786176, + "loss": 0.7348, + "grad_norm": 2.9101502895355225, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 21, + "epoch": 0.1423728813559322, + "cpu_mem": 1.433116672, + "gpu_mem": 4.436944384, + "loss": 0.6935, + "grad_norm": 1.8647228479385376, + "learning_rate": 0.00020999999999999998 + }, + { + "step": 22, + "epoch": 0.14915254237288136, + "cpu_mem": 1.43331328, + "gpu_mem": 4.436836864, + "loss": 0.7553, + "grad_norm": 3.874441623687744, + "learning_rate": 0.00021999999999999995 + }, + { + "step": 23, + "epoch": 0.15593220338983052, + "cpu_mem": 1.433903104, + "gpu_mem": 4.436809216, + "loss": 0.7322, + "grad_norm": 4.913814544677734, + "learning_rate": 0.00023 + }, + { + "step": 24, + "epoch": 0.16271186440677965, + "cpu_mem": 1.43429632, + "gpu_mem": 4.436866048, + "loss": 0.8099, + "grad_norm": 9.272371292114258, + "learning_rate": 0.00023999999999999998 + }, + { + "step": 25, + "epoch": 0.1694915254237288, + "cpu_mem": 1.434492928, + "gpu_mem": 4.436651008, + "loss": 0.6921, + "grad_norm": 2.5613512992858887, + "learning_rate": 0.00025 + }, + { + "step": 26, + "epoch": 0.17627118644067796, + "cpu_mem": 1.434689536, + "gpu_mem": 4.436706304, + "loss": 0.8625, + "grad_norm": 11.16734504699707, + "learning_rate": 0.00026 + }, + { + "step": 27, + "epoch": 0.18305084745762712, + "cpu_mem": 1.434886144, + "gpu_mem": 4.436998144, + "loss": 0.7036, + "grad_norm": 6.332653045654297, + "learning_rate": 0.00027 + }, + { + "step": 28, + "epoch": 0.18983050847457628, + "cpu_mem": 1.43527936, + "gpu_mem": 4.43667712, + "loss": 0.7987, + "grad_norm": 8.638484954833984, + "learning_rate": 0.00028 + }, + { + "step": 29, + "epoch": 0.19661016949152543, + "cpu_mem": 1.435475968, + "gpu_mem": 4.436741632, + "loss": 0.6741, + "grad_norm": 4.662780284881592, + "learning_rate": 0.00029 + }, + { + "step": 30, + "epoch": 0.2033898305084746, + "cpu_mem": 1.435672576, + "gpu_mem": 4.436819968, + "loss": 0.6965, + "grad_norm": 4.030617713928223, + "learning_rate": 0.0003 + }, + { + "step": 31, + "epoch": 0.21016949152542372, + "cpu_mem": 1.435869184, + "gpu_mem": 4.43662336, + "loss": 0.624, + "grad_norm": 3.1740593910217285, + "learning_rate": 0.0002999893794250036 + }, + { + "step": 32, + "epoch": 0.21694915254237288, + "cpu_mem": 1.4362624, + "gpu_mem": 4.436737024, + "loss": 0.734, + "grad_norm": 5.17486572265625, + "learning_rate": 0.00029995751920396937 + }, + { + "step": 33, + "epoch": 0.22372881355932203, + "cpu_mem": 1.436655616, + "gpu_mem": 4.436975104, + "loss": 0.7415, + "grad_norm": 5.361311912536621, + "learning_rate": 0.00029990442384854874 + }, + { + "step": 34, + "epoch": 0.2305084745762712, + "cpu_mem": 1.436852224, + "gpu_mem": 4.43667712, + "loss": 0.5797, + "grad_norm": 0.7124174237251282, + "learning_rate": 0.0002998301008774512 + }, + { + "step": 35, + "epoch": 0.23728813559322035, + "cpu_mem": 1.437048832, + "gpu_mem": 4.436887552, + "loss": 0.6842, + "grad_norm": 2.359614372253418, + "learning_rate": 0.0002997345608153792 + }, + { + "step": 36, + "epoch": 0.2440677966101695, + "cpu_mem": 1.437442048, + "gpu_mem": 4.4368384, + "loss": 0.7523, + "grad_norm": 5.171530723571777, + "learning_rate": 0.000299617817191538 + }, + { + "step": 37, + "epoch": 0.25084745762711863, + "cpu_mem": 1.437536256, + "gpu_mem": 4.436649472, + "loss": 0.8759, + "grad_norm": 9.087279319763184, + "learning_rate": 0.0002994798865377198 + }, + { + "step": 38, + "epoch": 0.2576271186440678, + "cpu_mem": 1.437929472, + "gpu_mem": 4.436896768, + "loss": 0.7501, + "grad_norm": 3.9313488006591797, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 39, + "epoch": 0.26440677966101694, + "cpu_mem": 1.440288768, + "gpu_mem": 4.43727616, + "loss": 0.6903, + "grad_norm": 2.6422057151794434, + "learning_rate": 0.0002991405452657846 + }, + { + "step": 40, + "epoch": 0.2711864406779661, + "cpu_mem": 1.440681984, + "gpu_mem": 4.43684608, + "loss": 0.659, + "grad_norm": 2.5218183994293213, + "learning_rate": 0.00029893918270099324 + }, + { + "step": 41, + "epoch": 0.27796610169491526, + "cpu_mem": 1.441067008, + "gpu_mem": 4.437073408, + "loss": 0.7519, + "grad_norm": 4.166274070739746, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 42, + "epoch": 0.2847457627118644, + "cpu_mem": 1.441263616, + "gpu_mem": 4.436970496, + "loss": 0.7073, + "grad_norm": 3.669914960861206, + "learning_rate": 0.0002984732162821399 + }, + { + "step": 43, + "epoch": 0.29152542372881357, + "cpu_mem": 1.441460224, + "gpu_mem": 4.43679232, + "loss": 0.6188, + "grad_norm": 0.7932752370834351, + "learning_rate": 0.0002982086784124952 + }, + { + "step": 44, + "epoch": 0.2983050847457627, + "cpu_mem": 1.441656832, + "gpu_mem": 4.436935168, + "loss": 0.6168, + "grad_norm": 1.0058467388153076, + "learning_rate": 0.00029792315305772796 + }, + { + "step": 45, + "epoch": 0.3050847457627119, + "cpu_mem": 1.44185344, + "gpu_mem": 4.43671552, + "loss": 0.7778, + "grad_norm": 4.188762187957764, + "learning_rate": 0.0002976166806504174 + }, + { + "step": 46, + "epoch": 0.31186440677966104, + "cpu_mem": 1.44185344, + "gpu_mem": 4.436958208, + "loss": 0.663, + "grad_norm": 0.8187170624732971, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 47, + "epoch": 0.31864406779661014, + "cpu_mem": 1.442050048, + "gpu_mem": 4.436681728, + "loss": 0.6241, + "grad_norm": 1.1677820682525635, + "learning_rate": 0.00029694107123365385 + }, + { + "step": 48, + "epoch": 0.3254237288135593, + "cpu_mem": 1.442050048, + "gpu_mem": 4.436758528, + "loss": 0.6205, + "grad_norm": 2.267068386077881, + "learning_rate": 0.00029657202989567393 + }, + { + "step": 49, + "epoch": 0.33220338983050846, + "cpu_mem": 1.442246656, + "gpu_mem": 4.436775424, + "loss": 0.7569, + "grad_norm": 3.2423722743988037, + "learning_rate": 0.00029618223283454893 + }, + { + "step": 50, + "epoch": 0.3389830508474576, + "cpu_mem": 1.442443264, + "gpu_mem": 4.436713984, + "loss": 0.6261, + "grad_norm": 0.49267980456352234, + "learning_rate": 0.00029577173524853123 + }, + { + "step": 51, + "epoch": 0.34576271186440677, + "cpu_mem": 1.442443264, + "gpu_mem": 4.436718592, + "loss": 0.6016, + "grad_norm": 1.313482642173767, + "learning_rate": 0.0002953405952672261 + }, + { + "step": 52, + "epoch": 0.3525423728813559, + "cpu_mem": 1.442639872, + "gpu_mem": 4.436798464, + "loss": 0.7021, + "grad_norm": 2.188422203063965, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 53, + "epoch": 0.3593220338983051, + "cpu_mem": 1.442639872, + "gpu_mem": 4.436821504, + "loss": 0.6066, + "grad_norm": 1.4183673858642578, + "learning_rate": 0.0002944166352441363 + }, + { + "step": 54, + "epoch": 0.36610169491525424, + "cpu_mem": 1.44283648, + "gpu_mem": 4.436749312, + "loss": 0.775, + "grad_norm": 3.4905335903167725, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 55, + "epoch": 0.3728813559322034, + "cpu_mem": 1.44283648, + "gpu_mem": 4.437019648, + "loss": 0.6336, + "grad_norm": 0.5713280439376831, + "learning_rate": 0.00029341087610604337 + }, + { + "step": 56, + "epoch": 0.37966101694915255, + "cpu_mem": 1.443033088, + "gpu_mem": 4.436806144, + "loss": 0.664, + "grad_norm": 1.7934110164642334, + "learning_rate": 0.00029287749809037904 + }, + { + "step": 57, + "epoch": 0.3864406779661017, + "cpu_mem": 1.443033088, + "gpu_mem": 4.4368, + "loss": 0.6344, + "grad_norm": 1.9662261009216309, + "learning_rate": 0.0002923238875255979 + }, + { + "step": 58, + "epoch": 0.39322033898305087, + "cpu_mem": 1.443229696, + "gpu_mem": 4.436695552, + "loss": 0.6055, + "grad_norm": 1.6516748666763306, + "learning_rate": 0.00029175012280720024 + }, + { + "step": 59, + "epoch": 0.4, + "cpu_mem": 1.443229696, + "gpu_mem": 4.436712448, + "loss": 0.6507, + "grad_norm": 1.5389324426651, + "learning_rate": 0.000291156285184669 + }, + { + "step": 60, + "epoch": 0.4067796610169492, + "cpu_mem": 1.443426304, + "gpu_mem": 4.436806144, + "loss": 0.5888, + "grad_norm": 1.9982390403747559, + "learning_rate": 0.00029054245874996426 + }, + { + "step": 61, + "epoch": 0.4135593220338983, + "cpu_mem": 1.443426304, + "gpu_mem": 4.436816896, + "loss": 0.5761, + "grad_norm": 0.6515742540359497, + "learning_rate": 0.0002899087304256151 + }, + { + "step": 62, + "epoch": 0.42033898305084744, + "cpu_mem": 1.443622912, + "gpu_mem": 4.436804608, + "loss": 0.7839, + "grad_norm": 4.520028591156006, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 63, + "epoch": 0.4271186440677966, + "cpu_mem": 1.44381952, + "gpu_mem": 4.436796928, + "loss": 0.5023, + "grad_norm": 0.9039902091026306, + "learning_rate": 0.000288581929876693 + }, + { + "step": 64, + "epoch": 0.43389830508474575, + "cpu_mem": 1.444016128, + "gpu_mem": 4.436726272, + "loss": 0.5933, + "grad_norm": 1.09525728225708, + "learning_rate": 0.0002878890455372498 + }, + { + "step": 65, + "epoch": 0.4406779661016949, + "cpu_mem": 1.444016128, + "gpu_mem": 4.436770816, + "loss": 0.613, + "grad_norm": 0.7913245558738708, + "learning_rate": 0.0002871766350518159 + }, + { + "step": 66, + "epoch": 0.44745762711864406, + "cpu_mem": 1.444016128, + "gpu_mem": 4.436964352, + "loss": 0.6429, + "grad_norm": 4.383787631988525, + "learning_rate": 0.00028644479930317775 + }, + { + "step": 67, + "epoch": 0.4542372881355932, + "cpu_mem": 1.444016128, + "gpu_mem": 4.436674048, + "loss": 0.6787, + "grad_norm": 2.66454815864563, + "learning_rate": 0.00028569364192488803 + }, + { + "step": 68, + "epoch": 0.4610169491525424, + "cpu_mem": 1.444016128, + "gpu_mem": 4.436641792, + "loss": 0.7227, + "grad_norm": 3.381702184677124, + "learning_rate": 0.00028492326928659045 + }, + { + "step": 69, + "epoch": 0.46779661016949153, + "cpu_mem": 1.444016128, + "gpu_mem": 4.43670784, + "loss": 0.6133, + "grad_norm": 2.1771934032440186, + "learning_rate": 0.00028413379047895665 + }, + { + "step": 70, + "epoch": 0.4745762711864407, + "cpu_mem": 1.444212736, + "gpu_mem": 4.436701696, + "loss": 0.5903, + "grad_norm": 4.505643844604492, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 71, + "epoch": 0.48135593220338985, + "cpu_mem": 1.444212736, + "gpu_mem": 4.43693056, + "loss": 0.5607, + "grad_norm": 2.1504905223846436, + "learning_rate": 0.0002824979642304366 + }, + { + "step": 72, + "epoch": 0.488135593220339, + "cpu_mem": 1.444212736, + "gpu_mem": 4.43692288, + "loss": 0.5541, + "grad_norm": 1.129972219467163, + "learning_rate": 0.0002816518484350883 + }, + { + "step": 73, + "epoch": 0.49491525423728816, + "cpu_mem": 1.444409344, + "gpu_mem": 4.436889088, + "loss": 0.6851, + "grad_norm": 3.7335383892059326, + "learning_rate": 0.0002807870897286772 + }, + { + "step": 74, + "epoch": 0.5016949152542373, + "cpu_mem": 1.444409344, + "gpu_mem": 4.436749312, + "loss": 0.5633, + "grad_norm": 1.4273486137390137, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 75, + "epoch": 0.5084745762711864, + "cpu_mem": 1.444605952, + "gpu_mem": 4.436674048, + "loss": 0.5644, + "grad_norm": 2.211606740951538, + "learning_rate": 0.000279002136031155 + }, + { + "step": 76, + "epoch": 0.5152542372881356, + "cpu_mem": 1.444605952, + "gpu_mem": 4.436614144, + "loss": 0.6288, + "grad_norm": 1.7605334520339966, + "learning_rate": 0.00027808219380317216 + }, + { + "step": 77, + "epoch": 0.5220338983050847, + "cpu_mem": 1.444605952, + "gpu_mem": 4.436687872, + "loss": 0.5195, + "grad_norm": 2.0635929107666016, + "learning_rate": 0.0002771441141545895 + }, + { + "step": 78, + "epoch": 0.5288135593220339, + "cpu_mem": 1.444605952, + "gpu_mem": 4.436740096, + "loss": 0.8864, + "grad_norm": 23.070409774780273, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 79, + "epoch": 0.535593220338983, + "cpu_mem": 1.44480256, + "gpu_mem": 4.436872192, + "loss": 0.8843, + "grad_norm": 16.97602081298828, + "learning_rate": 0.000275214076502292 + }, + { + "step": 80, + "epoch": 0.5423728813559322, + "cpu_mem": 1.44480256, + "gpu_mem": 4.436763136, + "loss": 0.7777, + "grad_norm": 11.48210334777832, + "learning_rate": 0.0002742223918067056 + }, + { + "step": 81, + "epoch": 0.5491525423728814, + "cpu_mem": 1.44480256, + "gpu_mem": 4.436643328, + "loss": 0.8646, + "grad_norm": 9.008251190185547, + "learning_rate": 0.00027321311626807374 + }, + { + "step": 82, + "epoch": 0.5559322033898305, + "cpu_mem": 1.44480256, + "gpu_mem": 4.436712448, + "loss": 0.7789, + "grad_norm": 5.702970504760742, + "learning_rate": 0.0002721863928075503 + }, + { + "step": 83, + "epoch": 0.5627118644067797, + "cpu_mem": 1.44480256, + "gpu_mem": 4.436812288, + "loss": 0.7144, + "grad_norm": 3.7831995487213135, + "learning_rate": 0.000271142366817049 + }, + { + "step": 84, + "epoch": 0.5694915254237288, + "cpu_mem": 1.44480256, + "gpu_mem": 4.436775424, + "loss": 0.6182, + "grad_norm": 3.961230516433716, + "learning_rate": 0.00027008118613865406 + }, + { + "step": 85, + "epoch": 0.576271186440678, + "cpu_mem": 1.44480256, + "gpu_mem": 4.43680768, + "loss": 0.6893, + "grad_norm": 6.207698822021484, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 86, + "epoch": 0.5830508474576271, + "cpu_mem": 1.44480256, + "gpu_mem": 4.436758528, + "loss": 0.7014, + "grad_norm": 8.512293815612793, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 87, + "epoch": 0.5898305084745763, + "cpu_mem": 1.446178816, + "gpu_mem": 4.436766208, + "loss": 0.5958, + "grad_norm": 1.9889169931411743, + "learning_rate": 0.00026679623070746325 + }, + { + "step": 88, + "epoch": 0.5966101694915255, + "cpu_mem": 1.448341504, + "gpu_mem": 4.436910592, + "loss": 0.5735, + "grad_norm": 3.8564302921295166, + "learning_rate": 0.0002656679579618081 + }, + { + "step": 89, + "epoch": 0.6033898305084746, + "cpu_mem": 1.450504192, + "gpu_mem": 4.43669248, + "loss": 0.6953, + "grad_norm": 3.774245262145996, + "learning_rate": 0.0002645233057465235 + }, + { + "step": 90, + "epoch": 0.6101694915254238, + "cpu_mem": 1.450504192, + "gpu_mem": 4.43674624, + "loss": 0.6382, + "grad_norm": 3.995943307876587, + "learning_rate": 0.00026336243615313873 + }, + { + "step": 91, + "epoch": 0.6169491525423729, + "cpu_mem": 1.450504192, + "gpu_mem": 4.436713984, + "loss": 0.5595, + "grad_norm": 1.171971321105957, + "learning_rate": 0.00026218551356968814 + }, + { + "step": 92, + "epoch": 0.6237288135593221, + "cpu_mem": 1.4507008, + "gpu_mem": 4.436795392, + "loss": 0.6606, + "grad_norm": 3.8118491172790527, + "learning_rate": 0.00026099270465743254 + }, + { + "step": 93, + "epoch": 0.6305084745762712, + "cpu_mem": 1.451094016, + "gpu_mem": 4.436598784, + "loss": 0.6989, + "grad_norm": 0.950904369354248, + "learning_rate": 0.0002597841783272588 + }, + { + "step": 94, + "epoch": 0.6372881355932203, + "cpu_mem": 1.451094016, + "gpu_mem": 4.436712448, + "loss": 0.6602, + "grad_norm": 3.818436861038208, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 95, + "epoch": 0.6440677966101694, + "cpu_mem": 1.451094016, + "gpu_mem": 4.436732416, + "loss": 0.603, + "grad_norm": 2.9474451541900635, + "learning_rate": 0.00025732066016100394 + }, + { + "step": 96, + "epoch": 0.6508474576271186, + "cpu_mem": 1.451094016, + "gpu_mem": 4.436770816, + "loss": 0.5586, + "grad_norm": 2.3562450408935547, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 97, + "epoch": 0.6576271186440678, + "cpu_mem": 1.451094016, + "gpu_mem": 4.436755456, + "loss": 0.6501, + "grad_norm": 1.0626623630523682, + "learning_rate": 0.0002547963544337602 + }, + { + "step": 98, + "epoch": 0.6644067796610169, + "cpu_mem": 1.451094016, + "gpu_mem": 4.436667904, + "loss": 0.607, + "grad_norm": 1.6151455640792847, + "learning_rate": 0.0002535118517223168 + }, + { + "step": 99, + "epoch": 0.6711864406779661, + "cpu_mem": 1.451094016, + "gpu_mem": 4.436617216, + "loss": 0.6002, + "grad_norm": 2.3275017738342285, + "learning_rate": 0.00025221269093908365 + }, + { + "step": 100, + "epoch": 0.6779661016949152, + "cpu_mem": 1.451487232, + "gpu_mem": 4.436733952, + "loss": 0.5914, + "grad_norm": 0.8867893218994141, + "learning_rate": 0.0002508990560551879 + }, + { + "step": 101, + "epoch": 0.6847457627118644, + "cpu_mem": 1.451487232, + "gpu_mem": 4.436766208, + "loss": 0.6007, + "grad_norm": 0.9918681383132935, + "learning_rate": 0.0002495711330914001 + }, + { + "step": 102, + "epoch": 0.6915254237288135, + "cpu_mem": 1.451487232, + "gpu_mem": 4.4368, + "loss": 0.6235, + "grad_norm": 0.9932146072387695, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 103, + "epoch": 0.6983050847457627, + "cpu_mem": 1.451487232, + "gpu_mem": 4.436850688, + "loss": 0.6079, + "grad_norm": 2.0735538005828857, + "learning_rate": 0.0002468731770971113 + }, + { + "step": 104, + "epoch": 0.7050847457627119, + "cpu_mem": 1.451487232, + "gpu_mem": 4.436755456, + "loss": 0.5517, + "grad_norm": 2.023514986038208, + "learning_rate": 0.0002455035261178632 + }, + { + "step": 105, + "epoch": 0.711864406779661, + "cpu_mem": 1.451487232, + "gpu_mem": 4.436856832, + "loss": 0.6066, + "grad_norm": 2.5218493938446045, + "learning_rate": 0.0002441203511071278 + }, + { + "step": 106, + "epoch": 0.7186440677966102, + "cpu_mem": 1.451487232, + "gpu_mem": 4.43680768, + "loss": 0.624, + "grad_norm": 2.087386131286621, + "learning_rate": 0.00024272384793309077 + }, + { + "step": 107, + "epoch": 0.7254237288135593, + "cpu_mem": 1.45168384, + "gpu_mem": 4.436695552, + "loss": 0.5281, + "grad_norm": 1.108269214630127, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 108, + "epoch": 0.7322033898305085, + "cpu_mem": 1.45168384, + "gpu_mem": 4.436879872, + "loss": 0.5912, + "grad_norm": 1.503037452697754, + "learning_rate": 0.00023989164997670202 + }, + { + "step": 109, + "epoch": 0.7389830508474576, + "cpu_mem": 1.453453312, + "gpu_mem": 4.436733952, + "loss": 0.6865, + "grad_norm": 3.024348735809326, + "learning_rate": 0.0002384563562552943 + }, + { + "step": 110, + "epoch": 0.7457627118644068, + "cpu_mem": 1.460924416, + "gpu_mem": 4.436737024, + "loss": 0.6289, + "grad_norm": 3.295914888381958, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 111, + "epoch": 0.752542372881356, + "cpu_mem": 1.46446336, + "gpu_mem": 4.436706304, + "loss": 0.566, + "grad_norm": 2.2233500480651855, + "learning_rate": 0.0002355483955402446 + }, + { + "step": 112, + "epoch": 0.7593220338983051, + "cpu_mem": 1.467805696, + "gpu_mem": 4.436752384, + "loss": 0.5337, + "grad_norm": 1.0901715755462646, + "learning_rate": 0.00023407614033613407 + }, + { + "step": 113, + "epoch": 0.7661016949152543, + "cpu_mem": 1.477439488, + "gpu_mem": 4.436743168, + "loss": 0.6015, + "grad_norm": 1.2340892553329468, + "learning_rate": 0.0002325919793059723 + }, + { + "step": 114, + "epoch": 0.7728813559322034, + "cpu_mem": 1.483927552, + "gpu_mem": 4.436724736, + "loss": 0.5849, + "grad_norm": 2.633082866668701, + "learning_rate": 0.00023109612261833963 + }, + { + "step": 115, + "epoch": 0.7796610169491526, + "cpu_mem": 1.487859712, + "gpu_mem": 4.4368, + "loss": 0.541, + "grad_norm": 1.6253365278244019, + "learning_rate": 0.0002295887820980112 + }, + { + "step": 116, + "epoch": 0.7864406779661017, + "cpu_mem": 1.493757952, + "gpu_mem": 4.436720128, + "loss": 0.5821, + "grad_norm": 1.6362676620483398, + "learning_rate": 0.0002280701711959608 + }, + { + "step": 117, + "epoch": 0.7932203389830509, + "cpu_mem": 1.502998528, + "gpu_mem": 4.436611072, + "loss": 0.5718, + "grad_norm": 1.65804123878479, + "learning_rate": 0.00022654050495913495 + }, + { + "step": 118, + "epoch": 0.8, + "cpu_mem": 1.511452672, + "gpu_mem": 4.436849152, + "loss": 0.5678, + "grad_norm": 2.4572842121124268, + "learning_rate": 0.000225 + }, + { + "step": 119, + "epoch": 0.8067796610169492, + "cpu_mem": 1.517940736, + "gpu_mem": 4.437019648, + "loss": 0.5117, + "grad_norm": 1.3108607530593872, + "learning_rate": 0.00022344887446586865 + }, + { + "step": 120, + "epoch": 0.8135593220338984, + "cpu_mem": 1.515974656, + "gpu_mem": 4.436752384, + "loss": 0.5412, + "grad_norm": 1.172432541847229, + "learning_rate": 0.00022188734800800852 + }, + { + "step": 121, + "epoch": 0.8203389830508474, + "cpu_mem": 1.516957696, + "gpu_mem": 4.436780032, + "loss": 0.5208, + "grad_norm": 1.1605913639068604, + "learning_rate": 0.00022031564175053754 + }, + { + "step": 122, + "epoch": 0.8271186440677966, + "cpu_mem": 1.517940736, + "gpu_mem": 4.43683072, + "loss": 0.5151, + "grad_norm": 0.9846782684326172, + "learning_rate": 0.00021873397825911153 + }, + { + "step": 123, + "epoch": 0.8338983050847457, + "cpu_mem": 1.521676288, + "gpu_mem": 4.436640256, + "loss": 0.5223, + "grad_norm": 1.9995113611221313, + "learning_rate": 0.00021714258150940685 + }, + { + "step": 124, + "epoch": 0.8406779661016949, + "cpu_mem": 1.524232192, + "gpu_mem": 4.437082624, + "loss": 0.5248, + "grad_norm": 1.3190289735794067, + "learning_rate": 0.0002155416768554039 + }, + { + "step": 125, + "epoch": 0.847457627118644, + "cpu_mem": 1.532096512, + "gpu_mem": 4.436809216, + "loss": 0.5161, + "grad_norm": 1.4528439044952393, + "learning_rate": 0.00021393149099747523 + }, + { + "step": 126, + "epoch": 0.8542372881355932, + "cpu_mem": 1.53622528, + "gpu_mem": 4.43669248, + "loss": 0.5153, + "grad_norm": 1.2847403287887573, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 127, + "epoch": 0.8610169491525423, + "cpu_mem": 1.54998784, + "gpu_mem": 4.437131776, + "loss": 0.5778, + "grad_norm": 2.2213666439056396, + "learning_rate": 0.00021068418901049025 + }, + { + "step": 128, + "epoch": 0.8677966101694915, + "cpu_mem": 1.560014848, + "gpu_mem": 4.43690752, + "loss": 0.4748, + "grad_norm": 1.3329259157180786, + "learning_rate": 0.0002090475327242912 + }, + { + "step": 129, + "epoch": 0.8745762711864407, + "cpu_mem": 1.563553792, + "gpu_mem": 4.436947456, + "loss": 0.539, + "grad_norm": 2.40218448638916, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 130, + "epoch": 0.8813559322033898, + "cpu_mem": 1.573777408, + "gpu_mem": 4.436729344, + "loss": 0.6459, + "grad_norm": 1.370928406715393, + "learning_rate": 0.0002057493683490491 + }, + { + "step": 131, + "epoch": 0.888135593220339, + "cpu_mem": 1.583017984, + "gpu_mem": 4.436858368, + "loss": 0.5479, + "grad_norm": 1.7192223072052002, + "learning_rate": 0.00020408832730536746 + }, + { + "step": 132, + "epoch": 0.8949152542372881, + "cpu_mem": 1.587539968, + "gpu_mem": 4.436939776, + "loss": 0.5042, + "grad_norm": 2.9121339321136475, + "learning_rate": 0.00020241962693986476 + }, + { + "step": 133, + "epoch": 0.9016949152542373, + "cpu_mem": 1.592455168, + "gpu_mem": 4.4367232, + "loss": 0.5694, + "grad_norm": 2.8980696201324463, + "learning_rate": 0.0002007435035533061 + }, + { + "step": 134, + "epoch": 0.9084745762711864, + "cpu_mem": 1.596977152, + "gpu_mem": 4.436856832, + "loss": 0.5223, + "grad_norm": 4.267848491668701, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 135, + "epoch": 0.9152542372881356, + "cpu_mem": 1.602482176, + "gpu_mem": 4.436879872, + "loss": 0.5745, + "grad_norm": 1.5059559345245361, + "learning_rate": 0.00019736993814225374 + }, + { + "step": 136, + "epoch": 0.9220338983050848, + "cpu_mem": 1.602482176, + "gpu_mem": 4.436717056, + "loss": 0.4481, + "grad_norm": 1.141717553138733, + "learning_rate": 0.00019567297384048604 + }, + { + "step": 137, + "epoch": 0.9288135593220339, + "cpu_mem": 1.602482176, + "gpu_mem": 4.436597248, + "loss": 0.5839, + "grad_norm": 2.4572339057922363, + "learning_rate": 0.0001939695418954653 + }, + { + "step": 138, + "epoch": 0.9355932203389831, + "cpu_mem": 1.602482176, + "gpu_mem": 4.436778496, + "loss": 0.5177, + "grad_norm": 2.8804714679718018, + "learning_rate": 0.00019225988352621445 + }, + { + "step": 139, + "epoch": 0.9423728813559322, + "cpu_mem": 1.602482176, + "gpu_mem": 4.43667712, + "loss": 0.5347, + "grad_norm": 2.6375012397766113, + "learning_rate": 0.00019054424083346592 + }, + { + "step": 140, + "epoch": 0.9491525423728814, + "cpu_mem": 1.602482176, + "gpu_mem": 4.436729344, + "loss": 0.5287, + "grad_norm": 1.7044628858566284, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 141, + "epoch": 0.9559322033898305, + "cpu_mem": 1.602482176, + "gpu_mem": 4.4367616, + "loss": 0.5408, + "grad_norm": 1.2940665483474731, + "learning_rate": 0.0001870959750831323 + }, + { + "step": 142, + "epoch": 0.9627118644067797, + "cpu_mem": 1.602482176, + "gpu_mem": 4.436901376, + "loss": 0.4845, + "grad_norm": 2.598459005355835, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 143, + "epoch": 0.9694915254237289, + "cpu_mem": 1.602482176, + "gpu_mem": 4.43688448, + "loss": 0.6403, + "grad_norm": 2.0425825119018555, + "learning_rate": 0.00018362669777878453 + }, + { + "step": 144, + "epoch": 0.976271186440678, + "cpu_mem": 1.602482176, + "gpu_mem": 4.43707648, + "loss": 0.5091, + "grad_norm": 1.6226866245269775, + "learning_rate": 0.00018188479343294648 + }, + { + "step": 145, + "epoch": 0.9830508474576272, + "cpu_mem": 1.602482176, + "gpu_mem": 4.436787712, + "loss": 0.5384, + "grad_norm": 2.6616642475128174, + "learning_rate": 0.0001801383739559098 + }, + { + "step": 146, + "epoch": 0.9898305084745763, + "cpu_mem": 1.602482176, + "gpu_mem": 4.43682304, + "loss": 0.49, + "grad_norm": 1.0542516708374023, + "learning_rate": 0.0001783876866540615 + }, + { + "step": 147, + "epoch": 0.9966101694915255, + "cpu_mem": 1.602482176, + "gpu_mem": 4.436721664, + "loss": 0.5264, + "grad_norm": 1.8203084468841553, + "learning_rate": 0.00017663297943814552 + }, + { + "step": 148, + "epoch": 1.0033898305084745, + "cpu_mem": 1.602482176, + "gpu_mem": 4.443193856, + "loss": 0.7665, + "grad_norm": 3.8269314765930176, + "learning_rate": 0.0001748745007881561 + }, + { + "step": 149, + "epoch": 1.0101694915254238, + "cpu_mem": 1.602678784, + "gpu_mem": 4.443129344, + "loss": 0.4594, + "grad_norm": 2.294579267501831, + "learning_rate": 0.00017311249971815185 + }, + { + "step": 150, + "epoch": 1.0169491525423728, + "cpu_mem": 1.602678784, + "gpu_mem": 4.442966528, + "loss": 0.5179, + "grad_norm": 2.6160833835601807, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 151, + "epoch": 1.023728813559322, + "cpu_mem": 1.602678784, + "gpu_mem": 4.44303872, + "loss": 0.6424, + "grad_norm": 2.3977127075195312, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 152, + "epoch": 1.0305084745762711, + "cpu_mem": 1.602678784, + "gpu_mem": 4.443074048, + "loss": 0.4599, + "grad_norm": 1.6235495805740356, + "learning_rate": 0.00016780785939859576 + }, + { + "step": 153, + "epoch": 1.0372881355932204, + "cpu_mem": 1.602678784, + "gpu_mem": 4.443098624, + "loss": 0.6746, + "grad_norm": 2.6140358448028564, + "learning_rate": 0.00016603426823476693 + }, + { + "step": 154, + "epoch": 1.0440677966101695, + "cpu_mem": 1.602678784, + "gpu_mem": 4.443060224, + "loss": 0.5133, + "grad_norm": 2.4362475872039795, + "learning_rate": 0.00016425840649562736 + }, + { + "step": 155, + "epoch": 1.0508474576271187, + "cpu_mem": 1.602678784, + "gpu_mem": 4.443281408, + "loss": 0.5488, + "grad_norm": 2.9870870113372803, + "learning_rate": 0.00016248052565681436 + }, + { + "step": 156, + "epoch": 1.0576271186440678, + "cpu_mem": 1.602678784, + "gpu_mem": 4.443189248, + "loss": 0.5487, + "grad_norm": 4.17355489730835, + "learning_rate": 0.00016070087747988482 + }, + { + "step": 157, + "epoch": 1.064406779661017, + "cpu_mem": 1.602678784, + "gpu_mem": 4.443095552, + "loss": 0.5035, + "grad_norm": 1.2917160987854004, + "learning_rate": 0.00015891971397666464 + }, + { + "step": 158, + "epoch": 1.071186440677966, + "cpu_mem": 1.602678784, + "gpu_mem": 4.443021824, + "loss": 0.4653, + "grad_norm": 1.6404560804367065, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 159, + "epoch": 1.0779661016949154, + "cpu_mem": 1.602875392, + "gpu_mem": 4.443370496, + "loss": 0.4162, + "grad_norm": 1.3462518453598022, + "learning_rate": 0.00015535385007584706 + }, + { + "step": 160, + "epoch": 1.0847457627118644, + "cpu_mem": 1.602875392, + "gpu_mem": 4.442964992, + "loss": 0.4721, + "grad_norm": 1.945906162261963, + "learning_rate": 0.0001535696546319161 + }, + { + "step": 161, + "epoch": 1.0915254237288137, + "cpu_mem": 1.602875392, + "gpu_mem": 4.442911232, + "loss": 0.4662, + "grad_norm": 1.4738388061523438, + "learning_rate": 0.00015178495369752213 + }, + { + "step": 162, + "epoch": 1.0983050847457627, + "cpu_mem": 1.602875392, + "gpu_mem": 4.443686912, + "loss": 0.4641, + "grad_norm": 1.358565092086792, + "learning_rate": 0.00015 + }, + { + "step": 163, + "epoch": 1.1050847457627118, + "cpu_mem": 1.602875392, + "gpu_mem": 4.443163136, + "loss": 0.5899, + "grad_norm": 3.4322757720947266, + "learning_rate": 0.00014821504630247785 + }, + { + "step": 164, + "epoch": 1.111864406779661, + "cpu_mem": 1.602875392, + "gpu_mem": 4.443075584, + "loss": 0.6439, + "grad_norm": 2.8457260131835938, + "learning_rate": 0.00014643034536808387 + }, + { + "step": 165, + "epoch": 1.11864406779661, + "cpu_mem": 1.60256, + "gpu_mem": 4.443024896, + "loss": 0.5013, + "grad_norm": 2.61537766456604, + "learning_rate": 0.00014464614992415294 + }, + { + "step": 166, + "epoch": 1.1254237288135593, + "cpu_mem": 1.602756608, + "gpu_mem": 4.443120128, + "loss": 0.4382, + "grad_norm": 1.9229656457901, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 167, + "epoch": 1.1322033898305084, + "cpu_mem": 1.602514944, + "gpu_mem": 4.443037184, + "loss": 0.5265, + "grad_norm": 1.5629796981811523, + "learning_rate": 0.00014108028602333536 + }, + { + "step": 168, + "epoch": 1.1389830508474577, + "cpu_mem": 1.602711552, + "gpu_mem": 4.443055616, + "loss": 0.536, + "grad_norm": 2.5809876918792725, + "learning_rate": 0.00013929912252011516 + }, + { + "step": 169, + "epoch": 1.1457627118644067, + "cpu_mem": 1.602711552, + "gpu_mem": 4.443143168, + "loss": 0.5415, + "grad_norm": 4.286618232727051, + "learning_rate": 0.00013751947434318564 + }, + { + "step": 170, + "epoch": 1.152542372881356, + "cpu_mem": 1.602711552, + "gpu_mem": 4.443027968, + "loss": 0.5834, + "grad_norm": 2.63362979888916, + "learning_rate": 0.00013574159350437261 + }, + { + "step": 171, + "epoch": 1.159322033898305, + "cpu_mem": 1.602711552, + "gpu_mem": 4.443090944, + "loss": 0.5285, + "grad_norm": 1.5087510347366333, + "learning_rate": 0.0001339657317652331 + }, + { + "step": 172, + "epoch": 1.1661016949152543, + "cpu_mem": 1.602506752, + "gpu_mem": 4.442998784, + "loss": 0.4773, + "grad_norm": 1.4778928756713867, + "learning_rate": 0.00013219214060140424 + }, + { + "step": 173, + "epoch": 1.1728813559322033, + "cpu_mem": 1.602506752, + "gpu_mem": 4.443298304, + "loss": 0.4955, + "grad_norm": 1.2077159881591797, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 174, + "epoch": 1.1796610169491526, + "cpu_mem": 1.60270336, + "gpu_mem": 4.443021824, + "loss": 0.5656, + "grad_norm": 2.176222801208496, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 175, + "epoch": 1.1864406779661016, + "cpu_mem": 1.60270336, + "gpu_mem": 4.442988032, + "loss": 0.563, + "grad_norm": 2.0453193187713623, + "learning_rate": 0.00012688750028184818 + }, + { + "step": 176, + "epoch": 1.193220338983051, + "cpu_mem": 1.60247808, + "gpu_mem": 4.443126272, + "loss": 0.3879, + "grad_norm": 1.306427001953125, + "learning_rate": 0.0001251254992118439 + }, + { + "step": 177, + "epoch": 1.2, + "cpu_mem": 1.602473984, + "gpu_mem": 4.443224576, + "loss": 0.4951, + "grad_norm": 1.602590560913086, + "learning_rate": 0.00012336702056185453 + }, + { + "step": 178, + "epoch": 1.2067796610169492, + "cpu_mem": 1.602670592, + "gpu_mem": 4.442971136, + "loss": 0.4318, + "grad_norm": 1.6733477115631104, + "learning_rate": 0.00012161231334593851 + }, + { + "step": 179, + "epoch": 1.2135593220338983, + "cpu_mem": 1.6024576, + "gpu_mem": 4.443070976, + "loss": 0.5415, + "grad_norm": 2.2424843311309814, + "learning_rate": 0.00011986162604409015 + }, + { + "step": 180, + "epoch": 1.2203389830508475, + "cpu_mem": 1.602654208, + "gpu_mem": 4.443043328, + "loss": 0.4656, + "grad_norm": 2.125783681869507, + "learning_rate": 0.00011811520656705348 + }, + { + "step": 181, + "epoch": 1.2271186440677966, + "cpu_mem": 1.602654208, + "gpu_mem": 4.442980352, + "loss": 0.449, + "grad_norm": 2.178128242492676, + "learning_rate": 0.00011637330222121543 + }, + { + "step": 182, + "epoch": 1.2338983050847459, + "cpu_mem": 1.603006464, + "gpu_mem": 4.443198464, + "loss": 0.5104, + "grad_norm": 3.3784139156341553, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 183, + "epoch": 1.240677966101695, + "cpu_mem": 1.602809856, + "gpu_mem": 4.443095552, + "loss": 0.5701, + "grad_norm": 2.6157169342041016, + "learning_rate": 0.00011290402491686766 + }, + { + "step": 184, + "epoch": 1.2474576271186442, + "cpu_mem": 1.602809856, + "gpu_mem": 4.443043328, + "loss": 0.4462, + "grad_norm": 2.806597948074341, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 185, + "epoch": 1.2542372881355932, + "cpu_mem": 1.602809856, + "gpu_mem": 4.443021824, + "loss": 0.4471, + "grad_norm": 4.645796775817871, + "learning_rate": 0.00010945575916653407 + }, + { + "step": 186, + "epoch": 1.2610169491525425, + "cpu_mem": 1.602809856, + "gpu_mem": 4.44303104, + "loss": 0.4341, + "grad_norm": 40.47117233276367, + "learning_rate": 0.00010774011647378553 + }, + { + "step": 187, + "epoch": 1.2677966101694915, + "cpu_mem": 1.602809856, + "gpu_mem": 4.442963456, + "loss": 0.6822, + "grad_norm": 42.85932922363281, + "learning_rate": 0.00010603045810453468 + }, + { + "step": 188, + "epoch": 1.2745762711864406, + "cpu_mem": 1.602809856, + "gpu_mem": 4.443126272, + "loss": 0.5436, + "grad_norm": 60.33808898925781, + "learning_rate": 0.00010432702615951396 + }, + { + "step": 189, + "epoch": 1.2813559322033898, + "cpu_mem": 1.602809856, + "gpu_mem": 4.442995712, + "loss": 0.6736, + "grad_norm": 27.595870971679688, + "learning_rate": 0.00010263006185774627 + }, + { + "step": 190, + "epoch": 1.288135593220339, + "cpu_mem": 1.602605056, + "gpu_mem": 4.44311552, + "loss": 0.4743, + "grad_norm": 13.908833503723145, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 191, + "epoch": 1.2949152542372881, + "cpu_mem": 1.602801664, + "gpu_mem": 4.442934272, + "loss": 0.3733, + "grad_norm": 1.8719438314437866, + "learning_rate": 9.925649644669391e-05 + }, + { + "step": 192, + "epoch": 1.3016949152542372, + "cpu_mem": 1.602605056, + "gpu_mem": 4.443066368, + "loss": 0.3733, + "grad_norm": 2.6409752368927, + "learning_rate": 9.758037306013526e-05 + }, + { + "step": 193, + "epoch": 1.3084745762711865, + "cpu_mem": 1.602801664, + "gpu_mem": 4.443040256, + "loss": 0.4369, + "grad_norm": 2.8527257442474365, + "learning_rate": 9.591167269463255e-05 + }, + { + "step": 194, + "epoch": 1.3152542372881357, + "cpu_mem": 1.602965504, + "gpu_mem": 4.443006464, + "loss": 0.4516, + "grad_norm": 2.0572543144226074, + "learning_rate": 9.425063165095088e-05 + }, + { + "step": 195, + "epoch": 1.3220338983050848, + "cpu_mem": 1.602609152, + "gpu_mem": 4.443110912, + "loss": 0.3949, + "grad_norm": 1.900550127029419, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 196, + "epoch": 1.3288135593220338, + "cpu_mem": 1.602797568, + "gpu_mem": 4.443106304, + "loss": 0.4659, + "grad_norm": 1.8844504356384277, + "learning_rate": 9.095246727570879e-05 + }, + { + "step": 197, + "epoch": 1.335593220338983, + "cpu_mem": 1.602994176, + "gpu_mem": 4.442964992, + "loss": 0.4672, + "grad_norm": 3.237405300140381, + "learning_rate": 8.931581098950973e-05 + }, + { + "step": 198, + "epoch": 1.3423728813559321, + "cpu_mem": 1.602994176, + "gpu_mem": 4.443156992, + "loss": 0.466, + "grad_norm": 1.9800825119018555, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 199, + "epoch": 1.3491525423728814, + "cpu_mem": 1.602961408, + "gpu_mem": 4.443008, + "loss": 0.4696, + "grad_norm": 1.6998234987258911, + "learning_rate": 8.606850900252478e-05 + }, + { + "step": 200, + "epoch": 1.3559322033898304, + "cpu_mem": 1.602961408, + "gpu_mem": 4.443110912, + "loss": 0.4928, + "grad_norm": 2.0752861499786377, + "learning_rate": 8.445832314459608e-05 + }, + { + "step": 201, + "epoch": 1.3627118644067797, + "cpu_mem": 1.602961408, + "gpu_mem": 4.443313664, + "loss": 0.3968, + "grad_norm": 2.156602382659912, + "learning_rate": 8.285741849059311e-05 + }, + { + "step": 202, + "epoch": 1.3694915254237288, + "cpu_mem": 1.602961408, + "gpu_mem": 4.44311552, + "loss": 0.4826, + "grad_norm": 1.9719423055648804, + "learning_rate": 8.126602174088843e-05 + }, + { + "step": 203, + "epoch": 1.376271186440678, + "cpu_mem": 1.602961408, + "gpu_mem": 4.443001856, + "loss": 0.4706, + "grad_norm": 2.662337303161621, + "learning_rate": 7.968435824946242e-05 + }, + { + "step": 204, + "epoch": 1.383050847457627, + "cpu_mem": 1.602961408, + "gpu_mem": 4.44301568, + "loss": 0.4529, + "grad_norm": 1.8581463098526, + "learning_rate": 7.811265199199152e-05 + }, + { + "step": 205, + "epoch": 1.3898305084745763, + "cpu_mem": 1.602961408, + "gpu_mem": 4.443060224, + "loss": 0.5464, + "grad_norm": 2.5608296394348145, + "learning_rate": 7.655112553413135e-05 + }, + { + "step": 206, + "epoch": 1.3966101694915254, + "cpu_mem": 1.602961408, + "gpu_mem": 4.443001856, + "loss": 0.4319, + "grad_norm": 1.8282169103622437, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 207, + "epoch": 1.4033898305084747, + "cpu_mem": 1.602961408, + "gpu_mem": 4.443235328, + "loss": 0.5103, + "grad_norm": 3.3306872844696045, + "learning_rate": 7.345949504086507e-05 + }, + { + "step": 208, + "epoch": 1.4101694915254237, + "cpu_mem": 1.602961408, + "gpu_mem": 4.443266048, + "loss": 0.4876, + "grad_norm": 4.448140621185303, + "learning_rate": 7.192982880403917e-05 + }, + { + "step": 209, + "epoch": 1.4169491525423727, + "cpu_mem": 1.60284672, + "gpu_mem": 4.44319232, + "loss": 0.5544, + "grad_norm": 1.565893530845642, + "learning_rate": 7.041121790198881e-05 + }, + { + "step": 210, + "epoch": 1.423728813559322, + "cpu_mem": 1.602842624, + "gpu_mem": 4.443080192, + "loss": 0.462, + "grad_norm": 2.3924496173858643, + "learning_rate": 6.890387738166041e-05 + }, + { + "step": 211, + "epoch": 1.4305084745762713, + "cpu_mem": 1.603039232, + "gpu_mem": 4.443029504, + "loss": 0.3744, + "grad_norm": 2.423670768737793, + "learning_rate": 6.740802069402771e-05 + }, + { + "step": 212, + "epoch": 1.4372881355932203, + "cpu_mem": 1.603039232, + "gpu_mem": 4.442998784, + "loss": 0.4917, + "grad_norm": 2.1420834064483643, + "learning_rate": 6.592385966386588e-05 + }, + { + "step": 213, + "epoch": 1.4440677966101694, + "cpu_mem": 1.603039232, + "gpu_mem": 4.443021824, + "loss": 0.6025, + "grad_norm": 2.7889678478240967, + "learning_rate": 6.445160445975536e-05 + }, + { + "step": 214, + "epoch": 1.4508474576271186, + "cpu_mem": 1.603039232, + "gpu_mem": 4.443104768, + "loss": 0.3849, + "grad_norm": 2.5013651847839355, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 215, + "epoch": 1.457627118644068, + "cpu_mem": 1.602781184, + "gpu_mem": 4.443032576, + "loss": 0.5902, + "grad_norm": 4.9631242752075195, + "learning_rate": 6.154364374470568e-05 + }, + { + "step": 216, + "epoch": 1.464406779661017, + "cpu_mem": 1.602977792, + "gpu_mem": 4.443198464, + "loss": 0.4968, + "grad_norm": 2.178252696990967, + "learning_rate": 6.010835002329795e-05 + }, + { + "step": 217, + "epoch": 1.471186440677966, + "cpu_mem": 1.602977792, + "gpu_mem": 4.443040256, + "loss": 0.4605, + "grad_norm": 3.21439790725708, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 218, + "epoch": 1.4779661016949153, + "cpu_mem": 1.602977792, + "gpu_mem": 4.443017216, + "loss": 0.4138, + "grad_norm": 1.9228417873382568, + "learning_rate": 5.72761520669092e-05 + }, + { + "step": 219, + "epoch": 1.4847457627118645, + "cpu_mem": 1.602523136, + "gpu_mem": 4.443143168, + "loss": 0.4918, + "grad_norm": 1.752586841583252, + "learning_rate": 5.587964889287218e-05 + }, + { + "step": 220, + "epoch": 1.4915254237288136, + "cpu_mem": 1.602719744, + "gpu_mem": 4.44317696, + "loss": 0.5032, + "grad_norm": 1.8907190561294556, + "learning_rate": 5.449647388213678e-05 + }, + { + "step": 221, + "epoch": 1.4983050847457626, + "cpu_mem": 1.602719744, + "gpu_mem": 4.443044864, + "loss": 0.4749, + "grad_norm": 2.090404987335205, + "learning_rate": 5.312682290288869e-05 + }, + { + "step": 222, + "epoch": 1.505084745762712, + "cpu_mem": 1.602719744, + "gpu_mem": 4.443181568, + "loss": 0.5338, + "grad_norm": 2.359840154647827, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 223, + "epoch": 1.5118644067796612, + "cpu_mem": 1.602719744, + "gpu_mem": 4.443095552, + "loss": 0.4187, + "grad_norm": 3.311598300933838, + "learning_rate": 5.0428866908599864e-05 + }, + { + "step": 224, + "epoch": 1.5186440677966102, + "cpu_mem": 1.602719744, + "gpu_mem": 4.443060224, + "loss": 0.3825, + "grad_norm": 1.841052532196045, + "learning_rate": 4.9100943944812114e-05 + }, + { + "step": 225, + "epoch": 1.5254237288135593, + "cpu_mem": 1.602719744, + "gpu_mem": 4.443024896, + "loss": 0.399, + "grad_norm": 1.9204356670379639, + "learning_rate": 4.778730906091632e-05 + }, + { + "step": 226, + "epoch": 1.5322033898305085, + "cpu_mem": 1.602719744, + "gpu_mem": 4.443173888, + "loss": 0.4099, + "grad_norm": 2.367415189743042, + "learning_rate": 4.648814827768322e-05 + }, + { + "step": 227, + "epoch": 1.5389830508474578, + "cpu_mem": 1.602719744, + "gpu_mem": 4.443063296, + "loss": 0.4953, + "grad_norm": 1.629281759262085, + "learning_rate": 4.5203645566239816e-05 + }, + { + "step": 228, + "epoch": 1.5457627118644068, + "cpu_mem": 1.602719744, + "gpu_mem": 4.443008, + "loss": 0.528, + "grad_norm": 1.812667727470398, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 229, + "epoch": 1.5525423728813559, + "cpu_mem": 1.602719744, + "gpu_mem": 4.442949632, + "loss": 0.394, + "grad_norm": 1.5554635524749756, + "learning_rate": 4.267933983899601e-05 + }, + { + "step": 230, + "epoch": 1.559322033898305, + "cpu_mem": 1.602719744, + "gpu_mem": 4.443006464, + "loss": 0.4336, + "grad_norm": 1.8320038318634033, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 231, + "epoch": 1.5661016949152542, + "cpu_mem": 1.602719744, + "gpu_mem": 4.44328448, + "loss": 0.4599, + "grad_norm": 2.155168294906616, + "learning_rate": 4.0215821672741213e-05 + }, + { + "step": 232, + "epoch": 1.5728813559322035, + "cpu_mem": 1.602719744, + "gpu_mem": 4.443008, + "loss": 0.5081, + "grad_norm": 1.9731533527374268, + "learning_rate": 3.900729534256745e-05 + }, + { + "step": 233, + "epoch": 1.5796610169491525, + "cpu_mem": 1.602719744, + "gpu_mem": 4.443321344, + "loss": 0.416, + "grad_norm": 2.585251569747925, + "learning_rate": 3.781448643031187e-05 + }, + { + "step": 234, + "epoch": 1.5864406779661016, + "cpu_mem": 1.602916352, + "gpu_mem": 4.443196928, + "loss": 0.4833, + "grad_norm": 2.581498146057129, + "learning_rate": 3.663756384686127e-05 + }, + { + "step": 235, + "epoch": 1.5932203389830508, + "cpu_mem": 1.602916352, + "gpu_mem": 4.442952704, + "loss": 0.3605, + "grad_norm": 1.8107917308807373, + "learning_rate": 3.547669425347647e-05 + }, + { + "step": 236, + "epoch": 1.6, + "cpu_mem": 1.602916352, + "gpu_mem": 4.443012608, + "loss": 0.538, + "grad_norm": 3.0624032020568848, + "learning_rate": 3.433204203819185e-05 + }, + { + "step": 237, + "epoch": 1.6067796610169491, + "cpu_mem": 1.602916352, + "gpu_mem": 4.443074048, + "loss": 0.503, + "grad_norm": 2.4980382919311523, + "learning_rate": 3.3203769292536764e-05 + }, + { + "step": 238, + "epoch": 1.6135593220338982, + "cpu_mem": 1.60311296, + "gpu_mem": 4.443075584, + "loss": 0.4599, + "grad_norm": 1.9407237768173218, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 239, + "epoch": 1.6203389830508474, + "cpu_mem": 1.60311296, + "gpu_mem": 4.443329024, + "loss": 0.49, + "grad_norm": 2.9388840198516846, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 240, + "epoch": 1.6271186440677967, + "cpu_mem": 1.60311296, + "gpu_mem": 4.442978816, + "loss": 0.5619, + "grad_norm": 1.8997223377227783, + "learning_rate": 2.9918813861345952e-05 + }, + { + "step": 241, + "epoch": 1.6338983050847458, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443275264, + "loss": 0.4534, + "grad_norm": 1.9766064882278442, + "learning_rate": 2.885763318295102e-05 + }, + { + "step": 242, + "epoch": 1.6406779661016948, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443137024, + "loss": 0.4608, + "grad_norm": 1.7670924663543701, + "learning_rate": 2.781360719244964e-05 + }, + { + "step": 243, + "epoch": 1.647457627118644, + "cpu_mem": 1.603309568, + "gpu_mem": 4.442989568, + "loss": 0.6521, + "grad_norm": 3.095453977584839, + "learning_rate": 2.6786883731926306e-05 + }, + { + "step": 244, + "epoch": 1.6542372881355933, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443129344, + "loss": 0.3603, + "grad_norm": 1.94898521900177, + "learning_rate": 2.5777608193294396e-05 + }, + { + "step": 245, + "epoch": 1.6610169491525424, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443008, + "loss": 0.4284, + "grad_norm": 1.6023672819137573, + "learning_rate": 2.4785923497707956e-05 + }, + { + "step": 246, + "epoch": 1.6677966101694914, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443101696, + "loss": 0.5007, + "grad_norm": 1.8199307918548584, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 247, + "epoch": 1.6745762711864407, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443120128, + "loss": 0.3529, + "grad_norm": 1.5310736894607544, + "learning_rate": 2.285588584541047e-05 + }, + { + "step": 248, + "epoch": 1.68135593220339, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443072512, + "loss": 0.3839, + "grad_norm": 1.739418625831604, + "learning_rate": 2.1917806196827792e-05 + }, + { + "step": 249, + "epoch": 1.688135593220339, + "cpu_mem": 1.603309568, + "gpu_mem": 4.442978816, + "loss": 0.3684, + "grad_norm": 2.0681259632110596, + "learning_rate": 2.0997863968844914e-05 + }, + { + "step": 250, + "epoch": 1.694915254237288, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443070976, + "loss": 0.4944, + "grad_norm": 2.333770513534546, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 251, + "epoch": 1.7016949152542373, + "cpu_mem": 1.603309568, + "gpu_mem": 4.442983424, + "loss": 0.3732, + "grad_norm": 1.6777522563934326, + "learning_rate": 1.921291027132278e-05 + }, + { + "step": 252, + "epoch": 1.7084745762711866, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443026432, + "loss": 0.4906, + "grad_norm": 2.97623348236084, + "learning_rate": 1.834815156491165e-05 + }, + { + "step": 253, + "epoch": 1.7152542372881356, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443219968, + "loss": 0.5394, + "grad_norm": 2.6836328506469727, + "learning_rate": 1.750203576956341e-05 + }, + { + "step": 254, + "epoch": 1.7220338983050847, + "cpu_mem": 1.603309568, + "gpu_mem": 4.44301568, + "loss": 0.482, + "grad_norm": 3.104398250579834, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 255, + "epoch": 1.7288135593220337, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443172352, + "loss": 0.5559, + "grad_norm": 3.304196834564209, + "learning_rate": 1.5866209521043304e-05 + }, + { + "step": 256, + "epoch": 1.735593220338983, + "cpu_mem": 1.603309568, + "gpu_mem": 4.442998784, + "loss": 0.418, + "grad_norm": 1.922536015510559, + "learning_rate": 1.5076730713409523e-05 + }, + { + "step": 257, + "epoch": 1.7423728813559323, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443411968, + "loss": 0.4713, + "grad_norm": 1.6665114164352417, + "learning_rate": 1.4306358075111923e-05 + }, + { + "step": 258, + "epoch": 1.7491525423728813, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443070976, + "loss": 0.4849, + "grad_norm": 3.0644140243530273, + "learning_rate": 1.3555200696822232e-05 + }, + { + "step": 259, + "epoch": 1.7559322033898304, + "cpu_mem": 1.603309568, + "gpu_mem": 4.442988032, + "loss": 0.4486, + "grad_norm": 2.4036529064178467, + "learning_rate": 1.2823364948184095e-05 + }, + { + "step": 260, + "epoch": 1.7627118644067796, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443104768, + "loss": 0.3823, + "grad_norm": 1.550516128540039, + "learning_rate": 1.2110954462750166e-05 + }, + { + "step": 261, + "epoch": 1.769491525423729, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443060224, + "loss": 0.3476, + "grad_norm": 1.6439403295516968, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 262, + "epoch": 1.776271186440678, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443017216, + "loss": 0.404, + "grad_norm": 1.9801993370056152, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 263, + "epoch": 1.783050847457627, + "cpu_mem": 1.603309568, + "gpu_mem": 4.44305408, + "loss": 0.3795, + "grad_norm": 1.6281121969223022, + "learning_rate": 1.0091269574384874e-05 + }, + { + "step": 264, + "epoch": 1.7898305084745763, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443141632, + "loss": 0.3783, + "grad_norm": 2.5569183826446533, + "learning_rate": 9.45754125003576e-06 + }, + { + "step": 265, + "epoch": 1.7966101694915255, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443060224, + "loss": 0.4606, + "grad_norm": 1.8561303615570068, + "learning_rate": 8.843714815330987e-06 + }, + { + "step": 266, + "epoch": 1.8033898305084746, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443275264, + "loss": 0.4994, + "grad_norm": 1.875414252281189, + "learning_rate": 8.249877192799731e-06 + }, + { + "step": 267, + "epoch": 1.8101694915254236, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443067904, + "loss": 0.5116, + "grad_norm": 3.686413526535034, + "learning_rate": 7.676112474402068e-06 + }, + { + "step": 268, + "epoch": 1.8169491525423729, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443072512, + "loss": 0.4256, + "grad_norm": 2.086432456970215, + "learning_rate": 7.122501909620926e-06 + }, + { + "step": 269, + "epoch": 1.8237288135593221, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443083264, + "loss": 0.4929, + "grad_norm": 3.0527637004852295, + "learning_rate": 6.5891238939566275e-06 + }, + { + "step": 270, + "epoch": 1.8305084745762712, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443121664, + "loss": 0.4831, + "grad_norm": 2.31764817237854, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 271, + "epoch": 1.8372881355932202, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443173888, + "loss": 0.4781, + "grad_norm": 1.969340443611145, + "learning_rate": 5.583364755863701e-06 + }, + { + "step": 272, + "epoch": 1.8440677966101695, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443032576, + "loss": 0.495, + "grad_norm": 2.3687548637390137, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 273, + "epoch": 1.8508474576271188, + "cpu_mem": 1.603309568, + "gpu_mem": 4.442912768, + "loss": 0.4434, + "grad_norm": 1.9633749723434448, + "learning_rate": 4.659404732773908e-06 + }, + { + "step": 274, + "epoch": 1.8576271186440678, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443140096, + "loss": 0.451, + "grad_norm": 2.266461133956909, + "learning_rate": 4.228264751468752e-06 + }, + { + "step": 275, + "epoch": 1.8644067796610169, + "cpu_mem": 1.603309568, + "gpu_mem": 4.44338432, + "loss": 0.4588, + "grad_norm": 2.506035804748535, + "learning_rate": 3.817767165451041e-06 + }, + { + "step": 276, + "epoch": 1.8711864406779661, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443044864, + "loss": 0.4768, + "grad_norm": 1.8306457996368408, + "learning_rate": 3.4279701043260886e-06 + }, + { + "step": 277, + "epoch": 1.8779661016949154, + "cpu_mem": 1.603309568, + "gpu_mem": 4.442991104, + "loss": 0.4983, + "grad_norm": 2.006708860397339, + "learning_rate": 3.0589287663461472e-06 + }, + { + "step": 278, + "epoch": 1.8847457627118644, + "cpu_mem": 1.603309568, + "gpu_mem": 4.44315392, + "loss": 0.5843, + "grad_norm": 2.1301772594451904, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 279, + "epoch": 1.8915254237288135, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443094016, + "loss": 0.4798, + "grad_norm": 2.1593410968780518, + "learning_rate": 2.3833193495825853e-06 + }, + { + "step": 280, + "epoch": 1.8983050847457628, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443074048, + "loss": 0.4526, + "grad_norm": 1.7318910360336304, + "learning_rate": 2.076846942272026e-06 + }, + { + "step": 281, + "epoch": 1.905084745762712, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443009536, + "loss": 0.4792, + "grad_norm": 2.0407559871673584, + "learning_rate": 1.791321587504768e-06 + }, + { + "step": 282, + "epoch": 1.911864406779661, + "cpu_mem": 1.603309568, + "gpu_mem": 4.44343808, + "loss": 0.4143, + "grad_norm": 2.588322877883911, + "learning_rate": 1.5267837178600972e-06 + }, + { + "step": 283, + "epoch": 1.9186440677966101, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443144704, + "loss": 0.415, + "grad_norm": 1.9997503757476807, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 284, + "epoch": 1.9254237288135592, + "cpu_mem": 1.603309568, + "gpu_mem": 4.44300032, + "loss": 0.4498, + "grad_norm": 1.603151798248291, + "learning_rate": 1.0608172990067553e-06 + }, + { + "step": 285, + "epoch": 1.9322033898305084, + "cpu_mem": 1.603309568, + "gpu_mem": 4.44305408, + "loss": 0.4515, + "grad_norm": 1.956412672996521, + "learning_rate": 8.594547342153979e-07 + }, + { + "step": 286, + "epoch": 1.9389830508474577, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443471872, + "loss": 0.3942, + "grad_norm": 2.2995858192443848, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 287, + "epoch": 1.9457627118644067, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443241472, + "loss": 0.3953, + "grad_norm": 2.05315899848938, + "learning_rate": 5.201134622801473e-07 + }, + { + "step": 288, + "epoch": 1.9525423728813558, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443026432, + "loss": 0.6105, + "grad_norm": 2.8791072368621826, + "learning_rate": 3.821828084619727e-07 + }, + { + "step": 289, + "epoch": 1.959322033898305, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443110912, + "loss": 0.4459, + "grad_norm": 1.8374425172805786, + "learning_rate": 2.654391846207915e-07 + }, + { + "step": 290, + "epoch": 1.9661016949152543, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443035648, + "loss": 0.5208, + "grad_norm": 2.8564956188201904, + "learning_rate": 1.6989912254880556e-07 + }, + { + "step": 291, + "epoch": 1.9728813559322034, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443070976, + "loss": 0.4915, + "grad_norm": 2.185080051422119, + "learning_rate": 9.557615145123765e-08 + }, + { + "step": 292, + "epoch": 1.9796610169491524, + "cpu_mem": 1.603309568, + "gpu_mem": 4.44315392, + "loss": 0.4476, + "grad_norm": 1.6571877002716064, + "learning_rate": 4.248079603064724e-08 + }, + { + "step": 293, + "epoch": 1.9864406779661017, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443070976, + "loss": 0.5731, + "grad_norm": 3.137981653213501, + "learning_rate": 1.0620574996372811e-08 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443097088, + "loss": 0.5576, + "grad_norm": 2.974691867828369, + "learning_rate": 0.0 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.603309568, + "gpu_mem": 4.443097088, + "train_runtime": 4566.1522, + "train_samples_per_second": 4.129, + "train_steps_per_second": 0.064, + "total_flos": 4.702018789028659e+16, + "train_loss": 0.9013245111825515 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r32-a2/README.md b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r32-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r32-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r32-a2/adapter_config.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..03eabaea80bc9f8c1936ead28264f565a8ac69c0 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r32-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r32-a2/eval_results.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..631182a58e7af7bc90e26f6a03fa08280e34659c --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "boolq", + "results": 0.8180428134556575 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r32-a2/training_configuration.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..019ea8d7ec1514419f3b78a0f83725c8df96457f --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "BOOLQ", + "dataset_id": "google/boolq", + "preprocess_id": "boolq_train_deepeval" + }, + "peft_config": { + "method": "lora", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 25231360 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 2, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-lora-boolq-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r32-a2", + "seed": 42, + "timestamp": "2025-08-30T04:07:14.517363" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r32-a2/training_logs.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..92aef33c4c212709f430421173dd7ec5f45bb763 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r32-a2/training_logs.json @@ -0,0 +1,2659 @@ +[ + { + "step": 1, + "epoch": 0.006779661016949152, + "cpu_mem": 1.720283136, + "gpu_mem": 4.518697984, + "loss": 8.869, + "grad_norm": 51.60678482055664, + "learning_rate": 9.999999999999999e-06 + }, + { + "step": 2, + "epoch": 0.013559322033898305, + "cpu_mem": 1.726377984, + "gpu_mem": 4.720686592, + "loss": 8.9376, + "grad_norm": 52.917572021484375, + "learning_rate": 1.9999999999999998e-05 + }, + { + "step": 3, + "epoch": 0.020338983050847456, + "cpu_mem": 1.726967808, + "gpu_mem": 4.720605184, + "loss": 8.4073, + "grad_norm": 53.50897216796875, + "learning_rate": 2.9999999999999997e-05 + }, + { + "step": 4, + "epoch": 0.02711864406779661, + "cpu_mem": 1.72775424, + "gpu_mem": 4.720605184, + "loss": 7.2589, + "grad_norm": 54.46932601928711, + "learning_rate": 3.9999999999999996e-05 + }, + { + "step": 5, + "epoch": 0.03389830508474576, + "cpu_mem": 1.728344064, + "gpu_mem": 4.720540672, + "loss": 5.631, + "grad_norm": 49.43437576293945, + "learning_rate": 4.9999999999999996e-05 + }, + { + "step": 6, + "epoch": 0.04067796610169491, + "cpu_mem": 1.72873728, + "gpu_mem": 4.72056064, + "loss": 3.8802, + "grad_norm": 45.01443099975586, + "learning_rate": 5.9999999999999995e-05 + }, + { + "step": 7, + "epoch": 0.04745762711864407, + "cpu_mem": 1.729327104, + "gpu_mem": 4.720612864, + "loss": 2.038, + "grad_norm": 24.61026382446289, + "learning_rate": 7e-05 + }, + { + "step": 8, + "epoch": 0.05423728813559322, + "cpu_mem": 1.729916928, + "gpu_mem": 4.72069888, + "loss": 1.22, + "grad_norm": 13.973702430725098, + "learning_rate": 7.999999999999999e-05 + }, + { + "step": 9, + "epoch": 0.061016949152542375, + "cpu_mem": 1.730506752, + "gpu_mem": 4.72060672, + "loss": 1.0126, + "grad_norm": 7.678637981414795, + "learning_rate": 8.999999999999999e-05 + }, + { + "step": 10, + "epoch": 0.06779661016949153, + "cpu_mem": 1.730899968, + "gpu_mem": 4.72050688, + "loss": 0.7563, + "grad_norm": 3.8022642135620117, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 11, + "epoch": 0.07457627118644068, + "cpu_mem": 1.731489792, + "gpu_mem": 4.720611328, + "loss": 0.9799, + "grad_norm": 17.203989028930664, + "learning_rate": 0.00010999999999999998 + }, + { + "step": 12, + "epoch": 0.08135593220338982, + "cpu_mem": 1.731883008, + "gpu_mem": 4.72098304, + "loss": 0.8376, + "grad_norm": 13.539207458496094, + "learning_rate": 0.00011999999999999999 + }, + { + "step": 13, + "epoch": 0.08813559322033898, + "cpu_mem": 1.732276224, + "gpu_mem": 4.720586752, + "loss": 0.8452, + "grad_norm": 13.823596000671387, + "learning_rate": 0.00013 + }, + { + "step": 14, + "epoch": 0.09491525423728814, + "cpu_mem": 1.73266944, + "gpu_mem": 4.720563712, + "loss": 0.7739, + "grad_norm": 10.076414108276367, + "learning_rate": 0.00014 + }, + { + "step": 15, + "epoch": 0.1016949152542373, + "cpu_mem": 1.732866048, + "gpu_mem": 4.720502272, + "loss": 0.6938, + "grad_norm": 1.1074076890945435, + "learning_rate": 0.00015 + }, + { + "step": 16, + "epoch": 0.10847457627118644, + "cpu_mem": 1.733259264, + "gpu_mem": 4.720586752, + "loss": 1.0152, + "grad_norm": 14.208033561706543, + "learning_rate": 0.00015999999999999999 + }, + { + "step": 17, + "epoch": 0.1152542372881356, + "cpu_mem": 1.73365248, + "gpu_mem": 4.720626688, + "loss": 0.7748, + "grad_norm": 7.999721527099609, + "learning_rate": 0.00016999999999999999 + }, + { + "step": 18, + "epoch": 0.12203389830508475, + "cpu_mem": 1.734045696, + "gpu_mem": 4.720689664, + "loss": 0.7783, + "grad_norm": 8.170677185058594, + "learning_rate": 0.00017999999999999998 + }, + { + "step": 19, + "epoch": 0.1288135593220339, + "cpu_mem": 1.734438912, + "gpu_mem": 4.720526848, + "loss": 0.8646, + "grad_norm": 9.882715225219727, + "learning_rate": 0.00018999999999999998 + }, + { + "step": 20, + "epoch": 0.13559322033898305, + "cpu_mem": 1.73463552, + "gpu_mem": 4.720638976, + "loss": 0.7086, + "grad_norm": 5.770381927490234, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 21, + "epoch": 0.1423728813559322, + "cpu_mem": 1.734832128, + "gpu_mem": 4.720797184, + "loss": 0.683, + "grad_norm": 3.6729583740234375, + "learning_rate": 0.00020999999999999998 + }, + { + "step": 22, + "epoch": 0.14915254237288136, + "cpu_mem": 1.735225344, + "gpu_mem": 4.720689664, + "loss": 0.7816, + "grad_norm": 4.814462184906006, + "learning_rate": 0.00021999999999999995 + }, + { + "step": 23, + "epoch": 0.15593220338983052, + "cpu_mem": 1.73561856, + "gpu_mem": 4.720662016, + "loss": 0.6889, + "grad_norm": 2.3137412071228027, + "learning_rate": 0.00023 + }, + { + "step": 24, + "epoch": 0.16271186440677965, + "cpu_mem": 1.736011776, + "gpu_mem": 4.720718848, + "loss": 0.6367, + "grad_norm": 2.682117223739624, + "learning_rate": 0.00023999999999999998 + }, + { + "step": 25, + "epoch": 0.1694915254237288, + "cpu_mem": 1.736208384, + "gpu_mem": 4.720503808, + "loss": 0.6728, + "grad_norm": 0.83548504114151, + "learning_rate": 0.00025 + }, + { + "step": 26, + "epoch": 0.17627118644067796, + "cpu_mem": 1.736404992, + "gpu_mem": 4.720559104, + "loss": 0.6733, + "grad_norm": 0.6044420599937439, + "learning_rate": 0.00026 + }, + { + "step": 27, + "epoch": 0.18305084745762712, + "cpu_mem": 1.736798208, + "gpu_mem": 4.720850944, + "loss": 0.7223, + "grad_norm": 4.928010940551758, + "learning_rate": 0.00027 + }, + { + "step": 28, + "epoch": 0.18983050847457628, + "cpu_mem": 1.736994816, + "gpu_mem": 4.72052992, + "loss": 0.6893, + "grad_norm": 1.4699289798736572, + "learning_rate": 0.00028 + }, + { + "step": 29, + "epoch": 0.19661016949152543, + "cpu_mem": 1.737191424, + "gpu_mem": 4.720594432, + "loss": 0.6779, + "grad_norm": 3.460477828979492, + "learning_rate": 0.00029 + }, + { + "step": 30, + "epoch": 0.2033898305084746, + "cpu_mem": 1.737388032, + "gpu_mem": 4.720672768, + "loss": 0.6714, + "grad_norm": 1.6848291158676147, + "learning_rate": 0.0003 + }, + { + "step": 31, + "epoch": 0.21016949152542372, + "cpu_mem": 1.73758464, + "gpu_mem": 4.72047616, + "loss": 0.5914, + "grad_norm": 2.466878652572632, + "learning_rate": 0.0002999893794250036 + }, + { + "step": 32, + "epoch": 0.21694915254237288, + "cpu_mem": 1.737977856, + "gpu_mem": 4.720589824, + "loss": 0.6731, + "grad_norm": 3.012584924697876, + "learning_rate": 0.00029995751920396937 + }, + { + "step": 33, + "epoch": 0.22372881355932203, + "cpu_mem": 1.738174464, + "gpu_mem": 4.720827904, + "loss": 0.6912, + "grad_norm": 1.0402086973190308, + "learning_rate": 0.00029990442384854874 + }, + { + "step": 34, + "epoch": 0.2305084745762712, + "cpu_mem": 1.738371072, + "gpu_mem": 4.72052992, + "loss": 0.5691, + "grad_norm": 0.9186956286430359, + "learning_rate": 0.0002998301008774512 + }, + { + "step": 35, + "epoch": 0.23728813559322035, + "cpu_mem": 1.73856768, + "gpu_mem": 4.720740352, + "loss": 0.9271, + "grad_norm": 7.014492034912109, + "learning_rate": 0.0002997345608153792 + }, + { + "step": 36, + "epoch": 0.2440677966101695, + "cpu_mem": 1.738764288, + "gpu_mem": 4.7206912, + "loss": 0.7777, + "grad_norm": 4.570197105407715, + "learning_rate": 0.000299617817191538 + }, + { + "step": 37, + "epoch": 0.25084745762711863, + "cpu_mem": 1.738960896, + "gpu_mem": 4.720502272, + "loss": 0.7592, + "grad_norm": 9.505629539489746, + "learning_rate": 0.0002994798865377198 + }, + { + "step": 38, + "epoch": 0.2576271186440678, + "cpu_mem": 1.739157504, + "gpu_mem": 4.720749568, + "loss": 0.7578, + "grad_norm": 4.544586181640625, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 39, + "epoch": 0.26440677966101694, + "cpu_mem": 1.739354112, + "gpu_mem": 4.72112896, + "loss": 0.6557, + "grad_norm": 1.633331537246704, + "learning_rate": 0.0002991405452657846 + }, + { + "step": 40, + "epoch": 0.2711864406779661, + "cpu_mem": 1.73955072, + "gpu_mem": 4.72069888, + "loss": 0.747, + "grad_norm": 3.7240281105041504, + "learning_rate": 0.00029893918270099324 + }, + { + "step": 41, + "epoch": 0.27796610169491526, + "cpu_mem": 1.739747328, + "gpu_mem": 4.720926208, + "loss": 0.7474, + "grad_norm": 6.776904582977295, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 42, + "epoch": 0.2847457627118644, + "cpu_mem": 1.739943936, + "gpu_mem": 4.720823296, + "loss": 9.7761, + "grad_norm": 94.46591186523438, + "learning_rate": 0.0002984732162821399 + }, + { + "step": 43, + "epoch": 0.29152542372881357, + "cpu_mem": 1.740140544, + "gpu_mem": 4.72064512, + "loss": 0.5947, + "grad_norm": 1.100540280342102, + "learning_rate": 0.0002982086784124952 + }, + { + "step": 44, + "epoch": 0.2983050847457627, + "cpu_mem": 1.740337152, + "gpu_mem": 4.720787968, + "loss": 0.7026, + "grad_norm": 3.0734498500823975, + "learning_rate": 0.00029792315305772796 + }, + { + "step": 45, + "epoch": 0.3050847457627119, + "cpu_mem": 1.74053376, + "gpu_mem": 4.72056832, + "loss": 0.9385, + "grad_norm": 5.014179706573486, + "learning_rate": 0.0002976166806504174 + }, + { + "step": 46, + "epoch": 0.31186440677966104, + "cpu_mem": 1.74053376, + "gpu_mem": 4.720811008, + "loss": 0.6442, + "grad_norm": 2.618241548538208, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 47, + "epoch": 0.31864406779661014, + "cpu_mem": 1.74053376, + "gpu_mem": 4.720534528, + "loss": 0.6731, + "grad_norm": 3.2640492916107178, + "learning_rate": 0.00029694107123365385 + }, + { + "step": 48, + "epoch": 0.3254237288135593, + "cpu_mem": 1.740730368, + "gpu_mem": 4.720611328, + "loss": 0.5926, + "grad_norm": 2.29248046875, + "learning_rate": 0.00029657202989567393 + }, + { + "step": 49, + "epoch": 0.33220338983050846, + "cpu_mem": 1.740926976, + "gpu_mem": 4.720628224, + "loss": 0.8899, + "grad_norm": 5.544245719909668, + "learning_rate": 0.00029618223283454893 + }, + { + "step": 50, + "epoch": 0.3389830508474576, + "cpu_mem": 1.740926976, + "gpu_mem": 4.720566784, + "loss": 0.6715, + "grad_norm": 2.797156572341919, + "learning_rate": 0.00029577173524853123 + }, + { + "step": 51, + "epoch": 0.34576271186440677, + "cpu_mem": 1.741123584, + "gpu_mem": 4.720571392, + "loss": 0.5647, + "grad_norm": 2.037566900253296, + "learning_rate": 0.0002953405952672261 + }, + { + "step": 52, + "epoch": 0.3525423728813559, + "cpu_mem": 1.741320192, + "gpu_mem": 4.720651264, + "loss": 0.6462, + "grad_norm": 2.068006992340088, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 53, + "epoch": 0.3593220338983051, + "cpu_mem": 1.741320192, + "gpu_mem": 4.720674304, + "loss": 0.6825, + "grad_norm": 5.855047225952148, + "learning_rate": 0.0002944166352441363 + }, + { + "step": 54, + "epoch": 0.36610169491525424, + "cpu_mem": 1.7415168, + "gpu_mem": 4.720602112, + "loss": 0.633, + "grad_norm": 1.4529956579208374, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 55, + "epoch": 0.3728813559322034, + "cpu_mem": 1.7415168, + "gpu_mem": 4.720872448, + "loss": 0.611, + "grad_norm": 2.3096799850463867, + "learning_rate": 0.00029341087610604337 + }, + { + "step": 56, + "epoch": 0.37966101694915255, + "cpu_mem": 1.741713408, + "gpu_mem": 4.720658944, + "loss": 0.7088, + "grad_norm": 2.794340133666992, + "learning_rate": 0.00029287749809037904 + }, + { + "step": 57, + "epoch": 0.3864406779661017, + "cpu_mem": 1.741713408, + "gpu_mem": 4.7206528, + "loss": 0.5946, + "grad_norm": 1.2550841569900513, + "learning_rate": 0.0002923238875255979 + }, + { + "step": 58, + "epoch": 0.39322033898305087, + "cpu_mem": 1.741713408, + "gpu_mem": 4.720548352, + "loss": 0.5883, + "grad_norm": 0.9456785321235657, + "learning_rate": 0.00029175012280720024 + }, + { + "step": 59, + "epoch": 0.4, + "cpu_mem": 1.741910016, + "gpu_mem": 4.720565248, + "loss": 0.5858, + "grad_norm": 0.9793041944503784, + "learning_rate": 0.000291156285184669 + }, + { + "step": 60, + "epoch": 0.4067796610169492, + "cpu_mem": 1.741910016, + "gpu_mem": 4.720658944, + "loss": 0.5358, + "grad_norm": 1.3087780475616455, + "learning_rate": 0.00029054245874996426 + }, + { + "step": 61, + "epoch": 0.4135593220338983, + "cpu_mem": 1.742106624, + "gpu_mem": 4.720669696, + "loss": 0.5731, + "grad_norm": 2.030730962753296, + "learning_rate": 0.0002899087304256151 + }, + { + "step": 62, + "epoch": 0.42033898305084744, + "cpu_mem": 1.742106624, + "gpu_mem": 4.720657408, + "loss": 0.8549, + "grad_norm": 4.453673839569092, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 63, + "epoch": 0.4271186440677966, + "cpu_mem": 1.742303232, + "gpu_mem": 4.720649728, + "loss": 0.4514, + "grad_norm": 3.212015390396118, + "learning_rate": 0.000288581929876693 + }, + { + "step": 64, + "epoch": 0.43389830508474575, + "cpu_mem": 1.742303232, + "gpu_mem": 4.720579072, + "loss": 0.5271, + "grad_norm": 3.536609411239624, + "learning_rate": 0.0002878890455372498 + }, + { + "step": 65, + "epoch": 0.4406779661016949, + "cpu_mem": 1.742303232, + "gpu_mem": 4.720623616, + "loss": 0.5957, + "grad_norm": 1.7865204811096191, + "learning_rate": 0.0002871766350518159 + }, + { + "step": 66, + "epoch": 0.44745762711864406, + "cpu_mem": 1.74249984, + "gpu_mem": 4.720817152, + "loss": 0.5146, + "grad_norm": 1.4939695596694946, + "learning_rate": 0.00028644479930317775 + }, + { + "step": 67, + "epoch": 0.4542372881355932, + "cpu_mem": 1.74249984, + "gpu_mem": 4.720526848, + "loss": 0.5516, + "grad_norm": 2.3452401161193848, + "learning_rate": 0.00028569364192488803 + }, + { + "step": 68, + "epoch": 0.4610169491525424, + "cpu_mem": 1.74249984, + "gpu_mem": 4.720494592, + "loss": 0.6503, + "grad_norm": 2.647474765777588, + "learning_rate": 0.00028492326928659045 + }, + { + "step": 69, + "epoch": 0.46779661016949153, + "cpu_mem": 1.742696448, + "gpu_mem": 4.72056064, + "loss": 0.6561, + "grad_norm": 4.234159469604492, + "learning_rate": 0.00028413379047895665 + }, + { + "step": 70, + "epoch": 0.4745762711864407, + "cpu_mem": 1.742696448, + "gpu_mem": 4.720554496, + "loss": 0.5756, + "grad_norm": 1.8981281518936157, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 71, + "epoch": 0.48135593220338985, + "cpu_mem": 1.742893056, + "gpu_mem": 4.72078336, + "loss": 0.5445, + "grad_norm": 2.1030561923980713, + "learning_rate": 0.0002824979642304366 + }, + { + "step": 72, + "epoch": 0.488135593220339, + "cpu_mem": 1.742893056, + "gpu_mem": 4.72077568, + "loss": 0.5823, + "grad_norm": 3.3516831398010254, + "learning_rate": 0.0002816518484350883 + }, + { + "step": 73, + "epoch": 0.49491525423728816, + "cpu_mem": 1.742893056, + "gpu_mem": 4.720741888, + "loss": 0.7, + "grad_norm": 3.1816067695617676, + "learning_rate": 0.0002807870897286772 + }, + { + "step": 74, + "epoch": 0.5016949152542373, + "cpu_mem": 1.742893056, + "gpu_mem": 4.720602112, + "loss": 0.6819, + "grad_norm": 6.909383773803711, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 75, + "epoch": 0.5084745762711864, + "cpu_mem": 1.742893056, + "gpu_mem": 4.720526848, + "loss": 0.8123, + "grad_norm": 7.361441135406494, + "learning_rate": 0.000279002136031155 + }, + { + "step": 76, + "epoch": 0.5152542372881356, + "cpu_mem": 1.742893056, + "gpu_mem": 4.720466944, + "loss": 0.5961, + "grad_norm": 3.6832051277160645, + "learning_rate": 0.00027808219380317216 + }, + { + "step": 77, + "epoch": 0.5220338983050847, + "cpu_mem": 1.743089664, + "gpu_mem": 4.720540672, + "loss": 0.5205, + "grad_norm": 1.8370615243911743, + "learning_rate": 0.0002771441141545895 + }, + { + "step": 78, + "epoch": 0.5288135593220339, + "cpu_mem": 1.743089664, + "gpu_mem": 4.720592896, + "loss": 0.9777, + "grad_norm": 6.232685565948486, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 79, + "epoch": 0.535593220338983, + "cpu_mem": 1.743089664, + "gpu_mem": 4.720724992, + "loss": 0.6923, + "grad_norm": 3.0233302116394043, + "learning_rate": 0.000275214076502292 + }, + { + "step": 80, + "epoch": 0.5423728813559322, + "cpu_mem": 1.743089664, + "gpu_mem": 4.720615936, + "loss": 0.5651, + "grad_norm": 1.1419326066970825, + "learning_rate": 0.0002742223918067056 + }, + { + "step": 81, + "epoch": 0.5491525423728814, + "cpu_mem": 1.743089664, + "gpu_mem": 4.720496128, + "loss": 0.5737, + "grad_norm": 1.6287283897399902, + "learning_rate": 0.00027321311626807374 + }, + { + "step": 82, + "epoch": 0.5559322033898305, + "cpu_mem": 1.743089664, + "gpu_mem": 4.720565248, + "loss": 0.5534, + "grad_norm": 1.266178011894226, + "learning_rate": 0.0002721863928075503 + }, + { + "step": 83, + "epoch": 0.5627118644067797, + "cpu_mem": 1.743286272, + "gpu_mem": 4.720665088, + "loss": 0.626, + "grad_norm": 1.8044404983520508, + "learning_rate": 0.000271142366817049 + }, + { + "step": 84, + "epoch": 0.5694915254237288, + "cpu_mem": 1.743286272, + "gpu_mem": 4.720628224, + "loss": 0.5178, + "grad_norm": 0.8960481286048889, + "learning_rate": 0.00027008118613865406 + }, + { + "step": 85, + "epoch": 0.576271186440678, + "cpu_mem": 1.74348288, + "gpu_mem": 4.72066048, + "loss": 0.5488, + "grad_norm": 1.3372315168380737, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 86, + "epoch": 0.5830508474576271, + "cpu_mem": 1.74348288, + "gpu_mem": 4.720611328, + "loss": 0.5562, + "grad_norm": 1.2149802446365356, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 87, + "epoch": 0.5898305084745763, + "cpu_mem": 1.74348288, + "gpu_mem": 4.720619008, + "loss": 0.4655, + "grad_norm": 1.4968947172164917, + "learning_rate": 0.00026679623070746325 + }, + { + "step": 88, + "epoch": 0.5966101694915255, + "cpu_mem": 1.74348288, + "gpu_mem": 4.720763392, + "loss": 0.5504, + "grad_norm": 2.285689115524292, + "learning_rate": 0.0002656679579618081 + }, + { + "step": 89, + "epoch": 0.6033898305084746, + "cpu_mem": 1.74348288, + "gpu_mem": 4.72054528, + "loss": 0.4889, + "grad_norm": 2.0822064876556396, + "learning_rate": 0.0002645233057465235 + }, + { + "step": 90, + "epoch": 0.6101694915254238, + "cpu_mem": 1.74348288, + "gpu_mem": 4.72059904, + "loss": 0.4943, + "grad_norm": 3.9266443252563477, + "learning_rate": 0.00026336243615313873 + }, + { + "step": 91, + "epoch": 0.6169491525423729, + "cpu_mem": 1.74348288, + "gpu_mem": 4.720566784, + "loss": 0.4021, + "grad_norm": 2.7494914531707764, + "learning_rate": 0.00026218551356968814 + }, + { + "step": 92, + "epoch": 0.6237288135593221, + "cpu_mem": 1.74348288, + "gpu_mem": 4.720648192, + "loss": 0.6306, + "grad_norm": 4.651350975036621, + "learning_rate": 0.00026099270465743254 + }, + { + "step": 93, + "epoch": 0.6305084745762712, + "cpu_mem": 1.743679488, + "gpu_mem": 4.720451584, + "loss": 0.8099, + "grad_norm": 3.718210458755493, + "learning_rate": 0.0002597841783272588 + }, + { + "step": 94, + "epoch": 0.6372881355932203, + "cpu_mem": 1.743679488, + "gpu_mem": 4.720565248, + "loss": 0.7, + "grad_norm": 6.5295233726501465, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 95, + "epoch": 0.6440677966101694, + "cpu_mem": 1.743679488, + "gpu_mem": 4.720585216, + "loss": 0.5295, + "grad_norm": 1.7366926670074463, + "learning_rate": 0.00025732066016100394 + }, + { + "step": 96, + "epoch": 0.6508474576271186, + "cpu_mem": 1.743679488, + "gpu_mem": 4.720623616, + "loss": 0.5149, + "grad_norm": 2.2745392322540283, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 97, + "epoch": 0.6576271186440678, + "cpu_mem": 1.743679488, + "gpu_mem": 4.720608256, + "loss": 0.6512, + "grad_norm": 3.646397352218628, + "learning_rate": 0.0002547963544337602 + }, + { + "step": 98, + "epoch": 0.6644067796610169, + "cpu_mem": 1.743679488, + "gpu_mem": 4.720520704, + "loss": 0.5422, + "grad_norm": 1.9580708742141724, + "learning_rate": 0.0002535118517223168 + }, + { + "step": 99, + "epoch": 0.6711864406779661, + "cpu_mem": 1.743679488, + "gpu_mem": 4.720470016, + "loss": 0.5669, + "grad_norm": 2.4615302085876465, + "learning_rate": 0.00025221269093908365 + }, + { + "step": 100, + "epoch": 0.6779661016949152, + "cpu_mem": 1.743679488, + "gpu_mem": 4.720586752, + "loss": 0.6187, + "grad_norm": 4.497075080871582, + "learning_rate": 0.0002508990560551879 + }, + { + "step": 101, + "epoch": 0.6847457627118644, + "cpu_mem": 1.743679488, + "gpu_mem": 4.720619008, + "loss": 0.5869, + "grad_norm": 2.289053440093994, + "learning_rate": 0.0002495711330914001 + }, + { + "step": 102, + "epoch": 0.6915254237288135, + "cpu_mem": 1.743679488, + "gpu_mem": 4.7206528, + "loss": 0.5424, + "grad_norm": 2.228019952774048, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 103, + "epoch": 0.6983050847457627, + "cpu_mem": 1.743679488, + "gpu_mem": 4.720703488, + "loss": 0.6973, + "grad_norm": 3.192368507385254, + "learning_rate": 0.0002468731770971113 + }, + { + "step": 104, + "epoch": 0.7050847457627119, + "cpu_mem": 1.743679488, + "gpu_mem": 4.720608256, + "loss": 0.5425, + "grad_norm": 2.1224424839019775, + "learning_rate": 0.0002455035261178632 + }, + { + "step": 105, + "epoch": 0.711864406779661, + "cpu_mem": 1.743679488, + "gpu_mem": 4.720709632, + "loss": 0.5521, + "grad_norm": 1.678141713142395, + "learning_rate": 0.0002441203511071278 + }, + { + "step": 106, + "epoch": 0.7186440677966102, + "cpu_mem": 1.743679488, + "gpu_mem": 4.72066048, + "loss": 0.5523, + "grad_norm": 2.573184013366699, + "learning_rate": 0.00024272384793309077 + }, + { + "step": 107, + "epoch": 0.7254237288135593, + "cpu_mem": 1.743679488, + "gpu_mem": 4.720548352, + "loss": 0.5066, + "grad_norm": 2.299196243286133, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 108, + "epoch": 0.7322033898305085, + "cpu_mem": 1.743679488, + "gpu_mem": 4.720732672, + "loss": 0.5026, + "grad_norm": 1.351962685585022, + "learning_rate": 0.00023989164997670202 + }, + { + "step": 109, + "epoch": 0.7389830508474576, + "cpu_mem": 1.743679488, + "gpu_mem": 4.720586752, + "loss": 0.6434, + "grad_norm": 1.961143136024475, + "learning_rate": 0.0002384563562552943 + }, + { + "step": 110, + "epoch": 0.7457627118644068, + "cpu_mem": 1.743679488, + "gpu_mem": 4.720589824, + "loss": 0.5521, + "grad_norm": 2.112461566925049, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 111, + "epoch": 0.752542372881356, + "cpu_mem": 1.743679488, + "gpu_mem": 4.720559104, + "loss": 0.4724, + "grad_norm": 1.2213021516799927, + "learning_rate": 0.0002355483955402446 + }, + { + "step": 112, + "epoch": 0.7593220338983051, + "cpu_mem": 1.743876096, + "gpu_mem": 4.720605184, + "loss": 0.5382, + "grad_norm": 2.333643913269043, + "learning_rate": 0.00023407614033613407 + }, + { + "step": 113, + "epoch": 0.7661016949152543, + "cpu_mem": 1.743876096, + "gpu_mem": 4.720595968, + "loss": 0.4957, + "grad_norm": 2.2547905445098877, + "learning_rate": 0.0002325919793059723 + }, + { + "step": 114, + "epoch": 0.7728813559322034, + "cpu_mem": 1.743876096, + "gpu_mem": 4.720577536, + "loss": 0.429, + "grad_norm": 1.5382694005966187, + "learning_rate": 0.00023109612261833963 + }, + { + "step": 115, + "epoch": 0.7796610169491526, + "cpu_mem": 1.743876096, + "gpu_mem": 4.7206528, + "loss": 0.508, + "grad_norm": 2.0915169715881348, + "learning_rate": 0.0002295887820980112 + }, + { + "step": 116, + "epoch": 0.7864406779661017, + "cpu_mem": 1.743876096, + "gpu_mem": 4.720572928, + "loss": 0.4888, + "grad_norm": 1.490182876586914, + "learning_rate": 0.0002280701711959608 + }, + { + "step": 117, + "epoch": 0.7932203389830509, + "cpu_mem": 1.743876096, + "gpu_mem": 4.720463872, + "loss": 0.5164, + "grad_norm": 2.824855089187622, + "learning_rate": 0.00022654050495913495 + }, + { + "step": 118, + "epoch": 0.8, + "cpu_mem": 1.743876096, + "gpu_mem": 4.720701952, + "loss": 0.5919, + "grad_norm": 3.377058744430542, + "learning_rate": 0.000225 + }, + { + "step": 119, + "epoch": 0.8067796610169492, + "cpu_mem": 1.743876096, + "gpu_mem": 4.720872448, + "loss": 0.5176, + "grad_norm": 2.5313000679016113, + "learning_rate": 0.00022344887446586865 + }, + { + "step": 120, + "epoch": 0.8135593220338984, + "cpu_mem": 1.743876096, + "gpu_mem": 4.720605184, + "loss": 0.5535, + "grad_norm": 3.8832449913024902, + "learning_rate": 0.00022188734800800852 + }, + { + "step": 121, + "epoch": 0.8203389830508474, + "cpu_mem": 1.743876096, + "gpu_mem": 4.720632832, + "loss": 0.5763, + "grad_norm": 2.72977352142334, + "learning_rate": 0.00022031564175053754 + }, + { + "step": 122, + "epoch": 0.8271186440677966, + "cpu_mem": 1.744072704, + "gpu_mem": 4.72068352, + "loss": 0.3668, + "grad_norm": 1.5825119018554688, + "learning_rate": 0.00021873397825911153 + }, + { + "step": 123, + "epoch": 0.8338983050847457, + "cpu_mem": 1.744072704, + "gpu_mem": 4.720493056, + "loss": 0.4973, + "grad_norm": 2.62083101272583, + "learning_rate": 0.00021714258150940685 + }, + { + "step": 124, + "epoch": 0.8406779661016949, + "cpu_mem": 1.744072704, + "gpu_mem": 4.720935424, + "loss": 0.4926, + "grad_norm": 2.6053855419158936, + "learning_rate": 0.0002155416768554039 + }, + { + "step": 125, + "epoch": 0.847457627118644, + "cpu_mem": 1.744072704, + "gpu_mem": 4.720662016, + "loss": 0.474, + "grad_norm": 1.6001797914505005, + "learning_rate": 0.00021393149099747523 + }, + { + "step": 126, + "epoch": 0.8542372881355932, + "cpu_mem": 1.744072704, + "gpu_mem": 4.72054528, + "loss": 0.5057, + "grad_norm": 1.8201334476470947, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 127, + "epoch": 0.8610169491525423, + "cpu_mem": 1.744072704, + "gpu_mem": 4.720984576, + "loss": 0.5122, + "grad_norm": 1.6502737998962402, + "learning_rate": 0.00021068418901049025 + }, + { + "step": 128, + "epoch": 0.8677966101694915, + "cpu_mem": 1.744072704, + "gpu_mem": 4.72076032, + "loss": 0.4455, + "grad_norm": 1.8586137294769287, + "learning_rate": 0.0002090475327242912 + }, + { + "step": 129, + "epoch": 0.8745762711864407, + "cpu_mem": 1.744072704, + "gpu_mem": 4.720800256, + "loss": 0.442, + "grad_norm": 1.3689652681350708, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 130, + "epoch": 0.8813559322033898, + "cpu_mem": 1.744072704, + "gpu_mem": 4.720582144, + "loss": 0.5437, + "grad_norm": 1.3689312934875488, + "learning_rate": 0.0002057493683490491 + }, + { + "step": 131, + "epoch": 0.888135593220339, + "cpu_mem": 1.744072704, + "gpu_mem": 4.720711168, + "loss": 0.4548, + "grad_norm": 1.7695180177688599, + "learning_rate": 0.00020408832730536746 + }, + { + "step": 132, + "epoch": 0.8949152542372881, + "cpu_mem": 1.744072704, + "gpu_mem": 4.720792576, + "loss": 0.3353, + "grad_norm": 1.1275129318237305, + "learning_rate": 0.00020241962693986476 + }, + { + "step": 133, + "epoch": 0.9016949152542373, + "cpu_mem": 1.744072704, + "gpu_mem": 4.720576, + "loss": 0.4001, + "grad_norm": 2.070481061935425, + "learning_rate": 0.0002007435035533061 + }, + { + "step": 134, + "epoch": 0.9084745762711864, + "cpu_mem": 1.744072704, + "gpu_mem": 4.720709632, + "loss": 0.3911, + "grad_norm": 2.228978395462036, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 135, + "epoch": 0.9152542372881356, + "cpu_mem": 1.744072704, + "gpu_mem": 4.720732672, + "loss": 0.4716, + "grad_norm": 2.3769936561584473, + "learning_rate": 0.00019736993814225374 + }, + { + "step": 136, + "epoch": 0.9220338983050848, + "cpu_mem": 1.744072704, + "gpu_mem": 4.720569856, + "loss": 0.4062, + "grad_norm": 2.331482410430908, + "learning_rate": 0.00019567297384048604 + }, + { + "step": 137, + "epoch": 0.9288135593220339, + "cpu_mem": 1.744072704, + "gpu_mem": 4.720450048, + "loss": 0.3982, + "grad_norm": 2.4171366691589355, + "learning_rate": 0.0001939695418954653 + }, + { + "step": 138, + "epoch": 0.9355932203389831, + "cpu_mem": 1.744072704, + "gpu_mem": 4.720631296, + "loss": 0.4115, + "grad_norm": 2.983798027038574, + "learning_rate": 0.00019225988352621445 + }, + { + "step": 139, + "epoch": 0.9423728813559322, + "cpu_mem": 1.744072704, + "gpu_mem": 4.72052992, + "loss": 0.4146, + "grad_norm": 3.065737724304199, + "learning_rate": 0.00019054424083346592 + }, + { + "step": 140, + "epoch": 0.9491525423728814, + "cpu_mem": 1.744072704, + "gpu_mem": 4.720582144, + "loss": 0.5438, + "grad_norm": 3.4206557273864746, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 141, + "epoch": 0.9559322033898305, + "cpu_mem": 1.744072704, + "gpu_mem": 4.7206144, + "loss": 0.6969, + "grad_norm": 4.57802677154541, + "learning_rate": 0.0001870959750831323 + }, + { + "step": 142, + "epoch": 0.9627118644067797, + "cpu_mem": 1.744072704, + "gpu_mem": 4.720754176, + "loss": 0.4681, + "grad_norm": 3.5430493354797363, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 143, + "epoch": 0.9694915254237289, + "cpu_mem": 1.744072704, + "gpu_mem": 4.72073728, + "loss": 0.5519, + "grad_norm": 2.895052671432495, + "learning_rate": 0.00018362669777878453 + }, + { + "step": 144, + "epoch": 0.976271186440678, + "cpu_mem": 1.744072704, + "gpu_mem": 4.72092928, + "loss": 0.4757, + "grad_norm": 2.613802909851074, + "learning_rate": 0.00018188479343294648 + }, + { + "step": 145, + "epoch": 0.9830508474576272, + "cpu_mem": 1.744072704, + "gpu_mem": 4.720640512, + "loss": 0.4792, + "grad_norm": 2.0761008262634277, + "learning_rate": 0.0001801383739559098 + }, + { + "step": 146, + "epoch": 0.9898305084745763, + "cpu_mem": 1.744072704, + "gpu_mem": 4.72067584, + "loss": 0.5492, + "grad_norm": 2.1266207695007324, + "learning_rate": 0.0001783876866540615 + }, + { + "step": 147, + "epoch": 0.9966101694915255, + "cpu_mem": 1.744072704, + "gpu_mem": 4.720574464, + "loss": 0.5, + "grad_norm": 2.12516450881958, + "learning_rate": 0.00017663297943814552 + }, + { + "step": 148, + "epoch": 1.0033898305084745, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821664256, + "loss": 0.5425, + "grad_norm": 2.1539177894592285, + "learning_rate": 0.0001748745007881561 + }, + { + "step": 149, + "epoch": 1.0101694915254238, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821599744, + "loss": 0.3761, + "grad_norm": 2.2254269123077393, + "learning_rate": 0.00017311249971815185 + }, + { + "step": 150, + "epoch": 1.0169491525423728, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821436928, + "loss": 0.4045, + "grad_norm": 2.78285813331604, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 151, + "epoch": 1.023728813559322, + "cpu_mem": 1.744072704, + "gpu_mem": 4.82150912, + "loss": 0.3761, + "grad_norm": 1.6437393426895142, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 152, + "epoch": 1.0305084745762711, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821544448, + "loss": 0.3874, + "grad_norm": 2.0093090534210205, + "learning_rate": 0.00016780785939859576 + }, + { + "step": 153, + "epoch": 1.0372881355932204, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821569024, + "loss": 0.5161, + "grad_norm": 2.3020403385162354, + "learning_rate": 0.00016603426823476693 + }, + { + "step": 154, + "epoch": 1.0440677966101695, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821530624, + "loss": 0.3661, + "grad_norm": 1.5422757863998413, + "learning_rate": 0.00016425840649562736 + }, + { + "step": 155, + "epoch": 1.0508474576271187, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821751808, + "loss": 0.4111, + "grad_norm": 2.014267921447754, + "learning_rate": 0.00016248052565681436 + }, + { + "step": 156, + "epoch": 1.0576271186440678, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821659648, + "loss": 0.291, + "grad_norm": 1.978320837020874, + "learning_rate": 0.00016070087747988482 + }, + { + "step": 157, + "epoch": 1.064406779661017, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821565952, + "loss": 0.3638, + "grad_norm": 1.8370791673660278, + "learning_rate": 0.00015891971397666464 + }, + { + "step": 158, + "epoch": 1.071186440677966, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821492224, + "loss": 0.3595, + "grad_norm": 2.0211241245269775, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 159, + "epoch": 1.0779661016949154, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821840896, + "loss": 0.2689, + "grad_norm": 1.456434965133667, + "learning_rate": 0.00015535385007584706 + }, + { + "step": 160, + "epoch": 1.0847457627118644, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821435392, + "loss": 0.3214, + "grad_norm": 1.8869231939315796, + "learning_rate": 0.0001535696546319161 + }, + { + "step": 161, + "epoch": 1.0915254237288137, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821381632, + "loss": 0.3413, + "grad_norm": 1.742978811264038, + "learning_rate": 0.00015178495369752213 + }, + { + "step": 162, + "epoch": 1.0983050847457627, + "cpu_mem": 1.744072704, + "gpu_mem": 4.822157312, + "loss": 0.3185, + "grad_norm": 1.9208202362060547, + "learning_rate": 0.00015 + }, + { + "step": 163, + "epoch": 1.1050847457627118, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821633536, + "loss": 0.3309, + "grad_norm": 2.343064785003662, + "learning_rate": 0.00014821504630247785 + }, + { + "step": 164, + "epoch": 1.111864406779661, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821545984, + "loss": 0.3542, + "grad_norm": 2.68346905708313, + "learning_rate": 0.00014643034536808387 + }, + { + "step": 165, + "epoch": 1.11864406779661, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821495296, + "loss": 0.2649, + "grad_norm": 2.21683931350708, + "learning_rate": 0.00014464614992415294 + }, + { + "step": 166, + "epoch": 1.1254237288135593, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821590528, + "loss": 0.3104, + "grad_norm": 2.4603419303894043, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 167, + "epoch": 1.1322033898305084, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821507584, + "loss": 0.431, + "grad_norm": 3.0024454593658447, + "learning_rate": 0.00014108028602333536 + }, + { + "step": 168, + "epoch": 1.1389830508474577, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821526016, + "loss": 0.3301, + "grad_norm": 2.411245822906494, + "learning_rate": 0.00013929912252011516 + }, + { + "step": 169, + "epoch": 1.1457627118644067, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821613568, + "loss": 0.2755, + "grad_norm": 2.378558397293091, + "learning_rate": 0.00013751947434318564 + }, + { + "step": 170, + "epoch": 1.152542372881356, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821498368, + "loss": 0.4929, + "grad_norm": 3.361145496368408, + "learning_rate": 0.00013574159350437261 + }, + { + "step": 171, + "epoch": 1.159322033898305, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821561344, + "loss": 0.5161, + "grad_norm": 3.6370110511779785, + "learning_rate": 0.0001339657317652331 + }, + { + "step": 172, + "epoch": 1.1661016949152543, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821469184, + "loss": 0.248, + "grad_norm": 3.121709108352661, + "learning_rate": 0.00013219214060140424 + }, + { + "step": 173, + "epoch": 1.1728813559322033, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821768704, + "loss": 0.5628, + "grad_norm": 3.892179012298584, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 174, + "epoch": 1.1796610169491526, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821492224, + "loss": 0.4715, + "grad_norm": 3.0351951122283936, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 175, + "epoch": 1.1864406779661016, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821458432, + "loss": 0.3665, + "grad_norm": 2.5450587272644043, + "learning_rate": 0.00012688750028184818 + }, + { + "step": 176, + "epoch": 1.193220338983051, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821596672, + "loss": 0.4406, + "grad_norm": 3.6099371910095215, + "learning_rate": 0.0001251254992118439 + }, + { + "step": 177, + "epoch": 1.2, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821694976, + "loss": 0.4205, + "grad_norm": 2.6356284618377686, + "learning_rate": 0.00012336702056185453 + }, + { + "step": 178, + "epoch": 1.2067796610169492, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821441536, + "loss": 0.382, + "grad_norm": 2.3380846977233887, + "learning_rate": 0.00012161231334593851 + }, + { + "step": 179, + "epoch": 1.2135593220338983, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821541376, + "loss": 0.4271, + "grad_norm": 2.20468807220459, + "learning_rate": 0.00011986162604409015 + }, + { + "step": 180, + "epoch": 1.2203389830508475, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821513728, + "loss": 0.2822, + "grad_norm": 1.981376051902771, + "learning_rate": 0.00011811520656705348 + }, + { + "step": 181, + "epoch": 1.2271186440677966, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821450752, + "loss": 0.244, + "grad_norm": 1.5451654195785522, + "learning_rate": 0.00011637330222121543 + }, + { + "step": 182, + "epoch": 1.2338983050847459, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821668864, + "loss": 0.4274, + "grad_norm": 3.1464285850524902, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 183, + "epoch": 1.240677966101695, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821565952, + "loss": 0.3007, + "grad_norm": 2.160327196121216, + "learning_rate": 0.00011290402491686766 + }, + { + "step": 184, + "epoch": 1.2474576271186442, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821513728, + "loss": 0.3186, + "grad_norm": 2.1222620010375977, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 185, + "epoch": 1.2542372881355932, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821492224, + "loss": 0.3547, + "grad_norm": 2.4442389011383057, + "learning_rate": 0.00010945575916653407 + }, + { + "step": 186, + "epoch": 1.2610169491525425, + "cpu_mem": 1.744072704, + "gpu_mem": 4.82150144, + "loss": 0.2353, + "grad_norm": 1.8244935274124146, + "learning_rate": 0.00010774011647378553 + }, + { + "step": 187, + "epoch": 1.2677966101694915, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821433856, + "loss": 0.3667, + "grad_norm": 2.1027145385742188, + "learning_rate": 0.00010603045810453468 + }, + { + "step": 188, + "epoch": 1.2745762711864406, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821596672, + "loss": 0.2041, + "grad_norm": 1.9952877759933472, + "learning_rate": 0.00010432702615951396 + }, + { + "step": 189, + "epoch": 1.2813559322033898, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821466112, + "loss": 0.3585, + "grad_norm": 2.5639634132385254, + "learning_rate": 0.00010263006185774627 + }, + { + "step": 190, + "epoch": 1.288135593220339, + "cpu_mem": 1.744072704, + "gpu_mem": 4.82158592, + "loss": 0.3623, + "grad_norm": 2.353642463684082, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 191, + "epoch": 1.2949152542372881, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821404672, + "loss": 0.2274, + "grad_norm": 2.21895432472229, + "learning_rate": 9.925649644669391e-05 + }, + { + "step": 192, + "epoch": 1.3016949152542372, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821536768, + "loss": 0.2585, + "grad_norm": 3.5670340061187744, + "learning_rate": 9.758037306013526e-05 + }, + { + "step": 193, + "epoch": 1.3084745762711865, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821510656, + "loss": 0.3029, + "grad_norm": 3.3269450664520264, + "learning_rate": 9.591167269463255e-05 + }, + { + "step": 194, + "epoch": 1.3152542372881357, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821476864, + "loss": 0.3436, + "grad_norm": 3.1074931621551514, + "learning_rate": 9.425063165095088e-05 + }, + { + "step": 195, + "epoch": 1.3220338983050848, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821581312, + "loss": 0.2544, + "grad_norm": 2.4987263679504395, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 196, + "epoch": 1.3288135593220338, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821576704, + "loss": 0.4032, + "grad_norm": 2.5920281410217285, + "learning_rate": 9.095246727570879e-05 + }, + { + "step": 197, + "epoch": 1.335593220338983, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821435392, + "loss": 0.373, + "grad_norm": 3.142155647277832, + "learning_rate": 8.931581098950973e-05 + }, + { + "step": 198, + "epoch": 1.3423728813559321, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821627392, + "loss": 0.3079, + "grad_norm": 2.6061782836914062, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 199, + "epoch": 1.3491525423728814, + "cpu_mem": 1.744072704, + "gpu_mem": 4.8214784, + "loss": 0.2853, + "grad_norm": 2.673964023590088, + "learning_rate": 8.606850900252478e-05 + }, + { + "step": 200, + "epoch": 1.3559322033898304, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821581312, + "loss": 0.3101, + "grad_norm": 2.496502637863159, + "learning_rate": 8.445832314459608e-05 + }, + { + "step": 201, + "epoch": 1.3627118644067797, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821784064, + "loss": 0.2441, + "grad_norm": 1.9021148681640625, + "learning_rate": 8.285741849059311e-05 + }, + { + "step": 202, + "epoch": 1.3694915254237288, + "cpu_mem": 1.744072704, + "gpu_mem": 4.82158592, + "loss": 0.2353, + "grad_norm": 2.1781320571899414, + "learning_rate": 8.126602174088843e-05 + }, + { + "step": 203, + "epoch": 1.376271186440678, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821472256, + "loss": 0.2864, + "grad_norm": 2.5567240715026855, + "learning_rate": 7.968435824946242e-05 + }, + { + "step": 204, + "epoch": 1.383050847457627, + "cpu_mem": 1.744072704, + "gpu_mem": 4.82148608, + "loss": 0.2289, + "grad_norm": 1.897853970527649, + "learning_rate": 7.811265199199152e-05 + }, + { + "step": 205, + "epoch": 1.3898305084745763, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821530624, + "loss": 0.2603, + "grad_norm": 1.566983938217163, + "learning_rate": 7.655112553413135e-05 + }, + { + "step": 206, + "epoch": 1.3966101694915254, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821472256, + "loss": 0.2385, + "grad_norm": 1.7799900770187378, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 207, + "epoch": 1.4033898305084747, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821705728, + "loss": 0.4147, + "grad_norm": 2.881108045578003, + "learning_rate": 7.345949504086507e-05 + }, + { + "step": 208, + "epoch": 1.4101694915254237, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821736448, + "loss": 0.2487, + "grad_norm": 2.413271427154541, + "learning_rate": 7.192982880403917e-05 + }, + { + "step": 209, + "epoch": 1.4169491525423727, + "cpu_mem": 1.744072704, + "gpu_mem": 4.82166272, + "loss": 0.2358, + "grad_norm": 1.7895900011062622, + "learning_rate": 7.041121790198881e-05 + }, + { + "step": 210, + "epoch": 1.423728813559322, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821550592, + "loss": 0.3909, + "grad_norm": 2.3234219551086426, + "learning_rate": 6.890387738166041e-05 + }, + { + "step": 211, + "epoch": 1.4305084745762713, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821499904, + "loss": 0.2744, + "grad_norm": 2.0628280639648438, + "learning_rate": 6.740802069402771e-05 + }, + { + "step": 212, + "epoch": 1.4372881355932203, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821469184, + "loss": 0.1836, + "grad_norm": 1.3194963932037354, + "learning_rate": 6.592385966386588e-05 + }, + { + "step": 213, + "epoch": 1.4440677966101694, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821492224, + "loss": 0.2264, + "grad_norm": 2.6920981407165527, + "learning_rate": 6.445160445975536e-05 + }, + { + "step": 214, + "epoch": 1.4508474576271186, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821575168, + "loss": 0.3359, + "grad_norm": 2.812617778778076, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 215, + "epoch": 1.457627118644068, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821502976, + "loss": 0.3195, + "grad_norm": 3.324860095977783, + "learning_rate": 6.154364374470568e-05 + }, + { + "step": 216, + "epoch": 1.464406779661017, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821668864, + "loss": 0.2554, + "grad_norm": 1.7928407192230225, + "learning_rate": 6.010835002329795e-05 + }, + { + "step": 217, + "epoch": 1.471186440677966, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821510656, + "loss": 0.335, + "grad_norm": 2.9798684120178223, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 218, + "epoch": 1.4779661016949153, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821487616, + "loss": 0.3263, + "grad_norm": 2.465559482574463, + "learning_rate": 5.72761520669092e-05 + }, + { + "step": 219, + "epoch": 1.4847457627118645, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821613568, + "loss": 0.3686, + "grad_norm": 2.267286539077759, + "learning_rate": 5.587964889287218e-05 + }, + { + "step": 220, + "epoch": 1.4915254237288136, + "cpu_mem": 1.744072704, + "gpu_mem": 4.82164736, + "loss": 0.2345, + "grad_norm": 2.027508020401001, + "learning_rate": 5.449647388213678e-05 + }, + { + "step": 221, + "epoch": 1.4983050847457626, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821515264, + "loss": 0.2911, + "grad_norm": 2.020876884460449, + "learning_rate": 5.312682290288869e-05 + }, + { + "step": 222, + "epoch": 1.505084745762712, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821651968, + "loss": 0.3821, + "grad_norm": 2.8136119842529297, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 223, + "epoch": 1.5118644067796612, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821565952, + "loss": 0.3757, + "grad_norm": 3.556309461593628, + "learning_rate": 5.0428866908599864e-05 + }, + { + "step": 224, + "epoch": 1.5186440677966102, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821530624, + "loss": 0.1841, + "grad_norm": 1.9165980815887451, + "learning_rate": 4.9100943944812114e-05 + }, + { + "step": 225, + "epoch": 1.5254237288135593, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821495296, + "loss": 0.1797, + "grad_norm": 1.8034125566482544, + "learning_rate": 4.778730906091632e-05 + }, + { + "step": 226, + "epoch": 1.5322033898305085, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821644288, + "loss": 0.3248, + "grad_norm": 2.2229056358337402, + "learning_rate": 4.648814827768322e-05 + }, + { + "step": 227, + "epoch": 1.5389830508474578, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821533696, + "loss": 0.2832, + "grad_norm": 2.2350094318389893, + "learning_rate": 4.5203645566239816e-05 + }, + { + "step": 228, + "epoch": 1.5457627118644068, + "cpu_mem": 1.744072704, + "gpu_mem": 4.8214784, + "loss": 0.2927, + "grad_norm": 2.6874685287475586, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 229, + "epoch": 1.5525423728813559, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821420032, + "loss": 0.2513, + "grad_norm": 2.4346120357513428, + "learning_rate": 4.267933983899601e-05 + }, + { + "step": 230, + "epoch": 1.559322033898305, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821476864, + "loss": 0.2095, + "grad_norm": 2.316563844680786, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 231, + "epoch": 1.5661016949152542, + "cpu_mem": 1.744072704, + "gpu_mem": 4.82175488, + "loss": 0.3342, + "grad_norm": 2.81589412689209, + "learning_rate": 4.0215821672741213e-05 + }, + { + "step": 232, + "epoch": 1.5728813559322035, + "cpu_mem": 1.744072704, + "gpu_mem": 4.8214784, + "loss": 0.3377, + "grad_norm": 2.403080701828003, + "learning_rate": 3.900729534256745e-05 + }, + { + "step": 233, + "epoch": 1.5796610169491525, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821791744, + "loss": 0.192, + "grad_norm": 2.292146921157837, + "learning_rate": 3.781448643031187e-05 + }, + { + "step": 234, + "epoch": 1.5864406779661016, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821667328, + "loss": 0.2057, + "grad_norm": 2.3101119995117188, + "learning_rate": 3.663756384686127e-05 + }, + { + "step": 235, + "epoch": 1.5932203389830508, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821423104, + "loss": 0.2743, + "grad_norm": 2.0743401050567627, + "learning_rate": 3.547669425347647e-05 + }, + { + "step": 236, + "epoch": 1.6, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821483008, + "loss": 0.2656, + "grad_norm": 1.919785499572754, + "learning_rate": 3.433204203819185e-05 + }, + { + "step": 237, + "epoch": 1.6067796610169491, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821544448, + "loss": 0.3226, + "grad_norm": 2.723062753677368, + "learning_rate": 3.3203769292536764e-05 + }, + { + "step": 238, + "epoch": 1.6135593220338982, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821545984, + "loss": 0.2555, + "grad_norm": 2.2839860916137695, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 239, + "epoch": 1.6203389830508474, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821799424, + "loss": 0.329, + "grad_norm": 2.159705877304077, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 240, + "epoch": 1.6271186440677967, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821449216, + "loss": 0.5048, + "grad_norm": 4.062475681304932, + "learning_rate": 2.9918813861345952e-05 + }, + { + "step": 241, + "epoch": 1.6338983050847458, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821745664, + "loss": 0.3341, + "grad_norm": 2.4401588439941406, + "learning_rate": 2.885763318295102e-05 + }, + { + "step": 242, + "epoch": 1.6406779661016948, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821607424, + "loss": 0.2024, + "grad_norm": 1.535955548286438, + "learning_rate": 2.781360719244964e-05 + }, + { + "step": 243, + "epoch": 1.647457627118644, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821459968, + "loss": 0.2519, + "grad_norm": 2.1156792640686035, + "learning_rate": 2.6786883731926306e-05 + }, + { + "step": 244, + "epoch": 1.6542372881355933, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821599744, + "loss": 0.215, + "grad_norm": 2.2101540565490723, + "learning_rate": 2.5777608193294396e-05 + }, + { + "step": 245, + "epoch": 1.6610169491525424, + "cpu_mem": 1.744072704, + "gpu_mem": 4.8214784, + "loss": 0.2625, + "grad_norm": 2.2044804096221924, + "learning_rate": 2.4785923497707956e-05 + }, + { + "step": 246, + "epoch": 1.6677966101694914, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821572096, + "loss": 0.375, + "grad_norm": 2.3791656494140625, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 247, + "epoch": 1.6745762711864407, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821590528, + "loss": 0.1599, + "grad_norm": 1.629304051399231, + "learning_rate": 2.285588584541047e-05 + }, + { + "step": 248, + "epoch": 1.68135593220339, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821542912, + "loss": 0.271, + "grad_norm": 1.8550136089324951, + "learning_rate": 2.1917806196827792e-05 + }, + { + "step": 249, + "epoch": 1.688135593220339, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821449216, + "loss": 0.1954, + "grad_norm": 1.2687727212905884, + "learning_rate": 2.0997863968844914e-05 + }, + { + "step": 250, + "epoch": 1.694915254237288, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821541376, + "loss": 0.3222, + "grad_norm": 2.447190999984741, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 251, + "epoch": 1.7016949152542373, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821453824, + "loss": 0.2174, + "grad_norm": 1.9837526082992554, + "learning_rate": 1.921291027132278e-05 + }, + { + "step": 252, + "epoch": 1.7084745762711866, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821496832, + "loss": 0.2836, + "grad_norm": 2.2523815631866455, + "learning_rate": 1.834815156491165e-05 + }, + { + "step": 253, + "epoch": 1.7152542372881356, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821690368, + "loss": 0.2447, + "grad_norm": 2.9001996517181396, + "learning_rate": 1.750203576956341e-05 + }, + { + "step": 254, + "epoch": 1.7220338983050847, + "cpu_mem": 1.744072704, + "gpu_mem": 4.82148608, + "loss": 0.3955, + "grad_norm": 2.917618751525879, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 255, + "epoch": 1.7288135593220337, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821642752, + "loss": 0.3537, + "grad_norm": 2.998972177505493, + "learning_rate": 1.5866209521043304e-05 + }, + { + "step": 256, + "epoch": 1.735593220338983, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821469184, + "loss": 0.1695, + "grad_norm": 2.372494697570801, + "learning_rate": 1.5076730713409523e-05 + }, + { + "step": 257, + "epoch": 1.7423728813559323, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821882368, + "loss": 0.3246, + "grad_norm": 1.8160532712936401, + "learning_rate": 1.4306358075111923e-05 + }, + { + "step": 258, + "epoch": 1.7491525423728813, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821541376, + "loss": 0.2896, + "grad_norm": 2.9768693447113037, + "learning_rate": 1.3555200696822232e-05 + }, + { + "step": 259, + "epoch": 1.7559322033898304, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821458432, + "loss": 0.345, + "grad_norm": 2.4858179092407227, + "learning_rate": 1.2823364948184095e-05 + }, + { + "step": 260, + "epoch": 1.7627118644067796, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821575168, + "loss": 0.1678, + "grad_norm": 1.6486356258392334, + "learning_rate": 1.2110954462750166e-05 + }, + { + "step": 261, + "epoch": 1.769491525423729, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821530624, + "loss": 0.1327, + "grad_norm": 1.282950758934021, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 262, + "epoch": 1.776271186440678, + "cpu_mem": 1.744072704, + "gpu_mem": 4.821487616, + "loss": 0.2609, + "grad_norm": 2.008997917175293, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 263, + "epoch": 1.783050847457627, + "cpu_mem": 1.744072704, + "gpu_mem": 4.82152448, + "loss": 0.2569, + "grad_norm": 1.967729926109314, + "learning_rate": 1.0091269574384874e-05 + }, + { + "step": 264, + "epoch": 1.7898305084745763, + "cpu_mem": 1.745842176, + "gpu_mem": 4.821612032, + "loss": 0.1652, + "grad_norm": 1.956727385520935, + "learning_rate": 9.45754125003576e-06 + }, + { + "step": 265, + "epoch": 1.7966101694915255, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821530624, + "loss": 0.3736, + "grad_norm": 2.719573736190796, + "learning_rate": 8.843714815330987e-06 + }, + { + "step": 266, + "epoch": 1.8033898305084746, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821745664, + "loss": 0.3917, + "grad_norm": 3.0269434452056885, + "learning_rate": 8.249877192799731e-06 + }, + { + "step": 267, + "epoch": 1.8101694915254236, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821538304, + "loss": 0.3528, + "grad_norm": 3.023977518081665, + "learning_rate": 7.676112474402068e-06 + }, + { + "step": 268, + "epoch": 1.8169491525423729, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821542912, + "loss": 0.2244, + "grad_norm": 2.8531906604766846, + "learning_rate": 7.122501909620926e-06 + }, + { + "step": 269, + "epoch": 1.8237288135593221, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821553664, + "loss": 0.4291, + "grad_norm": 3.1002683639526367, + "learning_rate": 6.5891238939566275e-06 + }, + { + "step": 270, + "epoch": 1.8305084745762712, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821592064, + "loss": 0.2661, + "grad_norm": 2.776768445968628, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 271, + "epoch": 1.8372881355932202, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821644288, + "loss": 0.2808, + "grad_norm": 2.421647310256958, + "learning_rate": 5.583364755863701e-06 + }, + { + "step": 272, + "epoch": 1.8440677966101695, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821502976, + "loss": 0.2695, + "grad_norm": 1.7574200630187988, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 273, + "epoch": 1.8508474576271188, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821383168, + "loss": 0.324, + "grad_norm": 2.5117247104644775, + "learning_rate": 4.659404732773908e-06 + }, + { + "step": 274, + "epoch": 1.8576271186440678, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821610496, + "loss": 0.2459, + "grad_norm": 2.1567142009735107, + "learning_rate": 4.228264751468752e-06 + }, + { + "step": 275, + "epoch": 1.8644067796610169, + "cpu_mem": 1.747808256, + "gpu_mem": 4.82185472, + "loss": 0.2354, + "grad_norm": 2.462887763977051, + "learning_rate": 3.817767165451041e-06 + }, + { + "step": 276, + "epoch": 1.8711864406779661, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821515264, + "loss": 0.2315, + "grad_norm": 2.0217902660369873, + "learning_rate": 3.4279701043260886e-06 + }, + { + "step": 277, + "epoch": 1.8779661016949154, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821461504, + "loss": 0.3156, + "grad_norm": 2.2479891777038574, + "learning_rate": 3.0589287663461472e-06 + }, + { + "step": 278, + "epoch": 1.8847457627118644, + "cpu_mem": 1.747808256, + "gpu_mem": 4.82162432, + "loss": 0.3981, + "grad_norm": 2.9107162952423096, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 279, + "epoch": 1.8915254237288135, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821564416, + "loss": 0.3483, + "grad_norm": 1.961869478225708, + "learning_rate": 2.3833193495825853e-06 + }, + { + "step": 280, + "epoch": 1.8983050847457628, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821544448, + "loss": 0.2644, + "grad_norm": 2.467308521270752, + "learning_rate": 2.076846942272026e-06 + }, + { + "step": 281, + "epoch": 1.905084745762712, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821479936, + "loss": 0.3864, + "grad_norm": 2.2558891773223877, + "learning_rate": 1.791321587504768e-06 + }, + { + "step": 282, + "epoch": 1.911864406779661, + "cpu_mem": 1.747808256, + "gpu_mem": 4.82190848, + "loss": 0.1935, + "grad_norm": 1.8974677324295044, + "learning_rate": 1.5267837178600972e-06 + }, + { + "step": 283, + "epoch": 1.9186440677966101, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821615104, + "loss": 0.3479, + "grad_norm": 3.1258468627929688, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 284, + "epoch": 1.9254237288135592, + "cpu_mem": 1.747808256, + "gpu_mem": 4.82147072, + "loss": 0.3342, + "grad_norm": 2.2150111198425293, + "learning_rate": 1.0608172990067553e-06 + }, + { + "step": 285, + "epoch": 1.9322033898305084, + "cpu_mem": 1.747808256, + "gpu_mem": 4.82152448, + "loss": 0.3128, + "grad_norm": 1.9711337089538574, + "learning_rate": 8.594547342153979e-07 + }, + { + "step": 286, + "epoch": 1.9389830508474577, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821942272, + "loss": 0.1826, + "grad_norm": 2.0316872596740723, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 287, + "epoch": 1.9457627118644067, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821711872, + "loss": 0.277, + "grad_norm": 2.4691121578216553, + "learning_rate": 5.201134622801473e-07 + }, + { + "step": 288, + "epoch": 1.9525423728813558, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821496832, + "loss": 0.421, + "grad_norm": 1.8380540609359741, + "learning_rate": 3.821828084619727e-07 + }, + { + "step": 289, + "epoch": 1.959322033898305, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821581312, + "loss": 0.1388, + "grad_norm": 1.5486232042312622, + "learning_rate": 2.654391846207915e-07 + }, + { + "step": 290, + "epoch": 1.9661016949152543, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821506048, + "loss": 0.2864, + "grad_norm": 3.1102795600891113, + "learning_rate": 1.6989912254880556e-07 + }, + { + "step": 291, + "epoch": 1.9728813559322034, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821541376, + "loss": 0.2744, + "grad_norm": 1.8687769174575806, + "learning_rate": 9.557615145123765e-08 + }, + { + "step": 292, + "epoch": 1.9796610169491524, + "cpu_mem": 1.747808256, + "gpu_mem": 4.82162432, + "loss": 0.2912, + "grad_norm": 2.2209088802337646, + "learning_rate": 4.248079603064724e-08 + }, + { + "step": 293, + "epoch": 1.9864406779661017, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821541376, + "loss": 0.4437, + "grad_norm": 3.4202144145965576, + "learning_rate": 1.0620574996372811e-08 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821567488, + "loss": 0.2208, + "grad_norm": 1.8357125520706177, + "learning_rate": 0.0 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.747808256, + "gpu_mem": 4.821567488, + "train_runtime": 4579.6596, + "train_samples_per_second": 4.117, + "train_steps_per_second": 0.064, + "total_flos": 4.809368057590579e+16, + "train_loss": 0.6318229741486562 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r8-a2/README.md b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r8-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r8-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r8-a2/adapter_config.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..616e0cc3677d4646846654f1887fbef4d57d10ca --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r8-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r8-a2/eval_results.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..46f4adcdfe44083310b0b55b832d78b0837b8e4c --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "boolq", + "results": 0.8 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r8-a2/training_configuration.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..a39312750896ffa7d456ca2d9e3fa216be158541 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "BOOLQ", + "dataset_id": "google/boolq", + "preprocess_id": "boolq_train_deepeval" + }, + "peft_config": { + "method": "lora", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 6307840 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 2, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-lora-boolq-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r8-a2", + "seed": 42, + "timestamp": "2025-08-29T20:59:39.191229" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r8-a2/training_logs.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..2a8785a2e42967ac3989d7aae1760392de2b6174 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-boolq-r8-a2/training_logs.json @@ -0,0 +1,2659 @@ +[ + { + "step": 1, + "epoch": 0.006779661016949152, + "cpu_mem": 1.688006656, + "gpu_mem": 4.443003904, + "loss": 8.869, + "grad_norm": 25.80951690673828, + "learning_rate": 9.999999999999999e-06 + }, + { + "step": 2, + "epoch": 0.013559322033898305, + "cpu_mem": 1.693904896, + "gpu_mem": 4.493604352, + "loss": 8.9376, + "grad_norm": 26.43102264404297, + "learning_rate": 1.9999999999999998e-05 + }, + { + "step": 3, + "epoch": 0.020338983050847456, + "cpu_mem": 1.694691328, + "gpu_mem": 4.493522944, + "loss": 8.8176, + "grad_norm": 26.73906898498535, + "learning_rate": 2.9999999999999997e-05 + }, + { + "step": 4, + "epoch": 0.02711864406779661, + "cpu_mem": 1.69547776, + "gpu_mem": 4.493522944, + "loss": 8.5096, + "grad_norm": 27.39969825744629, + "learning_rate": 3.9999999999999996e-05 + }, + { + "step": 5, + "epoch": 0.03389830508474576, + "cpu_mem": 1.695870976, + "gpu_mem": 4.493458432, + "loss": 7.9557, + "grad_norm": 27.356904983520508, + "learning_rate": 4.9999999999999996e-05 + }, + { + "step": 6, + "epoch": 0.04067796610169491, + "cpu_mem": 1.6964608, + "gpu_mem": 4.4934784, + "loss": 7.5594, + "grad_norm": 26.265954971313477, + "learning_rate": 5.9999999999999995e-05 + }, + { + "step": 7, + "epoch": 0.04745762711864407, + "cpu_mem": 1.697050624, + "gpu_mem": 4.493530624, + "loss": 6.5821, + "grad_norm": 28.45710563659668, + "learning_rate": 7e-05 + }, + { + "step": 8, + "epoch": 0.05423728813559322, + "cpu_mem": 1.697640448, + "gpu_mem": 4.49361664, + "loss": 5.4716, + "grad_norm": 28.2744083404541, + "learning_rate": 7.999999999999999e-05 + }, + { + "step": 9, + "epoch": 0.061016949152542375, + "cpu_mem": 1.698033664, + "gpu_mem": 4.49352448, + "loss": 4.174, + "grad_norm": 25.48270606994629, + "learning_rate": 8.999999999999999e-05 + }, + { + "step": 10, + "epoch": 0.06779661016949153, + "cpu_mem": 1.698623488, + "gpu_mem": 4.49342464, + "loss": 2.9274, + "grad_norm": 21.181928634643555, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 11, + "epoch": 0.07457627118644068, + "cpu_mem": 1.699016704, + "gpu_mem": 4.493529088, + "loss": 1.7605, + "grad_norm": 11.911050796508789, + "learning_rate": 0.00010999999999999998 + }, + { + "step": 12, + "epoch": 0.08135593220338982, + "cpu_mem": 1.69940992, + "gpu_mem": 4.4939008, + "loss": 1.2653, + "grad_norm": 7.890399932861328, + "learning_rate": 0.00011999999999999999 + }, + { + "step": 13, + "epoch": 0.08813559322033898, + "cpu_mem": 1.699803136, + "gpu_mem": 4.493504512, + "loss": 1.0507, + "grad_norm": 6.433838844299316, + "learning_rate": 0.00013 + }, + { + "step": 14, + "epoch": 0.09491525423728814, + "cpu_mem": 1.700196352, + "gpu_mem": 4.493481472, + "loss": 0.7746, + "grad_norm": 2.6924006938934326, + "learning_rate": 0.00014 + }, + { + "step": 15, + "epoch": 0.1016949152542373, + "cpu_mem": 1.700589568, + "gpu_mem": 4.493420032, + "loss": 0.8051, + "grad_norm": 4.62258768081665, + "learning_rate": 0.00015 + }, + { + "step": 16, + "epoch": 0.10847457627118644, + "cpu_mem": 1.700982784, + "gpu_mem": 4.493504512, + "loss": 0.7593, + "grad_norm": 6.643669128417969, + "learning_rate": 0.00015999999999999999 + }, + { + "step": 17, + "epoch": 0.1152542372881356, + "cpu_mem": 1.701376, + "gpu_mem": 4.493544448, + "loss": 0.8072, + "grad_norm": 7.782557487487793, + "learning_rate": 0.00016999999999999999 + }, + { + "step": 18, + "epoch": 0.12203389830508475, + "cpu_mem": 1.701769216, + "gpu_mem": 4.493607424, + "loss": 0.678, + "grad_norm": 3.5699033737182617, + "learning_rate": 0.00017999999999999998 + }, + { + "step": 19, + "epoch": 0.1288135593220339, + "cpu_mem": 1.702162432, + "gpu_mem": 4.493444608, + "loss": 1.1614, + "grad_norm": 16.549509048461914, + "learning_rate": 0.00018999999999999998 + }, + { + "step": 20, + "epoch": 0.13559322033898305, + "cpu_mem": 1.70235904, + "gpu_mem": 4.493556736, + "loss": 0.8437, + "grad_norm": 9.722262382507324, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 21, + "epoch": 0.1423728813559322, + "cpu_mem": 1.702555648, + "gpu_mem": 4.493714944, + "loss": 0.7431, + "grad_norm": 7.257400035858154, + "learning_rate": 0.00020999999999999998 + }, + { + "step": 22, + "epoch": 0.14915254237288136, + "cpu_mem": 1.702948864, + "gpu_mem": 4.493607424, + "loss": 0.6845, + "grad_norm": 1.694704294204712, + "learning_rate": 0.00021999999999999995 + }, + { + "step": 23, + "epoch": 0.15593220338983052, + "cpu_mem": 1.70334208, + "gpu_mem": 4.493579776, + "loss": 1.0115, + "grad_norm": 11.411857604980469, + "learning_rate": 0.00023 + }, + { + "step": 24, + "epoch": 0.16271186440677965, + "cpu_mem": 1.703538688, + "gpu_mem": 4.493636608, + "loss": 1.2285, + "grad_norm": 14.558090209960938, + "learning_rate": 0.00023999999999999998 + }, + { + "step": 25, + "epoch": 0.1694915254237288, + "cpu_mem": 1.703735296, + "gpu_mem": 4.493421568, + "loss": 0.9003, + "grad_norm": 8.614864349365234, + "learning_rate": 0.00025 + }, + { + "step": 26, + "epoch": 0.17627118644067796, + "cpu_mem": 1.704128512, + "gpu_mem": 4.493476864, + "loss": 0.6847, + "grad_norm": 0.6057906746864319, + "learning_rate": 0.00026 + }, + { + "step": 27, + "epoch": 0.18305084745762712, + "cpu_mem": 1.704521728, + "gpu_mem": 4.493768704, + "loss": 0.6172, + "grad_norm": 1.1809642314910889, + "learning_rate": 0.00027 + }, + { + "step": 28, + "epoch": 0.18983050847457628, + "cpu_mem": 1.704718336, + "gpu_mem": 4.49344768, + "loss": 0.7674, + "grad_norm": 4.456021308898926, + "learning_rate": 0.00028 + }, + { + "step": 29, + "epoch": 0.19661016949152543, + "cpu_mem": 1.704914944, + "gpu_mem": 4.493512192, + "loss": 0.6593, + "grad_norm": 1.3652160167694092, + "learning_rate": 0.00029 + }, + { + "step": 30, + "epoch": 0.2033898305084746, + "cpu_mem": 1.705111552, + "gpu_mem": 4.493590528, + "loss": 0.6684, + "grad_norm": 0.8698585033416748, + "learning_rate": 0.0003 + }, + { + "step": 31, + "epoch": 0.21016949152542372, + "cpu_mem": 1.70530816, + "gpu_mem": 4.49339392, + "loss": 0.6113, + "grad_norm": 1.9089570045471191, + "learning_rate": 0.0002999893794250036 + }, + { + "step": 32, + "epoch": 0.21694915254237288, + "cpu_mem": 1.705504768, + "gpu_mem": 4.493507584, + "loss": 0.6863, + "grad_norm": 1.697677731513977, + "learning_rate": 0.00029995751920396937 + }, + { + "step": 33, + "epoch": 0.22372881355932203, + "cpu_mem": 1.705701376, + "gpu_mem": 4.493745664, + "loss": 0.6867, + "grad_norm": 0.9887014627456665, + "learning_rate": 0.00029990442384854874 + }, + { + "step": 34, + "epoch": 0.2305084745762712, + "cpu_mem": 1.706094592, + "gpu_mem": 4.49344768, + "loss": 0.6611, + "grad_norm": 3.722730875015259, + "learning_rate": 0.0002998301008774512 + }, + { + "step": 35, + "epoch": 0.23728813559322035, + "cpu_mem": 1.7062912, + "gpu_mem": 4.493658112, + "loss": 0.6814, + "grad_norm": 1.1764622926712036, + "learning_rate": 0.0002997345608153792 + }, + { + "step": 36, + "epoch": 0.2440677966101695, + "cpu_mem": 1.706487808, + "gpu_mem": 4.49360896, + "loss": 0.6677, + "grad_norm": 1.456813097000122, + "learning_rate": 0.000299617817191538 + }, + { + "step": 37, + "epoch": 0.25084745762711863, + "cpu_mem": 1.706684416, + "gpu_mem": 4.493420032, + "loss": 0.6259, + "grad_norm": 1.3963892459869385, + "learning_rate": 0.0002994798865377198 + }, + { + "step": 38, + "epoch": 0.2576271186440678, + "cpu_mem": 1.706881024, + "gpu_mem": 4.493667328, + "loss": 0.8198, + "grad_norm": 4.987330913543701, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 39, + "epoch": 0.26440677966101694, + "cpu_mem": 1.707077632, + "gpu_mem": 4.49404672, + "loss": 0.7812, + "grad_norm": 3.9238972663879395, + "learning_rate": 0.0002991405452657846 + }, + { + "step": 40, + "epoch": 0.2711864406779661, + "cpu_mem": 1.70727424, + "gpu_mem": 4.49361664, + "loss": 0.6559, + "grad_norm": 2.3135671615600586, + "learning_rate": 0.00029893918270099324 + }, + { + "step": 41, + "epoch": 0.27796610169491526, + "cpu_mem": 1.707470848, + "gpu_mem": 4.493843968, + "loss": 0.6894, + "grad_norm": 1.7226029634475708, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 42, + "epoch": 0.2847457627118644, + "cpu_mem": 1.707667456, + "gpu_mem": 4.493741056, + "loss": 0.6965, + "grad_norm": 3.363767147064209, + "learning_rate": 0.0002984732162821399 + }, + { + "step": 43, + "epoch": 0.29152542372881357, + "cpu_mem": 1.707864064, + "gpu_mem": 4.49356288, + "loss": 0.6643, + "grad_norm": 3.8437230587005615, + "learning_rate": 0.0002982086784124952 + }, + { + "step": 44, + "epoch": 0.2983050847457627, + "cpu_mem": 1.707864064, + "gpu_mem": 4.493705728, + "loss": 0.6054, + "grad_norm": 1.1111056804656982, + "learning_rate": 0.00029792315305772796 + }, + { + "step": 45, + "epoch": 0.3050847457627119, + "cpu_mem": 1.708060672, + "gpu_mem": 4.49348608, + "loss": 0.9782, + "grad_norm": 5.8057379722595215, + "learning_rate": 0.0002976166806504174 + }, + { + "step": 46, + "epoch": 0.31186440677966104, + "cpu_mem": 1.70825728, + "gpu_mem": 4.493728768, + "loss": 0.7976, + "grad_norm": 4.049243450164795, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 47, + "epoch": 0.31864406779661014, + "cpu_mem": 1.708453888, + "gpu_mem": 4.493452288, + "loss": 0.7163, + "grad_norm": 2.926071882247925, + "learning_rate": 0.00029694107123365385 + }, + { + "step": 48, + "epoch": 0.3254237288135593, + "cpu_mem": 1.708453888, + "gpu_mem": 4.493529088, + "loss": 0.5791, + "grad_norm": 0.5326256155967712, + "learning_rate": 0.00029657202989567393 + }, + { + "step": 49, + "epoch": 0.33220338983050846, + "cpu_mem": 1.708650496, + "gpu_mem": 4.493545984, + "loss": 0.6759, + "grad_norm": 1.1503798961639404, + "learning_rate": 0.00029618223283454893 + }, + { + "step": 50, + "epoch": 0.3389830508474576, + "cpu_mem": 1.708847104, + "gpu_mem": 4.493484544, + "loss": 0.7367, + "grad_norm": 3.2880921363830566, + "learning_rate": 0.00029577173524853123 + }, + { + "step": 51, + "epoch": 0.34576271186440677, + "cpu_mem": 1.708847104, + "gpu_mem": 4.493489152, + "loss": 0.8794, + "grad_norm": 5.401247978210449, + "learning_rate": 0.0002953405952672261 + }, + { + "step": 52, + "epoch": 0.3525423728813559, + "cpu_mem": 1.709043712, + "gpu_mem": 4.493569024, + "loss": 0.6769, + "grad_norm": 2.1474921703338623, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 53, + "epoch": 0.3593220338983051, + "cpu_mem": 1.709043712, + "gpu_mem": 4.493592064, + "loss": 0.6135, + "grad_norm": 2.1449711322784424, + "learning_rate": 0.0002944166352441363 + }, + { + "step": 54, + "epoch": 0.36610169491525424, + "cpu_mem": 1.70924032, + "gpu_mem": 4.493519872, + "loss": 0.8567, + "grad_norm": 4.953275680541992, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 55, + "epoch": 0.3728813559322034, + "cpu_mem": 1.70924032, + "gpu_mem": 4.493790208, + "loss": 0.6669, + "grad_norm": 3.1533241271972656, + "learning_rate": 0.00029341087610604337 + }, + { + "step": 56, + "epoch": 0.37966101694915255, + "cpu_mem": 1.70924032, + "gpu_mem": 4.493576704, + "loss": 0.7462, + "grad_norm": 3.9032444953918457, + "learning_rate": 0.00029287749809037904 + }, + { + "step": 57, + "epoch": 0.3864406779661017, + "cpu_mem": 1.709436928, + "gpu_mem": 4.49357056, + "loss": 0.5896, + "grad_norm": 1.123113989830017, + "learning_rate": 0.0002923238875255979 + }, + { + "step": 58, + "epoch": 0.39322033898305087, + "cpu_mem": 1.709436928, + "gpu_mem": 4.493466112, + "loss": 0.5635, + "grad_norm": 1.4478036165237427, + "learning_rate": 0.00029175012280720024 + }, + { + "step": 59, + "epoch": 0.4, + "cpu_mem": 1.709633536, + "gpu_mem": 4.493483008, + "loss": 0.6124, + "grad_norm": 2.3705861568450928, + "learning_rate": 0.000291156285184669 + }, + { + "step": 60, + "epoch": 0.4067796610169492, + "cpu_mem": 1.709633536, + "gpu_mem": 4.493576704, + "loss": 0.5377, + "grad_norm": 1.2596737146377563, + "learning_rate": 0.00029054245874996426 + }, + { + "step": 61, + "epoch": 0.4135593220338983, + "cpu_mem": 1.709633536, + "gpu_mem": 4.493587456, + "loss": 0.5592, + "grad_norm": 1.3407964706420898, + "learning_rate": 0.0002899087304256151 + }, + { + "step": 62, + "epoch": 0.42033898305084744, + "cpu_mem": 1.709830144, + "gpu_mem": 4.493575168, + "loss": 0.7942, + "grad_norm": 4.734757423400879, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 63, + "epoch": 0.4271186440677966, + "cpu_mem": 1.710026752, + "gpu_mem": 4.493567488, + "loss": 0.457, + "grad_norm": 0.8973060846328735, + "learning_rate": 0.000288581929876693 + }, + { + "step": 64, + "epoch": 0.43389830508474575, + "cpu_mem": 1.710026752, + "gpu_mem": 4.493496832, + "loss": 0.5389, + "grad_norm": 1.2568331956863403, + "learning_rate": 0.0002878890455372498 + }, + { + "step": 65, + "epoch": 0.4406779661016949, + "cpu_mem": 1.71022336, + "gpu_mem": 4.493541376, + "loss": 0.6254, + "grad_norm": 1.676977515220642, + "learning_rate": 0.0002871766350518159 + }, + { + "step": 66, + "epoch": 0.44745762711864406, + "cpu_mem": 1.71022336, + "gpu_mem": 4.493734912, + "loss": 0.565, + "grad_norm": 2.4305481910705566, + "learning_rate": 0.00028644479930317775 + }, + { + "step": 67, + "epoch": 0.4542372881355932, + "cpu_mem": 1.71022336, + "gpu_mem": 4.493444608, + "loss": 0.6612, + "grad_norm": 2.157471179962158, + "learning_rate": 0.00028569364192488803 + }, + { + "step": 68, + "epoch": 0.4610169491525424, + "cpu_mem": 1.710419968, + "gpu_mem": 4.493412352, + "loss": 0.7994, + "grad_norm": 4.444483757019043, + "learning_rate": 0.00028492326928659045 + }, + { + "step": 69, + "epoch": 0.46779661016949153, + "cpu_mem": 1.710419968, + "gpu_mem": 4.4934784, + "loss": 0.5482, + "grad_norm": 1.433451771736145, + "learning_rate": 0.00028413379047895665 + }, + { + "step": 70, + "epoch": 0.4745762711864407, + "cpu_mem": 1.710419968, + "gpu_mem": 4.493472256, + "loss": 0.5635, + "grad_norm": 1.8430781364440918, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 71, + "epoch": 0.48135593220338985, + "cpu_mem": 1.710419968, + "gpu_mem": 4.49370112, + "loss": 0.6294, + "grad_norm": 5.560366153717041, + "learning_rate": 0.0002824979642304366 + }, + { + "step": 72, + "epoch": 0.488135593220339, + "cpu_mem": 1.710616576, + "gpu_mem": 4.49369344, + "loss": 0.5777, + "grad_norm": 3.2145206928253174, + "learning_rate": 0.0002816518484350883 + }, + { + "step": 73, + "epoch": 0.49491525423728816, + "cpu_mem": 1.710616576, + "gpu_mem": 4.493659648, + "loss": 0.6842, + "grad_norm": 4.191321849822998, + "learning_rate": 0.0002807870897286772 + }, + { + "step": 74, + "epoch": 0.5016949152542373, + "cpu_mem": 1.710616576, + "gpu_mem": 4.493519872, + "loss": 0.5721, + "grad_norm": 1.905380129814148, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 75, + "epoch": 0.5084745762711864, + "cpu_mem": 1.710616576, + "gpu_mem": 4.493444608, + "loss": 0.4694, + "grad_norm": 1.2219738960266113, + "learning_rate": 0.000279002136031155 + }, + { + "step": 76, + "epoch": 0.5152542372881356, + "cpu_mem": 1.710616576, + "gpu_mem": 4.493384704, + "loss": 0.5534, + "grad_norm": 1.2279196977615356, + "learning_rate": 0.00027808219380317216 + }, + { + "step": 77, + "epoch": 0.5220338983050847, + "cpu_mem": 1.710616576, + "gpu_mem": 4.493458432, + "loss": 0.4954, + "grad_norm": 1.0573623180389404, + "learning_rate": 0.0002771441141545895 + }, + { + "step": 78, + "epoch": 0.5288135593220339, + "cpu_mem": 1.710813184, + "gpu_mem": 4.493510656, + "loss": 0.8351, + "grad_norm": 9.962656021118164, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 79, + "epoch": 0.535593220338983, + "cpu_mem": 1.710813184, + "gpu_mem": 4.493642752, + "loss": 0.7547, + "grad_norm": 4.787449836730957, + "learning_rate": 0.000275214076502292 + }, + { + "step": 80, + "epoch": 0.5423728813559322, + "cpu_mem": 1.711009792, + "gpu_mem": 4.493533696, + "loss": 0.6423, + "grad_norm": 3.5397584438323975, + "learning_rate": 0.0002742223918067056 + }, + { + "step": 81, + "epoch": 0.5491525423728814, + "cpu_mem": 1.711009792, + "gpu_mem": 4.493413888, + "loss": 0.5834, + "grad_norm": 2.426708698272705, + "learning_rate": 0.00027321311626807374 + }, + { + "step": 82, + "epoch": 0.5559322033898305, + "cpu_mem": 1.711009792, + "gpu_mem": 4.493483008, + "loss": 0.6012, + "grad_norm": 3.486109495162964, + "learning_rate": 0.0002721863928075503 + }, + { + "step": 83, + "epoch": 0.5627118644067797, + "cpu_mem": 1.711009792, + "gpu_mem": 4.493582848, + "loss": 0.6704, + "grad_norm": 3.4765994548797607, + "learning_rate": 0.000271142366817049 + }, + { + "step": 84, + "epoch": 0.5694915254237288, + "cpu_mem": 1.711009792, + "gpu_mem": 4.493545984, + "loss": 0.6794, + "grad_norm": 5.785215377807617, + "learning_rate": 0.00027008118613865406 + }, + { + "step": 85, + "epoch": 0.576271186440678, + "cpu_mem": 1.711009792, + "gpu_mem": 4.49357824, + "loss": 0.5901, + "grad_norm": 3.048917531967163, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 86, + "epoch": 0.5830508474576271, + "cpu_mem": 1.711009792, + "gpu_mem": 4.493529088, + "loss": 0.591, + "grad_norm": 2.1233479976654053, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 87, + "epoch": 0.5898305084745763, + "cpu_mem": 1.7112064, + "gpu_mem": 4.493536768, + "loss": 0.5762, + "grad_norm": 3.311816692352295, + "learning_rate": 0.00026679623070746325 + }, + { + "step": 88, + "epoch": 0.5966101694915255, + "cpu_mem": 1.7112064, + "gpu_mem": 4.493681152, + "loss": 0.5188, + "grad_norm": 2.4754672050476074, + "learning_rate": 0.0002656679579618081 + }, + { + "step": 89, + "epoch": 0.6033898305084746, + "cpu_mem": 1.7112064, + "gpu_mem": 4.49346304, + "loss": 0.6072, + "grad_norm": 2.305510997772217, + "learning_rate": 0.0002645233057465235 + }, + { + "step": 90, + "epoch": 0.6101694915254238, + "cpu_mem": 1.7112064, + "gpu_mem": 4.4935168, + "loss": 0.514, + "grad_norm": 1.615087628364563, + "learning_rate": 0.00026336243615313873 + }, + { + "step": 91, + "epoch": 0.6169491525423729, + "cpu_mem": 1.7112064, + "gpu_mem": 4.493484544, + "loss": 0.5489, + "grad_norm": 3.731175661087036, + "learning_rate": 0.00026218551356968814 + }, + { + "step": 92, + "epoch": 0.6237288135593221, + "cpu_mem": 1.7112064, + "gpu_mem": 4.493565952, + "loss": 0.5691, + "grad_norm": 1.1627174615859985, + "learning_rate": 0.00026099270465743254 + }, + { + "step": 93, + "epoch": 0.6305084745762712, + "cpu_mem": 1.7112064, + "gpu_mem": 4.493369344, + "loss": 0.6813, + "grad_norm": 1.519644856452942, + "learning_rate": 0.0002597841783272588 + }, + { + "step": 94, + "epoch": 0.6372881355932203, + "cpu_mem": 1.7112064, + "gpu_mem": 4.493483008, + "loss": 0.6063, + "grad_norm": 2.775646924972534, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 95, + "epoch": 0.6440677966101694, + "cpu_mem": 1.7112064, + "gpu_mem": 4.493502976, + "loss": 0.5314, + "grad_norm": 1.4140139818191528, + "learning_rate": 0.00025732066016100394 + }, + { + "step": 96, + "epoch": 0.6508474576271186, + "cpu_mem": 1.7112064, + "gpu_mem": 4.493541376, + "loss": 0.4546, + "grad_norm": 1.097731351852417, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 97, + "epoch": 0.6576271186440678, + "cpu_mem": 1.7112064, + "gpu_mem": 4.493526016, + "loss": 0.6354, + "grad_norm": 1.9989746809005737, + "learning_rate": 0.0002547963544337602 + }, + { + "step": 98, + "epoch": 0.6644067796610169, + "cpu_mem": 1.7112064, + "gpu_mem": 4.493438464, + "loss": 0.5424, + "grad_norm": 1.8398076295852661, + "learning_rate": 0.0002535118517223168 + }, + { + "step": 99, + "epoch": 0.6711864406779661, + "cpu_mem": 1.7112064, + "gpu_mem": 4.493387776, + "loss": 0.509, + "grad_norm": 1.4739112854003906, + "learning_rate": 0.00025221269093908365 + }, + { + "step": 100, + "epoch": 0.6779661016949152, + "cpu_mem": 1.7112064, + "gpu_mem": 4.493504512, + "loss": 0.5889, + "grad_norm": 2.657379388809204, + "learning_rate": 0.0002508990560551879 + }, + { + "step": 101, + "epoch": 0.6847457627118644, + "cpu_mem": 1.7112064, + "gpu_mem": 4.493536768, + "loss": 0.5525, + "grad_norm": 3.054154396057129, + "learning_rate": 0.0002495711330914001 + }, + { + "step": 102, + "epoch": 0.6915254237288135, + "cpu_mem": 1.711403008, + "gpu_mem": 4.49357056, + "loss": 0.6228, + "grad_norm": 2.1836800575256348, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 103, + "epoch": 0.6983050847457627, + "cpu_mem": 1.711403008, + "gpu_mem": 4.493621248, + "loss": 0.5459, + "grad_norm": 1.6511459350585938, + "learning_rate": 0.0002468731770971113 + }, + { + "step": 104, + "epoch": 0.7050847457627119, + "cpu_mem": 1.711403008, + "gpu_mem": 4.493526016, + "loss": 0.5682, + "grad_norm": 2.221405267715454, + "learning_rate": 0.0002455035261178632 + }, + { + "step": 105, + "epoch": 0.711864406779661, + "cpu_mem": 1.711403008, + "gpu_mem": 4.493627392, + "loss": 0.6373, + "grad_norm": 2.944551706314087, + "learning_rate": 0.0002441203511071278 + }, + { + "step": 106, + "epoch": 0.7186440677966102, + "cpu_mem": 1.711403008, + "gpu_mem": 4.49357824, + "loss": 0.5586, + "grad_norm": 1.5154145956039429, + "learning_rate": 0.00024272384793309077 + }, + { + "step": 107, + "epoch": 0.7254237288135593, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493466112, + "loss": 0.4648, + "grad_norm": 1.5469335317611694, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 108, + "epoch": 0.7322033898305085, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493650432, + "loss": 0.5601, + "grad_norm": 1.445845603942871, + "learning_rate": 0.00023989164997670202 + }, + { + "step": 109, + "epoch": 0.7389830508474576, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493504512, + "loss": 0.5896, + "grad_norm": 1.7318332195281982, + "learning_rate": 0.0002384563562552943 + }, + { + "step": 110, + "epoch": 0.7457627118644068, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493507584, + "loss": 0.512, + "grad_norm": 1.0013798475265503, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 111, + "epoch": 0.752542372881356, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493476864, + "loss": 0.5105, + "grad_norm": 1.6935800313949585, + "learning_rate": 0.0002355483955402446 + }, + { + "step": 112, + "epoch": 0.7593220338983051, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493522944, + "loss": 0.5226, + "grad_norm": 2.211376667022705, + "learning_rate": 0.00023407614033613407 + }, + { + "step": 113, + "epoch": 0.7661016949152543, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493513728, + "loss": 0.5076, + "grad_norm": 1.155295729637146, + "learning_rate": 0.0002325919793059723 + }, + { + "step": 114, + "epoch": 0.7728813559322034, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493495296, + "loss": 0.43, + "grad_norm": 1.0353950262069702, + "learning_rate": 0.00023109612261833963 + }, + { + "step": 115, + "epoch": 0.7796610169491526, + "cpu_mem": 1.711599616, + "gpu_mem": 4.49357056, + "loss": 0.5259, + "grad_norm": 2.2882490158081055, + "learning_rate": 0.0002295887820980112 + }, + { + "step": 116, + "epoch": 0.7864406779661017, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493490688, + "loss": 0.5022, + "grad_norm": 1.1948612928390503, + "learning_rate": 0.0002280701711959608 + }, + { + "step": 117, + "epoch": 0.7932203389830509, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493381632, + "loss": 0.4259, + "grad_norm": 1.4133895635604858, + "learning_rate": 0.00022654050495913495 + }, + { + "step": 118, + "epoch": 0.8, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493619712, + "loss": 0.5497, + "grad_norm": 2.1352763175964355, + "learning_rate": 0.000225 + }, + { + "step": 119, + "epoch": 0.8067796610169492, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493790208, + "loss": 0.476, + "grad_norm": 2.5153393745422363, + "learning_rate": 0.00022344887446586865 + }, + { + "step": 120, + "epoch": 0.8135593220338984, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493522944, + "loss": 0.5373, + "grad_norm": 3.539487600326538, + "learning_rate": 0.00022188734800800852 + }, + { + "step": 121, + "epoch": 0.8203389830508474, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493550592, + "loss": 0.5428, + "grad_norm": 2.6697866916656494, + "learning_rate": 0.00022031564175053754 + }, + { + "step": 122, + "epoch": 0.8271186440677966, + "cpu_mem": 1.711599616, + "gpu_mem": 4.49360128, + "loss": 0.4596, + "grad_norm": 2.2046003341674805, + "learning_rate": 0.00021873397825911153 + }, + { + "step": 123, + "epoch": 0.8338983050847457, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493410816, + "loss": 0.4837, + "grad_norm": 2.772540330886841, + "learning_rate": 0.00021714258150940685 + }, + { + "step": 124, + "epoch": 0.8406779661016949, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493853184, + "loss": 0.4259, + "grad_norm": 2.2922492027282715, + "learning_rate": 0.0002155416768554039 + }, + { + "step": 125, + "epoch": 0.847457627118644, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493579776, + "loss": 0.4785, + "grad_norm": 1.7634029388427734, + "learning_rate": 0.00021393149099747523 + }, + { + "step": 126, + "epoch": 0.8542372881355932, + "cpu_mem": 1.711599616, + "gpu_mem": 4.49346304, + "loss": 0.5198, + "grad_norm": 1.8537559509277344, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 127, + "epoch": 0.8610169491525423, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493902336, + "loss": 0.5083, + "grad_norm": 1.7474989891052246, + "learning_rate": 0.00021068418901049025 + }, + { + "step": 128, + "epoch": 0.8677966101694915, + "cpu_mem": 1.711599616, + "gpu_mem": 4.49367808, + "loss": 0.4306, + "grad_norm": 2.371370553970337, + "learning_rate": 0.0002090475327242912 + }, + { + "step": 129, + "epoch": 0.8745762711864407, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493718016, + "loss": 0.472, + "grad_norm": 1.586747646331787, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 130, + "epoch": 0.8813559322033898, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493499904, + "loss": 0.5928, + "grad_norm": 1.5480819940567017, + "learning_rate": 0.0002057493683490491 + }, + { + "step": 131, + "epoch": 0.888135593220339, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493628928, + "loss": 0.4662, + "grad_norm": 1.661648154258728, + "learning_rate": 0.00020408832730536746 + }, + { + "step": 132, + "epoch": 0.8949152542372881, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493710336, + "loss": 0.3788, + "grad_norm": 2.3356921672821045, + "learning_rate": 0.00020241962693986476 + }, + { + "step": 133, + "epoch": 0.9016949152542373, + "cpu_mem": 1.711599616, + "gpu_mem": 4.49349376, + "loss": 0.4653, + "grad_norm": 2.2256569862365723, + "learning_rate": 0.0002007435035533061 + }, + { + "step": 134, + "epoch": 0.9084745762711864, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493627392, + "loss": 0.4198, + "grad_norm": 3.046046733856201, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 135, + "epoch": 0.9152542372881356, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493650432, + "loss": 0.509, + "grad_norm": 1.861631989479065, + "learning_rate": 0.00019736993814225374 + }, + { + "step": 136, + "epoch": 0.9220338983050848, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493487616, + "loss": 0.4851, + "grad_norm": 2.905832529067993, + "learning_rate": 0.00019567297384048604 + }, + { + "step": 137, + "epoch": 0.9288135593220339, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493367808, + "loss": 0.5325, + "grad_norm": 3.637758255004883, + "learning_rate": 0.0001939695418954653 + }, + { + "step": 138, + "epoch": 0.9355932203389831, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493549056, + "loss": 0.4982, + "grad_norm": 2.983755111694336, + "learning_rate": 0.00019225988352621445 + }, + { + "step": 139, + "epoch": 0.9423728813559322, + "cpu_mem": 1.711599616, + "gpu_mem": 4.49344768, + "loss": 0.4305, + "grad_norm": 2.5489985942840576, + "learning_rate": 0.00019054424083346592 + }, + { + "step": 140, + "epoch": 0.9491525423728814, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493499904, + "loss": 0.4568, + "grad_norm": 1.3761025667190552, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 141, + "epoch": 0.9559322033898305, + "cpu_mem": 1.711599616, + "gpu_mem": 4.49353216, + "loss": 0.5383, + "grad_norm": 2.7972006797790527, + "learning_rate": 0.0001870959750831323 + }, + { + "step": 142, + "epoch": 0.9627118644067797, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493671936, + "loss": 0.5046, + "grad_norm": 3.7174174785614014, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 143, + "epoch": 0.9694915254237289, + "cpu_mem": 1.711599616, + "gpu_mem": 4.49365504, + "loss": 0.6168, + "grad_norm": 2.988773822784424, + "learning_rate": 0.00018362669777878453 + }, + { + "step": 144, + "epoch": 0.976271186440678, + "cpu_mem": 1.711599616, + "gpu_mem": 4.49384704, + "loss": 0.4891, + "grad_norm": 1.5629301071166992, + "learning_rate": 0.00018188479343294648 + }, + { + "step": 145, + "epoch": 0.9830508474576272, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493558272, + "loss": 0.4779, + "grad_norm": 1.830448031425476, + "learning_rate": 0.0001801383739559098 + }, + { + "step": 146, + "epoch": 0.9898305084745763, + "cpu_mem": 1.711599616, + "gpu_mem": 4.4935936, + "loss": 0.532, + "grad_norm": 1.983514666557312, + "learning_rate": 0.0001783876866540615 + }, + { + "step": 147, + "epoch": 0.9966101694915255, + "cpu_mem": 1.711599616, + "gpu_mem": 4.493492224, + "loss": 0.5437, + "grad_norm": 2.4755373001098633, + "learning_rate": 0.00017663297943814552 + }, + { + "step": 148, + "epoch": 1.0033898305084745, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518887936, + "loss": 0.7233, + "grad_norm": 4.773542881011963, + "learning_rate": 0.0001748745007881561 + }, + { + "step": 149, + "epoch": 1.0101694915254238, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518823424, + "loss": 0.4214, + "grad_norm": 2.750079870223999, + "learning_rate": 0.00017311249971815185 + }, + { + "step": 150, + "epoch": 1.0169491525423728, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518660608, + "loss": 0.4365, + "grad_norm": 2.255727529525757, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 151, + "epoch": 1.023728813559322, + "cpu_mem": 1.711599616, + "gpu_mem": 4.5187328, + "loss": 0.4681, + "grad_norm": 2.083754539489746, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 152, + "epoch": 1.0305084745762711, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518768128, + "loss": 0.4152, + "grad_norm": 1.3357144594192505, + "learning_rate": 0.00016780785939859576 + }, + { + "step": 153, + "epoch": 1.0372881355932204, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518792704, + "loss": 0.608, + "grad_norm": 2.6959776878356934, + "learning_rate": 0.00016603426823476693 + }, + { + "step": 154, + "epoch": 1.0440677966101695, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518754304, + "loss": 0.4753, + "grad_norm": 2.5216684341430664, + "learning_rate": 0.00016425840649562736 + }, + { + "step": 155, + "epoch": 1.0508474576271187, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518975488, + "loss": 0.4919, + "grad_norm": 2.7383933067321777, + "learning_rate": 0.00016248052565681436 + }, + { + "step": 156, + "epoch": 1.0576271186440678, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518883328, + "loss": 0.4303, + "grad_norm": 3.2136483192443848, + "learning_rate": 0.00016070087747988482 + }, + { + "step": 157, + "epoch": 1.064406779661017, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518789632, + "loss": 0.431, + "grad_norm": 1.8872498273849487, + "learning_rate": 0.00015891971397666464 + }, + { + "step": 158, + "epoch": 1.071186440677966, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518715904, + "loss": 0.3988, + "grad_norm": 1.6168367862701416, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 159, + "epoch": 1.0779661016949154, + "cpu_mem": 1.711599616, + "gpu_mem": 4.519064576, + "loss": 0.3525, + "grad_norm": 1.5681523084640503, + "learning_rate": 0.00015535385007584706 + }, + { + "step": 160, + "epoch": 1.0847457627118644, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518659072, + "loss": 0.3675, + "grad_norm": 2.2624340057373047, + "learning_rate": 0.0001535696546319161 + }, + { + "step": 161, + "epoch": 1.0915254237288137, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518605312, + "loss": 0.3675, + "grad_norm": 1.7178032398223877, + "learning_rate": 0.00015178495369752213 + }, + { + "step": 162, + "epoch": 1.0983050847457627, + "cpu_mem": 1.711599616, + "gpu_mem": 4.519380992, + "loss": 0.3346, + "grad_norm": 1.4177395105361938, + "learning_rate": 0.00015 + }, + { + "step": 163, + "epoch": 1.1050847457627118, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518857216, + "loss": 0.4544, + "grad_norm": 2.216475009918213, + "learning_rate": 0.00014821504630247785 + }, + { + "step": 164, + "epoch": 1.111864406779661, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518769664, + "loss": 0.434, + "grad_norm": 2.2830970287323, + "learning_rate": 0.00014643034536808387 + }, + { + "step": 165, + "epoch": 1.11864406779661, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518718976, + "loss": 0.2884, + "grad_norm": 1.7546035051345825, + "learning_rate": 0.00014464614992415294 + }, + { + "step": 166, + "epoch": 1.1254237288135593, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518814208, + "loss": 0.3606, + "grad_norm": 2.0118448734283447, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 167, + "epoch": 1.1322033898305084, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518731264, + "loss": 0.4171, + "grad_norm": 2.5143301486968994, + "learning_rate": 0.00014108028602333536 + }, + { + "step": 168, + "epoch": 1.1389830508474577, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518749696, + "loss": 0.434, + "grad_norm": 2.1411123275756836, + "learning_rate": 0.00013929912252011516 + }, + { + "step": 169, + "epoch": 1.1457627118644067, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518837248, + "loss": 0.4613, + "grad_norm": 3.5322885513305664, + "learning_rate": 0.00013751947434318564 + }, + { + "step": 170, + "epoch": 1.152542372881356, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518722048, + "loss": 0.3929, + "grad_norm": 2.741533041000366, + "learning_rate": 0.00013574159350437261 + }, + { + "step": 171, + "epoch": 1.159322033898305, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518785024, + "loss": 0.4549, + "grad_norm": 2.226236343383789, + "learning_rate": 0.0001339657317652331 + }, + { + "step": 172, + "epoch": 1.1661016949152543, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518692864, + "loss": 0.3462, + "grad_norm": 2.467480421066284, + "learning_rate": 0.00013219214060140424 + }, + { + "step": 173, + "epoch": 1.1728813559322033, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518992384, + "loss": 0.4363, + "grad_norm": 2.3100264072418213, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 174, + "epoch": 1.1796610169491526, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518715904, + "loss": 0.4209, + "grad_norm": 1.8846086263656616, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 175, + "epoch": 1.1864406779661016, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518682112, + "loss": 0.4885, + "grad_norm": 2.6262102127075195, + "learning_rate": 0.00012688750028184818 + }, + { + "step": 176, + "epoch": 1.193220338983051, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518820352, + "loss": 0.4125, + "grad_norm": 2.492753028869629, + "learning_rate": 0.0001251254992118439 + }, + { + "step": 177, + "epoch": 1.2, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518918656, + "loss": 0.4146, + "grad_norm": 2.1673343181610107, + "learning_rate": 0.00012336702056185453 + }, + { + "step": 178, + "epoch": 1.2067796610169492, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518665216, + "loss": 0.3936, + "grad_norm": 2.1583871841430664, + "learning_rate": 0.00012161231334593851 + }, + { + "step": 179, + "epoch": 1.2135593220338983, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518765056, + "loss": 0.5185, + "grad_norm": 2.772233486175537, + "learning_rate": 0.00011986162604409015 + }, + { + "step": 180, + "epoch": 1.2203389830508475, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518737408, + "loss": 0.357, + "grad_norm": 2.3532066345214844, + "learning_rate": 0.00011811520656705348 + }, + { + "step": 181, + "epoch": 1.2271186440677966, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518674432, + "loss": 0.3485, + "grad_norm": 2.274549961090088, + "learning_rate": 0.00011637330222121543 + }, + { + "step": 182, + "epoch": 1.2338983050847459, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518892544, + "loss": 0.4276, + "grad_norm": 2.6826555728912354, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 183, + "epoch": 1.240677966101695, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518789632, + "loss": 0.4081, + "grad_norm": 1.8423587083816528, + "learning_rate": 0.00011290402491686766 + }, + { + "step": 184, + "epoch": 1.2474576271186442, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518737408, + "loss": 0.3642, + "grad_norm": 2.0157456398010254, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 185, + "epoch": 1.2542372881355932, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518715904, + "loss": 0.411, + "grad_norm": 1.8380581140518188, + "learning_rate": 0.00010945575916653407 + }, + { + "step": 186, + "epoch": 1.2610169491525425, + "cpu_mem": 1.711599616, + "gpu_mem": 4.51872512, + "loss": 0.4042, + "grad_norm": 2.248631000518799, + "learning_rate": 0.00010774011647378553 + }, + { + "step": 187, + "epoch": 1.2677966101694915, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518657536, + "loss": 0.4149, + "grad_norm": 2.107780694961548, + "learning_rate": 0.00010603045810453468 + }, + { + "step": 188, + "epoch": 1.2745762711864406, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518820352, + "loss": 0.2966, + "grad_norm": 1.9196923971176147, + "learning_rate": 0.00010432702615951396 + }, + { + "step": 189, + "epoch": 1.2813559322033898, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518689792, + "loss": 0.4637, + "grad_norm": 2.4447083473205566, + "learning_rate": 0.00010263006185774627 + }, + { + "step": 190, + "epoch": 1.288135593220339, + "cpu_mem": 1.711599616, + "gpu_mem": 4.5188096, + "loss": 0.4792, + "grad_norm": 2.078752040863037, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 191, + "epoch": 1.2949152542372881, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518628352, + "loss": 0.3097, + "grad_norm": 2.0523102283477783, + "learning_rate": 9.925649644669391e-05 + }, + { + "step": 192, + "epoch": 1.3016949152542372, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518760448, + "loss": 0.3243, + "grad_norm": 2.661395311355591, + "learning_rate": 9.758037306013526e-05 + }, + { + "step": 193, + "epoch": 1.3084745762711865, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518734336, + "loss": 0.3492, + "grad_norm": 1.5497900247573853, + "learning_rate": 9.591167269463255e-05 + }, + { + "step": 194, + "epoch": 1.3152542372881357, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518700544, + "loss": 0.3145, + "grad_norm": 1.7822644710540771, + "learning_rate": 9.425063165095088e-05 + }, + { + "step": 195, + "epoch": 1.3220338983050848, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518804992, + "loss": 0.2805, + "grad_norm": 1.917203664779663, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 196, + "epoch": 1.3288135593220338, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518800384, + "loss": 0.4078, + "grad_norm": 1.7383885383605957, + "learning_rate": 9.095246727570879e-05 + }, + { + "step": 197, + "epoch": 1.335593220338983, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518659072, + "loss": 0.3376, + "grad_norm": 2.438589572906494, + "learning_rate": 8.931581098950973e-05 + }, + { + "step": 198, + "epoch": 1.3423728813559321, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518851072, + "loss": 0.3787, + "grad_norm": 1.634804368019104, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 199, + "epoch": 1.3491525423728814, + "cpu_mem": 1.711599616, + "gpu_mem": 4.51870208, + "loss": 0.3405, + "grad_norm": 2.3534703254699707, + "learning_rate": 8.606850900252478e-05 + }, + { + "step": 200, + "epoch": 1.3559322033898304, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518804992, + "loss": 0.3231, + "grad_norm": 2.043266534805298, + "learning_rate": 8.445832314459608e-05 + }, + { + "step": 201, + "epoch": 1.3627118644067797, + "cpu_mem": 1.711599616, + "gpu_mem": 4.519007744, + "loss": 0.301, + "grad_norm": 1.762511968612671, + "learning_rate": 8.285741849059311e-05 + }, + { + "step": 202, + "epoch": 1.3694915254237288, + "cpu_mem": 1.711599616, + "gpu_mem": 4.5188096, + "loss": 0.3401, + "grad_norm": 2.244468927383423, + "learning_rate": 8.126602174088843e-05 + }, + { + "step": 203, + "epoch": 1.376271186440678, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518695936, + "loss": 0.3318, + "grad_norm": 1.7492423057556152, + "learning_rate": 7.968435824946242e-05 + }, + { + "step": 204, + "epoch": 1.383050847457627, + "cpu_mem": 1.711599616, + "gpu_mem": 4.51870976, + "loss": 0.2814, + "grad_norm": 2.2708284854888916, + "learning_rate": 7.811265199199152e-05 + }, + { + "step": 205, + "epoch": 1.3898305084745763, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518754304, + "loss": 0.3235, + "grad_norm": 2.096971273422241, + "learning_rate": 7.655112553413135e-05 + }, + { + "step": 206, + "epoch": 1.3966101694915254, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518695936, + "loss": 0.332, + "grad_norm": 1.9460471868515015, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 207, + "epoch": 1.4033898305084747, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518929408, + "loss": 0.4989, + "grad_norm": 3.4345767498016357, + "learning_rate": 7.345949504086507e-05 + }, + { + "step": 208, + "epoch": 1.4101694915254237, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518960128, + "loss": 0.3981, + "grad_norm": 4.654128074645996, + "learning_rate": 7.192982880403917e-05 + }, + { + "step": 209, + "epoch": 1.4169491525423727, + "cpu_mem": 1.711599616, + "gpu_mem": 4.5188864, + "loss": 0.4154, + "grad_norm": 2.861698627471924, + "learning_rate": 7.041121790198881e-05 + }, + { + "step": 210, + "epoch": 1.423728813559322, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518774272, + "loss": 0.3594, + "grad_norm": 2.289139747619629, + "learning_rate": 6.890387738166041e-05 + }, + { + "step": 211, + "epoch": 1.4305084745762713, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518723584, + "loss": 0.3542, + "grad_norm": 3.2476539611816406, + "learning_rate": 6.740802069402771e-05 + }, + { + "step": 212, + "epoch": 1.4372881355932203, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518692864, + "loss": 0.2731, + "grad_norm": 1.6506167650222778, + "learning_rate": 6.592385966386588e-05 + }, + { + "step": 213, + "epoch": 1.4440677966101694, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518715904, + "loss": 0.3918, + "grad_norm": 3.287315845489502, + "learning_rate": 6.445160445975536e-05 + }, + { + "step": 214, + "epoch": 1.4508474576271186, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518798848, + "loss": 0.3327, + "grad_norm": 2.6938748359680176, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 215, + "epoch": 1.457627118644068, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518726656, + "loss": 0.5451, + "grad_norm": 5.716137886047363, + "learning_rate": 6.154364374470568e-05 + }, + { + "step": 216, + "epoch": 1.464406779661017, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518892544, + "loss": 0.358, + "grad_norm": 2.027221918106079, + "learning_rate": 6.010835002329795e-05 + }, + { + "step": 217, + "epoch": 1.471186440677966, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518734336, + "loss": 0.4227, + "grad_norm": 3.2400901317596436, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 218, + "epoch": 1.4779661016949153, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518711296, + "loss": 0.3567, + "grad_norm": 2.3764185905456543, + "learning_rate": 5.72761520669092e-05 + }, + { + "step": 219, + "epoch": 1.4847457627118645, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518837248, + "loss": 0.3969, + "grad_norm": 2.774723768234253, + "learning_rate": 5.587964889287218e-05 + }, + { + "step": 220, + "epoch": 1.4915254237288136, + "cpu_mem": 1.711599616, + "gpu_mem": 4.51887104, + "loss": 0.3446, + "grad_norm": 2.1695356369018555, + "learning_rate": 5.449647388213678e-05 + }, + { + "step": 221, + "epoch": 1.4983050847457626, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518738944, + "loss": 0.3947, + "grad_norm": 2.8588099479675293, + "learning_rate": 5.312682290288869e-05 + }, + { + "step": 222, + "epoch": 1.505084745762712, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518875648, + "loss": 0.4086, + "grad_norm": 2.292579412460327, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 223, + "epoch": 1.5118644067796612, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518789632, + "loss": 0.3436, + "grad_norm": 3.475487232208252, + "learning_rate": 5.0428866908599864e-05 + }, + { + "step": 224, + "epoch": 1.5186440677966102, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518754304, + "loss": 0.2352, + "grad_norm": 1.5501208305358887, + "learning_rate": 4.9100943944812114e-05 + }, + { + "step": 225, + "epoch": 1.5254237288135593, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518718976, + "loss": 0.2872, + "grad_norm": 1.8750560283660889, + "learning_rate": 4.778730906091632e-05 + }, + { + "step": 226, + "epoch": 1.5322033898305085, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518867968, + "loss": 0.3367, + "grad_norm": 2.030620574951172, + "learning_rate": 4.648814827768322e-05 + }, + { + "step": 227, + "epoch": 1.5389830508474578, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518757376, + "loss": 0.4134, + "grad_norm": 2.3314332962036133, + "learning_rate": 4.5203645566239816e-05 + }, + { + "step": 228, + "epoch": 1.5457627118644068, + "cpu_mem": 1.711599616, + "gpu_mem": 4.51870208, + "loss": 0.3765, + "grad_norm": 2.124023675918579, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 229, + "epoch": 1.5525423728813559, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518643712, + "loss": 0.3267, + "grad_norm": 1.6352308988571167, + "learning_rate": 4.267933983899601e-05 + }, + { + "step": 230, + "epoch": 1.559322033898305, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518700544, + "loss": 0.3158, + "grad_norm": 1.8632351160049438, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 231, + "epoch": 1.5661016949152542, + "cpu_mem": 1.711599616, + "gpu_mem": 4.51897856, + "loss": 0.3573, + "grad_norm": 2.3792884349823, + "learning_rate": 4.0215821672741213e-05 + }, + { + "step": 232, + "epoch": 1.5728813559322035, + "cpu_mem": 1.711599616, + "gpu_mem": 4.51870208, + "loss": 0.3558, + "grad_norm": 1.9817919731140137, + "learning_rate": 3.900729534256745e-05 + }, + { + "step": 233, + "epoch": 1.5796610169491525, + "cpu_mem": 1.711599616, + "gpu_mem": 4.519015424, + "loss": 0.314, + "grad_norm": 2.540290594100952, + "learning_rate": 3.781448643031187e-05 + }, + { + "step": 234, + "epoch": 1.5864406779661016, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518891008, + "loss": 0.3529, + "grad_norm": 2.5171523094177246, + "learning_rate": 3.663756384686127e-05 + }, + { + "step": 235, + "epoch": 1.5932203389830508, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518646784, + "loss": 0.2473, + "grad_norm": 1.6396665573120117, + "learning_rate": 3.547669425347647e-05 + }, + { + "step": 236, + "epoch": 1.6, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518706688, + "loss": 0.4517, + "grad_norm": 2.5672972202301025, + "learning_rate": 3.433204203819185e-05 + }, + { + "step": 237, + "epoch": 1.6067796610169491, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518768128, + "loss": 0.4687, + "grad_norm": 2.4368391036987305, + "learning_rate": 3.3203769292536764e-05 + }, + { + "step": 238, + "epoch": 1.6135593220338982, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518769664, + "loss": 0.3534, + "grad_norm": 2.1598002910614014, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 239, + "epoch": 1.6203389830508474, + "cpu_mem": 1.711599616, + "gpu_mem": 4.519023104, + "loss": 0.3769, + "grad_norm": 2.201073169708252, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 240, + "epoch": 1.6271186440677967, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518672896, + "loss": 0.4793, + "grad_norm": 3.0026707649230957, + "learning_rate": 2.9918813861345952e-05 + }, + { + "step": 241, + "epoch": 1.6338983050847458, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518969344, + "loss": 0.3807, + "grad_norm": 2.082353115081787, + "learning_rate": 2.885763318295102e-05 + }, + { + "step": 242, + "epoch": 1.6406779661016948, + "cpu_mem": 1.711599616, + "gpu_mem": 4.518831104, + "loss": 0.3843, + "grad_norm": 2.338231086730957, + "learning_rate": 2.781360719244964e-05 + }, + { + "step": 243, + "epoch": 1.647457627118644, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518683648, + "loss": 0.4293, + "grad_norm": 2.2972209453582764, + "learning_rate": 2.6786883731926306e-05 + }, + { + "step": 244, + "epoch": 1.6542372881355933, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518823424, + "loss": 0.2622, + "grad_norm": 2.2086148262023926, + "learning_rate": 2.5777608193294396e-05 + }, + { + "step": 245, + "epoch": 1.6610169491525424, + "cpu_mem": 1.711796224, + "gpu_mem": 4.51870208, + "loss": 0.3806, + "grad_norm": 2.3076648712158203, + "learning_rate": 2.4785923497707956e-05 + }, + { + "step": 246, + "epoch": 1.6677966101694914, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518795776, + "loss": 0.4393, + "grad_norm": 1.8375837802886963, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 247, + "epoch": 1.6745762711864407, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518814208, + "loss": 0.2546, + "grad_norm": 1.4918547868728638, + "learning_rate": 2.285588584541047e-05 + }, + { + "step": 248, + "epoch": 1.68135593220339, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518766592, + "loss": 0.3267, + "grad_norm": 2.361802339553833, + "learning_rate": 2.1917806196827792e-05 + }, + { + "step": 249, + "epoch": 1.688135593220339, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518672896, + "loss": 0.242, + "grad_norm": 2.0840659141540527, + "learning_rate": 2.0997863968844914e-05 + }, + { + "step": 250, + "epoch": 1.694915254237288, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518765056, + "loss": 0.4094, + "grad_norm": 2.5733115673065186, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 251, + "epoch": 1.7016949152542373, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518677504, + "loss": 0.2575, + "grad_norm": 1.691854476928711, + "learning_rate": 1.921291027132278e-05 + }, + { + "step": 252, + "epoch": 1.7084745762711866, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518720512, + "loss": 0.3795, + "grad_norm": 2.6459226608276367, + "learning_rate": 1.834815156491165e-05 + }, + { + "step": 253, + "epoch": 1.7152542372881356, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518914048, + "loss": 0.3931, + "grad_norm": 2.409576177597046, + "learning_rate": 1.750203576956341e-05 + }, + { + "step": 254, + "epoch": 1.7220338983050847, + "cpu_mem": 1.711796224, + "gpu_mem": 4.51870976, + "loss": 0.4196, + "grad_norm": 2.6486446857452393, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 255, + "epoch": 1.7288135593220337, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518866432, + "loss": 0.4152, + "grad_norm": 2.8445231914520264, + "learning_rate": 1.5866209521043304e-05 + }, + { + "step": 256, + "epoch": 1.735593220338983, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518692864, + "loss": 0.3302, + "grad_norm": 2.0517022609710693, + "learning_rate": 1.5076730713409523e-05 + }, + { + "step": 257, + "epoch": 1.7423728813559323, + "cpu_mem": 1.711796224, + "gpu_mem": 4.519106048, + "loss": 0.3846, + "grad_norm": 1.5023503303527832, + "learning_rate": 1.4306358075111923e-05 + }, + { + "step": 258, + "epoch": 1.7491525423728813, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518765056, + "loss": 0.3792, + "grad_norm": 2.7162036895751953, + "learning_rate": 1.3555200696822232e-05 + }, + { + "step": 259, + "epoch": 1.7559322033898304, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518682112, + "loss": 0.3736, + "grad_norm": 1.9557006359100342, + "learning_rate": 1.2823364948184095e-05 + }, + { + "step": 260, + "epoch": 1.7627118644067796, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518798848, + "loss": 0.2528, + "grad_norm": 1.3790342807769775, + "learning_rate": 1.2110954462750166e-05 + }, + { + "step": 261, + "epoch": 1.769491525423729, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518754304, + "loss": 0.214, + "grad_norm": 1.6710273027420044, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 262, + "epoch": 1.776271186440678, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518711296, + "loss": 0.292, + "grad_norm": 1.720982313156128, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 263, + "epoch": 1.783050847457627, + "cpu_mem": 1.711796224, + "gpu_mem": 4.51874816, + "loss": 0.3357, + "grad_norm": 2.2081875801086426, + "learning_rate": 1.0091269574384874e-05 + }, + { + "step": 264, + "epoch": 1.7898305084745763, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518835712, + "loss": 0.3131, + "grad_norm": 2.31950044631958, + "learning_rate": 9.45754125003576e-06 + }, + { + "step": 265, + "epoch": 1.7966101694915255, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518754304, + "loss": 0.3637, + "grad_norm": 2.1394689083099365, + "learning_rate": 8.843714815330987e-06 + }, + { + "step": 266, + "epoch": 1.8033898305084746, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518969344, + "loss": 0.4491, + "grad_norm": 2.222255229949951, + "learning_rate": 8.249877192799731e-06 + }, + { + "step": 267, + "epoch": 1.8101694915254236, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518761984, + "loss": 0.4085, + "grad_norm": 2.8149144649505615, + "learning_rate": 7.676112474402068e-06 + }, + { + "step": 268, + "epoch": 1.8169491525423729, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518766592, + "loss": 0.3, + "grad_norm": 2.116175889968872, + "learning_rate": 7.122501909620926e-06 + }, + { + "step": 269, + "epoch": 1.8237288135593221, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518777344, + "loss": 0.3966, + "grad_norm": 2.7279956340789795, + "learning_rate": 6.5891238939566275e-06 + }, + { + "step": 270, + "epoch": 1.8305084745762712, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518815744, + "loss": 0.3116, + "grad_norm": 2.122032880783081, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 271, + "epoch": 1.8372881355932202, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518867968, + "loss": 0.3683, + "grad_norm": 2.1653904914855957, + "learning_rate": 5.583364755863701e-06 + }, + { + "step": 272, + "epoch": 1.8440677966101695, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518726656, + "loss": 0.358, + "grad_norm": 1.8753836154937744, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 273, + "epoch": 1.8508474576271188, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518606848, + "loss": 0.3515, + "grad_norm": 1.9849605560302734, + "learning_rate": 4.659404732773908e-06 + }, + { + "step": 274, + "epoch": 1.8576271186440678, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518834176, + "loss": 0.3148, + "grad_norm": 2.3825619220733643, + "learning_rate": 4.228264751468752e-06 + }, + { + "step": 275, + "epoch": 1.8644067796610169, + "cpu_mem": 1.711796224, + "gpu_mem": 4.5190784, + "loss": 0.3742, + "grad_norm": 3.0467369556427, + "learning_rate": 3.817767165451041e-06 + }, + { + "step": 276, + "epoch": 1.8711864406779661, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518738944, + "loss": 0.3349, + "grad_norm": 2.1529831886291504, + "learning_rate": 3.4279701043260886e-06 + }, + { + "step": 277, + "epoch": 1.8779661016949154, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518685184, + "loss": 0.3648, + "grad_norm": 2.077089548110962, + "learning_rate": 3.0589287663461472e-06 + }, + { + "step": 278, + "epoch": 1.8847457627118644, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518848, + "loss": 0.4342, + "grad_norm": 1.961988091468811, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 279, + "epoch": 1.8915254237288135, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518788096, + "loss": 0.3524, + "grad_norm": 1.8061267137527466, + "learning_rate": 2.3833193495825853e-06 + }, + { + "step": 280, + "epoch": 1.8983050847457628, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518768128, + "loss": 0.3263, + "grad_norm": 2.02156400680542, + "learning_rate": 2.076846942272026e-06 + }, + { + "step": 281, + "epoch": 1.905084745762712, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518703616, + "loss": 0.4553, + "grad_norm": 2.5121686458587646, + "learning_rate": 1.791321587504768e-06 + }, + { + "step": 282, + "epoch": 1.911864406779661, + "cpu_mem": 1.711796224, + "gpu_mem": 4.51913216, + "loss": 0.2958, + "grad_norm": 2.423207998275757, + "learning_rate": 1.5267837178600972e-06 + }, + { + "step": 283, + "epoch": 1.9186440677966101, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518838784, + "loss": 0.2953, + "grad_norm": 1.8908737897872925, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 284, + "epoch": 1.9254237288135592, + "cpu_mem": 1.711796224, + "gpu_mem": 4.5186944, + "loss": 0.393, + "grad_norm": 2.883166551589966, + "learning_rate": 1.0608172990067553e-06 + }, + { + "step": 285, + "epoch": 1.9322033898305084, + "cpu_mem": 1.711796224, + "gpu_mem": 4.51874816, + "loss": 0.3575, + "grad_norm": 2.05812406539917, + "learning_rate": 8.594547342153979e-07 + }, + { + "step": 286, + "epoch": 1.9389830508474577, + "cpu_mem": 1.711796224, + "gpu_mem": 4.519165952, + "loss": 0.2564, + "grad_norm": 1.4015121459960938, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 287, + "epoch": 1.9457627118644067, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518935552, + "loss": 0.3584, + "grad_norm": 2.0995254516601562, + "learning_rate": 5.201134622801473e-07 + }, + { + "step": 288, + "epoch": 1.9525423728813558, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518720512, + "loss": 0.4958, + "grad_norm": 2.2084200382232666, + "learning_rate": 3.821828084619727e-07 + }, + { + "step": 289, + "epoch": 1.959322033898305, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518804992, + "loss": 0.2645, + "grad_norm": 1.6111083030700684, + "learning_rate": 2.654391846207915e-07 + }, + { + "step": 290, + "epoch": 1.9661016949152543, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518729728, + "loss": 0.4146, + "grad_norm": 2.890242099761963, + "learning_rate": 1.6989912254880556e-07 + }, + { + "step": 291, + "epoch": 1.9728813559322034, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518765056, + "loss": 0.3453, + "grad_norm": 2.0175983905792236, + "learning_rate": 9.557615145123765e-08 + }, + { + "step": 292, + "epoch": 1.9796610169491524, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518848, + "loss": 0.3555, + "grad_norm": 1.9345208406448364, + "learning_rate": 4.248079603064724e-08 + }, + { + "step": 293, + "epoch": 1.9864406779661017, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518765056, + "loss": 0.44, + "grad_norm": 2.891662359237671, + "learning_rate": 1.0620574996372811e-08 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518791168, + "loss": 0.3464, + "grad_norm": 2.649435043334961, + "learning_rate": 0.0 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 1.711796224, + "gpu_mem": 4.518791168, + "train_runtime": 4555.2825, + "train_samples_per_second": 4.139, + "train_steps_per_second": 0.065, + "total_flos": 4.723488642741043e+16, + "train_loss": 0.7193643490777535 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r2-a2/README.md b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r2-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r2-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r2-a2/adapter_config.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..26ebe9ef584396639cb6b281f2c8108d7f3fd14a --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r2-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 4, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 2, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r2-a2/eval_results.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a69dfd7771cdf86e07b50c22dba05b0536435648 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "hellaswag", + "results": 0.7937661820354511 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r2-a2/training_configuration.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..68b7c29322ebc26f71f93f59a4e654a468a0ee20 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "HELLASWAG", + "dataset_id": "Rowan/hellaswag", + "preprocess_id": "hellaswag_train_deepeval" + }, + "peft_config": { + "method": "lora", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 1576960 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 1, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-lora-hellaswag-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r2-a2", + "seed": 42, + "timestamp": "2025-08-29T18:27:59.111520" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r2-a2/training_logs.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..2d04842bf4839f9271f54f76f8c890c6613e6297 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r2-a2/training_logs.json @@ -0,0 +1,5629 @@ +[ + { + "step": 1, + "epoch": 0.0016025641025641025, + "cpu_mem": 1.680371712, + "gpu_mem": 4.42406656, + "loss": 3.4877, + "grad_norm": 11.387960433959961, + "learning_rate": 4.7619047619047615e-06 + }, + { + "step": 2, + "epoch": 0.003205128205128205, + "cpu_mem": 1.686663168, + "gpu_mem": 4.436675584, + "loss": 3.6203, + "grad_norm": 11.125204086303711, + "learning_rate": 9.523809523809523e-06 + }, + { + "step": 3, + "epoch": 0.004807692307692308, + "cpu_mem": 1.687842816, + "gpu_mem": 4.436683264, + "loss": 3.4223, + "grad_norm": 11.065898895263672, + "learning_rate": 1.4285714285714284e-05 + }, + { + "step": 4, + "epoch": 0.00641025641025641, + "cpu_mem": 1.688825856, + "gpu_mem": 4.436717056, + "loss": 3.5962, + "grad_norm": 10.81956672668457, + "learning_rate": 1.9047619047619046e-05 + }, + { + "step": 5, + "epoch": 0.008012820512820512, + "cpu_mem": 1.689808896, + "gpu_mem": 4.436680192, + "loss": 3.48, + "grad_norm": 11.02424144744873, + "learning_rate": 2.3809523809523807e-05 + }, + { + "step": 6, + "epoch": 0.009615384615384616, + "cpu_mem": 1.690791936, + "gpu_mem": 4.436726272, + "loss": 3.5512, + "grad_norm": 11.792925834655762, + "learning_rate": 2.8571428571428567e-05 + }, + { + "step": 7, + "epoch": 0.011217948717948718, + "cpu_mem": 1.691774976, + "gpu_mem": 4.436686336, + "loss": 3.484, + "grad_norm": 10.678176879882812, + "learning_rate": 3.333333333333333e-05 + }, + { + "step": 8, + "epoch": 0.01282051282051282, + "cpu_mem": 1.692758016, + "gpu_mem": 4.436717056, + "loss": 3.1746, + "grad_norm": 11.425053596496582, + "learning_rate": 3.809523809523809e-05 + }, + { + "step": 9, + "epoch": 0.014423076923076924, + "cpu_mem": 1.693544448, + "gpu_mem": 4.436717056, + "loss": 3.0561, + "grad_norm": 10.597249031066895, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 10, + "epoch": 0.016025641025641024, + "cpu_mem": 1.69433088, + "gpu_mem": 4.436660224, + "loss": 2.8393, + "grad_norm": 9.874714851379395, + "learning_rate": 4.7619047619047614e-05 + }, + { + "step": 11, + "epoch": 0.017628205128205128, + "cpu_mem": 1.694920704, + "gpu_mem": 4.436680192, + "loss": 2.7081, + "grad_norm": 10.810506820678711, + "learning_rate": 5.238095238095237e-05 + }, + { + "step": 12, + "epoch": 0.019230769230769232, + "cpu_mem": 1.695510528, + "gpu_mem": 4.43667712, + "loss": 2.8511, + "grad_norm": 10.263282775878906, + "learning_rate": 5.7142857142857135e-05 + }, + { + "step": 13, + "epoch": 0.020833333333333332, + "cpu_mem": 1.69629696, + "gpu_mem": 4.43666944, + "loss": 2.5298, + "grad_norm": 8.528789520263672, + "learning_rate": 6.190476190476189e-05 + }, + { + "step": 14, + "epoch": 0.022435897435897436, + "cpu_mem": 1.69728, + "gpu_mem": 4.436695552, + "loss": 2.3208, + "grad_norm": 7.361415386199951, + "learning_rate": 6.666666666666666e-05 + }, + { + "step": 15, + "epoch": 0.02403846153846154, + "cpu_mem": 1.697869824, + "gpu_mem": 4.436694016, + "loss": 2.0338, + "grad_norm": 6.145046710968018, + "learning_rate": 7.142857142857142e-05 + }, + { + "step": 16, + "epoch": 0.02564102564102564, + "cpu_mem": 1.698656256, + "gpu_mem": 4.436686336, + "loss": 2.0417, + "grad_norm": 5.776154518127441, + "learning_rate": 7.619047619047618e-05 + }, + { + "step": 17, + "epoch": 0.027243589743589744, + "cpu_mem": 1.699442688, + "gpu_mem": 4.436686336, + "loss": 1.8005, + "grad_norm": 4.089014053344727, + "learning_rate": 8.095238095238093e-05 + }, + { + "step": 18, + "epoch": 0.028846153846153848, + "cpu_mem": 1.700032512, + "gpu_mem": 4.436686336, + "loss": 1.7583, + "grad_norm": 3.8473501205444336, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 19, + "epoch": 0.030448717948717948, + "cpu_mem": 1.700818944, + "gpu_mem": 4.436686336, + "loss": 1.6092, + "grad_norm": 2.8838818073272705, + "learning_rate": 9.047619047619046e-05 + }, + { + "step": 20, + "epoch": 0.03205128205128205, + "cpu_mem": 1.701408768, + "gpu_mem": 4.436660224, + "loss": 1.4655, + "grad_norm": 2.137171745300293, + "learning_rate": 9.523809523809523e-05 + }, + { + "step": 21, + "epoch": 0.03365384615384615, + "cpu_mem": 1.701998592, + "gpu_mem": 4.43667712, + "loss": 1.4584, + "grad_norm": 2.145756959915161, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 22, + "epoch": 0.035256410256410256, + "cpu_mem": 1.702785024, + "gpu_mem": 4.4366848, + "loss": 1.5057, + "grad_norm": 2.5071699619293213, + "learning_rate": 0.00010476190476190474 + }, + { + "step": 23, + "epoch": 0.03685897435897436, + "cpu_mem": 1.703374848, + "gpu_mem": 4.436698624, + "loss": 1.3901, + "grad_norm": 1.18629789352417, + "learning_rate": 0.0001095238095238095 + }, + { + "step": 24, + "epoch": 0.038461538461538464, + "cpu_mem": 1.703964672, + "gpu_mem": 4.436683264, + "loss": 1.3718, + "grad_norm": 1.0667744874954224, + "learning_rate": 0.00011428571428571427 + }, + { + "step": 25, + "epoch": 0.04006410256410257, + "cpu_mem": 1.704554496, + "gpu_mem": 4.436670976, + "loss": 1.5644, + "grad_norm": 4.299117088317871, + "learning_rate": 0.00011904761904761903 + }, + { + "step": 26, + "epoch": 0.041666666666666664, + "cpu_mem": 1.70514432, + "gpu_mem": 4.43667712, + "loss": 1.6022, + "grad_norm": 5.017867088317871, + "learning_rate": 0.00012380952380952378 + }, + { + "step": 27, + "epoch": 0.04326923076923077, + "cpu_mem": 1.705734144, + "gpu_mem": 4.4366848, + "loss": 1.4876, + "grad_norm": 3.3445303440093994, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 28, + "epoch": 0.04487179487179487, + "cpu_mem": 1.706323968, + "gpu_mem": 4.436680192, + "loss": 1.5141, + "grad_norm": 3.7309370040893555, + "learning_rate": 0.0001333333333333333 + }, + { + "step": 29, + "epoch": 0.046474358974358976, + "cpu_mem": 1.706913792, + "gpu_mem": 4.436689408, + "loss": 1.3932, + "grad_norm": 0.9375073909759521, + "learning_rate": 0.00013809523809523808 + }, + { + "step": 30, + "epoch": 0.04807692307692308, + "cpu_mem": 1.707503616, + "gpu_mem": 4.43666176, + "loss": 1.3986, + "grad_norm": 1.4223915338516235, + "learning_rate": 0.00014285714285714284 + }, + { + "step": 31, + "epoch": 0.049679487179487176, + "cpu_mem": 1.70809344, + "gpu_mem": 4.436717056, + "loss": 1.3914, + "grad_norm": 1.14540696144104, + "learning_rate": 0.0001476190476190476 + }, + { + "step": 32, + "epoch": 0.05128205128205128, + "cpu_mem": 1.708683264, + "gpu_mem": 4.436709376, + "loss": 1.4237, + "grad_norm": 2.0915143489837646, + "learning_rate": 0.00015238095238095237 + }, + { + "step": 33, + "epoch": 0.052884615384615384, + "cpu_mem": 1.709273088, + "gpu_mem": 4.436663296, + "loss": 1.3881, + "grad_norm": 1.0834895372390747, + "learning_rate": 0.00015714285714285713 + }, + { + "step": 34, + "epoch": 0.05448717948717949, + "cpu_mem": 1.709862912, + "gpu_mem": 4.436681728, + "loss": 1.4056, + "grad_norm": 1.0566556453704834, + "learning_rate": 0.00016190476190476187 + }, + { + "step": 35, + "epoch": 0.05608974358974359, + "cpu_mem": 1.710452736, + "gpu_mem": 4.436703232, + "loss": 1.3898, + "grad_norm": 2.1398050785064697, + "learning_rate": 0.00016666666666666666 + }, + { + "step": 36, + "epoch": 0.057692307692307696, + "cpu_mem": 1.71104256, + "gpu_mem": 4.436701696, + "loss": 1.3856, + "grad_norm": 1.0110061168670654, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 37, + "epoch": 0.05929487179487179, + "cpu_mem": 1.711632384, + "gpu_mem": 4.436733952, + "loss": 1.3988, + "grad_norm": 0.9300307631492615, + "learning_rate": 0.0001761904761904762 + }, + { + "step": 38, + "epoch": 0.060897435897435896, + "cpu_mem": 1.712222208, + "gpu_mem": 4.436686336, + "loss": 1.4167, + "grad_norm": 1.1721677780151367, + "learning_rate": 0.00018095238095238093 + }, + { + "step": 39, + "epoch": 0.0625, + "cpu_mem": 1.712812032, + "gpu_mem": 4.436743168, + "loss": 1.3701, + "grad_norm": 2.0884902477264404, + "learning_rate": 0.00018571428571428572 + }, + { + "step": 40, + "epoch": 0.0641025641025641, + "cpu_mem": 1.713205248, + "gpu_mem": 4.436670976, + "loss": 1.4441, + "grad_norm": 1.7054357528686523, + "learning_rate": 0.00019047619047619045 + }, + { + "step": 41, + "epoch": 0.06570512820512821, + "cpu_mem": 1.713598464, + "gpu_mem": 4.436698624, + "loss": 1.4028, + "grad_norm": 0.843626856803894, + "learning_rate": 0.00019523809523809522 + }, + { + "step": 42, + "epoch": 0.0673076923076923, + "cpu_mem": 1.714188288, + "gpu_mem": 4.436712448, + "loss": 1.3912, + "grad_norm": 0.8109623789787292, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 43, + "epoch": 0.06891025641025642, + "cpu_mem": 1.714778112, + "gpu_mem": 4.436718592, + "loss": 1.3709, + "grad_norm": 0.5446664094924927, + "learning_rate": 0.00020476190476190475 + }, + { + "step": 44, + "epoch": 0.07051282051282051, + "cpu_mem": 1.715367936, + "gpu_mem": 4.436697088, + "loss": 1.4, + "grad_norm": 0.7696340084075928, + "learning_rate": 0.00020952380952380948 + }, + { + "step": 45, + "epoch": 0.07211538461538461, + "cpu_mem": 1.71595776, + "gpu_mem": 4.436697088, + "loss": 1.39, + "grad_norm": 0.6064107418060303, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 46, + "epoch": 0.07371794871794872, + "cpu_mem": 1.716350976, + "gpu_mem": 4.436697088, + "loss": 1.4161, + "grad_norm": 1.5359725952148438, + "learning_rate": 0.000219047619047619 + }, + { + "step": 47, + "epoch": 0.07532051282051282, + "cpu_mem": 1.716744192, + "gpu_mem": 4.436683264, + "loss": 1.3981, + "grad_norm": 0.6635174751281738, + "learning_rate": 0.0002238095238095238 + }, + { + "step": 48, + "epoch": 0.07692307692307693, + "cpu_mem": 1.717334016, + "gpu_mem": 4.436701696, + "loss": 1.3738, + "grad_norm": 1.0175073146820068, + "learning_rate": 0.00022857142857142854 + }, + { + "step": 49, + "epoch": 0.07852564102564102, + "cpu_mem": 1.71792384, + "gpu_mem": 4.436713984, + "loss": 1.4146, + "grad_norm": 1.1613006591796875, + "learning_rate": 0.0002333333333333333 + }, + { + "step": 50, + "epoch": 0.08012820512820513, + "cpu_mem": 1.718513664, + "gpu_mem": 4.436690944, + "loss": 1.393, + "grad_norm": 1.155881404876709, + "learning_rate": 0.00023809523809523807 + }, + { + "step": 51, + "epoch": 0.08173076923076923, + "cpu_mem": 1.71890688, + "gpu_mem": 4.436675584, + "loss": 1.3602, + "grad_norm": 0.7468127012252808, + "learning_rate": 0.00024285714285714283 + }, + { + "step": 52, + "epoch": 0.08333333333333333, + "cpu_mem": 1.719496704, + "gpu_mem": 4.436680192, + "loss": 1.3643, + "grad_norm": 0.602774977684021, + "learning_rate": 0.00024761904761904757 + }, + { + "step": 53, + "epoch": 0.08493589743589744, + "cpu_mem": 1.71988992, + "gpu_mem": 4.43670784, + "loss": 1.4339, + "grad_norm": 1.747943639755249, + "learning_rate": 0.0002523809523809524 + }, + { + "step": 54, + "epoch": 0.08653846153846154, + "cpu_mem": 1.720283136, + "gpu_mem": 4.436683264, + "loss": 1.4284, + "grad_norm": 1.7197283506393433, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 55, + "epoch": 0.08814102564102565, + "cpu_mem": 1.720676352, + "gpu_mem": 4.436701696, + "loss": 1.397, + "grad_norm": 1.0936845541000366, + "learning_rate": 0.00026190476190476186 + }, + { + "step": 56, + "epoch": 0.08974358974358974, + "cpu_mem": 1.721266176, + "gpu_mem": 4.436695552, + "loss": 1.401, + "grad_norm": 1.0741146802902222, + "learning_rate": 0.0002666666666666666 + }, + { + "step": 57, + "epoch": 0.09134615384615384, + "cpu_mem": 1.721856, + "gpu_mem": 4.43666176, + "loss": 1.3932, + "grad_norm": 0.8704217672348022, + "learning_rate": 0.0002714285714285714 + }, + { + "step": 58, + "epoch": 0.09294871794871795, + "cpu_mem": 1.722249216, + "gpu_mem": 4.436690944, + "loss": 1.4133, + "grad_norm": 0.9307969808578491, + "learning_rate": 0.00027619047619047615 + }, + { + "step": 59, + "epoch": 0.09455128205128205, + "cpu_mem": 1.72283904, + "gpu_mem": 4.436674048, + "loss": 1.3474, + "grad_norm": 0.7573204040527344, + "learning_rate": 0.0002809523809523809 + }, + { + "step": 60, + "epoch": 0.09615384615384616, + "cpu_mem": 1.723232256, + "gpu_mem": 4.43671552, + "loss": 1.3916, + "grad_norm": 1.1691898107528687, + "learning_rate": 0.0002857142857142857 + }, + { + "step": 61, + "epoch": 0.09775641025641026, + "cpu_mem": 1.723625472, + "gpu_mem": 4.436681728, + "loss": 1.4037, + "grad_norm": 0.5584121346473694, + "learning_rate": 0.00029047619047619045 + }, + { + "step": 62, + "epoch": 0.09935897435897435, + "cpu_mem": 1.724215296, + "gpu_mem": 4.436721664, + "loss": 1.3497, + "grad_norm": 0.9253063797950745, + "learning_rate": 0.0002952380952380952 + }, + { + "step": 63, + "epoch": 0.10096153846153846, + "cpu_mem": 1.724608512, + "gpu_mem": 4.436675584, + "loss": 1.4358, + "grad_norm": 0.8931470513343811, + "learning_rate": 0.0003 + }, + { + "step": 64, + "epoch": 0.10256410256410256, + "cpu_mem": 1.725198336, + "gpu_mem": 4.436680192, + "loss": 1.4493, + "grad_norm": 0.8944385647773743, + "learning_rate": 0.00029999764801714643 + }, + { + "step": 65, + "epoch": 0.10416666666666667, + "cpu_mem": 1.725591552, + "gpu_mem": 4.43667712, + "loss": 1.4391, + "grad_norm": 1.0394381284713745, + "learning_rate": 0.00029999059214234344 + }, + { + "step": 66, + "epoch": 0.10576923076923077, + "cpu_mem": 1.725984768, + "gpu_mem": 4.436695552, + "loss": 1.4089, + "grad_norm": 0.6341714262962341, + "learning_rate": 0.00029997883259686163 + }, + { + "step": 67, + "epoch": 0.10737179487179487, + "cpu_mem": 1.726574592, + "gpu_mem": 4.436687872, + "loss": 1.3989, + "grad_norm": 0.5706843733787537, + "learning_rate": 0.00029996236974947764 + }, + { + "step": 68, + "epoch": 0.10897435897435898, + "cpu_mem": 1.726967808, + "gpu_mem": 4.436672512, + "loss": 1.4638, + "grad_norm": 1.8624440431594849, + "learning_rate": 0.00029994120411646263 + }, + { + "step": 69, + "epoch": 0.11057692307692307, + "cpu_mem": 1.727361024, + "gpu_mem": 4.436743168, + "loss": 1.4051, + "grad_norm": 1.0419940948486328, + "learning_rate": 0.00029991533636156603 + }, + { + "step": 70, + "epoch": 0.11217948717948718, + "cpu_mem": 1.727950848, + "gpu_mem": 4.436694016, + "loss": 1.4093, + "grad_norm": 1.1991232633590698, + "learning_rate": 0.00029988476729599464 + }, + { + "step": 71, + "epoch": 0.11378205128205128, + "cpu_mem": 1.728344064, + "gpu_mem": 4.436718592, + "loss": 1.3528, + "grad_norm": 0.951556921005249, + "learning_rate": 0.0002998494978783874 + }, + { + "step": 72, + "epoch": 0.11538461538461539, + "cpu_mem": 1.72873728, + "gpu_mem": 4.436689408, + "loss": 1.4192, + "grad_norm": 1.1266828775405884, + "learning_rate": 0.0002998095292147852 + }, + { + "step": 73, + "epoch": 0.11698717948717949, + "cpu_mem": 1.729130496, + "gpu_mem": 4.436681728, + "loss": 1.4693, + "grad_norm": 1.841294527053833, + "learning_rate": 0.0002997648625585962 + }, + { + "step": 74, + "epoch": 0.11858974358974358, + "cpu_mem": 1.72972032, + "gpu_mem": 4.436675584, + "loss": 1.3836, + "grad_norm": 0.557756245136261, + "learning_rate": 0.0002997154993105566 + }, + { + "step": 75, + "epoch": 0.1201923076923077, + "cpu_mem": 1.730113536, + "gpu_mem": 4.436704768, + "loss": 1.4079, + "grad_norm": 1.163779616355896, + "learning_rate": 0.00029966144101868636 + }, + { + "step": 76, + "epoch": 0.12179487179487179, + "cpu_mem": 1.730506752, + "gpu_mem": 4.436695552, + "loss": 1.4306, + "grad_norm": 1.7474571466445923, + "learning_rate": 0.0002996026893782414 + }, + { + "step": 77, + "epoch": 0.1233974358974359, + "cpu_mem": 1.730899968, + "gpu_mem": 4.436683264, + "loss": 1.3973, + "grad_norm": 0.47750669717788696, + "learning_rate": 0.00029953924623165955 + }, + { + "step": 78, + "epoch": 0.125, + "cpu_mem": 1.731293184, + "gpu_mem": 4.436675584, + "loss": 1.4225, + "grad_norm": 1.433131456375122, + "learning_rate": 0.0002994711135685035 + }, + { + "step": 79, + "epoch": 0.1266025641025641, + "cpu_mem": 1.7316864, + "gpu_mem": 4.436727808, + "loss": 1.399, + "grad_norm": 0.6875696182250977, + "learning_rate": 0.00029939829352539787 + }, + { + "step": 80, + "epoch": 0.1282051282051282, + "cpu_mem": 1.732079616, + "gpu_mem": 4.436706304, + "loss": 1.4122, + "grad_norm": 0.8823260068893433, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 81, + "epoch": 0.12980769230769232, + "cpu_mem": 1.732472832, + "gpu_mem": 4.43670016, + "loss": 1.3607, + "grad_norm": 0.3215961754322052, + "learning_rate": 0.0002992386005807413 + }, + { + "step": 82, + "epoch": 0.13141025641025642, + "cpu_mem": 1.732866048, + "gpu_mem": 4.43667712, + "loss": 1.3859, + "grad_norm": 0.9962493181228638, + "learning_rate": 0.00029915173268712456 + }, + { + "step": 83, + "epoch": 0.1330128205128205, + "cpu_mem": 1.733259264, + "gpu_mem": 4.436698624, + "loss": 1.4549, + "grad_norm": 1.4185404777526855, + "learning_rate": 0.0002990601874292698 + }, + { + "step": 84, + "epoch": 0.1346153846153846, + "cpu_mem": 1.73365248, + "gpu_mem": 4.436670976, + "loss": 1.4546, + "grad_norm": 1.2171250581741333, + "learning_rate": 0.0002989639676780152 + }, + { + "step": 85, + "epoch": 0.1362179487179487, + "cpu_mem": 1.734045696, + "gpu_mem": 4.436678656, + "loss": 1.3975, + "grad_norm": 0.7055566906929016, + "learning_rate": 0.0002988630764507904 + }, + { + "step": 86, + "epoch": 0.13782051282051283, + "cpu_mem": 1.734438912, + "gpu_mem": 4.436697088, + "loss": 1.3793, + "grad_norm": 0.5240998864173889, + "learning_rate": 0.00029875751691152094 + }, + { + "step": 87, + "epoch": 0.13942307692307693, + "cpu_mem": 1.734832128, + "gpu_mem": 4.436686336, + "loss": 1.3837, + "grad_norm": 0.430370956659317, + "learning_rate": 0.0002986472923705301 + }, + { + "step": 88, + "epoch": 0.14102564102564102, + "cpu_mem": 1.735225344, + "gpu_mem": 4.4366848, + "loss": 1.3838, + "grad_norm": 0.5287942886352539, + "learning_rate": 0.0002985324062844341 + }, + { + "step": 89, + "epoch": 0.14262820512820512, + "cpu_mem": 1.73561856, + "gpu_mem": 4.436680192, + "loss": 1.4227, + "grad_norm": 0.7389117479324341, + "learning_rate": 0.0002984128622560345 + }, + { + "step": 90, + "epoch": 0.14423076923076922, + "cpu_mem": 1.736011776, + "gpu_mem": 4.4366848, + "loss": 1.4008, + "grad_norm": 0.8164942860603333, + "learning_rate": 0.0002982886640342046 + }, + { + "step": 91, + "epoch": 0.14583333333333334, + "cpu_mem": 1.736404992, + "gpu_mem": 4.436695552, + "loss": 1.3829, + "grad_norm": 0.5718284845352173, + "learning_rate": 0.00029815981551377217 + }, + { + "step": 92, + "epoch": 0.14743589743589744, + "cpu_mem": 1.736798208, + "gpu_mem": 4.436698624, + "loss": 1.4247, + "grad_norm": 0.502756655216217, + "learning_rate": 0.00029802632073539745 + }, + { + "step": 93, + "epoch": 0.14903846153846154, + "cpu_mem": 1.737191424, + "gpu_mem": 4.436698624, + "loss": 1.4165, + "grad_norm": 0.45046237111091614, + "learning_rate": 0.0002978881838854462 + }, + { + "step": 94, + "epoch": 0.15064102564102563, + "cpu_mem": 1.73758464, + "gpu_mem": 4.436694016, + "loss": 1.4188, + "grad_norm": 0.5727940201759338, + "learning_rate": 0.00029774540929585847 + }, + { + "step": 95, + "epoch": 0.15224358974358973, + "cpu_mem": 1.737977856, + "gpu_mem": 4.436712448, + "loss": 1.3674, + "grad_norm": 0.6050216555595398, + "learning_rate": 0.0002975980014440126 + }, + { + "step": 96, + "epoch": 0.15384615384615385, + "cpu_mem": 1.738371072, + "gpu_mem": 4.43671552, + "loss": 1.3814, + "grad_norm": 0.24622002243995667, + "learning_rate": 0.00029744596495258525 + }, + { + "step": 97, + "epoch": 0.15544871794871795, + "cpu_mem": 1.738764288, + "gpu_mem": 4.43669248, + "loss": 1.4065, + "grad_norm": 0.5195072889328003, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 98, + "epoch": 0.15705128205128205, + "cpu_mem": 1.739157504, + "gpu_mem": 4.436703232, + "loss": 1.4024, + "grad_norm": 0.5982215404510498, + "learning_rate": 0.000297128025267308 + }, + { + "step": 99, + "epoch": 0.15865384615384615, + "cpu_mem": 1.739354112, + "gpu_mem": 4.436703232, + "loss": 1.3754, + "grad_norm": 0.7639931440353394, + "learning_rate": 0.00029696213204397396 + }, + { + "step": 100, + "epoch": 0.16025641025641027, + "cpu_mem": 1.739747328, + "gpu_mem": 4.436678656, + "loss": 1.3919, + "grad_norm": 0.4277855455875397, + "learning_rate": 0.00029679163012177737 + }, + { + "step": 101, + "epoch": 0.16185897435897437, + "cpu_mem": 1.740140544, + "gpu_mem": 4.43670784, + "loss": 1.4172, + "grad_norm": 1.2373700141906738, + "learning_rate": 0.0002966165248476196 + }, + { + "step": 102, + "epoch": 0.16346153846153846, + "cpu_mem": 1.74053376, + "gpu_mem": 4.4366848, + "loss": 1.3579, + "grad_norm": 0.5327304005622864, + "learning_rate": 0.00029643682171276203 + }, + { + "step": 103, + "epoch": 0.16506410256410256, + "cpu_mem": 1.740926976, + "gpu_mem": 4.436701696, + "loss": 1.4196, + "grad_norm": 0.9115890264511108, + "learning_rate": 0.0002962525263526538 + }, + { + "step": 104, + "epoch": 0.16666666666666666, + "cpu_mem": 1.741320192, + "gpu_mem": 4.43666944, + "loss": 1.4158, + "grad_norm": 0.9910538792610168, + "learning_rate": 0.0002960636445467553 + }, + { + "step": 105, + "epoch": 0.16826923076923078, + "cpu_mem": 1.741713408, + "gpu_mem": 4.4366848, + "loss": 1.3809, + "grad_norm": 0.507165789604187, + "learning_rate": 0.0002958701822183569 + }, + { + "step": 106, + "epoch": 0.16987179487179488, + "cpu_mem": 1.741910016, + "gpu_mem": 4.436664832, + "loss": 1.4083, + "grad_norm": 0.8599710464477539, + "learning_rate": 0.0002956721454343928 + }, + { + "step": 107, + "epoch": 0.17147435897435898, + "cpu_mem": 1.742106624, + "gpu_mem": 4.436706304, + "loss": 1.3909, + "grad_norm": 0.48854121565818787, + "learning_rate": 0.0002954695404052514 + }, + { + "step": 108, + "epoch": 0.17307692307692307, + "cpu_mem": 1.74249984, + "gpu_mem": 4.436701696, + "loss": 1.3933, + "grad_norm": 0.5948444604873657, + "learning_rate": 0.00029526237348458003 + }, + { + "step": 109, + "epoch": 0.17467948717948717, + "cpu_mem": 1.742893056, + "gpu_mem": 4.43670784, + "loss": 1.3977, + "grad_norm": 0.812613844871521, + "learning_rate": 0.000295050651169086 + }, + { + "step": 110, + "epoch": 0.1762820512820513, + "cpu_mem": 1.743286272, + "gpu_mem": 4.436704768, + "loss": 1.392, + "grad_norm": 0.42644554376602173, + "learning_rate": 0.00029483438009833264 + }, + { + "step": 111, + "epoch": 0.1778846153846154, + "cpu_mem": 1.74348288, + "gpu_mem": 4.436706304, + "loss": 1.3677, + "grad_norm": 0.40488898754119873, + "learning_rate": 0.0002946135670545314 + }, + { + "step": 112, + "epoch": 0.1794871794871795, + "cpu_mem": 1.743876096, + "gpu_mem": 4.436703232, + "loss": 1.3792, + "grad_norm": 0.3671112060546875, + "learning_rate": 0.0002943882189623288 + }, + { + "step": 113, + "epoch": 0.18108974358974358, + "cpu_mem": 1.744269312, + "gpu_mem": 4.436683264, + "loss": 1.4025, + "grad_norm": 0.3778337240219116, + "learning_rate": 0.00029415834288858947 + }, + { + "step": 114, + "epoch": 0.18269230769230768, + "cpu_mem": 1.744662528, + "gpu_mem": 4.436678656, + "loss": 1.3587, + "grad_norm": 0.5280987620353699, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 115, + "epoch": 0.1842948717948718, + "cpu_mem": 1.744859136, + "gpu_mem": 4.436697088, + "loss": 1.4144, + "grad_norm": 0.5694826245307922, + "learning_rate": 0.0002936850357737156 + }, + { + "step": 116, + "epoch": 0.1858974358974359, + "cpu_mem": 1.745252352, + "gpu_mem": 4.43670784, + "loss": 1.41, + "grad_norm": 0.5348560214042664, + "learning_rate": 0.0002934416195753839 + }, + { + "step": 117, + "epoch": 0.1875, + "cpu_mem": 1.745645568, + "gpu_mem": 4.436694016, + "loss": 1.3984, + "grad_norm": 0.4783788025379181, + "learning_rate": 0.00029319370508065594 + }, + { + "step": 118, + "epoch": 0.1891025641025641, + "cpu_mem": 1.745842176, + "gpu_mem": 4.436709376, + "loss": 1.4357, + "grad_norm": 1.021403431892395, + "learning_rate": 0.0002929413000640735 + }, + { + "step": 119, + "epoch": 0.1907051282051282, + "cpu_mem": 1.746235392, + "gpu_mem": 4.436690944, + "loss": 1.3583, + "grad_norm": 0.5122146010398865, + "learning_rate": 0.0002926844124410001 + }, + { + "step": 120, + "epoch": 0.19230769230769232, + "cpu_mem": 1.746628608, + "gpu_mem": 4.436717056, + "loss": 1.4024, + "grad_norm": 0.6580822467803955, + "learning_rate": 0.0002924230502673731 + }, + { + "step": 121, + "epoch": 0.19391025641025642, + "cpu_mem": 1.747021824, + "gpu_mem": 4.436675584, + "loss": 1.3787, + "grad_norm": 0.494693785905838, + "learning_rate": 0.00029215722173945034 + }, + { + "step": 122, + "epoch": 0.1955128205128205, + "cpu_mem": 1.747218432, + "gpu_mem": 4.43670784, + "loss": 1.3902, + "grad_norm": 0.5698670744895935, + "learning_rate": 0.0002918869351935537 + }, + { + "step": 123, + "epoch": 0.1971153846153846, + "cpu_mem": 1.747611648, + "gpu_mem": 4.436701696, + "loss": 1.4024, + "grad_norm": 0.39740651845932007, + "learning_rate": 0.00029161219910580754 + }, + { + "step": 124, + "epoch": 0.1987179487179487, + "cpu_mem": 1.748004864, + "gpu_mem": 4.436703232, + "loss": 1.3758, + "grad_norm": 0.4601883590221405, + "learning_rate": 0.00029133302209187267 + }, + { + "step": 125, + "epoch": 0.20032051282051283, + "cpu_mem": 1.74839808, + "gpu_mem": 4.436678656, + "loss": 1.3918, + "grad_norm": 0.6560602188110352, + "learning_rate": 0.00029104941290667655 + }, + { + "step": 126, + "epoch": 0.20192307692307693, + "cpu_mem": 1.748791296, + "gpu_mem": 4.436687872, + "loss": 1.3674, + "grad_norm": 0.623936653137207, + "learning_rate": 0.00029076138044413827 + }, + { + "step": 127, + "epoch": 0.20352564102564102, + "cpu_mem": 1.748987904, + "gpu_mem": 4.436674048, + "loss": 1.4145, + "grad_norm": 0.876342236995697, + "learning_rate": 0.00029046893373689 + }, + { + "step": 128, + "epoch": 0.20512820512820512, + "cpu_mem": 1.74938112, + "gpu_mem": 4.436710912, + "loss": 1.3703, + "grad_norm": 0.4721430540084839, + "learning_rate": 0.00029017208195599375 + }, + { + "step": 129, + "epoch": 0.20673076923076922, + "cpu_mem": 1.749577728, + "gpu_mem": 4.43670784, + "loss": 1.3875, + "grad_norm": 0.23871108889579773, + "learning_rate": 0.0002898708344106533 + }, + { + "step": 130, + "epoch": 0.20833333333333334, + "cpu_mem": 1.749970944, + "gpu_mem": 4.43670784, + "loss": 1.4101, + "grad_norm": 0.6306278109550476, + "learning_rate": 0.00028956520054792303 + }, + { + "step": 131, + "epoch": 0.20993589743589744, + "cpu_mem": 1.75036416, + "gpu_mem": 4.436697088, + "loss": 1.3967, + "grad_norm": 0.544289231300354, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 132, + "epoch": 0.21153846153846154, + "cpu_mem": 1.750560768, + "gpu_mem": 4.436697088, + "loss": 1.3602, + "grad_norm": 0.4631349742412567, + "learning_rate": 0.0002889408123459782 + }, + { + "step": 133, + "epoch": 0.21314102564102563, + "cpu_mem": 1.750757376, + "gpu_mem": 4.436678656, + "loss": 1.384, + "grad_norm": 0.439609169960022, + "learning_rate": 0.000288622077587435 + }, + { + "step": 134, + "epoch": 0.21474358974358973, + "cpu_mem": 1.751150592, + "gpu_mem": 4.436689408, + "loss": 1.414, + "grad_norm": 0.7468196153640747, + "learning_rate": 0.0002882989956722303 + }, + { + "step": 135, + "epoch": 0.21634615384615385, + "cpu_mem": 1.751543808, + "gpu_mem": 4.436698624, + "loss": 1.3738, + "grad_norm": 0.460167795419693, + "learning_rate": 0.00028797157673213914 + }, + { + "step": 136, + "epoch": 0.21794871794871795, + "cpu_mem": 1.751740416, + "gpu_mem": 4.436713984, + "loss": 1.4136, + "grad_norm": 1.1817598342895508, + "learning_rate": 0.00028763983103494465 + }, + { + "step": 137, + "epoch": 0.21955128205128205, + "cpu_mem": 1.752133632, + "gpu_mem": 4.43666176, + "loss": 1.3974, + "grad_norm": 0.7489287257194519, + "learning_rate": 0.00028730376898411606 + }, + { + "step": 138, + "epoch": 0.22115384615384615, + "cpu_mem": 1.75233024, + "gpu_mem": 4.436681728, + "loss": 1.3913, + "grad_norm": 0.4264850318431854, + "learning_rate": 0.00028696340111848245 + }, + { + "step": 139, + "epoch": 0.22275641025641027, + "cpu_mem": 1.752526848, + "gpu_mem": 4.436663296, + "loss": 1.3748, + "grad_norm": 0.5493107438087463, + "learning_rate": 0.00028661873811190226 + }, + { + "step": 140, + "epoch": 0.22435897435897437, + "cpu_mem": 1.752920064, + "gpu_mem": 4.436680192, + "loss": 1.4021, + "grad_norm": 0.9427160620689392, + "learning_rate": 0.0002862697907729285 + }, + { + "step": 141, + "epoch": 0.22596153846153846, + "cpu_mem": 1.753116672, + "gpu_mem": 4.436686336, + "loss": 1.4109, + "grad_norm": 0.9137815833091736, + "learning_rate": 0.0002859165700444701 + }, + { + "step": 142, + "epoch": 0.22756410256410256, + "cpu_mem": 1.75331328, + "gpu_mem": 4.436683264, + "loss": 1.3923, + "grad_norm": 0.5314501523971558, + "learning_rate": 0.00028555908700344824 + }, + { + "step": 143, + "epoch": 0.22916666666666666, + "cpu_mem": 1.753706496, + "gpu_mem": 4.436709376, + "loss": 1.3797, + "grad_norm": 0.2517310082912445, + "learning_rate": 0.00028519735286044936 + }, + { + "step": 144, + "epoch": 0.23076923076923078, + "cpu_mem": 1.753903104, + "gpu_mem": 4.436683264, + "loss": 1.3923, + "grad_norm": 0.34247902035713196, + "learning_rate": 0.0002848313789593736 + }, + { + "step": 145, + "epoch": 0.23237179487179488, + "cpu_mem": 1.75429632, + "gpu_mem": 4.4367232, + "loss": 1.3883, + "grad_norm": 0.700568437576294, + "learning_rate": 0.00028446117677707867 + }, + { + "step": 146, + "epoch": 0.23397435897435898, + "cpu_mem": 1.754689536, + "gpu_mem": 4.436672512, + "loss": 1.3976, + "grad_norm": 0.525873064994812, + "learning_rate": 0.0002840867579230205 + }, + { + "step": 147, + "epoch": 0.23557692307692307, + "cpu_mem": 1.754886144, + "gpu_mem": 4.436681728, + "loss": 1.397, + "grad_norm": 0.6007851362228394, + "learning_rate": 0.00028370813413888866 + }, + { + "step": 148, + "epoch": 0.23717948717948717, + "cpu_mem": 1.755082752, + "gpu_mem": 4.436701696, + "loss": 1.3672, + "grad_norm": 0.442802757024765, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 149, + "epoch": 0.2387820512820513, + "cpu_mem": 1.755475968, + "gpu_mem": 4.43669248, + "loss": 1.3804, + "grad_norm": 0.4429939091205597, + "learning_rate": 0.0002829383194061186 + }, + { + "step": 150, + "epoch": 0.2403846153846154, + "cpu_mem": 1.755672576, + "gpu_mem": 4.436704768, + "loss": 1.404, + "grad_norm": 0.6398499608039856, + "learning_rate": 0.00028254715259869444 + }, + { + "step": 151, + "epoch": 0.2419871794871795, + "cpu_mem": 1.756065792, + "gpu_mem": 4.43666944, + "loss": 1.4186, + "grad_norm": 0.9832616448402405, + "learning_rate": 0.00028215182914286766 + }, + { + "step": 152, + "epoch": 0.24358974358974358, + "cpu_mem": 1.7562624, + "gpu_mem": 4.43670016, + "loss": 1.4164, + "grad_norm": 0.9809108376502991, + "learning_rate": 0.00028175236143589144 + }, + { + "step": 153, + "epoch": 0.24519230769230768, + "cpu_mem": 1.756459008, + "gpu_mem": 4.436695552, + "loss": 1.3781, + "grad_norm": 0.5804086923599243, + "learning_rate": 0.0002813487620049817 + }, + { + "step": 154, + "epoch": 0.2467948717948718, + "cpu_mem": 1.756655616, + "gpu_mem": 4.436720128, + "loss": 1.3758, + "grad_norm": 0.33229586482048035, + "learning_rate": 0.00028094104350692435 + }, + { + "step": 155, + "epoch": 0.2483974358974359, + "cpu_mem": 1.756852224, + "gpu_mem": 4.436657152, + "loss": 1.4046, + "grad_norm": 0.5182873606681824, + "learning_rate": 0.0002805292187276783 + }, + { + "step": 156, + "epoch": 0.25, + "cpu_mem": 1.75724544, + "gpu_mem": 4.436710912, + "loss": 1.3741, + "grad_norm": 0.41128674149513245, + "learning_rate": 0.0002801133005819744 + }, + { + "step": 157, + "epoch": 0.2516025641025641, + "cpu_mem": 1.757442048, + "gpu_mem": 4.436703232, + "loss": 1.3797, + "grad_norm": 0.2869640588760376, + "learning_rate": 0.00027969330211291077 + }, + { + "step": 158, + "epoch": 0.2532051282051282, + "cpu_mem": 1.757835264, + "gpu_mem": 4.436718592, + "loss": 1.4106, + "grad_norm": 0.5420998930931091, + "learning_rate": 0.00027926923649154327 + }, + { + "step": 159, + "epoch": 0.2548076923076923, + "cpu_mem": 1.75822848, + "gpu_mem": 4.436720128, + "loss": 1.4027, + "grad_norm": 0.48017367720603943, + "learning_rate": 0.00027884111701647284 + }, + { + "step": 160, + "epoch": 0.2564102564102564, + "cpu_mem": 1.758425088, + "gpu_mem": 4.436687872, + "loss": 1.4653, + "grad_norm": 0.9793354272842407, + "learning_rate": 0.00027840895711342834 + }, + { + "step": 161, + "epoch": 0.25801282051282054, + "cpu_mem": 1.758621696, + "gpu_mem": 4.436680192, + "loss": 1.41, + "grad_norm": 0.5573543310165405, + "learning_rate": 0.00027797277033484553 + }, + { + "step": 162, + "epoch": 0.25961538461538464, + "cpu_mem": 1.758818304, + "gpu_mem": 4.43671552, + "loss": 1.4021, + "grad_norm": 0.43043625354766846, + "learning_rate": 0.0002775325703594421 + }, + { + "step": 163, + "epoch": 0.26121794871794873, + "cpu_mem": 1.75921152, + "gpu_mem": 4.436663296, + "loss": 1.3999, + "grad_norm": 0.6561245918273926, + "learning_rate": 0.0002770883709917886 + }, + { + "step": 164, + "epoch": 0.26282051282051283, + "cpu_mem": 1.759408128, + "gpu_mem": 4.436698624, + "loss": 1.3879, + "grad_norm": 0.6344681978225708, + "learning_rate": 0.0002766401861618757 + }, + { + "step": 165, + "epoch": 0.2644230769230769, + "cpu_mem": 1.759604736, + "gpu_mem": 4.436687872, + "loss": 1.3769, + "grad_norm": 0.2274082452058792, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 166, + "epoch": 0.266025641025641, + "cpu_mem": 1.759801344, + "gpu_mem": 4.436720128, + "loss": 1.3767, + "grad_norm": 0.5217154622077942, + "learning_rate": 0.0002757319164597092 + }, + { + "step": 167, + "epoch": 0.2676282051282051, + "cpu_mem": 1.759997952, + "gpu_mem": 4.436713984, + "loss": 1.3804, + "grad_norm": 0.924683690071106, + "learning_rate": 0.0002752718600705858 + }, + { + "step": 168, + "epoch": 0.2692307692307692, + "cpu_mem": 1.760391168, + "gpu_mem": 4.43669248, + "loss": 1.3853, + "grad_norm": 0.5467039346694946, + "learning_rate": 0.00027480787518457023 + }, + { + "step": 169, + "epoch": 0.2708333333333333, + "cpu_mem": 1.760587776, + "gpu_mem": 4.436689408, + "loss": 1.4025, + "grad_norm": 1.1411949396133423, + "learning_rate": 0.0002743399763521223 + }, + { + "step": 170, + "epoch": 0.2724358974358974, + "cpu_mem": 1.760784384, + "gpu_mem": 4.436726272, + "loss": 1.3991, + "grad_norm": 0.8148717880249023, + "learning_rate": 0.0002738681782464426 + }, + { + "step": 171, + "epoch": 0.27403846153846156, + "cpu_mem": 1.7611776, + "gpu_mem": 4.43670016, + "loss": 1.3809, + "grad_norm": 1.115392804145813, + "learning_rate": 0.0002733924956630117 + }, + { + "step": 172, + "epoch": 0.27564102564102566, + "cpu_mem": 1.761374208, + "gpu_mem": 4.43667712, + "loss": 1.3967, + "grad_norm": 1.279757022857666, + "learning_rate": 0.00027291294351912664 + }, + { + "step": 173, + "epoch": 0.27724358974358976, + "cpu_mem": 1.761570816, + "gpu_mem": 4.436703232, + "loss": 1.416, + "grad_norm": 1.4749424457550049, + "learning_rate": 0.00027242953685343327 + }, + { + "step": 174, + "epoch": 0.27884615384615385, + "cpu_mem": 1.761767424, + "gpu_mem": 4.43671552, + "loss": 1.4131, + "grad_norm": 1.5165612697601318, + "learning_rate": 0.0002719422908254538 + }, + { + "step": 175, + "epoch": 0.28044871794871795, + "cpu_mem": 1.761964032, + "gpu_mem": 4.43667712, + "loss": 1.3812, + "grad_norm": 0.6922191381454468, + "learning_rate": 0.0002714512207151125 + }, + { + "step": 176, + "epoch": 0.28205128205128205, + "cpu_mem": 1.76216064, + "gpu_mem": 4.436686336, + "loss": 1.4029, + "grad_norm": 0.8346382975578308, + "learning_rate": 0.0002709563419222557 + }, + { + "step": 177, + "epoch": 0.28365384615384615, + "cpu_mem": 1.762357248, + "gpu_mem": 4.436667904, + "loss": 1.4089, + "grad_norm": 1.2475104331970215, + "learning_rate": 0.0002704576699661691 + }, + { + "step": 178, + "epoch": 0.28525641025641024, + "cpu_mem": 1.762750464, + "gpu_mem": 4.436681728, + "loss": 1.4358, + "grad_norm": 1.5087765455245972, + "learning_rate": 0.0002699552204850914 + }, + { + "step": 179, + "epoch": 0.28685897435897434, + "cpu_mem": 1.762947072, + "gpu_mem": 4.436689408, + "loss": 1.4108, + "grad_norm": 0.7440826296806335, + "learning_rate": 0.0002694490092357233 + }, + { + "step": 180, + "epoch": 0.28846153846153844, + "cpu_mem": 1.763340288, + "gpu_mem": 4.436670976, + "loss": 1.3984, + "grad_norm": 0.7595475912094116, + "learning_rate": 0.00026893905209273404 + }, + { + "step": 181, + "epoch": 0.2900641025641026, + "cpu_mem": 1.763536896, + "gpu_mem": 4.436701696, + "loss": 1.3995, + "grad_norm": 0.6703044772148132, + "learning_rate": 0.00026842536504826286 + }, + { + "step": 182, + "epoch": 0.2916666666666667, + "cpu_mem": 1.763733504, + "gpu_mem": 4.436672512, + "loss": 1.3874, + "grad_norm": 0.7054870128631592, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 183, + "epoch": 0.2932692307692308, + "cpu_mem": 1.763930112, + "gpu_mem": 4.436697088, + "loss": 1.3737, + "grad_norm": 0.36180731654167175, + "learning_rate": 0.0002673868658077717 + }, + { + "step": 184, + "epoch": 0.2948717948717949, + "cpu_mem": 1.76412672, + "gpu_mem": 4.43667712, + "loss": 1.4142, + "grad_norm": 1.052125334739685, + "learning_rate": 0.00026686208617885055 + }, + { + "step": 185, + "epoch": 0.296474358974359, + "cpu_mem": 1.764519936, + "gpu_mem": 4.436709376, + "loss": 1.4457, + "grad_norm": 1.5841379165649414, + "learning_rate": 0.0002663336417816238 + }, + { + "step": 186, + "epoch": 0.2980769230769231, + "cpu_mem": 1.764716544, + "gpu_mem": 4.43670016, + "loss": 1.4027, + "grad_norm": 0.7962464690208435, + "learning_rate": 0.0002658015491879868 + }, + { + "step": 187, + "epoch": 0.29967948717948717, + "cpu_mem": 1.764913152, + "gpu_mem": 4.436695552, + "loss": 1.4271, + "grad_norm": 1.4927072525024414, + "learning_rate": 0.00026526582508424175 + }, + { + "step": 188, + "epoch": 0.30128205128205127, + "cpu_mem": 1.764913152, + "gpu_mem": 4.436652544, + "loss": 1.3689, + "grad_norm": 0.2578808069229126, + "learning_rate": 0.0002647264862705741 + }, + { + "step": 189, + "epoch": 0.30288461538461536, + "cpu_mem": 1.765306368, + "gpu_mem": 4.436732416, + "loss": 1.3873, + "grad_norm": 0.2664458453655243, + "learning_rate": 0.00026418354966052573 + }, + { + "step": 190, + "epoch": 0.30448717948717946, + "cpu_mem": 1.765502976, + "gpu_mem": 4.436683264, + "loss": 1.4074, + "grad_norm": 0.8936998248100281, + "learning_rate": 0.00026363703228046454 + }, + { + "step": 191, + "epoch": 0.3060897435897436, + "cpu_mem": 1.765699584, + "gpu_mem": 4.436683264, + "loss": 1.3857, + "grad_norm": 0.6013986468315125, + "learning_rate": 0.0002630869512690507 + }, + { + "step": 192, + "epoch": 0.3076923076923077, + "cpu_mem": 1.765896192, + "gpu_mem": 4.436649472, + "loss": 1.3951, + "grad_norm": 0.6670477986335754, + "learning_rate": 0.0002625333238766989 + }, + { + "step": 193, + "epoch": 0.3092948717948718, + "cpu_mem": 1.7660928, + "gpu_mem": 4.436689408, + "loss": 1.3644, + "grad_norm": 0.5989495515823364, + "learning_rate": 0.0002619761674650377 + }, + { + "step": 194, + "epoch": 0.3108974358974359, + "cpu_mem": 1.766289408, + "gpu_mem": 4.4366848, + "loss": 1.3952, + "grad_norm": 0.8915923833847046, + "learning_rate": 0.0002614154995063647 + }, + { + "step": 195, + "epoch": 0.3125, + "cpu_mem": 1.766486016, + "gpu_mem": 4.436672512, + "loss": 1.3903, + "grad_norm": 0.589945375919342, + "learning_rate": 0.00026085133758309883 + }, + { + "step": 196, + "epoch": 0.3141025641025641, + "cpu_mem": 1.766682624, + "gpu_mem": 4.436697088, + "loss": 1.4144, + "grad_norm": 1.4260048866271973, + "learning_rate": 0.0002602836993872292 + }, + { + "step": 197, + "epoch": 0.3157051282051282, + "cpu_mem": 1.766879232, + "gpu_mem": 4.436712448, + "loss": 1.4053, + "grad_norm": 1.229611873626709, + "learning_rate": 0.0002597126027197598 + }, + { + "step": 198, + "epoch": 0.3173076923076923, + "cpu_mem": 1.76707584, + "gpu_mem": 4.4366848, + "loss": 1.3721, + "grad_norm": 0.5831228494644165, + "learning_rate": 0.0002591380654901515 + }, + { + "step": 199, + "epoch": 0.3189102564102564, + "cpu_mem": 1.767272448, + "gpu_mem": 4.436681728, + "loss": 1.3882, + "grad_norm": 0.7441267967224121, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 200, + "epoch": 0.32051282051282054, + "cpu_mem": 1.767469056, + "gpu_mem": 4.436697088, + "loss": 1.4285, + "grad_norm": 0.8789777755737305, + "learning_rate": 0.0002579787415212732 + }, + { + "step": 201, + "epoch": 0.32211538461538464, + "cpu_mem": 1.767665664, + "gpu_mem": 4.436674048, + "loss": 1.3972, + "grad_norm": 0.3937188684940338, + "learning_rate": 0.00025739399113813784 + }, + { + "step": 202, + "epoch": 0.32371794871794873, + "cpu_mem": 1.767862272, + "gpu_mem": 4.436675584, + "loss": 1.3939, + "grad_norm": 0.5055484771728516, + "learning_rate": 0.00025680587290399277 + }, + { + "step": 203, + "epoch": 0.32532051282051283, + "cpu_mem": 1.76805888, + "gpu_mem": 4.436717056, + "loss": 1.3909, + "grad_norm": 0.8342368602752686, + "learning_rate": 0.0002562144052620913 + }, + { + "step": 204, + "epoch": 0.3269230769230769, + "cpu_mem": 1.768255488, + "gpu_mem": 4.436687872, + "loss": 1.3918, + "grad_norm": 0.35180357098579407, + "learning_rate": 0.00025561960676072354 + }, + { + "step": 205, + "epoch": 0.328525641025641, + "cpu_mem": 1.768452096, + "gpu_mem": 4.436687872, + "loss": 1.3772, + "grad_norm": 0.4398294687271118, + "learning_rate": 0.0002550214960526344 + }, + { + "step": 206, + "epoch": 0.3301282051282051, + "cpu_mem": 1.768648704, + "gpu_mem": 4.4366848, + "loss": 1.3796, + "grad_norm": 0.6190657615661621, + "learning_rate": 0.000254420091894439 + }, + { + "step": 207, + "epoch": 0.3317307692307692, + "cpu_mem": 1.768845312, + "gpu_mem": 4.4366848, + "loss": 1.3915, + "grad_norm": 0.6004791259765625, + "learning_rate": 0.0002538154131460342 + }, + { + "step": 208, + "epoch": 0.3333333333333333, + "cpu_mem": 1.76904192, + "gpu_mem": 4.436675584, + "loss": 1.3829, + "grad_norm": 0.5236581563949585, + "learning_rate": 0.00025320747877000745 + }, + { + "step": 209, + "epoch": 0.3349358974358974, + "cpu_mem": 1.769238528, + "gpu_mem": 4.436710912, + "loss": 1.4013, + "grad_norm": 1.0444456338882446, + "learning_rate": 0.00025259630783104164 + }, + { + "step": 210, + "epoch": 0.33653846153846156, + "cpu_mem": 1.769435136, + "gpu_mem": 4.436667904, + "loss": 1.372, + "grad_norm": 0.370542973279953, + "learning_rate": 0.00025198191949531786 + }, + { + "step": 211, + "epoch": 0.33814102564102566, + "cpu_mem": 1.769631744, + "gpu_mem": 4.436695552, + "loss": 1.3998, + "grad_norm": 0.6294221878051758, + "learning_rate": 0.00025136433302991366 + }, + { + "step": 212, + "epoch": 0.33974358974358976, + "cpu_mem": 1.769828352, + "gpu_mem": 4.436704768, + "loss": 1.3974, + "grad_norm": 1.1628621816635132, + "learning_rate": 0.00025074356780219946 + }, + { + "step": 213, + "epoch": 0.34134615384615385, + "cpu_mem": 1.77002496, + "gpu_mem": 4.43667712, + "loss": 1.3773, + "grad_norm": 0.8685598373413086, + "learning_rate": 0.000250119643279231 + }, + { + "step": 214, + "epoch": 0.34294871794871795, + "cpu_mem": 1.770221568, + "gpu_mem": 4.436686336, + "loss": 1.3793, + "grad_norm": 0.5068528056144714, + "learning_rate": 0.0002494925790271386 + }, + { + "step": 215, + "epoch": 0.34455128205128205, + "cpu_mem": 1.770418176, + "gpu_mem": 4.436687872, + "loss": 1.4177, + "grad_norm": 0.7025962471961975, + "learning_rate": 0.00024886239471051376 + }, + { + "step": 216, + "epoch": 0.34615384615384615, + "cpu_mem": 1.770811392, + "gpu_mem": 4.436687872, + "loss": 1.4029, + "grad_norm": 0.6025776863098145, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 217, + "epoch": 0.34775641025641024, + "cpu_mem": 1.771008, + "gpu_mem": 4.436672512, + "loss": 1.4242, + "grad_norm": 0.7310124039649963, + "learning_rate": 0.0002475927450306363 + }, + { + "step": 218, + "epoch": 0.34935897435897434, + "cpu_mem": 1.771204608, + "gpu_mem": 4.436694016, + "loss": 1.3883, + "grad_norm": 0.5473674535751343, + "learning_rate": 0.0002469533194833073 + }, + { + "step": 219, + "epoch": 0.35096153846153844, + "cpu_mem": 1.771401216, + "gpu_mem": 4.436727808, + "loss": 1.3878, + "grad_norm": 0.5928773283958435, + "learning_rate": 0.0002463108535020447 + }, + { + "step": 220, + "epoch": 0.3525641025641026, + "cpu_mem": 1.771597824, + "gpu_mem": 4.436681728, + "loss": 1.3855, + "grad_norm": 0.5716749429702759, + "learning_rate": 0.0002456653672344348 + }, + { + "step": 221, + "epoch": 0.3541666666666667, + "cpu_mem": 1.771794432, + "gpu_mem": 4.436687872, + "loss": 1.3741, + "grad_norm": 0.4931805431842804, + "learning_rate": 0.0002450168809227794 + }, + { + "step": 222, + "epoch": 0.3557692307692308, + "cpu_mem": 1.771794432, + "gpu_mem": 4.436703232, + "loss": 1.4117, + "grad_norm": 0.7298491597175598, + "learning_rate": 0.00024436541490346095 + }, + { + "step": 223, + "epoch": 0.3573717948717949, + "cpu_mem": 1.77199104, + "gpu_mem": 4.436721664, + "loss": 1.405, + "grad_norm": 0.815715491771698, + "learning_rate": 0.00024371098960630495 + }, + { + "step": 224, + "epoch": 0.358974358974359, + "cpu_mem": 1.772187648, + "gpu_mem": 4.436690944, + "loss": 1.378, + "grad_norm": 0.318278968334198, + "learning_rate": 0.000243053625553939 + }, + { + "step": 225, + "epoch": 0.3605769230769231, + "cpu_mem": 1.772384256, + "gpu_mem": 4.43667712, + "loss": 1.3792, + "grad_norm": 0.42567333579063416, + "learning_rate": 0.00024239334336114953 + }, + { + "step": 226, + "epoch": 0.36217948717948717, + "cpu_mem": 1.772580864, + "gpu_mem": 4.43666944, + "loss": 1.3794, + "grad_norm": 0.33707982301712036, + "learning_rate": 0.0002417301637342352 + }, + { + "step": 227, + "epoch": 0.36378205128205127, + "cpu_mem": 1.772777472, + "gpu_mem": 4.436733952, + "loss": 1.385, + "grad_norm": 0.4199323058128357, + "learning_rate": 0.00024106410747035744 + }, + { + "step": 228, + "epoch": 0.36538461538461536, + "cpu_mem": 1.77297408, + "gpu_mem": 4.436672512, + "loss": 1.3727, + "grad_norm": 0.5787172913551331, + "learning_rate": 0.00024039519545688846 + }, + { + "step": 229, + "epoch": 0.36698717948717946, + "cpu_mem": 1.773170688, + "gpu_mem": 4.436724736, + "loss": 1.37, + "grad_norm": 0.2990874648094177, + "learning_rate": 0.000239723448670756 + }, + { + "step": 230, + "epoch": 0.3685897435897436, + "cpu_mem": 1.773367296, + "gpu_mem": 4.436706304, + "loss": 1.3734, + "grad_norm": 0.38380107283592224, + "learning_rate": 0.0002390488881777858 + }, + { + "step": 231, + "epoch": 0.3701923076923077, + "cpu_mem": 1.773563904, + "gpu_mem": 4.436704768, + "loss": 1.3919, + "grad_norm": 0.5429220199584961, + "learning_rate": 0.0002383715351320406 + }, + { + "step": 232, + "epoch": 0.3717948717948718, + "cpu_mem": 1.773760512, + "gpu_mem": 4.436709376, + "loss": 1.3803, + "grad_norm": 0.38069674372673035, + "learning_rate": 0.00023769141077515713 + }, + { + "step": 233, + "epoch": 0.3733974358974359, + "cpu_mem": 1.77395712, + "gpu_mem": 4.4366848, + "loss": 1.3869, + "grad_norm": 0.969658613204956, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 234, + "epoch": 0.375, + "cpu_mem": 1.774153728, + "gpu_mem": 4.436713984, + "loss": 1.3855, + "grad_norm": 0.44469931721687317, + "learning_rate": 0.0002363229335283915 + }, + { + "step": 235, + "epoch": 0.3766025641025641, + "cpu_mem": 1.774350336, + "gpu_mem": 4.436690944, + "loss": 1.3796, + "grad_norm": 0.6257663369178772, + "learning_rate": 0.00023563462355364297 + }, + { + "step": 236, + "epoch": 0.3782051282051282, + "cpu_mem": 1.774350336, + "gpu_mem": 4.436752384, + "loss": 1.3992, + "grad_norm": 0.9581045508384705, + "learning_rate": 0.0002349436280966775 + }, + { + "step": 237, + "epoch": 0.3798076923076923, + "cpu_mem": 1.774350336, + "gpu_mem": 4.43667712, + "loss": 1.3913, + "grad_norm": 0.7786872386932373, + "learning_rate": 0.00023424996882695468 + }, + { + "step": 238, + "epoch": 0.3814102564102564, + "cpu_mem": 1.774546944, + "gpu_mem": 4.436687872, + "loss": 1.3816, + "grad_norm": 0.6045745611190796, + "learning_rate": 0.00023355366749747063 + }, + { + "step": 239, + "epoch": 0.38301282051282054, + "cpu_mem": 1.774743552, + "gpu_mem": 4.436686336, + "loss": 1.41, + "grad_norm": 0.47206735610961914, + "learning_rate": 0.00023285474594407585 + }, + { + "step": 240, + "epoch": 0.38461538461538464, + "cpu_mem": 1.77494016, + "gpu_mem": 4.436683264, + "loss": 1.3448, + "grad_norm": 0.4412538409233093, + "learning_rate": 0.0002321532260847905 + }, + { + "step": 241, + "epoch": 0.38621794871794873, + "cpu_mem": 1.775136768, + "gpu_mem": 4.436713984, + "loss": 1.3673, + "grad_norm": 0.5103728175163269, + "learning_rate": 0.00023144912991911691 + }, + { + "step": 242, + "epoch": 0.38782051282051283, + "cpu_mem": 1.775333376, + "gpu_mem": 4.43669248, + "loss": 1.3559, + "grad_norm": 0.5571041107177734, + "learning_rate": 0.0002307424795273499 + }, + { + "step": 243, + "epoch": 0.3894230769230769, + "cpu_mem": 1.775529984, + "gpu_mem": 4.436687872, + "loss": 1.3356, + "grad_norm": 0.6705704927444458, + "learning_rate": 0.00023003329706988425 + }, + { + "step": 244, + "epoch": 0.391025641025641, + "cpu_mem": 1.775726592, + "gpu_mem": 4.436698624, + "loss": 1.3612, + "grad_norm": 0.8544007539749146, + "learning_rate": 0.00022932160478651963 + }, + { + "step": 245, + "epoch": 0.3926282051282051, + "cpu_mem": 1.7759232, + "gpu_mem": 4.436703232, + "loss": 1.3657, + "grad_norm": 0.43387213349342346, + "learning_rate": 0.00022860742499576338 + }, + { + "step": 246, + "epoch": 0.3942307692307692, + "cpu_mem": 1.776119808, + "gpu_mem": 4.436664832, + "loss": 1.376, + "grad_norm": 0.4864002466201782, + "learning_rate": 0.00022789078009413042 + }, + { + "step": 247, + "epoch": 0.3958333333333333, + "cpu_mem": 1.776316416, + "gpu_mem": 4.436732416, + "loss": 1.3411, + "grad_norm": 0.4893796443939209, + "learning_rate": 0.00022717169255544108 + }, + { + "step": 248, + "epoch": 0.3974358974358974, + "cpu_mem": 1.776513024, + "gpu_mem": 4.436695552, + "loss": 1.2983, + "grad_norm": 0.6367840766906738, + "learning_rate": 0.00022645018493011612 + }, + { + "step": 249, + "epoch": 0.39903846153846156, + "cpu_mem": 1.776709632, + "gpu_mem": 4.4366848, + "loss": 1.3653, + "grad_norm": 0.9235938787460327, + "learning_rate": 0.0002257262798444698 + }, + { + "step": 250, + "epoch": 0.40064102564102566, + "cpu_mem": 1.776709632, + "gpu_mem": 4.436701696, + "loss": 1.335, + "grad_norm": 0.9581326842308044, + "learning_rate": 0.000225 + }, + { + "step": 251, + "epoch": 0.40224358974358976, + "cpu_mem": 1.776709632, + "gpu_mem": 4.436675584, + "loss": 1.3828, + "grad_norm": 1.9765608310699463, + "learning_rate": 0.00022427136817267668 + }, + { + "step": 252, + "epoch": 0.40384615384615385, + "cpu_mem": 1.77690624, + "gpu_mem": 4.4367232, + "loss": 1.348, + "grad_norm": 1.8403476476669312, + "learning_rate": 0.0002235404072122273 + }, + { + "step": 253, + "epoch": 0.40544871794871795, + "cpu_mem": 1.777102848, + "gpu_mem": 4.436690944, + "loss": 1.3178, + "grad_norm": 2.166224956512451, + "learning_rate": 0.00022280714004142054 + }, + { + "step": 254, + "epoch": 0.40705128205128205, + "cpu_mem": 1.777299456, + "gpu_mem": 4.436680192, + "loss": 1.3017, + "grad_norm": 1.200668215751648, + "learning_rate": 0.00022207158965534726 + }, + { + "step": 255, + "epoch": 0.40865384615384615, + "cpu_mem": 1.777496064, + "gpu_mem": 4.436695552, + "loss": 1.2965, + "grad_norm": 1.0055468082427979, + "learning_rate": 0.0002213337791206993 + }, + { + "step": 256, + "epoch": 0.41025641025641024, + "cpu_mem": 1.777692672, + "gpu_mem": 4.43669248, + "loss": 1.2691, + "grad_norm": 0.8830154538154602, + "learning_rate": 0.00022059373157504636 + }, + { + "step": 257, + "epoch": 0.41185897435897434, + "cpu_mem": 1.77788928, + "gpu_mem": 4.43669248, + "loss": 1.2906, + "grad_norm": 1.1126112937927246, + "learning_rate": 0.00021985147022611038 + }, + { + "step": 258, + "epoch": 0.41346153846153844, + "cpu_mem": 1.77788928, + "gpu_mem": 4.436680192, + "loss": 1.365, + "grad_norm": 1.2338305711746216, + "learning_rate": 0.0002191070183510375 + }, + { + "step": 259, + "epoch": 0.4150641025641026, + "cpu_mem": 1.778085888, + "gpu_mem": 4.436663296, + "loss": 1.2681, + "grad_norm": 1.1255525350570679, + "learning_rate": 0.00021836039929566835 + }, + { + "step": 260, + "epoch": 0.4166666666666667, + "cpu_mem": 1.778085888, + "gpu_mem": 4.436726272, + "loss": 1.3051, + "grad_norm": 1.1394000053405762, + "learning_rate": 0.0002176116364738058 + }, + { + "step": 261, + "epoch": 0.4182692307692308, + "cpu_mem": 1.778282496, + "gpu_mem": 4.436680192, + "loss": 1.2972, + "grad_norm": 1.1032432317733765, + "learning_rate": 0.00021686075336648075 + }, + { + "step": 262, + "epoch": 0.4198717948717949, + "cpu_mem": 1.778479104, + "gpu_mem": 4.436689408, + "loss": 1.3083, + "grad_norm": 2.024336099624634, + "learning_rate": 0.00021610777352121574 + }, + { + "step": 263, + "epoch": 0.421474358974359, + "cpu_mem": 1.778675712, + "gpu_mem": 4.436724736, + "loss": 1.1981, + "grad_norm": 1.5016961097717285, + "learning_rate": 0.0002153527205512867 + }, + { + "step": 264, + "epoch": 0.4230769230769231, + "cpu_mem": 1.778675712, + "gpu_mem": 4.436689408, + "loss": 1.2339, + "grad_norm": 1.135440707206726, + "learning_rate": 0.0002145956181349821 + }, + { + "step": 265, + "epoch": 0.42467948717948717, + "cpu_mem": 1.77887232, + "gpu_mem": 4.436694016, + "loss": 1.3272, + "grad_norm": 2.8508365154266357, + "learning_rate": 0.00021383649001486055 + }, + { + "step": 266, + "epoch": 0.42628205128205127, + "cpu_mem": 1.779068928, + "gpu_mem": 4.436741632, + "loss": 1.1513, + "grad_norm": 2.8491263389587402, + "learning_rate": 0.00021307535999700637 + }, + { + "step": 267, + "epoch": 0.42788461538461536, + "cpu_mem": 1.779265536, + "gpu_mem": 4.436750848, + "loss": 1.2364, + "grad_norm": 2.807981014251709, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 268, + "epoch": 0.42948717948717946, + "cpu_mem": 1.779462144, + "gpu_mem": 4.436704768, + "loss": 1.4862, + "grad_norm": 7.530654430389404, + "learning_rate": 0.00021154718980558417 + }, + { + "step": 269, + "epoch": 0.4310897435897436, + "cpu_mem": 1.779462144, + "gpu_mem": 4.436698624, + "loss": 1.1133, + "grad_norm": 2.956831216812134, + "learning_rate": 0.00021078019755508398 + }, + { + "step": 270, + "epoch": 0.4326923076923077, + "cpu_mem": 1.779658752, + "gpu_mem": 4.436760064, + "loss": 1.1641, + "grad_norm": 2.5013203620910645, + "learning_rate": 0.00021001129925148396 + }, + { + "step": 271, + "epoch": 0.4342948717948718, + "cpu_mem": 1.77985536, + "gpu_mem": 4.436686336, + "loss": 1.1011, + "grad_norm": 2.5826539993286133, + "learning_rate": 0.00020924051900725923 + }, + { + "step": 272, + "epoch": 0.4358974358974359, + "cpu_mem": 1.780051968, + "gpu_mem": 4.4366848, + "loss": 1.3088, + "grad_norm": 5.981411933898926, + "learning_rate": 0.00020846788099390188 + }, + { + "step": 273, + "epoch": 0.4375, + "cpu_mem": 1.780051968, + "gpu_mem": 4.436687872, + "loss": 1.1254, + "grad_norm": 2.732546329498291, + "learning_rate": 0.0002076934094411635 + }, + { + "step": 274, + "epoch": 0.4391025641025641, + "cpu_mem": 1.780051968, + "gpu_mem": 4.436674048, + "loss": 1.0749, + "grad_norm": 2.460160255432129, + "learning_rate": 0.0002069171286362949 + }, + { + "step": 275, + "epoch": 0.4407051282051282, + "cpu_mem": 1.780248576, + "gpu_mem": 4.436689408, + "loss": 1.1221, + "grad_norm": 2.463603973388672, + "learning_rate": 0.00020613906292328457 + }, + { + "step": 276, + "epoch": 0.4423076923076923, + "cpu_mem": 1.780445184, + "gpu_mem": 4.436727808, + "loss": 1.0889, + "grad_norm": 2.291102409362793, + "learning_rate": 0.0002053592367020955 + }, + { + "step": 277, + "epoch": 0.4439102564102564, + "cpu_mem": 1.780641792, + "gpu_mem": 4.43670784, + "loss": 1.2343, + "grad_norm": 5.164675235748291, + "learning_rate": 0.0002045776744278996 + }, + { + "step": 278, + "epoch": 0.44551282051282054, + "cpu_mem": 1.7808384, + "gpu_mem": 4.436733952, + "loss": 1.2226, + "grad_norm": 2.820469379425049, + "learning_rate": 0.00020379440061031118 + }, + { + "step": 279, + "epoch": 0.44711538461538464, + "cpu_mem": 1.781035008, + "gpu_mem": 4.4366848, + "loss": 1.0485, + "grad_norm": 2.629295825958252, + "learning_rate": 0.00020300943981261808 + }, + { + "step": 280, + "epoch": 0.44871794871794873, + "cpu_mem": 1.781035008, + "gpu_mem": 4.436678656, + "loss": 1.0056, + "grad_norm": 2.2663748264312744, + "learning_rate": 0.0002022228166510114 + }, + { + "step": 281, + "epoch": 0.45032051282051283, + "cpu_mem": 1.781231616, + "gpu_mem": 4.436701696, + "loss": 1.0301, + "grad_norm": 6.019822597503662, + "learning_rate": 0.00020143455579381373 + }, + { + "step": 282, + "epoch": 0.4519230769230769, + "cpu_mem": 1.781231616, + "gpu_mem": 4.436680192, + "loss": 1.0241, + "grad_norm": 3.2175159454345703, + "learning_rate": 0.0002006446819607053 + }, + { + "step": 283, + "epoch": 0.453525641025641, + "cpu_mem": 1.781428224, + "gpu_mem": 4.436694016, + "loss": 1.2092, + "grad_norm": 3.445387125015259, + "learning_rate": 0.00019985321992194892 + }, + { + "step": 284, + "epoch": 0.4551282051282051, + "cpu_mem": 1.781624832, + "gpu_mem": 4.436698624, + "loss": 1.0962, + "grad_norm": 4.872371673583984, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 285, + "epoch": 0.4567307692307692, + "cpu_mem": 1.781624832, + "gpu_mem": 4.436717056, + "loss": 1.1149, + "grad_norm": 4.223694801330566, + "learning_rate": 0.00019826563055679418 + }, + { + "step": 286, + "epoch": 0.4583333333333333, + "cpu_mem": 1.78182144, + "gpu_mem": 4.436687872, + "loss": 0.8596, + "grad_norm": 2.956149101257324, + "learning_rate": 0.00019746955301683537 + }, + { + "step": 287, + "epoch": 0.4599358974358974, + "cpu_mem": 1.78182144, + "gpu_mem": 4.43671552, + "loss": 1.1906, + "grad_norm": 3.2856316566467285, + "learning_rate": 0.0001966719868425464 + }, + { + "step": 288, + "epoch": 0.46153846153846156, + "cpu_mem": 1.782018048, + "gpu_mem": 4.436697088, + "loss": 0.9925, + "grad_norm": 2.9810378551483154, + "learning_rate": 0.0001958729570454201 + }, + { + "step": 289, + "epoch": 0.46314102564102566, + "cpu_mem": 1.782018048, + "gpu_mem": 4.4366848, + "loss": 1.1111, + "grad_norm": 4.0183820724487305, + "learning_rate": 0.0001950724886828484 + }, + { + "step": 290, + "epoch": 0.46474358974358976, + "cpu_mem": 1.782018048, + "gpu_mem": 4.436694016, + "loss": 1.136, + "grad_norm": 3.4965949058532715, + "learning_rate": 0.000194270606857336 + }, + { + "step": 291, + "epoch": 0.46634615384615385, + "cpu_mem": 1.782214656, + "gpu_mem": 4.436690944, + "loss": 1.0289, + "grad_norm": 3.3413844108581543, + "learning_rate": 0.00019346733671571367 + }, + { + "step": 292, + "epoch": 0.46794871794871795, + "cpu_mem": 1.782411264, + "gpu_mem": 4.436706304, + "loss": 0.9028, + "grad_norm": 3.762413740158081, + "learning_rate": 0.00019266270344834942 + }, + { + "step": 293, + "epoch": 0.46955128205128205, + "cpu_mem": 1.782607872, + "gpu_mem": 4.436713984, + "loss": 0.9394, + "grad_norm": 3.1840126514434814, + "learning_rate": 0.00019185673228835857 + }, + { + "step": 294, + "epoch": 0.47115384615384615, + "cpu_mem": 1.78280448, + "gpu_mem": 4.436703232, + "loss": 1.0697, + "grad_norm": 3.6577200889587402, + "learning_rate": 0.00019104944851081244 + }, + { + "step": 295, + "epoch": 0.47275641025641024, + "cpu_mem": 1.78280448, + "gpu_mem": 4.436687872, + "loss": 1.16, + "grad_norm": 4.8309102058410645, + "learning_rate": 0.00019024087743194564 + }, + { + "step": 296, + "epoch": 0.47435897435897434, + "cpu_mem": 1.78280448, + "gpu_mem": 4.436690944, + "loss": 0.9739, + "grad_norm": 3.4009406566619873, + "learning_rate": 0.0001894310444083625 + }, + { + "step": 297, + "epoch": 0.47596153846153844, + "cpu_mem": 1.783001088, + "gpu_mem": 4.4366848, + "loss": 1.1716, + "grad_norm": 3.8871536254882812, + "learning_rate": 0.00018861997483624136 + }, + { + "step": 298, + "epoch": 0.4775641025641026, + "cpu_mem": 1.783197696, + "gpu_mem": 4.436680192, + "loss": 1.1751, + "grad_norm": 5.638017177581787, + "learning_rate": 0.00018780769415053866 + }, + { + "step": 299, + "epoch": 0.4791666666666667, + "cpu_mem": 1.783197696, + "gpu_mem": 4.436701696, + "loss": 0.9295, + "grad_norm": 3.9298219680786133, + "learning_rate": 0.00018699422782419094 + }, + { + "step": 300, + "epoch": 0.4807692307692308, + "cpu_mem": 1.783394304, + "gpu_mem": 4.436694016, + "loss": 0.9403, + "grad_norm": 3.172013998031616, + "learning_rate": 0.00018617960136731624 + }, + { + "step": 301, + "epoch": 0.4823717948717949, + "cpu_mem": 1.783394304, + "gpu_mem": 4.436666368, + "loss": 1.1591, + "grad_norm": 3.8074755668640137, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 302, + "epoch": 0.483974358974359, + "cpu_mem": 1.783590912, + "gpu_mem": 4.436664832, + "loss": 0.9856, + "grad_norm": 2.971527099609375, + "learning_rate": 0.0001845469702835641 + }, + { + "step": 303, + "epoch": 0.4855769230769231, + "cpu_mem": 1.783590912, + "gpu_mem": 4.436690944, + "loss": 0.959, + "grad_norm": 5.159769535064697, + "learning_rate": 0.00018372901685562414 + }, + { + "step": 304, + "epoch": 0.48717948717948717, + "cpu_mem": 1.783590912, + "gpu_mem": 4.436674048, + "loss": 0.8947, + "grad_norm": 3.293670654296875, + "learning_rate": 0.00018291000569342676 + }, + { + "step": 305, + "epoch": 0.48878205128205127, + "cpu_mem": 1.78378752, + "gpu_mem": 4.436704768, + "loss": 0.9283, + "grad_norm": 5.646028518676758, + "learning_rate": 0.00018208996248097458 + }, + { + "step": 306, + "epoch": 0.49038461538461536, + "cpu_mem": 1.783984128, + "gpu_mem": 4.436687872, + "loss": 0.9999, + "grad_norm": 4.604079246520996, + "learning_rate": 0.00018126891293463547 + }, + { + "step": 307, + "epoch": 0.49198717948717946, + "cpu_mem": 1.783984128, + "gpu_mem": 4.436718592, + "loss": 0.9074, + "grad_norm": 3.168745756149292, + "learning_rate": 0.0001804468828023354 + }, + { + "step": 308, + "epoch": 0.4935897435897436, + "cpu_mem": 1.784180736, + "gpu_mem": 4.436686336, + "loss": 0.8542, + "grad_norm": 3.5956215858459473, + "learning_rate": 0.00017962389786275142 + }, + { + "step": 309, + "epoch": 0.4951923076923077, + "cpu_mem": 1.784377344, + "gpu_mem": 4.436712448, + "loss": 0.8653, + "grad_norm": 3.365502119064331, + "learning_rate": 0.000178799983924503 + }, + { + "step": 310, + "epoch": 0.4967948717948718, + "cpu_mem": 1.784377344, + "gpu_mem": 4.436687872, + "loss": 1.0041, + "grad_norm": 6.088065147399902, + "learning_rate": 0.00017797516682534293 + }, + { + "step": 311, + "epoch": 0.4983974358974359, + "cpu_mem": 1.784377344, + "gpu_mem": 4.436683264, + "loss": 0.9069, + "grad_norm": 5.252047538757324, + "learning_rate": 0.00017714947243134695 + }, + { + "step": 312, + "epoch": 0.5, + "cpu_mem": 1.784573952, + "gpu_mem": 4.436686336, + "loss": 0.8177, + "grad_norm": 3.816659688949585, + "learning_rate": 0.0001763229266361024 + }, + { + "step": 313, + "epoch": 0.5016025641025641, + "cpu_mem": 1.784573952, + "gpu_mem": 4.436704768, + "loss": 0.7758, + "grad_norm": 4.273536205291748, + "learning_rate": 0.00017549555535989648 + }, + { + "step": 314, + "epoch": 0.5032051282051282, + "cpu_mem": 1.78477056, + "gpu_mem": 4.4366848, + "loss": 0.8889, + "grad_norm": 4.014143466949463, + "learning_rate": 0.00017466738454890323 + }, + { + "step": 315, + "epoch": 0.5048076923076923, + "cpu_mem": 1.784967168, + "gpu_mem": 4.436689408, + "loss": 0.9264, + "grad_norm": 4.8177690505981445, + "learning_rate": 0.00017383844017436996 + }, + { + "step": 316, + "epoch": 0.5064102564102564, + "cpu_mem": 1.784967168, + "gpu_mem": 4.4366848, + "loss": 0.9192, + "grad_norm": 5.479294776916504, + "learning_rate": 0.00017300874823180282 + }, + { + "step": 317, + "epoch": 0.5080128205128205, + "cpu_mem": 1.784967168, + "gpu_mem": 4.43669248, + "loss": 0.6386, + "grad_norm": 3.0257720947265625, + "learning_rate": 0.00017217833474015128 + }, + { + "step": 318, + "epoch": 0.5096153846153846, + "cpu_mem": 1.785163776, + "gpu_mem": 4.436717056, + "loss": 0.8547, + "grad_norm": 7.127033710479736, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 319, + "epoch": 0.5112179487179487, + "cpu_mem": 1.785163776, + "gpu_mem": 4.436709376, + "loss": 0.8042, + "grad_norm": 4.178009510040283, + "learning_rate": 0.0001705154472977154 + }, + { + "step": 320, + "epoch": 0.5128205128205128, + "cpu_mem": 1.785360384, + "gpu_mem": 4.436710912, + "loss": 0.8691, + "grad_norm": 4.265892028808594, + "learning_rate": 0.00016968302549470095 + }, + { + "step": 321, + "epoch": 0.5144230769230769, + "cpu_mem": 1.785556992, + "gpu_mem": 4.436686336, + "loss": 0.8113, + "grad_norm": 4.079497814178467, + "learning_rate": 0.00016884998643650694 + }, + { + "step": 322, + "epoch": 0.5160256410256411, + "cpu_mem": 1.785556992, + "gpu_mem": 4.436687872, + "loss": 0.9329, + "grad_norm": 4.325129508972168, + "learning_rate": 0.00016801635624704776 + }, + { + "step": 323, + "epoch": 0.5176282051282052, + "cpu_mem": 1.785556992, + "gpu_mem": 4.43670784, + "loss": 0.8473, + "grad_norm": 3.7255797386169434, + "learning_rate": 0.0001671821610687756 + }, + { + "step": 324, + "epoch": 0.5192307692307693, + "cpu_mem": 1.7857536, + "gpu_mem": 4.436680192, + "loss": 0.9439, + "grad_norm": 4.617800712585449, + "learning_rate": 0.00016634742706186036 + }, + { + "step": 325, + "epoch": 0.5208333333333334, + "cpu_mem": 1.785950208, + "gpu_mem": 4.43669248, + "loss": 0.7199, + "grad_norm": 3.5368435382843018, + "learning_rate": 0.00016551218040336993 + }, + { + "step": 326, + "epoch": 0.5224358974358975, + "cpu_mem": 1.785950208, + "gpu_mem": 4.436701696, + "loss": 0.758, + "grad_norm": 4.384279251098633, + "learning_rate": 0.00016467644728644843 + }, + { + "step": 327, + "epoch": 0.5240384615384616, + "cpu_mem": 1.786146816, + "gpu_mem": 4.436678656, + "loss": 0.7699, + "grad_norm": 2.9671826362609863, + "learning_rate": 0.0001638402539194953 + }, + { + "step": 328, + "epoch": 0.5256410256410257, + "cpu_mem": 1.786146816, + "gpu_mem": 4.436703232, + "loss": 0.8761, + "grad_norm": 4.175673484802246, + "learning_rate": 0.00016300362652534346 + }, + { + "step": 329, + "epoch": 0.5272435897435898, + "cpu_mem": 1.786343424, + "gpu_mem": 4.436703232, + "loss": 0.6577, + "grad_norm": 4.008923530578613, + "learning_rate": 0.00016216659134043657 + }, + { + "step": 330, + "epoch": 0.5288461538461539, + "cpu_mem": 1.786343424, + "gpu_mem": 4.436686336, + "loss": 0.9596, + "grad_norm": 6.723553657531738, + "learning_rate": 0.00016132917461400686 + }, + { + "step": 331, + "epoch": 0.530448717948718, + "cpu_mem": 1.786540032, + "gpu_mem": 4.436683264, + "loss": 0.6398, + "grad_norm": 4.007308006286621, + "learning_rate": 0.00016049140260725127 + }, + { + "step": 332, + "epoch": 0.532051282051282, + "cpu_mem": 1.786540032, + "gpu_mem": 4.436675584, + "loss": 0.8651, + "grad_norm": 6.924561023712158, + "learning_rate": 0.00015965330159250845 + }, + { + "step": 333, + "epoch": 0.5336538461538461, + "cpu_mem": 1.786540032, + "gpu_mem": 4.436713984, + "loss": 0.9014, + "grad_norm": 6.791933059692383, + "learning_rate": 0.00015881489785243467 + }, + { + "step": 334, + "epoch": 0.5352564102564102, + "cpu_mem": 1.786540032, + "gpu_mem": 4.436690944, + "loss": 1.0268, + "grad_norm": 6.000373840332031, + "learning_rate": 0.00015797621767917942 + }, + { + "step": 335, + "epoch": 0.5368589743589743, + "cpu_mem": 1.786540032, + "gpu_mem": 4.436689408, + "loss": 0.825, + "grad_norm": 5.565177917480469, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 336, + "epoch": 0.5384615384615384, + "cpu_mem": 1.786540032, + "gpu_mem": 4.436706304, + "loss": 0.7076, + "grad_norm": 4.641871929168701, + "learning_rate": 0.00015629813324424292 + }, + { + "step": 337, + "epoch": 0.5400641025641025, + "cpu_mem": 1.78673664, + "gpu_mem": 4.436690944, + "loss": 0.7438, + "grad_norm": 4.893527507781982, + "learning_rate": 0.00015545878160690583 + }, + { + "step": 338, + "epoch": 0.5416666666666666, + "cpu_mem": 1.78673664, + "gpu_mem": 4.436703232, + "loss": 0.7906, + "grad_norm": 3.822333812713623, + "learning_rate": 0.00015461925878342556 + }, + { + "step": 339, + "epoch": 0.5432692307692307, + "cpu_mem": 1.786933248, + "gpu_mem": 4.43671552, + "loss": 0.7084, + "grad_norm": 3.810340166091919, + "learning_rate": 0.00015377959110104584 + }, + { + "step": 340, + "epoch": 0.5448717948717948, + "cpu_mem": 1.787129856, + "gpu_mem": 4.436690944, + "loss": 0.8241, + "grad_norm": 4.218953609466553, + "learning_rate": 0.00015293980489155333 + }, + { + "step": 341, + "epoch": 0.5464743589743589, + "cpu_mem": 1.787129856, + "gpu_mem": 4.436735488, + "loss": 0.7898, + "grad_norm": 3.741776943206787, + "learning_rate": 0.00015209992649045152 + }, + { + "step": 342, + "epoch": 0.5480769230769231, + "cpu_mem": 1.787129856, + "gpu_mem": 4.436709376, + "loss": 0.7983, + "grad_norm": 3.9828784465789795, + "learning_rate": 0.000151259982236135 + }, + { + "step": 343, + "epoch": 0.5496794871794872, + "cpu_mem": 1.787326464, + "gpu_mem": 4.436706304, + "loss": 0.7939, + "grad_norm": 4.479023456573486, + "learning_rate": 0.00015041999846906367 + }, + { + "step": 344, + "epoch": 0.5512820512820513, + "cpu_mem": 1.787326464, + "gpu_mem": 4.436687872, + "loss": 0.6374, + "grad_norm": 3.341785430908203, + "learning_rate": 0.00014958000153093634 + }, + { + "step": 345, + "epoch": 0.5528846153846154, + "cpu_mem": 1.787326464, + "gpu_mem": 4.436694016, + "loss": 0.6307, + "grad_norm": 3.9544854164123535, + "learning_rate": 0.000148740017763865 + }, + { + "step": 346, + "epoch": 0.5544871794871795, + "cpu_mem": 1.787326464, + "gpu_mem": 4.436663296, + "loss": 0.6976, + "grad_norm": 4.005458831787109, + "learning_rate": 0.00014790007350954845 + }, + { + "step": 347, + "epoch": 0.5560897435897436, + "cpu_mem": 1.787523072, + "gpu_mem": 4.436727808, + "loss": 0.7365, + "grad_norm": 3.7133290767669678, + "learning_rate": 0.00014706019510844664 + }, + { + "step": 348, + "epoch": 0.5576923076923077, + "cpu_mem": 1.787523072, + "gpu_mem": 4.436681728, + "loss": 0.7202, + "grad_norm": 4.776855945587158, + "learning_rate": 0.0001462204088989541 + }, + { + "step": 349, + "epoch": 0.5592948717948718, + "cpu_mem": 1.787523072, + "gpu_mem": 4.436675584, + "loss": 0.8235, + "grad_norm": 4.900458335876465, + "learning_rate": 0.00014538074121657447 + }, + { + "step": 350, + "epoch": 0.5608974358974359, + "cpu_mem": 1.78771968, + "gpu_mem": 4.43673088, + "loss": 0.5662, + "grad_norm": 5.028250694274902, + "learning_rate": 0.00014454121839309415 + }, + { + "step": 351, + "epoch": 0.5625, + "cpu_mem": 1.78771968, + "gpu_mem": 4.436697088, + "loss": 0.6548, + "grad_norm": 5.610021591186523, + "learning_rate": 0.00014370186675575705 + }, + { + "step": 352, + "epoch": 0.5641025641025641, + "cpu_mem": 1.787916288, + "gpu_mem": 4.4366848, + "loss": 0.5687, + "grad_norm": 3.8194875717163086, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 353, + "epoch": 0.5657051282051282, + "cpu_mem": 1.788112896, + "gpu_mem": 4.436689408, + "loss": 0.6753, + "grad_norm": 4.071313858032227, + "learning_rate": 0.00014202378232082053 + }, + { + "step": 354, + "epoch": 0.5673076923076923, + "cpu_mem": 1.788112896, + "gpu_mem": 4.43666944, + "loss": 0.7399, + "grad_norm": 4.291172981262207, + "learning_rate": 0.00014118510214756536 + }, + { + "step": 355, + "epoch": 0.5689102564102564, + "cpu_mem": 1.788112896, + "gpu_mem": 4.436694016, + "loss": 0.7128, + "grad_norm": 5.111339569091797, + "learning_rate": 0.00014034669840749152 + }, + { + "step": 356, + "epoch": 0.5705128205128205, + "cpu_mem": 1.788309504, + "gpu_mem": 4.436672512, + "loss": 0.5701, + "grad_norm": 4.443622589111328, + "learning_rate": 0.0001395085973927487 + }, + { + "step": 357, + "epoch": 0.5721153846153846, + "cpu_mem": 1.788506112, + "gpu_mem": 4.436689408, + "loss": 0.6423, + "grad_norm": 5.427051067352295, + "learning_rate": 0.00013867082538599317 + }, + { + "step": 358, + "epoch": 0.5737179487179487, + "cpu_mem": 1.788506112, + "gpu_mem": 4.43665408, + "loss": 0.8513, + "grad_norm": 5.775101184844971, + "learning_rate": 0.00013783340865956338 + }, + { + "step": 359, + "epoch": 0.5753205128205128, + "cpu_mem": 1.788506112, + "gpu_mem": 4.436686336, + "loss": 0.5987, + "grad_norm": 3.988433599472046, + "learning_rate": 0.0001369963734746566 + }, + { + "step": 360, + "epoch": 0.5769230769230769, + "cpu_mem": 1.78870272, + "gpu_mem": 4.436675584, + "loss": 0.5402, + "grad_norm": 3.563646078109741, + "learning_rate": 0.0001361597460805047 + }, + { + "step": 361, + "epoch": 0.5785256410256411, + "cpu_mem": 1.78870272, + "gpu_mem": 4.436712448, + "loss": 0.8261, + "grad_norm": 5.724892616271973, + "learning_rate": 0.00013532355271355154 + }, + { + "step": 362, + "epoch": 0.5801282051282052, + "cpu_mem": 1.78870272, + "gpu_mem": 4.436678656, + "loss": 0.8042, + "grad_norm": 7.161776542663574, + "learning_rate": 0.00013448781959663004 + }, + { + "step": 363, + "epoch": 0.5817307692307693, + "cpu_mem": 1.78870272, + "gpu_mem": 4.436701696, + "loss": 0.8449, + "grad_norm": 6.6703619956970215, + "learning_rate": 0.00013365257293813956 + }, + { + "step": 364, + "epoch": 0.5833333333333334, + "cpu_mem": 1.788899328, + "gpu_mem": 4.436690944, + "loss": 0.6449, + "grad_norm": 4.802823066711426, + "learning_rate": 0.00013281783893122446 + }, + { + "step": 365, + "epoch": 0.5849358974358975, + "cpu_mem": 1.788899328, + "gpu_mem": 4.436697088, + "loss": 0.6804, + "grad_norm": 4.638364315032959, + "learning_rate": 0.00013198364375295224 + }, + { + "step": 366, + "epoch": 0.5865384615384616, + "cpu_mem": 1.789095936, + "gpu_mem": 4.436690944, + "loss": 0.714, + "grad_norm": 4.195858478546143, + "learning_rate": 0.000131150013563493 + }, + { + "step": 367, + "epoch": 0.5881410256410257, + "cpu_mem": 1.789095936, + "gpu_mem": 4.436709376, + "loss": 0.79, + "grad_norm": 3.7465264797210693, + "learning_rate": 0.00013031697450529902 + }, + { + "step": 368, + "epoch": 0.5897435897435898, + "cpu_mem": 1.789095936, + "gpu_mem": 4.43666944, + "loss": 0.6638, + "grad_norm": 4.017826080322266, + "learning_rate": 0.0001294845527022846 + }, + { + "step": 369, + "epoch": 0.5913461538461539, + "cpu_mem": 1.789095936, + "gpu_mem": 4.436701696, + "loss": 0.8603, + "grad_norm": 4.366079807281494, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 370, + "epoch": 0.592948717948718, + "cpu_mem": 1.789095936, + "gpu_mem": 4.436721664, + "loss": 0.7275, + "grad_norm": 3.6458230018615723, + "learning_rate": 0.0001278216652598487 + }, + { + "step": 371, + "epoch": 0.594551282051282, + "cpu_mem": 1.789292544, + "gpu_mem": 4.43671552, + "loss": 0.5578, + "grad_norm": 4.239272594451904, + "learning_rate": 0.00012699125176819716 + }, + { + "step": 372, + "epoch": 0.5961538461538461, + "cpu_mem": 1.789292544, + "gpu_mem": 4.436678656, + "loss": 0.7805, + "grad_norm": 4.6191840171813965, + "learning_rate": 0.00012616155982563 + }, + { + "step": 373, + "epoch": 0.5977564102564102, + "cpu_mem": 1.789292544, + "gpu_mem": 4.436695552, + "loss": 0.4805, + "grad_norm": 3.9109909534454346, + "learning_rate": 0.00012533261545109674 + }, + { + "step": 374, + "epoch": 0.5993589743589743, + "cpu_mem": 1.789292544, + "gpu_mem": 4.436672512, + "loss": 0.8584, + "grad_norm": 4.176302909851074, + "learning_rate": 0.00012450444464010352 + }, + { + "step": 375, + "epoch": 0.6009615384615384, + "cpu_mem": 1.789489152, + "gpu_mem": 4.436704768, + "loss": 0.7479, + "grad_norm": 4.6865949630737305, + "learning_rate": 0.0001236770733638976 + }, + { + "step": 376, + "epoch": 0.6025641025641025, + "cpu_mem": 1.78968576, + "gpu_mem": 4.43670016, + "loss": 0.8052, + "grad_norm": 4.493798732757568, + "learning_rate": 0.000122850527568653 + }, + { + "step": 377, + "epoch": 0.6041666666666666, + "cpu_mem": 1.78968576, + "gpu_mem": 4.436709376, + "loss": 0.6735, + "grad_norm": 4.104478359222412, + "learning_rate": 0.00012202483317465704 + }, + { + "step": 378, + "epoch": 0.6057692307692307, + "cpu_mem": 1.78968576, + "gpu_mem": 4.436683264, + "loss": 0.5893, + "grad_norm": 4.188327312469482, + "learning_rate": 0.00012120001607549698 + }, + { + "step": 379, + "epoch": 0.6073717948717948, + "cpu_mem": 1.789882368, + "gpu_mem": 4.436703232, + "loss": 0.5265, + "grad_norm": 3.1024508476257324, + "learning_rate": 0.00012037610213724862 + }, + { + "step": 380, + "epoch": 0.6089743589743589, + "cpu_mem": 1.789882368, + "gpu_mem": 4.43667712, + "loss": 0.9167, + "grad_norm": 5.433984279632568, + "learning_rate": 0.0001195531171976646 + }, + { + "step": 381, + "epoch": 0.6105769230769231, + "cpu_mem": 1.789882368, + "gpu_mem": 4.436701696, + "loss": 0.6721, + "grad_norm": 5.92414665222168, + "learning_rate": 0.00011873108706536448 + }, + { + "step": 382, + "epoch": 0.6121794871794872, + "cpu_mem": 1.789882368, + "gpu_mem": 4.436686336, + "loss": 0.5699, + "grad_norm": 4.380060195922852, + "learning_rate": 0.00011791003751902542 + }, + { + "step": 383, + "epoch": 0.6137820512820513, + "cpu_mem": 1.789882368, + "gpu_mem": 4.436720128, + "loss": 0.7559, + "grad_norm": 4.462453365325928, + "learning_rate": 0.00011708999430657325 + }, + { + "step": 384, + "epoch": 0.6153846153846154, + "cpu_mem": 1.790078976, + "gpu_mem": 4.43670016, + "loss": 0.6469, + "grad_norm": 3.473867654800415, + "learning_rate": 0.00011627098314437586 + }, + { + "step": 385, + "epoch": 0.6169871794871795, + "cpu_mem": 1.790078976, + "gpu_mem": 4.4366848, + "loss": 0.6012, + "grad_norm": 3.0591113567352295, + "learning_rate": 0.0001154530297164359 + }, + { + "step": 386, + "epoch": 0.6185897435897436, + "cpu_mem": 1.790078976, + "gpu_mem": 4.436720128, + "loss": 0.707, + "grad_norm": 3.776118516921997, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 387, + "epoch": 0.6201923076923077, + "cpu_mem": 1.790078976, + "gpu_mem": 4.436726272, + "loss": 0.5405, + "grad_norm": 3.874037981033325, + "learning_rate": 0.00011382039863268374 + }, + { + "step": 388, + "epoch": 0.6217948717948718, + "cpu_mem": 1.790078976, + "gpu_mem": 4.436689408, + "loss": 0.7073, + "grad_norm": 3.9455573558807373, + "learning_rate": 0.00011300577217580905 + }, + { + "step": 389, + "epoch": 0.6233974358974359, + "cpu_mem": 1.790275584, + "gpu_mem": 4.436667904, + "loss": 0.6956, + "grad_norm": 3.3347461223602295, + "learning_rate": 0.00011219230584946136 + }, + { + "step": 390, + "epoch": 0.625, + "cpu_mem": 1.790275584, + "gpu_mem": 4.436720128, + "loss": 0.7794, + "grad_norm": 4.386207580566406, + "learning_rate": 0.00011138002516375864 + }, + { + "step": 391, + "epoch": 0.6266025641025641, + "cpu_mem": 1.790275584, + "gpu_mem": 4.436706304, + "loss": 0.5198, + "grad_norm": 4.139154434204102, + "learning_rate": 0.00011056895559163748 + }, + { + "step": 392, + "epoch": 0.6282051282051282, + "cpu_mem": 1.790275584, + "gpu_mem": 4.43670016, + "loss": 0.5857, + "grad_norm": 3.7094950675964355, + "learning_rate": 0.00010975912256805436 + }, + { + "step": 393, + "epoch": 0.6298076923076923, + "cpu_mem": 1.790472192, + "gpu_mem": 4.436706304, + "loss": 0.5499, + "grad_norm": 3.56085467338562, + "learning_rate": 0.00010895055148918756 + }, + { + "step": 394, + "epoch": 0.6314102564102564, + "cpu_mem": 1.790472192, + "gpu_mem": 4.436683264, + "loss": 0.8469, + "grad_norm": 6.771099090576172, + "learning_rate": 0.00010814326771164141 + }, + { + "step": 395, + "epoch": 0.6330128205128205, + "cpu_mem": 1.790472192, + "gpu_mem": 4.436697088, + "loss": 0.5178, + "grad_norm": 4.526926517486572, + "learning_rate": 0.00010733729655165054 + }, + { + "step": 396, + "epoch": 0.6346153846153846, + "cpu_mem": 1.790472192, + "gpu_mem": 4.436697088, + "loss": 0.7327, + "grad_norm": 4.429098606109619, + "learning_rate": 0.00010653266328428628 + }, + { + "step": 397, + "epoch": 0.6362179487179487, + "cpu_mem": 1.7906688, + "gpu_mem": 4.436666368, + "loss": 0.6306, + "grad_norm": 4.024591445922852, + "learning_rate": 0.00010572939314266402 + }, + { + "step": 398, + "epoch": 0.6378205128205128, + "cpu_mem": 1.7906688, + "gpu_mem": 4.43670016, + "loss": 0.7435, + "grad_norm": 4.39280891418457, + "learning_rate": 0.00010492751131715159 + }, + { + "step": 399, + "epoch": 0.6394230769230769, + "cpu_mem": 1.7906688, + "gpu_mem": 4.436678656, + "loss": 0.6424, + "grad_norm": 3.8005006313323975, + "learning_rate": 0.00010412704295457988 + }, + { + "step": 400, + "epoch": 0.6410256410256411, + "cpu_mem": 1.7906688, + "gpu_mem": 4.436686336, + "loss": 0.5507, + "grad_norm": 4.362600803375244, + "learning_rate": 0.00010332801315745361 + }, + { + "step": 401, + "epoch": 0.6426282051282052, + "cpu_mem": 1.7906688, + "gpu_mem": 4.436704768, + "loss": 0.6251, + "grad_norm": 4.1261162757873535, + "learning_rate": 0.00010253044698316464 + }, + { + "step": 402, + "epoch": 0.6442307692307693, + "cpu_mem": 1.790865408, + "gpu_mem": 4.436672512, + "loss": 0.8, + "grad_norm": 5.890397548675537, + "learning_rate": 0.00010173436944320582 + }, + { + "step": 403, + "epoch": 0.6458333333333334, + "cpu_mem": 1.790865408, + "gpu_mem": 4.43667712, + "loss": 0.5398, + "grad_norm": 5.174508571624756, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 404, + "epoch": 0.6474358974358975, + "cpu_mem": 1.790865408, + "gpu_mem": 4.436672512, + "loss": 0.55, + "grad_norm": 4.136332035064697, + "learning_rate": 0.00010014678007805106 + }, + { + "step": 405, + "epoch": 0.6490384615384616, + "cpu_mem": 1.791062016, + "gpu_mem": 4.436717056, + "loss": 0.695, + "grad_norm": 3.9705355167388916, + "learning_rate": 9.935531803929469e-05 + }, + { + "step": 406, + "epoch": 0.6506410256410257, + "cpu_mem": 1.791062016, + "gpu_mem": 4.43670016, + "loss": 0.4963, + "grad_norm": 3.857638359069824, + "learning_rate": 9.856544420618624e-05 + }, + { + "step": 407, + "epoch": 0.6522435897435898, + "cpu_mem": 1.791258624, + "gpu_mem": 4.436689408, + "loss": 0.5862, + "grad_norm": 4.611895561218262, + "learning_rate": 9.777718334898859e-05 + }, + { + "step": 408, + "epoch": 0.6538461538461539, + "cpu_mem": 1.791258624, + "gpu_mem": 4.436710912, + "loss": 0.6116, + "grad_norm": 4.632784843444824, + "learning_rate": 9.699056018738192e-05 + }, + { + "step": 409, + "epoch": 0.655448717948718, + "cpu_mem": 1.791258624, + "gpu_mem": 4.43667712, + "loss": 0.5249, + "grad_norm": 4.745171070098877, + "learning_rate": 9.62055993896888e-05 + }, + { + "step": 410, + "epoch": 0.657051282051282, + "cpu_mem": 1.791258624, + "gpu_mem": 4.43669248, + "loss": 0.5955, + "grad_norm": 5.121898174285889, + "learning_rate": 9.542232557210039e-05 + }, + { + "step": 411, + "epoch": 0.6586538461538461, + "cpu_mem": 1.791258624, + "gpu_mem": 4.43669248, + "loss": 0.5652, + "grad_norm": 3.9958999156951904, + "learning_rate": 9.464076329790451e-05 + }, + { + "step": 412, + "epoch": 0.6602564102564102, + "cpu_mem": 1.791455232, + "gpu_mem": 4.436683264, + "loss": 0.5312, + "grad_norm": 4.14360237121582, + "learning_rate": 9.386093707671543e-05 + }, + { + "step": 413, + "epoch": 0.6618589743589743, + "cpu_mem": 1.791455232, + "gpu_mem": 4.436694016, + "loss": 0.6692, + "grad_norm": 3.83903169631958, + "learning_rate": 9.308287136370511e-05 + }, + { + "step": 414, + "epoch": 0.6634615384615384, + "cpu_mem": 1.791455232, + "gpu_mem": 4.436718592, + "loss": 0.4856, + "grad_norm": 4.325196743011475, + "learning_rate": 9.230659055883649e-05 + }, + { + "step": 415, + "epoch": 0.6650641025641025, + "cpu_mem": 1.791455232, + "gpu_mem": 4.436670976, + "loss": 0.8965, + "grad_norm": 5.735899925231934, + "learning_rate": 9.15321190060981e-05 + }, + { + "step": 416, + "epoch": 0.6666666666666666, + "cpu_mem": 1.79165184, + "gpu_mem": 4.436706304, + "loss": 0.725, + "grad_norm": 7.810929298400879, + "learning_rate": 9.075948099274078e-05 + }, + { + "step": 417, + "epoch": 0.6682692307692307, + "cpu_mem": 1.79165184, + "gpu_mem": 4.436667904, + "loss": 0.5693, + "grad_norm": 6.714444160461426, + "learning_rate": 8.998870074851604e-05 + }, + { + "step": 418, + "epoch": 0.6698717948717948, + "cpu_mem": 1.79165184, + "gpu_mem": 4.436686336, + "loss": 0.6229, + "grad_norm": 4.6374359130859375, + "learning_rate": 8.9219802444916e-05 + }, + { + "step": 419, + "epoch": 0.6714743589743589, + "cpu_mem": 1.79165184, + "gpu_mem": 4.436678656, + "loss": 0.6744, + "grad_norm": 4.346306324005127, + "learning_rate": 8.845281019441583e-05 + }, + { + "step": 420, + "epoch": 0.6730769230769231, + "cpu_mem": 1.79165184, + "gpu_mem": 4.43671552, + "loss": 0.6249, + "grad_norm": 5.048990249633789, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 421, + "epoch": 0.6746794871794872, + "cpu_mem": 1.79165184, + "gpu_mem": 4.436675584, + "loss": 0.5342, + "grad_norm": 4.457531452178955, + "learning_rate": 8.692464000299362e-05 + }, + { + "step": 422, + "epoch": 0.6762820512820513, + "cpu_mem": 1.79165184, + "gpu_mem": 4.436689408, + "loss": 0.4752, + "grad_norm": 3.675194263458252, + "learning_rate": 8.61635099851395e-05 + }, + { + "step": 423, + "epoch": 0.6778846153846154, + "cpu_mem": 1.791848448, + "gpu_mem": 4.436694016, + "loss": 0.4725, + "grad_norm": 5.338257789611816, + "learning_rate": 8.540438186501792e-05 + }, + { + "step": 424, + "epoch": 0.6794871794871795, + "cpu_mem": 1.792045056, + "gpu_mem": 4.436655616, + "loss": 0.5669, + "grad_norm": 3.651369571685791, + "learning_rate": 8.464727944871322e-05 + }, + { + "step": 425, + "epoch": 0.6810897435897436, + "cpu_mem": 1.792045056, + "gpu_mem": 4.436678656, + "loss": 0.7784, + "grad_norm": 4.949639320373535, + "learning_rate": 8.389222647878426e-05 + }, + { + "step": 426, + "epoch": 0.6826923076923077, + "cpu_mem": 1.792045056, + "gpu_mem": 4.43667712, + "loss": 0.5534, + "grad_norm": 4.349726676940918, + "learning_rate": 8.313924663351926e-05 + }, + { + "step": 427, + "epoch": 0.6842948717948718, + "cpu_mem": 1.792045056, + "gpu_mem": 4.436695552, + "loss": 0.6157, + "grad_norm": 6.235125541687012, + "learning_rate": 8.238836352619424e-05 + }, + { + "step": 428, + "epoch": 0.6858974358974359, + "cpu_mem": 1.792241664, + "gpu_mem": 4.43669248, + "loss": 0.6666, + "grad_norm": 5.24396276473999, + "learning_rate": 8.163960070433164e-05 + }, + { + "step": 429, + "epoch": 0.6875, + "cpu_mem": 1.792241664, + "gpu_mem": 4.436690944, + "loss": 0.7291, + "grad_norm": 5.097180366516113, + "learning_rate": 8.089298164896245e-05 + }, + { + "step": 430, + "epoch": 0.6891025641025641, + "cpu_mem": 1.792241664, + "gpu_mem": 4.436709376, + "loss": 0.4468, + "grad_norm": 3.487100601196289, + "learning_rate": 8.014852977388964e-05 + }, + { + "step": 431, + "epoch": 0.6907051282051282, + "cpu_mem": 1.792241664, + "gpu_mem": 4.436670976, + "loss": 0.6028, + "grad_norm": 3.9118049144744873, + "learning_rate": 7.940626842495362e-05 + }, + { + "step": 432, + "epoch": 0.6923076923076923, + "cpu_mem": 1.792438272, + "gpu_mem": 4.43671552, + "loss": 0.5544, + "grad_norm": 3.869927406311035, + "learning_rate": 7.866622087930074e-05 + }, + { + "step": 433, + "epoch": 0.6939102564102564, + "cpu_mem": 1.792438272, + "gpu_mem": 4.436680192, + "loss": 0.4751, + "grad_norm": 3.780824661254883, + "learning_rate": 7.792841034465275e-05 + }, + { + "step": 434, + "epoch": 0.6955128205128205, + "cpu_mem": 1.792438272, + "gpu_mem": 4.43670784, + "loss": 0.6496, + "grad_norm": 3.8507421016693115, + "learning_rate": 7.719285995857938e-05 + }, + { + "step": 435, + "epoch": 0.6971153846153846, + "cpu_mem": 1.792438272, + "gpu_mem": 4.436687872, + "loss": 0.6267, + "grad_norm": 4.466668605804443, + "learning_rate": 7.64595927877727e-05 + }, + { + "step": 436, + "epoch": 0.6987179487179487, + "cpu_mem": 1.792438272, + "gpu_mem": 4.436733952, + "loss": 0.5727, + "grad_norm": 3.4449939727783203, + "learning_rate": 7.572863182732332e-05 + }, + { + "step": 437, + "epoch": 0.7003205128205128, + "cpu_mem": 1.792438272, + "gpu_mem": 4.436698624, + "loss": 0.4988, + "grad_norm": 3.192558526992798, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 438, + "epoch": 0.7019230769230769, + "cpu_mem": 1.792438272, + "gpu_mem": 4.436689408, + "loss": 0.4523, + "grad_norm": 3.800586700439453, + "learning_rate": 7.42737201555302e-05 + }, + { + "step": 439, + "epoch": 0.7035256410256411, + "cpu_mem": 1.792438272, + "gpu_mem": 4.436683264, + "loss": 0.4756, + "grad_norm": 5.1058669090271, + "learning_rate": 7.354981506988387e-05 + }, + { + "step": 440, + "epoch": 0.7051282051282052, + "cpu_mem": 1.792438272, + "gpu_mem": 4.436667904, + "loss": 0.5632, + "grad_norm": 4.608668804168701, + "learning_rate": 7.282830744455895e-05 + }, + { + "step": 441, + "epoch": 0.7067307692307693, + "cpu_mem": 1.79263488, + "gpu_mem": 4.436686336, + "loss": 0.8295, + "grad_norm": 7.807580471038818, + "learning_rate": 7.210921990586957e-05 + }, + { + "step": 442, + "epoch": 0.7083333333333334, + "cpu_mem": 1.79263488, + "gpu_mem": 4.436687872, + "loss": 0.4478, + "grad_norm": 3.91226863861084, + "learning_rate": 7.139257500423665e-05 + }, + { + "step": 443, + "epoch": 0.7099358974358975, + "cpu_mem": 1.79263488, + "gpu_mem": 4.43669248, + "loss": 0.7212, + "grad_norm": 7.0274810791015625, + "learning_rate": 7.067839521348035e-05 + }, + { + "step": 444, + "epoch": 0.7115384615384616, + "cpu_mem": 1.79263488, + "gpu_mem": 4.436695552, + "loss": 0.3686, + "grad_norm": 4.005268573760986, + "learning_rate": 6.996670293011575e-05 + }, + { + "step": 445, + "epoch": 0.7131410256410257, + "cpu_mem": 1.79263488, + "gpu_mem": 4.436689408, + "loss": 0.7689, + "grad_norm": 5.850273132324219, + "learning_rate": 6.92575204726501e-05 + }, + { + "step": 446, + "epoch": 0.7147435897435898, + "cpu_mem": 1.792831488, + "gpu_mem": 4.43671552, + "loss": 0.7196, + "grad_norm": 5.689333438873291, + "learning_rate": 6.855087008088307e-05 + }, + { + "step": 447, + "epoch": 0.7163461538461539, + "cpu_mem": 1.792831488, + "gpu_mem": 4.436683264, + "loss": 0.4752, + "grad_norm": 3.297355890274048, + "learning_rate": 6.784677391520952e-05 + }, + { + "step": 448, + "epoch": 0.717948717948718, + "cpu_mem": 1.792831488, + "gpu_mem": 4.436710912, + "loss": 0.4048, + "grad_norm": 3.830850839614868, + "learning_rate": 6.714525405592412e-05 + }, + { + "step": 449, + "epoch": 0.719551282051282, + "cpu_mem": 1.792831488, + "gpu_mem": 4.436718592, + "loss": 0.5723, + "grad_norm": 6.2726898193359375, + "learning_rate": 6.644633250252937e-05 + }, + { + "step": 450, + "epoch": 0.7211538461538461, + "cpu_mem": 1.792831488, + "gpu_mem": 4.43670016, + "loss": 0.6736, + "grad_norm": 4.726658344268799, + "learning_rate": 6.575003117304535e-05 + }, + { + "step": 451, + "epoch": 0.7227564102564102, + "cpu_mem": 1.792831488, + "gpu_mem": 4.436686336, + "loss": 0.5046, + "grad_norm": 3.8653242588043213, + "learning_rate": 6.50563719033225e-05 + }, + { + "step": 452, + "epoch": 0.7243589743589743, + "cpu_mem": 1.792831488, + "gpu_mem": 4.436697088, + "loss": 0.5143, + "grad_norm": 3.675222873687744, + "learning_rate": 6.436537644635705e-05 + }, + { + "step": 453, + "epoch": 0.7259615384615384, + "cpu_mem": 1.793028096, + "gpu_mem": 4.436689408, + "loss": 0.458, + "grad_norm": 3.310844659805298, + "learning_rate": 6.367706647160847e-05 + }, + { + "step": 454, + "epoch": 0.7275641025641025, + "cpu_mem": 1.793028096, + "gpu_mem": 4.436706304, + "loss": 0.6416, + "grad_norm": 4.6833977699279785, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 455, + "epoch": 0.7291666666666666, + "cpu_mem": 1.793028096, + "gpu_mem": 4.436678656, + "loss": 0.4042, + "grad_norm": 3.632859706878662, + "learning_rate": 6.230858922484288e-05 + }, + { + "step": 456, + "epoch": 0.7307692307692307, + "cpu_mem": 1.793028096, + "gpu_mem": 4.436709376, + "loss": 0.5621, + "grad_norm": 4.671077251434326, + "learning_rate": 6.162846486795938e-05 + }, + { + "step": 457, + "epoch": 0.7323717948717948, + "cpu_mem": 1.793028096, + "gpu_mem": 4.436690944, + "loss": 0.7289, + "grad_norm": 5.497105598449707, + "learning_rate": 6.095111182221422e-05 + }, + { + "step": 458, + "epoch": 0.7339743589743589, + "cpu_mem": 1.793028096, + "gpu_mem": 4.436678656, + "loss": 0.718, + "grad_norm": 3.973571538925171, + "learning_rate": 6.027655132924397e-05 + }, + { + "step": 459, + "epoch": 0.7355769230769231, + "cpu_mem": 1.793028096, + "gpu_mem": 4.436690944, + "loss": 0.6272, + "grad_norm": 4.856440544128418, + "learning_rate": 5.960480454311155e-05 + }, + { + "step": 460, + "epoch": 0.7371794871794872, + "cpu_mem": 1.793224704, + "gpu_mem": 4.436697088, + "loss": 0.4578, + "grad_norm": 3.6832237243652344, + "learning_rate": 5.893589252964258e-05 + }, + { + "step": 461, + "epoch": 0.7387820512820513, + "cpu_mem": 1.793224704, + "gpu_mem": 4.4366848, + "loss": 0.3874, + "grad_norm": 3.653801202774048, + "learning_rate": 5.826983626576479e-05 + }, + { + "step": 462, + "epoch": 0.7403846153846154, + "cpu_mem": 1.793224704, + "gpu_mem": 4.436674048, + "loss": 0.4127, + "grad_norm": 3.9497387409210205, + "learning_rate": 5.760665663885046e-05 + }, + { + "step": 463, + "epoch": 0.7419871794871795, + "cpu_mem": 1.793224704, + "gpu_mem": 4.436675584, + "loss": 0.5924, + "grad_norm": 3.8645057678222656, + "learning_rate": 5.6946374446060984e-05 + }, + { + "step": 464, + "epoch": 0.7435897435897436, + "cpu_mem": 1.793421312, + "gpu_mem": 4.436689408, + "loss": 0.5187, + "grad_norm": 2.7863729000091553, + "learning_rate": 5.6289010393695056e-05 + }, + { + "step": 465, + "epoch": 0.7451923076923077, + "cpu_mem": 1.793421312, + "gpu_mem": 4.43669248, + "loss": 0.4878, + "grad_norm": 3.640704393386841, + "learning_rate": 5.563458509653904e-05 + }, + { + "step": 466, + "epoch": 0.7467948717948718, + "cpu_mem": 1.793421312, + "gpu_mem": 4.436703232, + "loss": 0.4578, + "grad_norm": 3.654576301574707, + "learning_rate": 5.498311907722057e-05 + }, + { + "step": 467, + "epoch": 0.7483974358974359, + "cpu_mem": 1.793421312, + "gpu_mem": 4.43667712, + "loss": 0.5235, + "grad_norm": 3.8503942489624023, + "learning_rate": 5.43346327655652e-05 + }, + { + "step": 468, + "epoch": 0.75, + "cpu_mem": 1.793421312, + "gpu_mem": 4.43669248, + "loss": 0.6348, + "grad_norm": 4.960061550140381, + "learning_rate": 5.3689146497955274e-05 + }, + { + "step": 469, + "epoch": 0.7516025641025641, + "cpu_mem": 1.793421312, + "gpu_mem": 4.436701696, + "loss": 0.6316, + "grad_norm": 5.092273712158203, + "learning_rate": 5.30466805166927e-05 + }, + { + "step": 470, + "epoch": 0.7532051282051282, + "cpu_mem": 1.793421312, + "gpu_mem": 4.436675584, + "loss": 0.5847, + "grad_norm": 5.045073986053467, + "learning_rate": 5.240725496936372e-05 + }, + { + "step": 471, + "epoch": 0.7548076923076923, + "cpu_mem": 1.793421312, + "gpu_mem": 4.436681728, + "loss": 0.571, + "grad_norm": 4.303534984588623, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 472, + "epoch": 0.7564102564102564, + "cpu_mem": 1.793421312, + "gpu_mem": 4.436670976, + "loss": 0.6083, + "grad_norm": 5.757172584533691, + "learning_rate": 5.113760528948622e-05 + }, + { + "step": 473, + "epoch": 0.7580128205128205, + "cpu_mem": 1.793421312, + "gpu_mem": 4.43667712, + "loss": 0.5189, + "grad_norm": 4.174593925476074, + "learning_rate": 5.05074209728614e-05 + }, + { + "step": 474, + "epoch": 0.7596153846153846, + "cpu_mem": 1.793421312, + "gpu_mem": 4.436713984, + "loss": 0.3872, + "grad_norm": 4.346033573150635, + "learning_rate": 4.988035672076899e-05 + }, + { + "step": 475, + "epoch": 0.7612179487179487, + "cpu_mem": 1.793421312, + "gpu_mem": 4.43666176, + "loss": 0.6286, + "grad_norm": 4.697591304779053, + "learning_rate": 4.925643219780052e-05 + }, + { + "step": 476, + "epoch": 0.7628205128205128, + "cpu_mem": 1.793421312, + "gpu_mem": 4.436681728, + "loss": 0.5161, + "grad_norm": 4.314554214477539, + "learning_rate": 4.863566697008634e-05 + }, + { + "step": 477, + "epoch": 0.7644230769230769, + "cpu_mem": 1.79361792, + "gpu_mem": 4.436681728, + "loss": 0.5617, + "grad_norm": 5.453423023223877, + "learning_rate": 4.801808050468219e-05 + }, + { + "step": 478, + "epoch": 0.7660256410256411, + "cpu_mem": 1.79361792, + "gpu_mem": 4.436680192, + "loss": 0.4787, + "grad_norm": 3.520378828048706, + "learning_rate": 4.7403692168958305e-05 + }, + { + "step": 479, + "epoch": 0.7676282051282052, + "cpu_mem": 1.79361792, + "gpu_mem": 4.436678656, + "loss": 0.4006, + "grad_norm": 4.303217887878418, + "learning_rate": 4.679252122999255e-05 + }, + { + "step": 480, + "epoch": 0.7692307692307693, + "cpu_mem": 1.79361792, + "gpu_mem": 4.436670976, + "loss": 0.6042, + "grad_norm": 4.1407999992370605, + "learning_rate": 4.618458685396579e-05 + }, + { + "step": 481, + "epoch": 0.7708333333333334, + "cpu_mem": 1.79361792, + "gpu_mem": 4.43673088, + "loss": 0.4605, + "grad_norm": 4.869826316833496, + "learning_rate": 4.5579908105561016e-05 + }, + { + "step": 482, + "epoch": 0.7724358974358975, + "cpu_mem": 1.79361792, + "gpu_mem": 4.436675584, + "loss": 0.4132, + "grad_norm": 3.77632999420166, + "learning_rate": 4.497850394736563e-05 + }, + { + "step": 483, + "epoch": 0.7740384615384616, + "cpu_mem": 1.79361792, + "gpu_mem": 4.436658688, + "loss": 0.5935, + "grad_norm": 3.551877021789551, + "learning_rate": 4.438039323927648e-05 + }, + { + "step": 484, + "epoch": 0.7756410256410257, + "cpu_mem": 1.79361792, + "gpu_mem": 4.436689408, + "loss": 0.6255, + "grad_norm": 4.1481781005859375, + "learning_rate": 4.3785594737908676e-05 + }, + { + "step": 485, + "epoch": 0.7772435897435898, + "cpu_mem": 1.79361792, + "gpu_mem": 4.436733952, + "loss": 0.7372, + "grad_norm": 5.095218181610107, + "learning_rate": 4.319412709600723e-05 + }, + { + "step": 486, + "epoch": 0.7788461538461539, + "cpu_mem": 1.793814528, + "gpu_mem": 4.436713984, + "loss": 0.3861, + "grad_norm": 4.123284816741943, + "learning_rate": 4.2606008861862116e-05 + }, + { + "step": 487, + "epoch": 0.780448717948718, + "cpu_mem": 1.793814528, + "gpu_mem": 4.436713984, + "loss": 0.4212, + "grad_norm": 3.582926034927368, + "learning_rate": 4.2021258478726774e-05 + }, + { + "step": 488, + "epoch": 0.782051282051282, + "cpu_mem": 1.793814528, + "gpu_mem": 4.436680192, + "loss": 0.5269, + "grad_norm": 5.184456825256348, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 489, + "epoch": 0.7836538461538461, + "cpu_mem": 1.793814528, + "gpu_mem": 4.436704768, + "loss": 0.5519, + "grad_norm": 4.651491641998291, + "learning_rate": 4.0861934509848507e-05 + }, + { + "step": 490, + "epoch": 0.7852564102564102, + "cpu_mem": 1.793814528, + "gpu_mem": 4.43670784, + "loss": 0.656, + "grad_norm": 4.362300872802734, + "learning_rate": 4.028739728024022e-05 + }, + { + "step": 491, + "epoch": 0.7868589743589743, + "cpu_mem": 1.793814528, + "gpu_mem": 4.436686336, + "loss": 0.5022, + "grad_norm": 4.698019027709961, + "learning_rate": 3.971630061277077e-05 + }, + { + "step": 492, + "epoch": 0.7884615384615384, + "cpu_mem": 1.793814528, + "gpu_mem": 4.436709376, + "loss": 0.5076, + "grad_norm": 6.532934188842773, + "learning_rate": 3.914866241690115e-05 + }, + { + "step": 493, + "epoch": 0.7900641025641025, + "cpu_mem": 1.793814528, + "gpu_mem": 4.436689408, + "loss": 0.6816, + "grad_norm": 4.832774639129639, + "learning_rate": 3.858450049363532e-05 + }, + { + "step": 494, + "epoch": 0.7916666666666666, + "cpu_mem": 1.793814528, + "gpu_mem": 4.436713984, + "loss": 0.3972, + "grad_norm": 2.9182722568511963, + "learning_rate": 3.8023832534962314e-05 + }, + { + "step": 495, + "epoch": 0.7932692307692307, + "cpu_mem": 1.794011136, + "gpu_mem": 4.436697088, + "loss": 0.5205, + "grad_norm": 4.4874114990234375, + "learning_rate": 3.746667612330109e-05 + }, + { + "step": 496, + "epoch": 0.7948717948717948, + "cpu_mem": 1.794011136, + "gpu_mem": 4.43669248, + "loss": 0.464, + "grad_norm": 3.8461802005767822, + "learning_rate": 3.691304873094927e-05 + }, + { + "step": 497, + "epoch": 0.7964743589743589, + "cpu_mem": 1.794011136, + "gpu_mem": 4.436704768, + "loss": 0.4233, + "grad_norm": 4.035707473754883, + "learning_rate": 3.636296771953544e-05 + }, + { + "step": 498, + "epoch": 0.7980769230769231, + "cpu_mem": 1.794011136, + "gpu_mem": 4.436675584, + "loss": 0.5125, + "grad_norm": 4.001553058624268, + "learning_rate": 3.581645033947425e-05 + }, + { + "step": 499, + "epoch": 0.7996794871794872, + "cpu_mem": 1.794011136, + "gpu_mem": 4.436689408, + "loss": 0.6883, + "grad_norm": 4.442062854766846, + "learning_rate": 3.527351372942588e-05 + }, + { + "step": 500, + "epoch": 0.8012820512820513, + "cpu_mem": 1.794011136, + "gpu_mem": 4.436675584, + "loss": 0.5381, + "grad_norm": 4.1814374923706055, + "learning_rate": 3.473417491575824e-05 + }, + { + "step": 501, + "epoch": 0.8028846153846154, + "cpu_mem": 1.794011136, + "gpu_mem": 4.43666944, + "loss": 0.534, + "grad_norm": 4.3359174728393555, + "learning_rate": 3.41984508120132e-05 + }, + { + "step": 502, + "epoch": 0.8044871794871795, + "cpu_mem": 1.794011136, + "gpu_mem": 4.436675584, + "loss": 0.4685, + "grad_norm": 3.375187397003174, + "learning_rate": 3.366635821837627e-05 + }, + { + "step": 503, + "epoch": 0.8060897435897436, + "cpu_mem": 1.794011136, + "gpu_mem": 4.436689408, + "loss": 0.5675, + "grad_norm": 3.5298495292663574, + "learning_rate": 3.3137913821149425e-05 + }, + { + "step": 504, + "epoch": 0.8076923076923077, + "cpu_mem": 1.794011136, + "gpu_mem": 4.436672512, + "loss": 0.6223, + "grad_norm": 3.939624309539795, + "learning_rate": 3.261313419222825e-05 + }, + { + "step": 505, + "epoch": 0.8092948717948718, + "cpu_mem": 1.794011136, + "gpu_mem": 4.436726272, + "loss": 0.4788, + "grad_norm": 3.603093385696411, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 506, + "epoch": 0.8108974358974359, + "cpu_mem": 1.794011136, + "gpu_mem": 4.43666944, + "loss": 0.5081, + "grad_norm": 4.091839790344238, + "learning_rate": 3.157463495173713e-05 + }, + { + "step": 507, + "epoch": 0.8125, + "cpu_mem": 1.794011136, + "gpu_mem": 4.436747776, + "loss": 0.478, + "grad_norm": 4.184883117675781, + "learning_rate": 3.1060947907265936e-05 + }, + { + "step": 508, + "epoch": 0.8141025641025641, + "cpu_mem": 1.794011136, + "gpu_mem": 4.436690944, + "loss": 0.4467, + "grad_norm": 4.470358371734619, + "learning_rate": 3.0550990764276634e-05 + }, + { + "step": 509, + "epoch": 0.8157051282051282, + "cpu_mem": 1.794011136, + "gpu_mem": 4.436709376, + "loss": 0.597, + "grad_norm": 5.192322731018066, + "learning_rate": 3.00447795149086e-05 + }, + { + "step": 510, + "epoch": 0.8173076923076923, + "cpu_mem": 1.794011136, + "gpu_mem": 4.4366848, + "loss": 0.4513, + "grad_norm": 3.295670747756958, + "learning_rate": 2.9542330033830884e-05 + }, + { + "step": 511, + "epoch": 0.8189102564102564, + "cpu_mem": 1.794207744, + "gpu_mem": 4.436717056, + "loss": 0.4902, + "grad_norm": 4.266352653503418, + "learning_rate": 2.9043658077744316e-05 + }, + { + "step": 512, + "epoch": 0.8205128205128205, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436737024, + "loss": 0.6717, + "grad_norm": 4.769539833068848, + "learning_rate": 2.8548779284887442e-05 + }, + { + "step": 513, + "epoch": 0.8221153846153846, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436666368, + "loss": 0.5122, + "grad_norm": 4.379767894744873, + "learning_rate": 2.805770917454614e-05 + }, + { + "step": 514, + "epoch": 0.8237179487179487, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436680192, + "loss": 0.3744, + "grad_norm": 4.1639885902404785, + "learning_rate": 2.7570463146566758e-05 + }, + { + "step": 515, + "epoch": 0.8253205128205128, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436664832, + "loss": 0.6843, + "grad_norm": 4.648193359375, + "learning_rate": 2.708705648087332e-05 + }, + { + "step": 516, + "epoch": 0.8269230769230769, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436703232, + "loss": 0.4328, + "grad_norm": 3.263113021850586, + "learning_rate": 2.6607504336988317e-05 + }, + { + "step": 517, + "epoch": 0.8285256410256411, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436703232, + "loss": 0.5828, + "grad_norm": 5.040179252624512, + "learning_rate": 2.613182175355739e-05 + }, + { + "step": 518, + "epoch": 0.8301282051282052, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436689408, + "loss": 0.4996, + "grad_norm": 3.661585807800293, + "learning_rate": 2.5660023647877644e-05 + }, + { + "step": 519, + "epoch": 0.8317307692307693, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436680192, + "loss": 0.5299, + "grad_norm": 4.205281734466553, + "learning_rate": 2.5192124815429777e-05 + }, + { + "step": 520, + "epoch": 0.8333333333333334, + "cpu_mem": 1.794404352, + "gpu_mem": 4.4366848, + "loss": 0.5461, + "grad_norm": 3.244652271270752, + "learning_rate": 2.472813992941418e-05 + }, + { + "step": 521, + "epoch": 0.8349358974358975, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436687872, + "loss": 0.4999, + "grad_norm": 3.5447440147399902, + "learning_rate": 2.426808354029078e-05 + }, + { + "step": 522, + "epoch": 0.8365384615384616, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436694016, + "loss": 0.4029, + "grad_norm": 2.8145949840545654, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 523, + "epoch": 0.8381410256410257, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436712448, + "loss": 0.449, + "grad_norm": 3.32452392578125, + "learning_rate": 2.3359813838124277e-05 + }, + { + "step": 524, + "epoch": 0.8397435897435898, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436706304, + "loss": 0.6632, + "grad_norm": 5.892474174499512, + "learning_rate": 2.2911629008211363e-05 + }, + { + "step": 525, + "epoch": 0.8413461538461539, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436683264, + "loss": 0.3563, + "grad_norm": 3.4862375259399414, + "learning_rate": 2.24674296405579e-05 + }, + { + "step": 526, + "epoch": 0.842948717948718, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436670976, + "loss": 0.5627, + "grad_norm": 5.056401252746582, + "learning_rate": 2.2027229665154446e-05 + }, + { + "step": 527, + "epoch": 0.844551282051282, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436637184, + "loss": 0.6454, + "grad_norm": 5.47981595993042, + "learning_rate": 2.1591042886571634e-05 + }, + { + "step": 528, + "epoch": 0.8461538461538461, + "cpu_mem": 1.794404352, + "gpu_mem": 4.4366848, + "loss": 0.563, + "grad_norm": 3.9013442993164062, + "learning_rate": 2.1158882983527166e-05 + }, + { + "step": 529, + "epoch": 0.8477564102564102, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436651008, + "loss": 0.6183, + "grad_norm": 4.71444034576416, + "learning_rate": 2.0730763508456738e-05 + }, + { + "step": 530, + "epoch": 0.8493589743589743, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436698624, + "loss": 0.6445, + "grad_norm": 4.649319171905518, + "learning_rate": 2.0306697887089235e-05 + }, + { + "step": 531, + "epoch": 0.8509615384615384, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436697088, + "loss": 0.6351, + "grad_norm": 3.7950797080993652, + "learning_rate": 1.9886699418025543e-05 + }, + { + "step": 532, + "epoch": 0.8525641025641025, + "cpu_mem": 1.794404352, + "gpu_mem": 4.436698624, + "loss": 0.4075, + "grad_norm": 3.232701063156128, + "learning_rate": 1.947078127232169e-05 + }, + { + "step": 533, + "epoch": 0.8541666666666666, + "cpu_mem": 1.794404352, + "gpu_mem": 4.43670784, + "loss": 0.7319, + "grad_norm": 4.970794200897217, + "learning_rate": 1.9058956493075644e-05 + }, + { + "step": 534, + "epoch": 0.8557692307692307, + "cpu_mem": 1.79460096, + "gpu_mem": 4.436683264, + "loss": 0.6494, + "grad_norm": 4.306753158569336, + "learning_rate": 1.8651237995018324e-05 + }, + { + "step": 535, + "epoch": 0.8573717948717948, + "cpu_mem": 1.79460096, + "gpu_mem": 4.436667904, + "loss": 0.658, + "grad_norm": 4.080709457397461, + "learning_rate": 1.8247638564108607e-05 + }, + { + "step": 536, + "epoch": 0.8589743589743589, + "cpu_mem": 1.79460096, + "gpu_mem": 4.436697088, + "loss": 0.614, + "grad_norm": 4.221437454223633, + "learning_rate": 1.7848170857132325e-05 + }, + { + "step": 537, + "epoch": 0.8605769230769231, + "cpu_mem": 1.79460096, + "gpu_mem": 4.436710912, + "loss": 0.5669, + "grad_norm": 4.341765403747559, + "learning_rate": 1.7452847401305496e-05 + }, + { + "step": 538, + "epoch": 0.8621794871794872, + "cpu_mem": 1.79460096, + "gpu_mem": 4.436666368, + "loss": 0.5183, + "grad_norm": 3.6584784984588623, + "learning_rate": 1.7061680593881344e-05 + }, + { + "step": 539, + "epoch": 0.8637820512820513, + "cpu_mem": 1.79460096, + "gpu_mem": 4.436672512, + "loss": 0.6065, + "grad_norm": 3.6766653060913086, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 540, + "epoch": 0.8653846153846154, + "cpu_mem": 1.79460096, + "gpu_mem": 4.436701696, + "loss": 0.6057, + "grad_norm": 4.101161003112793, + "learning_rate": 1.6291865861111353e-05 + }, + { + "step": 541, + "epoch": 0.8669871794871795, + "cpu_mem": 1.79460096, + "gpu_mem": 4.436697088, + "loss": 0.4681, + "grad_norm": 3.3177623748779297, + "learning_rate": 1.5913242076979493e-05 + }, + { + "step": 542, + "epoch": 0.8685897435897436, + "cpu_mem": 1.79460096, + "gpu_mem": 4.436683264, + "loss": 0.6727, + "grad_norm": 4.047039031982422, + "learning_rate": 1.5538823222921288e-05 + }, + { + "step": 543, + "epoch": 0.8701923076923077, + "cpu_mem": 1.79460096, + "gpu_mem": 4.436697088, + "loss": 0.4736, + "grad_norm": 4.27162504196167, + "learning_rate": 1.5168621040626388e-05 + }, + { + "step": 544, + "epoch": 0.8717948717948718, + "cpu_mem": 1.79460096, + "gpu_mem": 4.436686336, + "loss": 0.607, + "grad_norm": 3.6118993759155273, + "learning_rate": 1.4802647139550577e-05 + }, + { + "step": 545, + "epoch": 0.8733974358974359, + "cpu_mem": 1.79460096, + "gpu_mem": 4.43669248, + "loss": 0.4518, + "grad_norm": 3.250335454940796, + "learning_rate": 1.444091299655175e-05 + }, + { + "step": 546, + "epoch": 0.875, + "cpu_mem": 1.79460096, + "gpu_mem": 4.436697088, + "loss": 0.5898, + "grad_norm": 4.267196178436279, + "learning_rate": 1.408342995552988e-05 + }, + { + "step": 547, + "epoch": 0.8766025641025641, + "cpu_mem": 1.79460096, + "gpu_mem": 4.43669248, + "loss": 0.4738, + "grad_norm": 3.71323823928833, + "learning_rate": 1.3730209227071436e-05 + }, + { + "step": 548, + "epoch": 0.8782051282051282, + "cpu_mem": 1.79460096, + "gpu_mem": 4.436666368, + "loss": 0.561, + "grad_norm": 3.839517116546631, + "learning_rate": 1.3381261888097755e-05 + }, + { + "step": 549, + "epoch": 0.8798076923076923, + "cpu_mem": 1.79460096, + "gpu_mem": 4.436675584, + "loss": 0.4335, + "grad_norm": 4.0469207763671875, + "learning_rate": 1.303659888151753e-05 + }, + { + "step": 550, + "epoch": 0.8814102564102564, + "cpu_mem": 1.79460096, + "gpu_mem": 4.436694016, + "loss": 0.6196, + "grad_norm": 3.954371929168701, + "learning_rate": 1.2696231015883913e-05 + }, + { + "step": 551, + "epoch": 0.8830128205128205, + "cpu_mem": 1.79460096, + "gpu_mem": 4.436664832, + "loss": 0.5449, + "grad_norm": 4.013249397277832, + "learning_rate": 1.2360168965055301e-05 + }, + { + "step": 552, + "epoch": 0.8846153846153846, + "cpu_mem": 1.79460096, + "gpu_mem": 4.436695552, + "loss": 0.6192, + "grad_norm": 4.606841564178467, + "learning_rate": 1.2028423267860805e-05 + }, + { + "step": 553, + "epoch": 0.8862179487179487, + "cpu_mem": 1.79460096, + "gpu_mem": 4.436704768, + "loss": 0.462, + "grad_norm": 3.1521763801574707, + "learning_rate": 1.1701004327769709e-05 + }, + { + "step": 554, + "epoch": 0.8878205128205128, + "cpu_mem": 1.794797568, + "gpu_mem": 4.436666368, + "loss": 0.7037, + "grad_norm": 4.533555507659912, + "learning_rate": 1.1377922412565005e-05 + }, + { + "step": 555, + "epoch": 0.8894230769230769, + "cpu_mem": 1.794797568, + "gpu_mem": 4.436670976, + "loss": 0.5093, + "grad_norm": 3.730055809020996, + "learning_rate": 1.1059187654021762e-05 + }, + { + "step": 556, + "epoch": 0.8910256410256411, + "cpu_mem": 1.794797568, + "gpu_mem": 4.436695552, + "loss": 0.5442, + "grad_norm": 3.3977084159851074, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 557, + "epoch": 0.8926282051282052, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436713984, + "loss": 0.4168, + "grad_norm": 3.2771377563476562, + "learning_rate": 1.0434799452076915e-05 + }, + { + "step": 558, + "epoch": 0.8942307692307693, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436695552, + "loss": 0.5143, + "grad_norm": 3.7577221393585205, + "learning_rate": 1.0129165589346643e-05 + }, + { + "step": 559, + "epoch": 0.8958333333333334, + "cpu_mem": 1.794994176, + "gpu_mem": 4.43674624, + "loss": 0.7301, + "grad_norm": 5.176353931427002, + "learning_rate": 9.82791804400626e-06 + }, + { + "step": 560, + "epoch": 0.8974358974358975, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436678656, + "loss": 0.8141, + "grad_norm": 4.961831569671631, + "learning_rate": 9.531066263109971e-06 + }, + { + "step": 561, + "epoch": 0.8990384615384616, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436680192, + "loss": 0.5075, + "grad_norm": 4.478232383728027, + "learning_rate": 9.238619555861731e-06 + }, + { + "step": 562, + "epoch": 0.9006410256410257, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436680192, + "loss": 0.6278, + "grad_norm": 4.468003273010254, + "learning_rate": 8.950587093323435e-06 + }, + { + "step": 563, + "epoch": 0.9022435897435898, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436686336, + "loss": 0.4849, + "grad_norm": 2.924138069152832, + "learning_rate": 8.66697790812731e-06 + }, + { + "step": 564, + "epoch": 0.9038461538461539, + "cpu_mem": 1.794994176, + "gpu_mem": 4.43670016, + "loss": 0.3418, + "grad_norm": 3.2150347232818604, + "learning_rate": 8.387800894192453e-06 + }, + { + "step": 565, + "epoch": 0.905448717948718, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436704768, + "loss": 0.6209, + "grad_norm": 4.409070014953613, + "learning_rate": 8.113064806446285e-06 + }, + { + "step": 566, + "epoch": 0.907051282051282, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436698624, + "loss": 0.4491, + "grad_norm": 3.3870038986206055, + "learning_rate": 7.842778260549654e-06 + }, + { + "step": 567, + "epoch": 0.9086538461538461, + "cpu_mem": 1.794994176, + "gpu_mem": 4.43669248, + "loss": 0.5174, + "grad_norm": 5.474210262298584, + "learning_rate": 7.5769497326268804e-06 + }, + { + "step": 568, + "epoch": 0.9102564102564102, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436706304, + "loss": 0.6406, + "grad_norm": 4.501908302307129, + "learning_rate": 7.315587558999864e-06 + }, + { + "step": 569, + "epoch": 0.9118589743589743, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436698624, + "loss": 0.6543, + "grad_norm": 3.7554819583892822, + "learning_rate": 7.058699935926526e-06 + }, + { + "step": 570, + "epoch": 0.9134615384615384, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436683264, + "loss": 0.5553, + "grad_norm": 4.473869323730469, + "learning_rate": 6.8062949193440515e-06 + }, + { + "step": 571, + "epoch": 0.9150641025641025, + "cpu_mem": 1.794994176, + "gpu_mem": 4.43669248, + "loss": 0.6701, + "grad_norm": 3.913724184036255, + "learning_rate": 6.5583804246160385e-06 + }, + { + "step": 572, + "epoch": 0.9166666666666666, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436701696, + "loss": 0.4617, + "grad_norm": 4.013757228851318, + "learning_rate": 6.3149642262843804e-06 + }, + { + "step": 573, + "epoch": 0.9182692307692307, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436704768, + "loss": 0.5188, + "grad_norm": 4.270125865936279, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 574, + "epoch": 0.9198717948717948, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436658688, + "loss": 0.4917, + "grad_norm": 3.897599935531616, + "learning_rate": 5.84165711141048e-06 + }, + { + "step": 575, + "epoch": 0.9214743589743589, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436712448, + "loss": 0.3937, + "grad_norm": 3.885298013687134, + "learning_rate": 5.611781037671176e-06 + }, + { + "step": 576, + "epoch": 0.9230769230769231, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436713984, + "loss": 0.4639, + "grad_norm": 3.179224729537964, + "learning_rate": 5.386432945468555e-06 + }, + { + "step": 577, + "epoch": 0.9246794871794872, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436658688, + "loss": 0.4893, + "grad_norm": 3.8022515773773193, + "learning_rate": 5.165619901667311e-06 + }, + { + "step": 578, + "epoch": 0.9262820512820513, + "cpu_mem": 1.794994176, + "gpu_mem": 4.43669248, + "loss": 0.5868, + "grad_norm": 3.588253974914551, + "learning_rate": 4.949348830914002e-06 + }, + { + "step": 579, + "epoch": 0.9278846153846154, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436670976, + "loss": 0.6219, + "grad_norm": 4.463711738586426, + "learning_rate": 4.737626515419951e-06 + }, + { + "step": 580, + "epoch": 0.9294871794871795, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436701696, + "loss": 0.595, + "grad_norm": 4.2354736328125, + "learning_rate": 4.530459594748592e-06 + }, + { + "step": 581, + "epoch": 0.9310897435897436, + "cpu_mem": 1.794994176, + "gpu_mem": 4.43667712, + "loss": 0.4579, + "grad_norm": 4.262109756469727, + "learning_rate": 4.327854565607164e-06 + }, + { + "step": 582, + "epoch": 0.9326923076923077, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436710912, + "loss": 0.4933, + "grad_norm": 3.533358097076416, + "learning_rate": 4.129817781643091e-06 + }, + { + "step": 583, + "epoch": 0.9342948717948718, + "cpu_mem": 1.794994176, + "gpu_mem": 4.43673088, + "loss": 0.5719, + "grad_norm": 3.9157609939575195, + "learning_rate": 3.9363554532446276e-06 + }, + { + "step": 584, + "epoch": 0.9358974358974359, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436695552, + "loss": 0.426, + "grad_norm": 3.990307331085205, + "learning_rate": 3.7474736473461607e-06 + }, + { + "step": 585, + "epoch": 0.9375, + "cpu_mem": 1.794994176, + "gpu_mem": 4.43671552, + "loss": 0.4404, + "grad_norm": 3.072902202606201, + "learning_rate": 3.56317828723795e-06 + }, + { + "step": 586, + "epoch": 0.9391025641025641, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436695552, + "loss": 0.4296, + "grad_norm": 3.5783474445343018, + "learning_rate": 3.383475152380355e-06 + }, + { + "step": 587, + "epoch": 0.9407051282051282, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436697088, + "loss": 0.538, + "grad_norm": 4.739755153656006, + "learning_rate": 3.2083698782225997e-06 + }, + { + "step": 588, + "epoch": 0.9423076923076923, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436689408, + "loss": 0.4491, + "grad_norm": 3.860515832901001, + "learning_rate": 3.0378679560260467e-06 + }, + { + "step": 589, + "epoch": 0.9439102564102564, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436687872, + "loss": 0.6325, + "grad_norm": 5.408517837524414, + "learning_rate": 2.871974732691984e-06 + }, + { + "step": 590, + "epoch": 0.9455128205128205, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436701696, + "loss": 0.6281, + "grad_norm": 4.000801086425781, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 591, + "epoch": 0.9471153846153846, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436672512, + "loss": 0.5075, + "grad_norm": 3.1697006225585938, + "learning_rate": 2.554035047414732e-06 + }, + { + "step": 592, + "epoch": 0.9487179487179487, + "cpu_mem": 1.794994176, + "gpu_mem": 4.4367232, + "loss": 0.5129, + "grad_norm": 4.392864227294922, + "learning_rate": 2.401998555987389e-06 + }, + { + "step": 593, + "epoch": 0.9503205128205128, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436720128, + "loss": 0.5952, + "grad_norm": 4.163697719573975, + "learning_rate": 2.2545907041415457e-06 + }, + { + "step": 594, + "epoch": 0.9519230769230769, + "cpu_mem": 1.794994176, + "gpu_mem": 4.43670016, + "loss": 0.6223, + "grad_norm": 4.263038158416748, + "learning_rate": 2.1118161145537436e-06 + }, + { + "step": 595, + "epoch": 0.9535256410256411, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436681728, + "loss": 0.445, + "grad_norm": 2.9297125339508057, + "learning_rate": 1.973679264602485e-06 + }, + { + "step": 596, + "epoch": 0.9551282051282052, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436690944, + "loss": 0.5302, + "grad_norm": 4.073398590087891, + "learning_rate": 1.840184486227808e-06 + }, + { + "step": 597, + "epoch": 0.9567307692307693, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436658688, + "loss": 0.5758, + "grad_norm": 4.972208499908447, + "learning_rate": 1.711335965795435e-06 + }, + { + "step": 598, + "epoch": 0.9583333333333334, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436720128, + "loss": 0.5329, + "grad_norm": 3.644810199737549, + "learning_rate": 1.5871377439655054e-06 + }, + { + "step": 599, + "epoch": 0.9599358974358975, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436718592, + "loss": 0.4392, + "grad_norm": 3.2555408477783203, + "learning_rate": 1.4675937155658456e-06 + }, + { + "step": 600, + "epoch": 0.9615384615384616, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436674048, + "loss": 0.5636, + "grad_norm": 3.7881691455841064, + "learning_rate": 1.3527076294698846e-06 + }, + { + "step": 601, + "epoch": 0.9631410256410257, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436706304, + "loss": 0.5172, + "grad_norm": 4.100376129150391, + "learning_rate": 1.2424830884790126e-06 + }, + { + "step": 602, + "epoch": 0.9647435897435898, + "cpu_mem": 1.794994176, + "gpu_mem": 4.43670016, + "loss": 0.5233, + "grad_norm": 4.834383487701416, + "learning_rate": 1.1369235492096397e-06 + }, + { + "step": 603, + "epoch": 0.9663461538461539, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436686336, + "loss": 0.3842, + "grad_norm": 4.330268859863281, + "learning_rate": 1.0360323219847645e-06 + }, + { + "step": 604, + "epoch": 0.967948717948718, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436686336, + "loss": 0.7489, + "grad_norm": 5.054118633270264, + "learning_rate": 9.398125707302084e-07 + }, + { + "step": 605, + "epoch": 0.969551282051282, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436712448, + "loss": 0.417, + "grad_norm": 2.857571840286255, + "learning_rate": 8.482673128753947e-07 + }, + { + "step": 606, + "epoch": 0.9711538461538461, + "cpu_mem": 1.794994176, + "gpu_mem": 4.43670016, + "loss": 0.3602, + "grad_norm": 4.049975872039795, + "learning_rate": 7.613994192586736e-07 + }, + { + "step": 607, + "epoch": 0.9727564102564102, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436690944, + "loss": 0.7015, + "grad_norm": 4.124582767486572, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 608, + "epoch": 0.9743589743589743, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436683264, + "loss": 0.6957, + "grad_norm": 6.526576519012451, + "learning_rate": 6.017064746021094e-07 + }, + { + "step": 609, + "epoch": 0.9759615384615384, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436706304, + "loss": 0.4289, + "grad_norm": 3.6352458000183105, + "learning_rate": 5.288864314965003e-07 + }, + { + "step": 610, + "epoch": 0.9775641025641025, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436695552, + "loss": 0.3529, + "grad_norm": 2.8481993675231934, + "learning_rate": 4.607537683404106e-07 + }, + { + "step": 611, + "epoch": 0.9791666666666666, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436680192, + "loss": 0.4049, + "grad_norm": 3.3151462078094482, + "learning_rate": 3.973106217585842e-07 + }, + { + "step": 612, + "epoch": 0.9807692307692307, + "cpu_mem": 1.794994176, + "gpu_mem": 4.43674624, + "loss": 0.5517, + "grad_norm": 4.056897163391113, + "learning_rate": 3.3855898131356915e-07 + }, + { + "step": 613, + "epoch": 0.9823717948717948, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436689408, + "loss": 0.5294, + "grad_norm": 4.433591365814209, + "learning_rate": 2.845006894433843e-07 + }, + { + "step": 614, + "epoch": 0.9839743589743589, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436675584, + "loss": 0.5069, + "grad_norm": 3.355001926422119, + "learning_rate": 2.351374414037155e-07 + }, + { + "step": 615, + "epoch": 0.9855769230769231, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436740096, + "loss": 0.5062, + "grad_norm": 4.395427703857422, + "learning_rate": 1.9047078521474135e-07 + }, + { + "step": 616, + "epoch": 0.9871794871794872, + "cpu_mem": 1.794994176, + "gpu_mem": 4.43666944, + "loss": 0.5694, + "grad_norm": 4.33888578414917, + "learning_rate": 1.505021216125557e-07 + }, + { + "step": 617, + "epoch": 0.9887820512820513, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436697088, + "loss": 0.3402, + "grad_norm": 3.34303617477417, + "learning_rate": 1.1523270400535245e-07 + }, + { + "step": 618, + "epoch": 0.9903846153846154, + "cpu_mem": 1.794994176, + "gpu_mem": 4.43670016, + "loss": 0.5166, + "grad_norm": 3.6346209049224854, + "learning_rate": 8.466363843397383e-08 + }, + { + "step": 619, + "epoch": 0.9919871794871795, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436675584, + "loss": 0.6671, + "grad_norm": 5.3376545906066895, + "learning_rate": 5.8795883537338106e-08 + }, + { + "step": 620, + "epoch": 0.9935897435897436, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436703232, + "loss": 0.476, + "grad_norm": 3.2905967235565186, + "learning_rate": 3.763025052231361e-08 + }, + { + "step": 621, + "epoch": 0.9951923076923077, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436712448, + "loss": 0.4604, + "grad_norm": 4.822052478790283, + "learning_rate": 2.1167403138339088e-08 + }, + { + "step": 622, + "epoch": 0.9967948717948718, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436709376, + "loss": 0.5098, + "grad_norm": 3.513444185256958, + "learning_rate": 9.407857656540397e-09 + }, + { + "step": 623, + "epoch": 0.9983974358974359, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436681728, + "loss": 0.5438, + "grad_norm": 4.024201393127441, + "learning_rate": 2.3519828535434325e-09 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436428288, + "loss": 0.3917, + "grad_norm": 4.229819297790527, + "learning_rate": 0.0 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.794994176, + "gpu_mem": 4.436428288, + "train_runtime": 8245.3651, + "train_samples_per_second": 4.84, + "train_steps_per_second": 0.076, + "total_flos": 8.436434983002931e+16, + "train_loss": 1.0135697507992005 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r32-a2/README.md b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r32-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r32-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r32-a2/adapter_config.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..03eabaea80bc9f8c1936ead28264f565a8ac69c0 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r32-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r32-a2/eval_results.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..4b850954df5fba1eca0a83a766e791616fe21883 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "hellaswag", + "results": 0.8604859589723163 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r32-a2/training_configuration.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..cd8653a48cede8be5f4271995d1f7a9908859baa --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "HELLASWAG", + "dataset_id": "Rowan/hellaswag", + "preprocess_id": "hellaswag_train_deepeval" + }, + "peft_config": { + "method": "lora", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 25231360 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 1, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-lora-hellaswag-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r32-a2", + "seed": 42, + "timestamp": "2025-08-30T08:45:54.111624" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r32-a2/training_logs.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..616f8cf7d61a2fe9f8527ab2f020b60184b29c89 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r32-a2/training_logs.json @@ -0,0 +1,5629 @@ +[ + { + "step": 1, + "epoch": 0.0016025641025641025, + "cpu_mem": 1.721778176, + "gpu_mem": 4.51868416, + "loss": 3.4877, + "grad_norm": 45.624332427978516, + "learning_rate": 4.7619047619047615e-06 + }, + { + "step": 2, + "epoch": 0.003205128205128205, + "cpu_mem": 1.728069632, + "gpu_mem": 4.720528384, + "loss": 3.6203, + "grad_norm": 44.852569580078125, + "learning_rate": 9.523809523809523e-06 + }, + { + "step": 3, + "epoch": 0.004807692307692308, + "cpu_mem": 1.72924928, + "gpu_mem": 4.720536064, + "loss": 3.224, + "grad_norm": 42.18233108520508, + "learning_rate": 1.4285714285714284e-05 + }, + { + "step": 4, + "epoch": 0.00641025641025641, + "cpu_mem": 1.73023232, + "gpu_mem": 4.720569856, + "loss": 3.0529, + "grad_norm": 34.29615783691406, + "learning_rate": 1.9047619047619046e-05 + }, + { + "step": 5, + "epoch": 0.008012820512820512, + "cpu_mem": 1.73121536, + "gpu_mem": 4.720532992, + "loss": 2.5087, + "grad_norm": 25.34889030456543, + "learning_rate": 2.3809523809523807e-05 + }, + { + "step": 6, + "epoch": 0.009615384615384616, + "cpu_mem": 1.7321984, + "gpu_mem": 4.720579072, + "loss": 2.1275, + "grad_norm": 17.694313049316406, + "learning_rate": 2.8571428571428567e-05 + }, + { + "step": 7, + "epoch": 0.011217948717948718, + "cpu_mem": 1.732984832, + "gpu_mem": 4.720539136, + "loss": 1.8401, + "grad_norm": 12.947490692138672, + "learning_rate": 3.333333333333333e-05 + }, + { + "step": 8, + "epoch": 0.01282051282051282, + "cpu_mem": 1.733771264, + "gpu_mem": 4.720569856, + "loss": 1.5264, + "grad_norm": 5.436551094055176, + "learning_rate": 3.809523809523809e-05 + }, + { + "step": 9, + "epoch": 0.014423076923076924, + "cpu_mem": 1.734557696, + "gpu_mem": 4.720569856, + "loss": 1.4607, + "grad_norm": 2.95275616645813, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 10, + "epoch": 0.016025641025641024, + "cpu_mem": 1.735344128, + "gpu_mem": 4.720513024, + "loss": 1.4274, + "grad_norm": 2.161958694458008, + "learning_rate": 4.7619047619047614e-05 + }, + { + "step": 11, + "epoch": 0.017628205128205128, + "cpu_mem": 1.73613056, + "gpu_mem": 4.720532992, + "loss": 1.4371, + "grad_norm": 5.046875, + "learning_rate": 5.238095238095237e-05 + }, + { + "step": 12, + "epoch": 0.019230769230769232, + "cpu_mem": 1.736916992, + "gpu_mem": 4.72052992, + "loss": 1.4013, + "grad_norm": 3.2430617809295654, + "learning_rate": 5.7142857142857135e-05 + }, + { + "step": 13, + "epoch": 0.020833333333333332, + "cpu_mem": 1.737703424, + "gpu_mem": 4.72052224, + "loss": 1.3925, + "grad_norm": 2.895480155944824, + "learning_rate": 6.190476190476189e-05 + }, + { + "step": 14, + "epoch": 0.022435897435897436, + "cpu_mem": 1.738489856, + "gpu_mem": 4.720548352, + "loss": 1.3756, + "grad_norm": 2.6612460613250732, + "learning_rate": 6.666666666666666e-05 + }, + { + "step": 15, + "epoch": 0.02403846153846154, + "cpu_mem": 1.73907968, + "gpu_mem": 4.720546816, + "loss": 1.3866, + "grad_norm": 2.5455257892608643, + "learning_rate": 7.142857142857142e-05 + }, + { + "step": 16, + "epoch": 0.02564102564102564, + "cpu_mem": 1.739866112, + "gpu_mem": 4.720539136, + "loss": 1.4648, + "grad_norm": 5.530676364898682, + "learning_rate": 7.619047619047618e-05 + }, + { + "step": 17, + "epoch": 0.027243589743589744, + "cpu_mem": 1.740652544, + "gpu_mem": 4.720539136, + "loss": 1.4892, + "grad_norm": 4.753200054168701, + "learning_rate": 8.095238095238093e-05 + }, + { + "step": 18, + "epoch": 0.028846153846153848, + "cpu_mem": 1.741242368, + "gpu_mem": 4.720539136, + "loss": 1.3135, + "grad_norm": 1.888420820236206, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 19, + "epoch": 0.030448717948717948, + "cpu_mem": 1.7420288, + "gpu_mem": 4.720539136, + "loss": 1.4895, + "grad_norm": 3.6025712490081787, + "learning_rate": 9.047619047619046e-05 + }, + { + "step": 20, + "epoch": 0.03205128205128205, + "cpu_mem": 1.742618624, + "gpu_mem": 4.720513024, + "loss": 1.5106, + "grad_norm": 4.70390510559082, + "learning_rate": 9.523809523809523e-05 + }, + { + "step": 21, + "epoch": 0.03365384615384615, + "cpu_mem": 1.743208448, + "gpu_mem": 4.72052992, + "loss": 1.4453, + "grad_norm": 3.293036937713623, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 22, + "epoch": 0.035256410256410256, + "cpu_mem": 1.743798272, + "gpu_mem": 4.7205376, + "loss": 1.4775, + "grad_norm": 4.905246257781982, + "learning_rate": 0.00010476190476190474 + }, + { + "step": 23, + "epoch": 0.03685897435897436, + "cpu_mem": 1.744584704, + "gpu_mem": 4.720551424, + "loss": 1.3793, + "grad_norm": 2.105634927749634, + "learning_rate": 0.0001095238095238095 + }, + { + "step": 24, + "epoch": 0.038461538461538464, + "cpu_mem": 1.745174528, + "gpu_mem": 4.720536064, + "loss": 1.4438, + "grad_norm": 4.069201946258545, + "learning_rate": 0.00011428571428571427 + }, + { + "step": 25, + "epoch": 0.04006410256410257, + "cpu_mem": 1.745764352, + "gpu_mem": 4.720523776, + "loss": 1.487, + "grad_norm": 3.3817083835601807, + "learning_rate": 0.00011904761904761903 + }, + { + "step": 26, + "epoch": 0.041666666666666664, + "cpu_mem": 1.746354176, + "gpu_mem": 4.72052992, + "loss": 1.4191, + "grad_norm": 2.5586702823638916, + "learning_rate": 0.00012380952380952378 + }, + { + "step": 27, + "epoch": 0.04326923076923077, + "cpu_mem": 1.747140608, + "gpu_mem": 4.7205376, + "loss": 1.373, + "grad_norm": 1.1180799007415771, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 28, + "epoch": 0.04487179487179487, + "cpu_mem": 1.747730432, + "gpu_mem": 4.720532992, + "loss": 1.3668, + "grad_norm": 0.7312409281730652, + "learning_rate": 0.0001333333333333333 + }, + { + "step": 29, + "epoch": 0.046474358974358976, + "cpu_mem": 1.748123648, + "gpu_mem": 4.720542208, + "loss": 1.4861, + "grad_norm": 2.5371813774108887, + "learning_rate": 0.00013809523809523808 + }, + { + "step": 30, + "epoch": 0.04807692307692308, + "cpu_mem": 1.748713472, + "gpu_mem": 4.72051456, + "loss": 1.4403, + "grad_norm": 2.0787477493286133, + "learning_rate": 0.00014285714285714284 + }, + { + "step": 31, + "epoch": 0.049679487179487176, + "cpu_mem": 1.749303296, + "gpu_mem": 4.720569856, + "loss": 1.4129, + "grad_norm": 1.1780266761779785, + "learning_rate": 0.0001476190476190476 + }, + { + "step": 32, + "epoch": 0.05128205128205128, + "cpu_mem": 1.750089728, + "gpu_mem": 4.720562176, + "loss": 1.3917, + "grad_norm": 0.8133901953697205, + "learning_rate": 0.00015238095238095237 + }, + { + "step": 33, + "epoch": 0.052884615384615384, + "cpu_mem": 1.750482944, + "gpu_mem": 4.720516096, + "loss": 1.4106, + "grad_norm": 1.1489310264587402, + "learning_rate": 0.00015714285714285713 + }, + { + "step": 34, + "epoch": 0.05448717948717949, + "cpu_mem": 1.75087616, + "gpu_mem": 4.720534528, + "loss": 1.4062, + "grad_norm": 0.9190765619277954, + "learning_rate": 0.00016190476190476187 + }, + { + "step": 35, + "epoch": 0.05608974358974359, + "cpu_mem": 1.751465984, + "gpu_mem": 4.720556032, + "loss": 1.4416, + "grad_norm": 2.3195981979370117, + "learning_rate": 0.00016666666666666666 + }, + { + "step": 36, + "epoch": 0.057692307692307696, + "cpu_mem": 1.752055808, + "gpu_mem": 4.720554496, + "loss": 1.4016, + "grad_norm": 0.8058652281761169, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 37, + "epoch": 0.05929487179487179, + "cpu_mem": 1.752645632, + "gpu_mem": 4.720586752, + "loss": 1.4117, + "grad_norm": 1.2887476682662964, + "learning_rate": 0.0001761904761904762 + }, + { + "step": 38, + "epoch": 0.060897435897435896, + "cpu_mem": 1.753235456, + "gpu_mem": 4.720539136, + "loss": 1.4319, + "grad_norm": 1.2711442708969116, + "learning_rate": 0.00018095238095238093 + }, + { + "step": 39, + "epoch": 0.0625, + "cpu_mem": 1.75382528, + "gpu_mem": 4.720595968, + "loss": 1.3764, + "grad_norm": 1.7507933378219604, + "learning_rate": 0.00018571428571428572 + }, + { + "step": 40, + "epoch": 0.0641025641025641, + "cpu_mem": 1.754415104, + "gpu_mem": 4.720523776, + "loss": 1.4268, + "grad_norm": 1.466294288635254, + "learning_rate": 0.00019047619047619045 + }, + { + "step": 41, + "epoch": 0.06570512820512821, + "cpu_mem": 1.755004928, + "gpu_mem": 4.720551424, + "loss": 1.3593, + "grad_norm": 0.8242108821868896, + "learning_rate": 0.00019523809523809522 + }, + { + "step": 42, + "epoch": 0.0673076923076923, + "cpu_mem": 1.755398144, + "gpu_mem": 4.720565248, + "loss": 1.466, + "grad_norm": 2.1575498580932617, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 43, + "epoch": 0.06891025641025642, + "cpu_mem": 1.755987968, + "gpu_mem": 4.720571392, + "loss": 1.3959, + "grad_norm": 1.0167787075042725, + "learning_rate": 0.00020476190476190475 + }, + { + "step": 44, + "epoch": 0.07051282051282051, + "cpu_mem": 1.756577792, + "gpu_mem": 4.720549888, + "loss": 1.4274, + "grad_norm": 0.9973177313804626, + "learning_rate": 0.00020952380952380948 + }, + { + "step": 45, + "epoch": 0.07211538461538461, + "cpu_mem": 1.756971008, + "gpu_mem": 4.720549888, + "loss": 1.3925, + "grad_norm": 0.4502605199813843, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 46, + "epoch": 0.07371794871794872, + "cpu_mem": 1.757560832, + "gpu_mem": 4.720549888, + "loss": 1.3823, + "grad_norm": 1.3020449876785278, + "learning_rate": 0.000219047619047619 + }, + { + "step": 47, + "epoch": 0.07532051282051282, + "cpu_mem": 1.758150656, + "gpu_mem": 4.720536064, + "loss": 1.4321, + "grad_norm": 1.2800509929656982, + "learning_rate": 0.0002238095238095238 + }, + { + "step": 48, + "epoch": 0.07692307692307693, + "cpu_mem": 1.75874048, + "gpu_mem": 4.720554496, + "loss": 1.3856, + "grad_norm": 1.0432732105255127, + "learning_rate": 0.00022857142857142854 + }, + { + "step": 49, + "epoch": 0.07852564102564102, + "cpu_mem": 1.759330304, + "gpu_mem": 4.720566784, + "loss": 1.4313, + "grad_norm": 1.346016526222229, + "learning_rate": 0.0002333333333333333 + }, + { + "step": 50, + "epoch": 0.08012820512820513, + "cpu_mem": 1.75972352, + "gpu_mem": 4.720543744, + "loss": 1.4321, + "grad_norm": 1.97268807888031, + "learning_rate": 0.00023809523809523807 + }, + { + "step": 51, + "epoch": 0.08173076923076923, + "cpu_mem": 1.760116736, + "gpu_mem": 4.720528384, + "loss": 1.3984, + "grad_norm": 1.6832994222640991, + "learning_rate": 0.00024285714285714283 + }, + { + "step": 52, + "epoch": 0.08333333333333333, + "cpu_mem": 1.76070656, + "gpu_mem": 4.720532992, + "loss": 1.3861, + "grad_norm": 1.14437735080719, + "learning_rate": 0.00024761904761904757 + }, + { + "step": 53, + "epoch": 0.08493589743589744, + "cpu_mem": 1.761099776, + "gpu_mem": 4.72056064, + "loss": 1.4263, + "grad_norm": 2.0639162063598633, + "learning_rate": 0.0002523809523809524 + }, + { + "step": 54, + "epoch": 0.08653846153846154, + "cpu_mem": 1.761492992, + "gpu_mem": 4.720536064, + "loss": 1.4339, + "grad_norm": 2.9532437324523926, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 55, + "epoch": 0.08814102564102565, + "cpu_mem": 1.762082816, + "gpu_mem": 4.720554496, + "loss": 1.4112, + "grad_norm": 1.9110426902770996, + "learning_rate": 0.00026190476190476186 + }, + { + "step": 56, + "epoch": 0.08974358974358974, + "cpu_mem": 1.76267264, + "gpu_mem": 4.720548352, + "loss": 1.3884, + "grad_norm": 1.3936654329299927, + "learning_rate": 0.0002666666666666666 + }, + { + "step": 57, + "epoch": 0.09134615384615384, + "cpu_mem": 1.763065856, + "gpu_mem": 4.72051456, + "loss": 1.42, + "grad_norm": 2.418982982635498, + "learning_rate": 0.0002714285714285714 + }, + { + "step": 58, + "epoch": 0.09294871794871795, + "cpu_mem": 1.76365568, + "gpu_mem": 4.720543744, + "loss": 1.4723, + "grad_norm": 2.420297145843506, + "learning_rate": 0.00027619047619047615 + }, + { + "step": 59, + "epoch": 0.09455128205128205, + "cpu_mem": 1.764048896, + "gpu_mem": 4.720526848, + "loss": 1.3391, + "grad_norm": 1.6782341003417969, + "learning_rate": 0.0002809523809523809 + }, + { + "step": 60, + "epoch": 0.09615384615384616, + "cpu_mem": 1.764442112, + "gpu_mem": 4.72056832, + "loss": 1.4047, + "grad_norm": 1.4877963066101074, + "learning_rate": 0.0002857142857142857 + }, + { + "step": 61, + "epoch": 0.09775641025641026, + "cpu_mem": 1.765031936, + "gpu_mem": 4.720534528, + "loss": 1.4046, + "grad_norm": 0.5583492517471313, + "learning_rate": 0.00029047619047619045 + }, + { + "step": 62, + "epoch": 0.09935897435897435, + "cpu_mem": 1.765425152, + "gpu_mem": 4.720574464, + "loss": 1.3422, + "grad_norm": 1.1066739559173584, + "learning_rate": 0.0002952380952380952 + }, + { + "step": 63, + "epoch": 0.10096153846153846, + "cpu_mem": 1.765818368, + "gpu_mem": 4.720528384, + "loss": 1.4269, + "grad_norm": 1.0821268558502197, + "learning_rate": 0.0003 + }, + { + "step": 64, + "epoch": 0.10256410256410256, + "cpu_mem": 1.766211584, + "gpu_mem": 4.720532992, + "loss": 1.4408, + "grad_norm": 1.262398600578308, + "learning_rate": 0.00029999764801714643 + }, + { + "step": 65, + "epoch": 0.10416666666666667, + "cpu_mem": 1.766801408, + "gpu_mem": 4.72052992, + "loss": 1.399, + "grad_norm": 0.7498250007629395, + "learning_rate": 0.00029999059214234344 + }, + { + "step": 66, + "epoch": 0.10576923076923077, + "cpu_mem": 1.767391232, + "gpu_mem": 4.720548352, + "loss": 1.4052, + "grad_norm": 0.7992784976959229, + "learning_rate": 0.00029997883259686163 + }, + { + "step": 67, + "epoch": 0.10737179487179487, + "cpu_mem": 1.767784448, + "gpu_mem": 4.720540672, + "loss": 1.4222, + "grad_norm": 1.2474349737167358, + "learning_rate": 0.00029996236974947764 + }, + { + "step": 68, + "epoch": 0.10897435897435898, + "cpu_mem": 1.768374272, + "gpu_mem": 4.720525312, + "loss": 1.4202, + "grad_norm": 1.8791433572769165, + "learning_rate": 0.00029994120411646263 + }, + { + "step": 69, + "epoch": 0.11057692307692307, + "cpu_mem": 1.768767488, + "gpu_mem": 4.720595968, + "loss": 1.5128, + "grad_norm": 3.4562149047851562, + "learning_rate": 0.00029991533636156603 + }, + { + "step": 70, + "epoch": 0.11217948717948718, + "cpu_mem": 1.769160704, + "gpu_mem": 4.720546816, + "loss": 1.4825, + "grad_norm": 2.3116061687469482, + "learning_rate": 0.00029988476729599464 + }, + { + "step": 71, + "epoch": 0.11378205128205128, + "cpu_mem": 1.76955392, + "gpu_mem": 4.720571392, + "loss": 1.367, + "grad_norm": 1.3178279399871826, + "learning_rate": 0.0002998494978783874 + }, + { + "step": 72, + "epoch": 0.11538461538461539, + "cpu_mem": 1.769947136, + "gpu_mem": 4.720542208, + "loss": 1.4007, + "grad_norm": 0.880597710609436, + "learning_rate": 0.0002998095292147852 + }, + { + "step": 73, + "epoch": 0.11698717948717949, + "cpu_mem": 1.77053696, + "gpu_mem": 4.720534528, + "loss": 1.5591, + "grad_norm": 5.063915252685547, + "learning_rate": 0.0002997648625585962 + }, + { + "step": 74, + "epoch": 0.11858974358974358, + "cpu_mem": 1.770930176, + "gpu_mem": 4.720528384, + "loss": 1.3913, + "grad_norm": 2.110844850540161, + "learning_rate": 0.0002997154993105566 + }, + { + "step": 75, + "epoch": 0.1201923076923077, + "cpu_mem": 1.771323392, + "gpu_mem": 4.720557568, + "loss": 1.4695, + "grad_norm": 2.442915439605713, + "learning_rate": 0.00029966144101868636 + }, + { + "step": 76, + "epoch": 0.12179487179487179, + "cpu_mem": 1.771716608, + "gpu_mem": 4.720548352, + "loss": 1.5327, + "grad_norm": 3.8041326999664307, + "learning_rate": 0.0002996026893782414 + }, + { + "step": 77, + "epoch": 0.1233974358974359, + "cpu_mem": 1.772109824, + "gpu_mem": 4.720536064, + "loss": 1.4018, + "grad_norm": 1.2460558414459229, + "learning_rate": 0.00029953924623165955 + }, + { + "step": 78, + "epoch": 0.125, + "cpu_mem": 1.77250304, + "gpu_mem": 4.720528384, + "loss": 1.4786, + "grad_norm": 2.4543752670288086, + "learning_rate": 0.0002994711135685035 + }, + { + "step": 79, + "epoch": 0.1266025641025641, + "cpu_mem": 1.772896256, + "gpu_mem": 4.720580608, + "loss": 1.4329, + "grad_norm": 1.9899290800094604, + "learning_rate": 0.00029939829352539787 + }, + { + "step": 80, + "epoch": 0.1282051282051282, + "cpu_mem": 1.773289472, + "gpu_mem": 4.720559104, + "loss": 1.4296, + "grad_norm": 1.1790740489959717, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 81, + "epoch": 0.12980769230769232, + "cpu_mem": 1.773682688, + "gpu_mem": 4.72055296, + "loss": 1.3837, + "grad_norm": 1.3790696859359741, + "learning_rate": 0.0002992386005807413 + }, + { + "step": 82, + "epoch": 0.13141025641025642, + "cpu_mem": 1.774272512, + "gpu_mem": 4.72052992, + "loss": 1.3632, + "grad_norm": 0.6337279081344604, + "learning_rate": 0.00029915173268712456 + }, + { + "step": 83, + "epoch": 0.1330128205128205, + "cpu_mem": 1.774665728, + "gpu_mem": 4.720551424, + "loss": 1.4741, + "grad_norm": 1.534408688545227, + "learning_rate": 0.0002990601874292698 + }, + { + "step": 84, + "epoch": 0.1346153846153846, + "cpu_mem": 1.775058944, + "gpu_mem": 4.720523776, + "loss": 1.4241, + "grad_norm": 1.1739754676818848, + "learning_rate": 0.0002989639676780152 + }, + { + "step": 85, + "epoch": 0.1362179487179487, + "cpu_mem": 1.77545216, + "gpu_mem": 4.720531456, + "loss": 1.3907, + "grad_norm": 0.6938722729682922, + "learning_rate": 0.0002988630764507904 + }, + { + "step": 86, + "epoch": 0.13782051282051283, + "cpu_mem": 1.775845376, + "gpu_mem": 4.720549888, + "loss": 1.3889, + "grad_norm": 0.7773188948631287, + "learning_rate": 0.00029875751691152094 + }, + { + "step": 87, + "epoch": 0.13942307692307693, + "cpu_mem": 1.776238592, + "gpu_mem": 4.720539136, + "loss": 1.3874, + "grad_norm": 0.6501248478889465, + "learning_rate": 0.0002986472923705301 + }, + { + "step": 88, + "epoch": 0.14102564102564102, + "cpu_mem": 1.776631808, + "gpu_mem": 4.7205376, + "loss": 1.3825, + "grad_norm": 1.009731411933899, + "learning_rate": 0.0002985324062844341 + }, + { + "step": 89, + "epoch": 0.14262820512820512, + "cpu_mem": 1.777025024, + "gpu_mem": 4.720532992, + "loss": 1.4284, + "grad_norm": 1.0669746398925781, + "learning_rate": 0.0002984128622560345 + }, + { + "step": 90, + "epoch": 0.14423076923076922, + "cpu_mem": 1.77741824, + "gpu_mem": 4.7205376, + "loss": 1.4022, + "grad_norm": 1.0928078889846802, + "learning_rate": 0.0002982886640342046 + }, + { + "step": 91, + "epoch": 0.14583333333333334, + "cpu_mem": 1.777811456, + "gpu_mem": 4.720548352, + "loss": 1.4175, + "grad_norm": 1.2170552015304565, + "learning_rate": 0.00029815981551377217 + }, + { + "step": 92, + "epoch": 0.14743589743589744, + "cpu_mem": 1.778204672, + "gpu_mem": 4.720551424, + "loss": 1.4046, + "grad_norm": 0.8167317509651184, + "learning_rate": 0.00029802632073539745 + }, + { + "step": 93, + "epoch": 0.14903846153846154, + "cpu_mem": 1.778597888, + "gpu_mem": 4.720551424, + "loss": 1.4387, + "grad_norm": 1.2170416116714478, + "learning_rate": 0.0002978881838854462 + }, + { + "step": 94, + "epoch": 0.15064102564102563, + "cpu_mem": 1.778991104, + "gpu_mem": 4.720546816, + "loss": 1.3637, + "grad_norm": 0.8987075686454773, + "learning_rate": 0.00029774540929585847 + }, + { + "step": 95, + "epoch": 0.15224358974358973, + "cpu_mem": 1.77938432, + "gpu_mem": 4.720565248, + "loss": 1.4373, + "grad_norm": 2.0082123279571533, + "learning_rate": 0.0002975980014440126 + }, + { + "step": 96, + "epoch": 0.15384615384615385, + "cpu_mem": 1.779580928, + "gpu_mem": 4.72056832, + "loss": 1.4173, + "grad_norm": 1.2789134979248047, + "learning_rate": 0.00029744596495258525 + }, + { + "step": 97, + "epoch": 0.15544871794871795, + "cpu_mem": 1.780170752, + "gpu_mem": 4.72054528, + "loss": 1.415, + "grad_norm": 1.18489670753479, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 98, + "epoch": 0.15705128205128205, + "cpu_mem": 1.780563968, + "gpu_mem": 4.720556032, + "loss": 1.3812, + "grad_norm": 0.3706219792366028, + "learning_rate": 0.000297128025267308 + }, + { + "step": 99, + "epoch": 0.15865384615384615, + "cpu_mem": 1.780760576, + "gpu_mem": 4.720556032, + "loss": 1.4726, + "grad_norm": 2.415025234222412, + "learning_rate": 0.00029696213204397396 + }, + { + "step": 100, + "epoch": 0.16025641025641027, + "cpu_mem": 1.781153792, + "gpu_mem": 4.720531456, + "loss": 1.4121, + "grad_norm": 1.3799420595169067, + "learning_rate": 0.00029679163012177737 + }, + { + "step": 101, + "epoch": 0.16185897435897437, + "cpu_mem": 1.781547008, + "gpu_mem": 4.72056064, + "loss": 1.3845, + "grad_norm": 1.146619439125061, + "learning_rate": 0.0002966165248476196 + }, + { + "step": 102, + "epoch": 0.16346153846153846, + "cpu_mem": 1.781940224, + "gpu_mem": 4.7205376, + "loss": 1.3454, + "grad_norm": 0.639557421207428, + "learning_rate": 0.00029643682171276203 + }, + { + "step": 103, + "epoch": 0.16506410256410256, + "cpu_mem": 1.78233344, + "gpu_mem": 4.720554496, + "loss": 1.4796, + "grad_norm": 1.9625685214996338, + "learning_rate": 0.0002962525263526538 + }, + { + "step": 104, + "epoch": 0.16666666666666666, + "cpu_mem": 1.782530048, + "gpu_mem": 4.72052224, + "loss": 1.4515, + "grad_norm": 1.855381965637207, + "learning_rate": 0.0002960636445467553 + }, + { + "step": 105, + "epoch": 0.16826923076923078, + "cpu_mem": 1.782726656, + "gpu_mem": 4.7205376, + "loss": 1.3623, + "grad_norm": 0.8174445629119873, + "learning_rate": 0.0002958701822183569 + }, + { + "step": 106, + "epoch": 0.16987179487179488, + "cpu_mem": 1.783119872, + "gpu_mem": 4.720517632, + "loss": 1.4901, + "grad_norm": 1.934415578842163, + "learning_rate": 0.0002956721454343928 + }, + { + "step": 107, + "epoch": 0.17147435897435898, + "cpu_mem": 1.783513088, + "gpu_mem": 4.720559104, + "loss": 1.3893, + "grad_norm": 0.7557987570762634, + "learning_rate": 0.0002954695404052514 + }, + { + "step": 108, + "epoch": 0.17307692307692307, + "cpu_mem": 1.783906304, + "gpu_mem": 4.720554496, + "loss": 1.3947, + "grad_norm": 0.8405246138572693, + "learning_rate": 0.00029526237348458003 + }, + { + "step": 109, + "epoch": 0.17467948717948717, + "cpu_mem": 1.78429952, + "gpu_mem": 4.72056064, + "loss": 1.3714, + "grad_norm": 0.8637200593948364, + "learning_rate": 0.000295050651169086 + }, + { + "step": 110, + "epoch": 0.1762820512820513, + "cpu_mem": 1.784496128, + "gpu_mem": 4.720557568, + "loss": 1.3675, + "grad_norm": 0.5141253471374512, + "learning_rate": 0.00029483438009833264 + }, + { + "step": 111, + "epoch": 0.1778846153846154, + "cpu_mem": 1.784692736, + "gpu_mem": 4.720559104, + "loss": 1.4075, + "grad_norm": 1.3532006740570068, + "learning_rate": 0.0002946135670545314 + }, + { + "step": 112, + "epoch": 0.1794871794871795, + "cpu_mem": 1.785085952, + "gpu_mem": 4.720556032, + "loss": 1.343, + "grad_norm": 0.6668848395347595, + "learning_rate": 0.0002943882189623288 + }, + { + "step": 113, + "epoch": 0.18108974358974358, + "cpu_mem": 1.785479168, + "gpu_mem": 4.720536064, + "loss": 1.336, + "grad_norm": 1.0137008428573608, + "learning_rate": 0.00029415834288858947 + }, + { + "step": 114, + "epoch": 0.18269230769230768, + "cpu_mem": 1.785872384, + "gpu_mem": 4.720531456, + "loss": 1.2771, + "grad_norm": 1.4304274320602417, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 115, + "epoch": 0.1842948717948718, + "cpu_mem": 1.7862656, + "gpu_mem": 4.720549888, + "loss": 1.2722, + "grad_norm": 3.1247785091400146, + "learning_rate": 0.0002936850357737156 + }, + { + "step": 116, + "epoch": 0.1858974358974359, + "cpu_mem": 1.786658816, + "gpu_mem": 4.72056064, + "loss": 1.2778, + "grad_norm": 7.110416889190674, + "learning_rate": 0.0002934416195753839 + }, + { + "step": 117, + "epoch": 0.1875, + "cpu_mem": 1.787052032, + "gpu_mem": 4.720546816, + "loss": 1.3675, + "grad_norm": 4.971436977386475, + "learning_rate": 0.00029319370508065594 + }, + { + "step": 118, + "epoch": 0.1891025641025641, + "cpu_mem": 1.78724864, + "gpu_mem": 4.720562176, + "loss": 1.1445, + "grad_norm": 2.869131326675415, + "learning_rate": 0.0002929413000640735 + }, + { + "step": 119, + "epoch": 0.1907051282051282, + "cpu_mem": 1.787641856, + "gpu_mem": 4.720543744, + "loss": 1.3462, + "grad_norm": 4.617319583892822, + "learning_rate": 0.0002926844124410001 + }, + { + "step": 120, + "epoch": 0.19230769230769232, + "cpu_mem": 1.788035072, + "gpu_mem": 4.720569856, + "loss": 1.3233, + "grad_norm": 4.377890110015869, + "learning_rate": 0.0002924230502673731 + }, + { + "step": 121, + "epoch": 0.19391025641025642, + "cpu_mem": 1.78823168, + "gpu_mem": 4.720528384, + "loss": 1.2483, + "grad_norm": 2.612948417663574, + "learning_rate": 0.00029215722173945034 + }, + { + "step": 122, + "epoch": 0.1955128205128205, + "cpu_mem": 1.788428288, + "gpu_mem": 4.72056064, + "loss": 1.267, + "grad_norm": 2.3601908683776855, + "learning_rate": 0.0002918869351935537 + }, + { + "step": 123, + "epoch": 0.1971153846153846, + "cpu_mem": 1.788821504, + "gpu_mem": 4.720554496, + "loss": 1.1894, + "grad_norm": 1.8673102855682373, + "learning_rate": 0.00029161219910580754 + }, + { + "step": 124, + "epoch": 0.1987179487179487, + "cpu_mem": 1.78921472, + "gpu_mem": 4.720556032, + "loss": 1.3193, + "grad_norm": 6.551246643066406, + "learning_rate": 0.00029133302209187267 + }, + { + "step": 125, + "epoch": 0.20032051282051283, + "cpu_mem": 1.789607936, + "gpu_mem": 4.720531456, + "loss": 1.1673, + "grad_norm": 3.5235562324523926, + "learning_rate": 0.00029104941290667655 + }, + { + "step": 126, + "epoch": 0.20192307692307693, + "cpu_mem": 1.789804544, + "gpu_mem": 4.720540672, + "loss": 1.0878, + "grad_norm": 7.8766093254089355, + "learning_rate": 0.00029076138044413827 + }, + { + "step": 127, + "epoch": 0.20352564102564102, + "cpu_mem": 1.79019776, + "gpu_mem": 4.720526848, + "loss": 1.2909, + "grad_norm": 5.3333258628845215, + "learning_rate": 0.00029046893373689 + }, + { + "step": 128, + "epoch": 0.20512820512820512, + "cpu_mem": 1.790394368, + "gpu_mem": 4.720563712, + "loss": 1.0701, + "grad_norm": 3.926029682159424, + "learning_rate": 0.00029017208195599375 + }, + { + "step": 129, + "epoch": 0.20673076923076922, + "cpu_mem": 1.790787584, + "gpu_mem": 4.72056064, + "loss": 1.0655, + "grad_norm": 3.8800652027130127, + "learning_rate": 0.0002898708344106533 + }, + { + "step": 130, + "epoch": 0.20833333333333334, + "cpu_mem": 1.7911808, + "gpu_mem": 4.72056064, + "loss": 1.0747, + "grad_norm": 3.465404510498047, + "learning_rate": 0.00028956520054792303 + }, + { + "step": 131, + "epoch": 0.20993589743589744, + "cpu_mem": 1.791574016, + "gpu_mem": 4.720549888, + "loss": 1.0601, + "grad_norm": 3.2494454383850098, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 132, + "epoch": 0.21153846153846154, + "cpu_mem": 1.791967232, + "gpu_mem": 4.720549888, + "loss": 0.9935, + "grad_norm": 3.6604511737823486, + "learning_rate": 0.0002889408123459782 + }, + { + "step": 133, + "epoch": 0.21314102564102563, + "cpu_mem": 1.79216384, + "gpu_mem": 4.720531456, + "loss": 0.9993, + "grad_norm": 5.088719367980957, + "learning_rate": 0.000288622077587435 + }, + { + "step": 134, + "epoch": 0.21474358974358973, + "cpu_mem": 1.792360448, + "gpu_mem": 4.720542208, + "loss": 0.8598, + "grad_norm": 4.976592063903809, + "learning_rate": 0.0002882989956722303 + }, + { + "step": 135, + "epoch": 0.21634615384615385, + "cpu_mem": 1.792753664, + "gpu_mem": 4.720551424, + "loss": 1.3191, + "grad_norm": 10.094297409057617, + "learning_rate": 0.00028797157673213914 + }, + { + "step": 136, + "epoch": 0.21794871794871795, + "cpu_mem": 1.792950272, + "gpu_mem": 4.720566784, + "loss": 0.9233, + "grad_norm": 3.128728151321411, + "learning_rate": 0.00028763983103494465 + }, + { + "step": 137, + "epoch": 0.21955128205128205, + "cpu_mem": 1.793343488, + "gpu_mem": 4.72051456, + "loss": 0.9206, + "grad_norm": 3.806426525115967, + "learning_rate": 0.00028730376898411606 + }, + { + "step": 138, + "epoch": 0.22115384615384615, + "cpu_mem": 1.793540096, + "gpu_mem": 4.720534528, + "loss": 0.8886, + "grad_norm": 4.19140100479126, + "learning_rate": 0.00028696340111848245 + }, + { + "step": 139, + "epoch": 0.22275641025641027, + "cpu_mem": 1.793736704, + "gpu_mem": 4.720516096, + "loss": 1.0492, + "grad_norm": 4.804494857788086, + "learning_rate": 0.00028661873811190226 + }, + { + "step": 140, + "epoch": 0.22435897435897437, + "cpu_mem": 1.79412992, + "gpu_mem": 4.720532992, + "loss": 0.9445, + "grad_norm": 3.544142961502075, + "learning_rate": 0.0002862697907729285 + }, + { + "step": 141, + "epoch": 0.22596153846153846, + "cpu_mem": 1.794326528, + "gpu_mem": 4.720539136, + "loss": 1.077, + "grad_norm": 5.056070804595947, + "learning_rate": 0.0002859165700444701 + }, + { + "step": 142, + "epoch": 0.22756410256410256, + "cpu_mem": 1.794719744, + "gpu_mem": 4.720536064, + "loss": 0.8754, + "grad_norm": 3.3884620666503906, + "learning_rate": 0.00028555908700344824 + }, + { + "step": 143, + "epoch": 0.22916666666666666, + "cpu_mem": 1.79511296, + "gpu_mem": 4.720562176, + "loss": 0.7984, + "grad_norm": 3.2185678482055664, + "learning_rate": 0.00028519735286044936 + }, + { + "step": 144, + "epoch": 0.23076923076923078, + "cpu_mem": 1.795309568, + "gpu_mem": 4.720536064, + "loss": 0.8326, + "grad_norm": 5.061500549316406, + "learning_rate": 0.0002848313789593736 + }, + { + "step": 145, + "epoch": 0.23237179487179488, + "cpu_mem": 1.795702784, + "gpu_mem": 4.720576, + "loss": 0.8483, + "grad_norm": 4.525794506072998, + "learning_rate": 0.00028446117677707867 + }, + { + "step": 146, + "epoch": 0.23397435897435898, + "cpu_mem": 1.795899392, + "gpu_mem": 4.720525312, + "loss": 1.0289, + "grad_norm": 6.090339660644531, + "learning_rate": 0.0002840867579230205 + }, + { + "step": 147, + "epoch": 0.23557692307692307, + "cpu_mem": 1.796096, + "gpu_mem": 4.720534528, + "loss": 0.8712, + "grad_norm": 5.1013898849487305, + "learning_rate": 0.00028370813413888866 + }, + { + "step": 148, + "epoch": 0.23717948717948717, + "cpu_mem": 1.796292608, + "gpu_mem": 4.720554496, + "loss": 0.7884, + "grad_norm": 4.061498165130615, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 149, + "epoch": 0.2387820512820513, + "cpu_mem": 1.796685824, + "gpu_mem": 4.72054528, + "loss": 0.6656, + "grad_norm": 3.5959649085998535, + "learning_rate": 0.0002829383194061186 + }, + { + "step": 150, + "epoch": 0.2403846153846154, + "cpu_mem": 1.796882432, + "gpu_mem": 4.720557568, + "loss": 0.9426, + "grad_norm": 5.307168960571289, + "learning_rate": 0.00028254715259869444 + }, + { + "step": 151, + "epoch": 0.2419871794871795, + "cpu_mem": 1.79707904, + "gpu_mem": 4.72052224, + "loss": 0.7436, + "grad_norm": 3.5209925174713135, + "learning_rate": 0.00028215182914286766 + }, + { + "step": 152, + "epoch": 0.24358974358974358, + "cpu_mem": 1.797275648, + "gpu_mem": 4.72055296, + "loss": 0.848, + "grad_norm": 3.94337797164917, + "learning_rate": 0.00028175236143589144 + }, + { + "step": 153, + "epoch": 0.24519230769230768, + "cpu_mem": 1.797668864, + "gpu_mem": 4.720548352, + "loss": 1.064, + "grad_norm": 6.2684783935546875, + "learning_rate": 0.0002813487620049817 + }, + { + "step": 154, + "epoch": 0.2467948717948718, + "cpu_mem": 1.79806208, + "gpu_mem": 4.720572928, + "loss": 0.8829, + "grad_norm": 3.8749735355377197, + "learning_rate": 0.00028094104350692435 + }, + { + "step": 155, + "epoch": 0.2483974358974359, + "cpu_mem": 1.798258688, + "gpu_mem": 4.720509952, + "loss": 0.7822, + "grad_norm": 3.0371100902557373, + "learning_rate": 0.0002805292187276783 + }, + { + "step": 156, + "epoch": 0.25, + "cpu_mem": 1.798651904, + "gpu_mem": 4.720563712, + "loss": 0.8857, + "grad_norm": 4.887096405029297, + "learning_rate": 0.0002801133005819744 + }, + { + "step": 157, + "epoch": 0.2516025641025641, + "cpu_mem": 1.798848512, + "gpu_mem": 4.720556032, + "loss": 0.8516, + "grad_norm": 4.937999248504639, + "learning_rate": 0.00027969330211291077 + }, + { + "step": 158, + "epoch": 0.2532051282051282, + "cpu_mem": 1.79904512, + "gpu_mem": 4.720571392, + "loss": 0.7195, + "grad_norm": 6.111913204193115, + "learning_rate": 0.00027926923649154327 + }, + { + "step": 159, + "epoch": 0.2548076923076923, + "cpu_mem": 1.799438336, + "gpu_mem": 4.720572928, + "loss": 0.6445, + "grad_norm": 5.653451919555664, + "learning_rate": 0.00027884111701647284 + }, + { + "step": 160, + "epoch": 0.2564102564102564, + "cpu_mem": 1.799634944, + "gpu_mem": 4.720540672, + "loss": 0.7305, + "grad_norm": 5.545091152191162, + "learning_rate": 0.00027840895711342834 + }, + { + "step": 161, + "epoch": 0.25801282051282054, + "cpu_mem": 1.801797632, + "gpu_mem": 4.720532992, + "loss": 0.7614, + "grad_norm": 5.7517828941345215, + "learning_rate": 0.00027797277033484553 + }, + { + "step": 162, + "epoch": 0.25961538461538464, + "cpu_mem": 1.80199424, + "gpu_mem": 4.72056832, + "loss": 0.6567, + "grad_norm": 3.986340045928955, + "learning_rate": 0.0002775325703594421 + }, + { + "step": 163, + "epoch": 0.26121794871794873, + "cpu_mem": 1.802190848, + "gpu_mem": 4.720516096, + "loss": 0.8237, + "grad_norm": 4.85936164855957, + "learning_rate": 0.0002770883709917886 + }, + { + "step": 164, + "epoch": 0.26282051282051283, + "cpu_mem": 1.802584064, + "gpu_mem": 4.720551424, + "loss": 0.7143, + "grad_norm": 3.7536511421203613, + "learning_rate": 0.0002766401861618757 + }, + { + "step": 165, + "epoch": 0.2644230769230769, + "cpu_mem": 1.802780672, + "gpu_mem": 4.720540672, + "loss": 0.7117, + "grad_norm": 3.8949320316314697, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 166, + "epoch": 0.266025641025641, + "cpu_mem": 1.80297728, + "gpu_mem": 4.720572928, + "loss": 0.4101, + "grad_norm": 3.2398221492767334, + "learning_rate": 0.0002757319164597092 + }, + { + "step": 167, + "epoch": 0.2676282051282051, + "cpu_mem": 1.803173888, + "gpu_mem": 4.720566784, + "loss": 0.9079, + "grad_norm": 8.636190414428711, + "learning_rate": 0.0002752718600705858 + }, + { + "step": 168, + "epoch": 0.2692307692307692, + "cpu_mem": 1.803567104, + "gpu_mem": 4.72054528, + "loss": 0.8902, + "grad_norm": 7.243869304656982, + "learning_rate": 0.00027480787518457023 + }, + { + "step": 169, + "epoch": 0.2708333333333333, + "cpu_mem": 1.803763712, + "gpu_mem": 4.720542208, + "loss": 0.9625, + "grad_norm": 6.601426124572754, + "learning_rate": 0.0002743399763521223 + }, + { + "step": 170, + "epoch": 0.2724358974358974, + "cpu_mem": 1.80396032, + "gpu_mem": 4.720579072, + "loss": 0.6957, + "grad_norm": 6.074435710906982, + "learning_rate": 0.0002738681782464426 + }, + { + "step": 171, + "epoch": 0.27403846153846156, + "cpu_mem": 1.804353536, + "gpu_mem": 4.72055296, + "loss": 0.4546, + "grad_norm": 4.1066179275512695, + "learning_rate": 0.0002733924956630117 + }, + { + "step": 172, + "epoch": 0.27564102564102566, + "cpu_mem": 1.804550144, + "gpu_mem": 4.72052992, + "loss": 0.6003, + "grad_norm": 4.241726398468018, + "learning_rate": 0.00027291294351912664 + }, + { + "step": 173, + "epoch": 0.27724358974358976, + "cpu_mem": 1.80494336, + "gpu_mem": 4.720556032, + "loss": 0.8207, + "grad_norm": 4.6515021324157715, + "learning_rate": 0.00027242953685343327 + }, + { + "step": 174, + "epoch": 0.27884615384615385, + "cpu_mem": 1.805139968, + "gpu_mem": 4.72056832, + "loss": 0.8325, + "grad_norm": 3.9895925521850586, + "learning_rate": 0.0002719422908254538 + }, + { + "step": 175, + "epoch": 0.28044871794871795, + "cpu_mem": 1.805336576, + "gpu_mem": 4.72052992, + "loss": 0.7099, + "grad_norm": 3.1644885540008545, + "learning_rate": 0.0002714512207151125 + }, + { + "step": 176, + "epoch": 0.28205128205128205, + "cpu_mem": 1.805533184, + "gpu_mem": 4.720539136, + "loss": 0.6932, + "grad_norm": 3.6002683639526367, + "learning_rate": 0.0002709563419222557 + }, + { + "step": 177, + "epoch": 0.28365384615384615, + "cpu_mem": 1.805729792, + "gpu_mem": 4.720520704, + "loss": 0.7363, + "grad_norm": 3.980494499206543, + "learning_rate": 0.0002704576699661691 + }, + { + "step": 178, + "epoch": 0.28525641025641024, + "cpu_mem": 1.8059264, + "gpu_mem": 4.720534528, + "loss": 0.5143, + "grad_norm": 3.1749427318573, + "learning_rate": 0.0002699552204850914 + }, + { + "step": 179, + "epoch": 0.28685897435897434, + "cpu_mem": 1.806123008, + "gpu_mem": 4.720542208, + "loss": 0.5277, + "grad_norm": 2.841369867324829, + "learning_rate": 0.0002694490092357233 + }, + { + "step": 180, + "epoch": 0.28846153846153844, + "cpu_mem": 1.806319616, + "gpu_mem": 4.720523776, + "loss": 0.8914, + "grad_norm": 5.5690412521362305, + "learning_rate": 0.00026893905209273404 + }, + { + "step": 181, + "epoch": 0.2900641025641026, + "cpu_mem": 1.806516224, + "gpu_mem": 4.720554496, + "loss": 0.7591, + "grad_norm": 4.848388671875, + "learning_rate": 0.00026842536504826286 + }, + { + "step": 182, + "epoch": 0.2916666666666667, + "cpu_mem": 1.806712832, + "gpu_mem": 4.720525312, + "loss": 0.8352, + "grad_norm": 4.41049861907959, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 183, + "epoch": 0.2932692307692308, + "cpu_mem": 1.807106048, + "gpu_mem": 4.720549888, + "loss": 0.6294, + "grad_norm": 4.131494045257568, + "learning_rate": 0.0002673868658077717 + }, + { + "step": 184, + "epoch": 0.2948717948717949, + "cpu_mem": 1.807302656, + "gpu_mem": 4.72052992, + "loss": 0.4275, + "grad_norm": 3.490968704223633, + "learning_rate": 0.00026686208617885055 + }, + { + "step": 185, + "epoch": 0.296474358974359, + "cpu_mem": 1.807499264, + "gpu_mem": 4.720562176, + "loss": 0.7843, + "grad_norm": 6.0852861404418945, + "learning_rate": 0.0002663336417816238 + }, + { + "step": 186, + "epoch": 0.2980769230769231, + "cpu_mem": 1.807695872, + "gpu_mem": 4.72055296, + "loss": 0.7269, + "grad_norm": 4.61384391784668, + "learning_rate": 0.0002658015491879868 + }, + { + "step": 187, + "epoch": 0.29967948717948717, + "cpu_mem": 1.80789248, + "gpu_mem": 4.720548352, + "loss": 0.6626, + "grad_norm": 4.397415637969971, + "learning_rate": 0.00026526582508424175 + }, + { + "step": 188, + "epoch": 0.30128205128205127, + "cpu_mem": 1.810448384, + "gpu_mem": 4.720505344, + "loss": 0.7408, + "grad_norm": 4.256959438323975, + "learning_rate": 0.0002647264862705741 + }, + { + "step": 189, + "epoch": 0.30288461538461536, + "cpu_mem": 1.810644992, + "gpu_mem": 4.720585216, + "loss": 0.8002, + "grad_norm": 4.698314189910889, + "learning_rate": 0.00026418354966052573 + }, + { + "step": 190, + "epoch": 0.30448717948717946, + "cpu_mem": 1.8108416, + "gpu_mem": 4.720536064, + "loss": 0.6868, + "grad_norm": 4.742916584014893, + "learning_rate": 0.00026363703228046454 + }, + { + "step": 191, + "epoch": 0.3060897435897436, + "cpu_mem": 1.811038208, + "gpu_mem": 4.720536064, + "loss": 0.5561, + "grad_norm": 3.329320192337036, + "learning_rate": 0.0002630869512690507 + }, + { + "step": 192, + "epoch": 0.3076923076923077, + "cpu_mem": 1.811234816, + "gpu_mem": 4.720502272, + "loss": 0.7206, + "grad_norm": 4.7666473388671875, + "learning_rate": 0.0002625333238766989 + }, + { + "step": 193, + "epoch": 0.3092948717948718, + "cpu_mem": 1.811628032, + "gpu_mem": 4.720542208, + "loss": 0.3835, + "grad_norm": 3.607692241668701, + "learning_rate": 0.0002619761674650377 + }, + { + "step": 194, + "epoch": 0.3108974358974359, + "cpu_mem": 1.81182464, + "gpu_mem": 4.7205376, + "loss": 0.4497, + "grad_norm": 3.670856475830078, + "learning_rate": 0.0002614154995063647 + }, + { + "step": 195, + "epoch": 0.3125, + "cpu_mem": 1.812217856, + "gpu_mem": 4.720525312, + "loss": 0.7858, + "grad_norm": 4.712100028991699, + "learning_rate": 0.00026085133758309883 + }, + { + "step": 196, + "epoch": 0.3141025641025641, + "cpu_mem": 1.812414464, + "gpu_mem": 4.720549888, + "loss": 0.7217, + "grad_norm": 5.544972896575928, + "learning_rate": 0.0002602836993872292 + }, + { + "step": 197, + "epoch": 0.3157051282051282, + "cpu_mem": 1.812611072, + "gpu_mem": 4.720565248, + "loss": 0.8271, + "grad_norm": 4.174983024597168, + "learning_rate": 0.0002597126027197598 + }, + { + "step": 198, + "epoch": 0.3173076923076923, + "cpu_mem": 1.81280768, + "gpu_mem": 4.7205376, + "loss": 0.5453, + "grad_norm": 3.859314441680908, + "learning_rate": 0.0002591380654901515 + }, + { + "step": 199, + "epoch": 0.3189102564102564, + "cpu_mem": 1.813004288, + "gpu_mem": 4.720534528, + "loss": 0.9142, + "grad_norm": 3.9925742149353027, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 200, + "epoch": 0.32051282051282054, + "cpu_mem": 1.813200896, + "gpu_mem": 4.720549888, + "loss": 0.8043, + "grad_norm": 4.044938087463379, + "learning_rate": 0.0002579787415212732 + }, + { + "step": 201, + "epoch": 0.32211538461538464, + "cpu_mem": 1.813397504, + "gpu_mem": 4.720526848, + "loss": 0.6606, + "grad_norm": 3.944694995880127, + "learning_rate": 0.00025739399113813784 + }, + { + "step": 202, + "epoch": 0.32371794871794873, + "cpu_mem": 1.813397504, + "gpu_mem": 4.720528384, + "loss": 0.7005, + "grad_norm": 3.898433208465576, + "learning_rate": 0.00025680587290399277 + }, + { + "step": 203, + "epoch": 0.32532051282051283, + "cpu_mem": 1.815560192, + "gpu_mem": 4.720569856, + "loss": 0.5139, + "grad_norm": 3.6402008533477783, + "learning_rate": 0.0002562144052620913 + }, + { + "step": 204, + "epoch": 0.3269230769230769, + "cpu_mem": 1.818116096, + "gpu_mem": 4.720540672, + "loss": 0.471, + "grad_norm": 3.600193977355957, + "learning_rate": 0.00025561960676072354 + }, + { + "step": 205, + "epoch": 0.328525641025641, + "cpu_mem": 1.818312704, + "gpu_mem": 4.720540672, + "loss": 0.8145, + "grad_norm": 4.9200029373168945, + "learning_rate": 0.0002550214960526344 + }, + { + "step": 206, + "epoch": 0.3301282051282051, + "cpu_mem": 1.818509312, + "gpu_mem": 4.7205376, + "loss": 0.6847, + "grad_norm": 4.895996570587158, + "learning_rate": 0.000254420091894439 + }, + { + "step": 207, + "epoch": 0.3317307692307692, + "cpu_mem": 1.81870592, + "gpu_mem": 4.7205376, + "loss": 0.4908, + "grad_norm": 6.09129524230957, + "learning_rate": 0.0002538154131460342 + }, + { + "step": 208, + "epoch": 0.3333333333333333, + "cpu_mem": 1.818902528, + "gpu_mem": 4.720528384, + "loss": 0.5981, + "grad_norm": 5.824703216552734, + "learning_rate": 0.00025320747877000745 + }, + { + "step": 209, + "epoch": 0.3349358974358974, + "cpu_mem": 1.818902528, + "gpu_mem": 4.720563712, + "loss": 0.6412, + "grad_norm": 5.980694770812988, + "learning_rate": 0.00025259630783104164 + }, + { + "step": 210, + "epoch": 0.33653846153846156, + "cpu_mem": 1.819099136, + "gpu_mem": 4.720520704, + "loss": 0.6966, + "grad_norm": 6.049636363983154, + "learning_rate": 0.00025198191949531786 + }, + { + "step": 211, + "epoch": 0.33814102564102566, + "cpu_mem": 1.819295744, + "gpu_mem": 4.720548352, + "loss": 0.435, + "grad_norm": 3.5862605571746826, + "learning_rate": 0.00025136433302991366 + }, + { + "step": 212, + "epoch": 0.33974358974358976, + "cpu_mem": 1.819492352, + "gpu_mem": 4.720557568, + "loss": 0.497, + "grad_norm": 3.8452136516571045, + "learning_rate": 0.00025074356780219946 + }, + { + "step": 213, + "epoch": 0.34134615384615385, + "cpu_mem": 1.81968896, + "gpu_mem": 4.72052992, + "loss": 0.4866, + "grad_norm": 3.8942999839782715, + "learning_rate": 0.000250119643279231 + }, + { + "step": 214, + "epoch": 0.34294871794871795, + "cpu_mem": 1.819885568, + "gpu_mem": 4.720539136, + "loss": 0.5335, + "grad_norm": 3.1518399715423584, + "learning_rate": 0.0002494925790271386 + }, + { + "step": 215, + "epoch": 0.34455128205128205, + "cpu_mem": 1.820082176, + "gpu_mem": 4.720540672, + "loss": 0.6087, + "grad_norm": 4.642149448394775, + "learning_rate": 0.00024886239471051376 + }, + { + "step": 216, + "epoch": 0.34615384615384615, + "cpu_mem": 1.820475392, + "gpu_mem": 4.720540672, + "loss": 0.4377, + "grad_norm": 2.8820831775665283, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 217, + "epoch": 0.34775641025641024, + "cpu_mem": 1.820672, + "gpu_mem": 4.720525312, + "loss": 0.5365, + "grad_norm": 4.086950778961182, + "learning_rate": 0.0002475927450306363 + }, + { + "step": 218, + "epoch": 0.34935897435897434, + "cpu_mem": 1.820868608, + "gpu_mem": 4.720546816, + "loss": 0.4518, + "grad_norm": 4.219118595123291, + "learning_rate": 0.0002469533194833073 + }, + { + "step": 219, + "epoch": 0.35096153846153844, + "cpu_mem": 1.821065216, + "gpu_mem": 4.720580608, + "loss": 0.5606, + "grad_norm": 4.470487594604492, + "learning_rate": 0.0002463108535020447 + }, + { + "step": 220, + "epoch": 0.3525641025641026, + "cpu_mem": 1.821261824, + "gpu_mem": 4.720534528, + "loss": 0.7911, + "grad_norm": 5.462947845458984, + "learning_rate": 0.0002456653672344348 + }, + { + "step": 221, + "epoch": 0.3541666666666667, + "cpu_mem": 1.821458432, + "gpu_mem": 4.720540672, + "loss": 0.6531, + "grad_norm": 5.016514778137207, + "learning_rate": 0.0002450168809227794 + }, + { + "step": 222, + "epoch": 0.3557692307692308, + "cpu_mem": 1.821458432, + "gpu_mem": 4.720556032, + "loss": 0.7695, + "grad_norm": 4.872075080871582, + "learning_rate": 0.00024436541490346095 + }, + { + "step": 223, + "epoch": 0.3573717948717949, + "cpu_mem": 1.82165504, + "gpu_mem": 4.720574464, + "loss": 0.5722, + "grad_norm": 4.042016506195068, + "learning_rate": 0.00024371098960630495 + }, + { + "step": 224, + "epoch": 0.358974358974359, + "cpu_mem": 1.821851648, + "gpu_mem": 4.720543744, + "loss": 0.5453, + "grad_norm": 3.188103437423706, + "learning_rate": 0.000243053625553939 + }, + { + "step": 225, + "epoch": 0.3605769230769231, + "cpu_mem": 1.822048256, + "gpu_mem": 4.72052992, + "loss": 0.4689, + "grad_norm": 2.7726995944976807, + "learning_rate": 0.00024239334336114953 + }, + { + "step": 226, + "epoch": 0.36217948717948717, + "cpu_mem": 1.822244864, + "gpu_mem": 4.72052224, + "loss": 0.5428, + "grad_norm": 3.429758071899414, + "learning_rate": 0.0002417301637342352 + }, + { + "step": 227, + "epoch": 0.36378205128205127, + "cpu_mem": 1.82460416, + "gpu_mem": 4.720586752, + "loss": 0.5932, + "grad_norm": 3.599915027618408, + "learning_rate": 0.00024106410747035744 + }, + { + "step": 228, + "epoch": 0.36538461538461536, + "cpu_mem": 1.824800768, + "gpu_mem": 4.720525312, + "loss": 0.5338, + "grad_norm": 3.389854907989502, + "learning_rate": 0.00024039519545688846 + }, + { + "step": 229, + "epoch": 0.36698717948717946, + "cpu_mem": 1.824997376, + "gpu_mem": 4.720577536, + "loss": 0.5821, + "grad_norm": 3.363149404525757, + "learning_rate": 0.000239723448670756 + }, + { + "step": 230, + "epoch": 0.3685897435897436, + "cpu_mem": 1.825193984, + "gpu_mem": 4.720559104, + "loss": 0.4425, + "grad_norm": 4.195175647735596, + "learning_rate": 0.0002390488881777858 + }, + { + "step": 231, + "epoch": 0.3701923076923077, + "cpu_mem": 1.825390592, + "gpu_mem": 4.720557568, + "loss": 0.5002, + "grad_norm": 3.1989200115203857, + "learning_rate": 0.0002383715351320406 + }, + { + "step": 232, + "epoch": 0.3717948717948718, + "cpu_mem": 1.8255872, + "gpu_mem": 4.720562176, + "loss": 0.6165, + "grad_norm": 3.562655448913574, + "learning_rate": 0.00023769141077515713 + }, + { + "step": 233, + "epoch": 0.3733974358974359, + "cpu_mem": 1.825783808, + "gpu_mem": 4.7205376, + "loss": 0.4275, + "grad_norm": 3.3661890029907227, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 234, + "epoch": 0.375, + "cpu_mem": 1.825980416, + "gpu_mem": 4.720566784, + "loss": 0.5534, + "grad_norm": 5.825259208679199, + "learning_rate": 0.0002363229335283915 + }, + { + "step": 235, + "epoch": 0.3766025641025641, + "cpu_mem": 1.826177024, + "gpu_mem": 4.720543744, + "loss": 0.8008, + "grad_norm": 5.267117977142334, + "learning_rate": 0.00023563462355364297 + }, + { + "step": 236, + "epoch": 0.3782051282051282, + "cpu_mem": 1.826177024, + "gpu_mem": 4.720605184, + "loss": 0.5537, + "grad_norm": 4.115809440612793, + "learning_rate": 0.0002349436280966775 + }, + { + "step": 237, + "epoch": 0.3798076923076923, + "cpu_mem": 1.826373632, + "gpu_mem": 4.72052992, + "loss": 0.6066, + "grad_norm": 4.933228492736816, + "learning_rate": 0.00023424996882695468 + }, + { + "step": 238, + "epoch": 0.3814102564102564, + "cpu_mem": 1.82853632, + "gpu_mem": 4.720540672, + "loss": 0.588, + "grad_norm": 4.033113479614258, + "learning_rate": 0.00023355366749747063 + }, + { + "step": 239, + "epoch": 0.38301282051282054, + "cpu_mem": 1.828732928, + "gpu_mem": 4.720539136, + "loss": 0.7246, + "grad_norm": 3.9991984367370605, + "learning_rate": 0.00023285474594407585 + }, + { + "step": 240, + "epoch": 0.38461538461538464, + "cpu_mem": 1.831092224, + "gpu_mem": 4.720536064, + "loss": 0.6652, + "grad_norm": 3.058049201965332, + "learning_rate": 0.0002321532260847905 + }, + { + "step": 241, + "epoch": 0.38621794871794873, + "cpu_mem": 1.831288832, + "gpu_mem": 4.720566784, + "loss": 0.4885, + "grad_norm": 3.4072256088256836, + "learning_rate": 0.00023144912991911691 + }, + { + "step": 242, + "epoch": 0.38782051282051283, + "cpu_mem": 1.831288832, + "gpu_mem": 4.72054528, + "loss": 0.6389, + "grad_norm": 3.776989459991455, + "learning_rate": 0.0002307424795273499 + }, + { + "step": 243, + "epoch": 0.3894230769230769, + "cpu_mem": 1.833648128, + "gpu_mem": 4.720540672, + "loss": 0.5267, + "grad_norm": 3.7145767211914062, + "learning_rate": 0.00023003329706988425 + }, + { + "step": 244, + "epoch": 0.391025641025641, + "cpu_mem": 1.833844736, + "gpu_mem": 4.720551424, + "loss": 0.6671, + "grad_norm": 3.9254729747772217, + "learning_rate": 0.00022932160478651963 + }, + { + "step": 245, + "epoch": 0.3926282051282051, + "cpu_mem": 1.834041344, + "gpu_mem": 4.720556032, + "loss": 0.4051, + "grad_norm": 2.6177785396575928, + "learning_rate": 0.00022860742499576338 + }, + { + "step": 246, + "epoch": 0.3942307692307692, + "cpu_mem": 1.834237952, + "gpu_mem": 4.720517632, + "loss": 0.5514, + "grad_norm": 4.025628566741943, + "learning_rate": 0.00022789078009413042 + }, + { + "step": 247, + "epoch": 0.3958333333333333, + "cpu_mem": 1.83443456, + "gpu_mem": 4.720585216, + "loss": 0.9006, + "grad_norm": 5.515537261962891, + "learning_rate": 0.00022717169255544108 + }, + { + "step": 248, + "epoch": 0.3974358974358974, + "cpu_mem": 1.83443456, + "gpu_mem": 4.720548352, + "loss": 0.5809, + "grad_norm": 4.265677452087402, + "learning_rate": 0.00022645018493011612 + }, + { + "step": 249, + "epoch": 0.39903846153846156, + "cpu_mem": 1.834631168, + "gpu_mem": 4.7205376, + "loss": 0.5817, + "grad_norm": 5.189319610595703, + "learning_rate": 0.0002257262798444698 + }, + { + "step": 250, + "epoch": 0.40064102564102566, + "cpu_mem": 1.834827776, + "gpu_mem": 4.720554496, + "loss": 0.5774, + "grad_norm": 3.914578437805176, + "learning_rate": 0.000225 + }, + { + "step": 251, + "epoch": 0.40224358974358976, + "cpu_mem": 1.835024384, + "gpu_mem": 4.720528384, + "loss": 0.521, + "grad_norm": 4.013396263122559, + "learning_rate": 0.00022427136817267668 + }, + { + "step": 252, + "epoch": 0.40384615384615385, + "cpu_mem": 1.835220992, + "gpu_mem": 4.720576, + "loss": 0.5545, + "grad_norm": 4.0251312255859375, + "learning_rate": 0.0002235404072122273 + }, + { + "step": 253, + "epoch": 0.40544871794871795, + "cpu_mem": 1.8354176, + "gpu_mem": 4.720543744, + "loss": 0.6066, + "grad_norm": 4.933129787445068, + "learning_rate": 0.00022280714004142054 + }, + { + "step": 254, + "epoch": 0.40705128205128205, + "cpu_mem": 1.8354176, + "gpu_mem": 4.720532992, + "loss": 0.4867, + "grad_norm": 2.58709454536438, + "learning_rate": 0.00022207158965534726 + }, + { + "step": 255, + "epoch": 0.40865384615384615, + "cpu_mem": 1.835614208, + "gpu_mem": 4.720548352, + "loss": 0.5215, + "grad_norm": 3.781154155731201, + "learning_rate": 0.0002213337791206993 + }, + { + "step": 256, + "epoch": 0.41025641025641024, + "cpu_mem": 1.835810816, + "gpu_mem": 4.72054528, + "loss": 0.4686, + "grad_norm": 3.088909864425659, + "learning_rate": 0.00022059373157504636 + }, + { + "step": 257, + "epoch": 0.41185897435897434, + "cpu_mem": 1.836007424, + "gpu_mem": 4.72054528, + "loss": 0.4709, + "grad_norm": 4.010195255279541, + "learning_rate": 0.00021985147022611038 + }, + { + "step": 258, + "epoch": 0.41346153846153844, + "cpu_mem": 1.836007424, + "gpu_mem": 4.720532992, + "loss": 0.5073, + "grad_norm": 3.446216344833374, + "learning_rate": 0.0002191070183510375 + }, + { + "step": 259, + "epoch": 0.4150641025641026, + "cpu_mem": 1.836204032, + "gpu_mem": 4.720516096, + "loss": 0.9743, + "grad_norm": 7.685817241668701, + "learning_rate": 0.00021836039929566835 + }, + { + "step": 260, + "epoch": 0.4166666666666667, + "cpu_mem": 1.836204032, + "gpu_mem": 4.720579072, + "loss": 0.4985, + "grad_norm": 3.612640380859375, + "learning_rate": 0.0002176116364738058 + }, + { + "step": 261, + "epoch": 0.4182692307692308, + "cpu_mem": 1.83640064, + "gpu_mem": 4.720532992, + "loss": 0.4882, + "grad_norm": 4.941830158233643, + "learning_rate": 0.00021686075336648075 + }, + { + "step": 262, + "epoch": 0.4198717948717949, + "cpu_mem": 1.836597248, + "gpu_mem": 4.720542208, + "loss": 0.4644, + "grad_norm": 4.333150386810303, + "learning_rate": 0.00021610777352121574 + }, + { + "step": 263, + "epoch": 0.421474358974359, + "cpu_mem": 1.836793856, + "gpu_mem": 4.720577536, + "loss": 0.3583, + "grad_norm": 3.0434811115264893, + "learning_rate": 0.0002153527205512867 + }, + { + "step": 264, + "epoch": 0.4230769230769231, + "cpu_mem": 1.836990464, + "gpu_mem": 4.720542208, + "loss": 0.584, + "grad_norm": 5.2302165031433105, + "learning_rate": 0.0002145956181349821 + }, + { + "step": 265, + "epoch": 0.42467948717948717, + "cpu_mem": 1.837187072, + "gpu_mem": 4.720546816, + "loss": 0.5768, + "grad_norm": 4.052805423736572, + "learning_rate": 0.00021383649001486055 + }, + { + "step": 266, + "epoch": 0.42628205128205127, + "cpu_mem": 1.83738368, + "gpu_mem": 4.720594432, + "loss": 0.4545, + "grad_norm": 5.0773444175720215, + "learning_rate": 0.00021307535999700637 + }, + { + "step": 267, + "epoch": 0.42788461538461536, + "cpu_mem": 1.83738368, + "gpu_mem": 4.720603648, + "loss": 0.3998, + "grad_norm": 4.037325859069824, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 268, + "epoch": 0.42948717948717946, + "cpu_mem": 1.837580288, + "gpu_mem": 4.720557568, + "loss": 0.502, + "grad_norm": 6.233963489532471, + "learning_rate": 0.00021154718980558417 + }, + { + "step": 269, + "epoch": 0.4310897435897436, + "cpu_mem": 1.837776896, + "gpu_mem": 4.720551424, + "loss": 0.5955, + "grad_norm": 4.025015354156494, + "learning_rate": 0.00021078019755508398 + }, + { + "step": 270, + "epoch": 0.4326923076923077, + "cpu_mem": 1.837776896, + "gpu_mem": 4.720612864, + "loss": 0.4641, + "grad_norm": 3.381136894226074, + "learning_rate": 0.00021001129925148396 + }, + { + "step": 271, + "epoch": 0.4342948717948718, + "cpu_mem": 1.837973504, + "gpu_mem": 4.720539136, + "loss": 0.4912, + "grad_norm": 3.9406044483184814, + "learning_rate": 0.00020924051900725923 + }, + { + "step": 272, + "epoch": 0.4358974358974359, + "cpu_mem": 1.838170112, + "gpu_mem": 4.7205376, + "loss": 0.419, + "grad_norm": 3.5241832733154297, + "learning_rate": 0.00020846788099390188 + }, + { + "step": 273, + "epoch": 0.4375, + "cpu_mem": 1.838170112, + "gpu_mem": 4.720540672, + "loss": 0.4786, + "grad_norm": 3.7314419746398926, + "learning_rate": 0.0002076934094411635 + }, + { + "step": 274, + "epoch": 0.4391025641025641, + "cpu_mem": 1.83836672, + "gpu_mem": 4.720526848, + "loss": 0.5239, + "grad_norm": 3.8411033153533936, + "learning_rate": 0.0002069171286362949 + }, + { + "step": 275, + "epoch": 0.4407051282051282, + "cpu_mem": 1.83836672, + "gpu_mem": 4.720542208, + "loss": 0.4144, + "grad_norm": 3.1702229976654053, + "learning_rate": 0.00020613906292328457 + }, + { + "step": 276, + "epoch": 0.4423076923076923, + "cpu_mem": 1.838563328, + "gpu_mem": 4.720580608, + "loss": 0.5616, + "grad_norm": 3.776132106781006, + "learning_rate": 0.0002053592367020955 + }, + { + "step": 277, + "epoch": 0.4439102564102564, + "cpu_mem": 1.838759936, + "gpu_mem": 4.72056064, + "loss": 0.5519, + "grad_norm": 3.8856117725372314, + "learning_rate": 0.0002045776744278996 + }, + { + "step": 278, + "epoch": 0.44551282051282054, + "cpu_mem": 1.838759936, + "gpu_mem": 4.720586752, + "loss": 0.7481, + "grad_norm": 4.943800926208496, + "learning_rate": 0.00020379440061031118 + }, + { + "step": 279, + "epoch": 0.44711538461538464, + "cpu_mem": 1.838956544, + "gpu_mem": 4.7205376, + "loss": 0.3678, + "grad_norm": 3.693385362625122, + "learning_rate": 0.00020300943981261808 + }, + { + "step": 280, + "epoch": 0.44871794871794873, + "cpu_mem": 1.839153152, + "gpu_mem": 4.720531456, + "loss": 0.5228, + "grad_norm": 3.9281654357910156, + "learning_rate": 0.0002022228166510114 + }, + { + "step": 281, + "epoch": 0.45032051282051283, + "cpu_mem": 1.83934976, + "gpu_mem": 4.720554496, + "loss": 0.4232, + "grad_norm": 2.7370541095733643, + "learning_rate": 0.00020143455579381373 + }, + { + "step": 282, + "epoch": 0.4519230769230769, + "cpu_mem": 1.83934976, + "gpu_mem": 4.720532992, + "loss": 0.5094, + "grad_norm": 3.1187233924865723, + "learning_rate": 0.0002006446819607053 + }, + { + "step": 283, + "epoch": 0.453525641025641, + "cpu_mem": 1.839546368, + "gpu_mem": 4.720546816, + "loss": 0.5672, + "grad_norm": 3.8652682304382324, + "learning_rate": 0.00019985321992194892 + }, + { + "step": 284, + "epoch": 0.4551282051282051, + "cpu_mem": 1.839742976, + "gpu_mem": 4.720551424, + "loss": 0.4553, + "grad_norm": 3.8987295627593994, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 285, + "epoch": 0.4567307692307692, + "cpu_mem": 1.839742976, + "gpu_mem": 4.720569856, + "loss": 0.5468, + "grad_norm": 4.452647686004639, + "learning_rate": 0.00019826563055679418 + }, + { + "step": 286, + "epoch": 0.4583333333333333, + "cpu_mem": 1.839742976, + "gpu_mem": 4.720540672, + "loss": 0.4143, + "grad_norm": 3.378143310546875, + "learning_rate": 0.00019746955301683537 + }, + { + "step": 287, + "epoch": 0.4599358974358974, + "cpu_mem": 1.839939584, + "gpu_mem": 4.72056832, + "loss": 0.7018, + "grad_norm": 4.218902587890625, + "learning_rate": 0.0001966719868425464 + }, + { + "step": 288, + "epoch": 0.46153846153846156, + "cpu_mem": 1.840136192, + "gpu_mem": 4.720549888, + "loss": 0.4628, + "grad_norm": 3.792783260345459, + "learning_rate": 0.0001958729570454201 + }, + { + "step": 289, + "epoch": 0.46314102564102566, + "cpu_mem": 1.8403328, + "gpu_mem": 4.7205376, + "loss": 0.5343, + "grad_norm": 3.8773250579833984, + "learning_rate": 0.0001950724886828484 + }, + { + "step": 290, + "epoch": 0.46474358974358976, + "cpu_mem": 1.8403328, + "gpu_mem": 4.720546816, + "loss": 0.6021, + "grad_norm": 4.1986589431762695, + "learning_rate": 0.000194270606857336 + }, + { + "step": 291, + "epoch": 0.46634615384615385, + "cpu_mem": 1.840529408, + "gpu_mem": 4.720543744, + "loss": 0.4238, + "grad_norm": 2.9637720584869385, + "learning_rate": 0.00019346733671571367 + }, + { + "step": 292, + "epoch": 0.46794871794871795, + "cpu_mem": 1.840529408, + "gpu_mem": 4.720559104, + "loss": 0.446, + "grad_norm": 4.196908473968506, + "learning_rate": 0.00019266270344834942 + }, + { + "step": 293, + "epoch": 0.46955128205128205, + "cpu_mem": 1.840726016, + "gpu_mem": 4.720566784, + "loss": 0.5175, + "grad_norm": 4.567485332489014, + "learning_rate": 0.00019185673228835857 + }, + { + "step": 294, + "epoch": 0.47115384615384615, + "cpu_mem": 1.840922624, + "gpu_mem": 4.720556032, + "loss": 0.5079, + "grad_norm": 4.384257793426514, + "learning_rate": 0.00019104944851081244 + }, + { + "step": 295, + "epoch": 0.47275641025641024, + "cpu_mem": 1.840922624, + "gpu_mem": 4.720540672, + "loss": 0.6646, + "grad_norm": 4.190060615539551, + "learning_rate": 0.00019024087743194564 + }, + { + "step": 296, + "epoch": 0.47435897435897434, + "cpu_mem": 1.841119232, + "gpu_mem": 4.720543744, + "loss": 0.4932, + "grad_norm": 4.083422660827637, + "learning_rate": 0.0001894310444083625 + }, + { + "step": 297, + "epoch": 0.47596153846153844, + "cpu_mem": 1.841119232, + "gpu_mem": 4.7205376, + "loss": 0.528, + "grad_norm": 4.365039348602295, + "learning_rate": 0.00018861997483624136 + }, + { + "step": 298, + "epoch": 0.4775641025641026, + "cpu_mem": 1.841119232, + "gpu_mem": 4.720532992, + "loss": 0.669, + "grad_norm": 4.034911632537842, + "learning_rate": 0.00018780769415053866 + }, + { + "step": 299, + "epoch": 0.4791666666666667, + "cpu_mem": 1.84131584, + "gpu_mem": 4.720554496, + "loss": 0.7482, + "grad_norm": 4.173949718475342, + "learning_rate": 0.00018699422782419094 + }, + { + "step": 300, + "epoch": 0.4807692307692308, + "cpu_mem": 1.841512448, + "gpu_mem": 4.720546816, + "loss": 0.7059, + "grad_norm": 3.261152982711792, + "learning_rate": 0.00018617960136731624 + }, + { + "step": 301, + "epoch": 0.4823717948717949, + "cpu_mem": 1.841512448, + "gpu_mem": 4.720519168, + "loss": 0.6538, + "grad_norm": 3.2103867530822754, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 302, + "epoch": 0.483974358974359, + "cpu_mem": 1.841512448, + "gpu_mem": 4.720517632, + "loss": 0.5925, + "grad_norm": 2.4563100337982178, + "learning_rate": 0.0001845469702835641 + }, + { + "step": 303, + "epoch": 0.4855769230769231, + "cpu_mem": 1.841709056, + "gpu_mem": 4.720543744, + "loss": 0.6397, + "grad_norm": 2.883416175842285, + "learning_rate": 0.00018372901685562414 + }, + { + "step": 304, + "epoch": 0.48717948717948717, + "cpu_mem": 1.841709056, + "gpu_mem": 4.720526848, + "loss": 0.4067, + "grad_norm": 2.6274826526641846, + "learning_rate": 0.00018291000569342676 + }, + { + "step": 305, + "epoch": 0.48878205128205127, + "cpu_mem": 1.841905664, + "gpu_mem": 4.720557568, + "loss": 0.4968, + "grad_norm": 2.923224687576294, + "learning_rate": 0.00018208996248097458 + }, + { + "step": 306, + "epoch": 0.49038461538461536, + "cpu_mem": 1.842102272, + "gpu_mem": 4.720540672, + "loss": 0.5687, + "grad_norm": 3.171687602996826, + "learning_rate": 0.00018126891293463547 + }, + { + "step": 307, + "epoch": 0.49198717948717946, + "cpu_mem": 1.84229888, + "gpu_mem": 4.720571392, + "loss": 0.5561, + "grad_norm": 3.6087560653686523, + "learning_rate": 0.0001804468828023354 + }, + { + "step": 308, + "epoch": 0.4935897435897436, + "cpu_mem": 1.84229888, + "gpu_mem": 4.720539136, + "loss": 0.4705, + "grad_norm": 3.970033884048462, + "learning_rate": 0.00017962389786275142 + }, + { + "step": 309, + "epoch": 0.4951923076923077, + "cpu_mem": 1.842495488, + "gpu_mem": 4.720565248, + "loss": 0.4474, + "grad_norm": 4.067206382751465, + "learning_rate": 0.000178799983924503 + }, + { + "step": 310, + "epoch": 0.4967948717948718, + "cpu_mem": 1.842495488, + "gpu_mem": 4.720540672, + "loss": 0.4375, + "grad_norm": 4.885968208312988, + "learning_rate": 0.00017797516682534293 + }, + { + "step": 311, + "epoch": 0.4983974358974359, + "cpu_mem": 1.842692096, + "gpu_mem": 4.720536064, + "loss": 0.7706, + "grad_norm": 5.980837821960449, + "learning_rate": 0.00017714947243134695 + }, + { + "step": 312, + "epoch": 0.5, + "cpu_mem": 1.842692096, + "gpu_mem": 4.720539136, + "loss": 0.3448, + "grad_norm": 4.665349006652832, + "learning_rate": 0.0001763229266361024 + }, + { + "step": 313, + "epoch": 0.5016025641025641, + "cpu_mem": 1.842692096, + "gpu_mem": 4.720557568, + "loss": 0.5202, + "grad_norm": 5.223811149597168, + "learning_rate": 0.00017549555535989648 + }, + { + "step": 314, + "epoch": 0.5032051282051282, + "cpu_mem": 1.842692096, + "gpu_mem": 4.7205376, + "loss": 0.5682, + "grad_norm": 5.580026626586914, + "learning_rate": 0.00017466738454890323 + }, + { + "step": 315, + "epoch": 0.5048076923076923, + "cpu_mem": 1.842888704, + "gpu_mem": 4.720542208, + "loss": 0.4495, + "grad_norm": 4.3583221435546875, + "learning_rate": 0.00017383844017436996 + }, + { + "step": 316, + "epoch": 0.5064102564102564, + "cpu_mem": 1.843085312, + "gpu_mem": 4.7205376, + "loss": 0.571, + "grad_norm": 4.94177770614624, + "learning_rate": 0.00017300874823180282 + }, + { + "step": 317, + "epoch": 0.5080128205128205, + "cpu_mem": 1.843085312, + "gpu_mem": 4.72054528, + "loss": 0.2957, + "grad_norm": 2.4039363861083984, + "learning_rate": 0.00017217833474015128 + }, + { + "step": 318, + "epoch": 0.5096153846153846, + "cpu_mem": 1.84328192, + "gpu_mem": 4.720569856, + "loss": 0.5201, + "grad_norm": 4.549537181854248, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 319, + "epoch": 0.5112179487179487, + "cpu_mem": 1.84328192, + "gpu_mem": 4.720562176, + "loss": 0.4872, + "grad_norm": 3.916066884994507, + "learning_rate": 0.0001705154472977154 + }, + { + "step": 320, + "epoch": 0.5128205128205128, + "cpu_mem": 1.843478528, + "gpu_mem": 4.720563712, + "loss": 0.5526, + "grad_norm": 3.6068239212036133, + "learning_rate": 0.00016968302549470095 + }, + { + "step": 321, + "epoch": 0.5144230769230769, + "cpu_mem": 1.843675136, + "gpu_mem": 4.720539136, + "loss": 0.5131, + "grad_norm": 3.439851760864258, + "learning_rate": 0.00016884998643650694 + }, + { + "step": 322, + "epoch": 0.5160256410256411, + "cpu_mem": 1.843675136, + "gpu_mem": 4.720540672, + "loss": 0.6036, + "grad_norm": 3.8973946571350098, + "learning_rate": 0.00016801635624704776 + }, + { + "step": 323, + "epoch": 0.5176282051282052, + "cpu_mem": 1.843675136, + "gpu_mem": 4.72056064, + "loss": 0.5055, + "grad_norm": 3.2447736263275146, + "learning_rate": 0.0001671821610687756 + }, + { + "step": 324, + "epoch": 0.5192307692307693, + "cpu_mem": 1.843871744, + "gpu_mem": 4.720532992, + "loss": 0.7675, + "grad_norm": 4.024089336395264, + "learning_rate": 0.00016634742706186036 + }, + { + "step": 325, + "epoch": 0.5208333333333334, + "cpu_mem": 1.844068352, + "gpu_mem": 4.72054528, + "loss": 0.3114, + "grad_norm": 3.120734691619873, + "learning_rate": 0.00016551218040336993 + }, + { + "step": 326, + "epoch": 0.5224358974358975, + "cpu_mem": 1.84426496, + "gpu_mem": 4.720554496, + "loss": 0.4512, + "grad_norm": 4.173813343048096, + "learning_rate": 0.00016467644728644843 + }, + { + "step": 327, + "epoch": 0.5240384615384616, + "cpu_mem": 1.84426496, + "gpu_mem": 4.720531456, + "loss": 0.4703, + "grad_norm": 3.780308723449707, + "learning_rate": 0.0001638402539194953 + }, + { + "step": 328, + "epoch": 0.5256410256410257, + "cpu_mem": 1.84426496, + "gpu_mem": 4.720556032, + "loss": 0.7453, + "grad_norm": 5.381161689758301, + "learning_rate": 0.00016300362652534346 + }, + { + "step": 329, + "epoch": 0.5272435897435898, + "cpu_mem": 1.84426496, + "gpu_mem": 4.720556032, + "loss": 0.5457, + "grad_norm": 4.106481552124023, + "learning_rate": 0.00016216659134043657 + }, + { + "step": 330, + "epoch": 0.5288461538461539, + "cpu_mem": 1.844461568, + "gpu_mem": 4.720539136, + "loss": 0.4135, + "grad_norm": 3.452975034713745, + "learning_rate": 0.00016132917461400686 + }, + { + "step": 331, + "epoch": 0.530448717948718, + "cpu_mem": 1.844461568, + "gpu_mem": 4.720536064, + "loss": 0.3362, + "grad_norm": 3.2590513229370117, + "learning_rate": 0.00016049140260725127 + }, + { + "step": 332, + "epoch": 0.532051282051282, + "cpu_mem": 1.844461568, + "gpu_mem": 4.720528384, + "loss": 0.5362, + "grad_norm": 5.263980388641357, + "learning_rate": 0.00015965330159250845 + }, + { + "step": 333, + "epoch": 0.5336538461538461, + "cpu_mem": 1.844658176, + "gpu_mem": 4.720566784, + "loss": 0.4807, + "grad_norm": 3.585007667541504, + "learning_rate": 0.00015881489785243467 + }, + { + "step": 334, + "epoch": 0.5352564102564102, + "cpu_mem": 1.844658176, + "gpu_mem": 4.720543744, + "loss": 0.976, + "grad_norm": 5.7225518226623535, + "learning_rate": 0.00015797621767917942 + }, + { + "step": 335, + "epoch": 0.5368589743589743, + "cpu_mem": 1.844854784, + "gpu_mem": 4.720542208, + "loss": 0.6364, + "grad_norm": 4.316378116607666, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 336, + "epoch": 0.5384615384615384, + "cpu_mem": 1.844854784, + "gpu_mem": 4.720559104, + "loss": 0.519, + "grad_norm": 4.536004066467285, + "learning_rate": 0.00015629813324424292 + }, + { + "step": 337, + "epoch": 0.5400641025641025, + "cpu_mem": 1.845051392, + "gpu_mem": 4.720543744, + "loss": 0.3603, + "grad_norm": 2.7211925983428955, + "learning_rate": 0.00015545878160690583 + }, + { + "step": 338, + "epoch": 0.5416666666666666, + "cpu_mem": 1.845051392, + "gpu_mem": 4.720556032, + "loss": 0.5568, + "grad_norm": 2.9896998405456543, + "learning_rate": 0.00015461925878342556 + }, + { + "step": 339, + "epoch": 0.5432692307692307, + "cpu_mem": 1.845051392, + "gpu_mem": 4.72056832, + "loss": 0.3551, + "grad_norm": 2.384856939315796, + "learning_rate": 0.00015377959110104584 + }, + { + "step": 340, + "epoch": 0.5448717948717948, + "cpu_mem": 1.845051392, + "gpu_mem": 4.720543744, + "loss": 0.426, + "grad_norm": 2.9072306156158447, + "learning_rate": 0.00015293980489155333 + }, + { + "step": 341, + "epoch": 0.5464743589743589, + "cpu_mem": 1.845248, + "gpu_mem": 4.720588288, + "loss": 0.6195, + "grad_norm": 3.7444984912872314, + "learning_rate": 0.00015209992649045152 + }, + { + "step": 342, + "epoch": 0.5480769230769231, + "cpu_mem": 1.845248, + "gpu_mem": 4.720562176, + "loss": 0.5569, + "grad_norm": 3.814422130584717, + "learning_rate": 0.000151259982236135 + }, + { + "step": 343, + "epoch": 0.5496794871794872, + "cpu_mem": 1.845444608, + "gpu_mem": 4.720559104, + "loss": 0.4152, + "grad_norm": 2.7806289196014404, + "learning_rate": 0.00015041999846906367 + }, + { + "step": 344, + "epoch": 0.5512820512820513, + "cpu_mem": 1.845444608, + "gpu_mem": 4.720540672, + "loss": 0.2758, + "grad_norm": 2.6996688842773438, + "learning_rate": 0.00014958000153093634 + }, + { + "step": 345, + "epoch": 0.5528846153846154, + "cpu_mem": 1.845641216, + "gpu_mem": 4.720546816, + "loss": 0.3315, + "grad_norm": 3.1441996097564697, + "learning_rate": 0.000148740017763865 + }, + { + "step": 346, + "epoch": 0.5544871794871795, + "cpu_mem": 1.845641216, + "gpu_mem": 4.720516096, + "loss": 0.5032, + "grad_norm": 4.1909613609313965, + "learning_rate": 0.00014790007350954845 + }, + { + "step": 347, + "epoch": 0.5560897435897436, + "cpu_mem": 1.845837824, + "gpu_mem": 4.720580608, + "loss": 0.6732, + "grad_norm": 5.483664512634277, + "learning_rate": 0.00014706019510844664 + }, + { + "step": 348, + "epoch": 0.5576923076923077, + "cpu_mem": 1.845837824, + "gpu_mem": 4.720534528, + "loss": 0.5658, + "grad_norm": 4.995872974395752, + "learning_rate": 0.0001462204088989541 + }, + { + "step": 349, + "epoch": 0.5592948717948718, + "cpu_mem": 1.845837824, + "gpu_mem": 4.720528384, + "loss": 0.4636, + "grad_norm": 4.079861164093018, + "learning_rate": 0.00014538074121657447 + }, + { + "step": 350, + "epoch": 0.5608974358974359, + "cpu_mem": 1.846034432, + "gpu_mem": 4.72058368, + "loss": 0.239, + "grad_norm": 2.886357307434082, + "learning_rate": 0.00014454121839309415 + }, + { + "step": 351, + "epoch": 0.5625, + "cpu_mem": 1.84623104, + "gpu_mem": 4.720549888, + "loss": 0.4422, + "grad_norm": 5.631111145019531, + "learning_rate": 0.00014370186675575705 + }, + { + "step": 352, + "epoch": 0.5641025641025641, + "cpu_mem": 1.84623104, + "gpu_mem": 4.7205376, + "loss": 0.3458, + "grad_norm": 3.8821935653686523, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 353, + "epoch": 0.5657051282051282, + "cpu_mem": 1.84623104, + "gpu_mem": 4.720542208, + "loss": 0.3346, + "grad_norm": 3.4518489837646484, + "learning_rate": 0.00014202378232082053 + }, + { + "step": 354, + "epoch": 0.5673076923076923, + "cpu_mem": 1.84623104, + "gpu_mem": 4.72052224, + "loss": 0.5712, + "grad_norm": 5.143901348114014, + "learning_rate": 0.00014118510214756536 + }, + { + "step": 355, + "epoch": 0.5689102564102564, + "cpu_mem": 1.846427648, + "gpu_mem": 4.720546816, + "loss": 0.4449, + "grad_norm": 4.08884859085083, + "learning_rate": 0.00014034669840749152 + }, + { + "step": 356, + "epoch": 0.5705128205128205, + "cpu_mem": 1.846427648, + "gpu_mem": 4.720525312, + "loss": 0.3481, + "grad_norm": 4.157481670379639, + "learning_rate": 0.0001395085973927487 + }, + { + "step": 357, + "epoch": 0.5721153846153846, + "cpu_mem": 1.846427648, + "gpu_mem": 4.720542208, + "loss": 0.3693, + "grad_norm": 4.340169906616211, + "learning_rate": 0.00013867082538599317 + }, + { + "step": 358, + "epoch": 0.5737179487179487, + "cpu_mem": 1.846624256, + "gpu_mem": 4.72050688, + "loss": 0.445, + "grad_norm": 5.416385173797607, + "learning_rate": 0.00013783340865956338 + }, + { + "step": 359, + "epoch": 0.5753205128205128, + "cpu_mem": 1.846624256, + "gpu_mem": 4.720539136, + "loss": 0.3455, + "grad_norm": 3.5990593433380127, + "learning_rate": 0.0001369963734746566 + }, + { + "step": 360, + "epoch": 0.5769230769230769, + "cpu_mem": 1.846820864, + "gpu_mem": 4.720528384, + "loss": 0.3387, + "grad_norm": 3.7677319049835205, + "learning_rate": 0.0001361597460805047 + }, + { + "step": 361, + "epoch": 0.5785256410256411, + "cpu_mem": 1.846820864, + "gpu_mem": 4.720565248, + "loss": 0.5759, + "grad_norm": 4.844612121582031, + "learning_rate": 0.00013532355271355154 + }, + { + "step": 362, + "epoch": 0.5801282051282052, + "cpu_mem": 1.846820864, + "gpu_mem": 4.720531456, + "loss": 0.5074, + "grad_norm": 4.806457042694092, + "learning_rate": 0.00013448781959663004 + }, + { + "step": 363, + "epoch": 0.5817307692307693, + "cpu_mem": 1.846820864, + "gpu_mem": 4.720554496, + "loss": 0.3813, + "grad_norm": 4.249344348907471, + "learning_rate": 0.00013365257293813956 + }, + { + "step": 364, + "epoch": 0.5833333333333334, + "cpu_mem": 1.846820864, + "gpu_mem": 4.720543744, + "loss": 0.4744, + "grad_norm": 4.4250054359436035, + "learning_rate": 0.00013281783893122446 + }, + { + "step": 365, + "epoch": 0.5849358974358975, + "cpu_mem": 1.847017472, + "gpu_mem": 4.720549888, + "loss": 0.4057, + "grad_norm": 3.66872239112854, + "learning_rate": 0.00013198364375295224 + }, + { + "step": 366, + "epoch": 0.5865384615384616, + "cpu_mem": 1.847017472, + "gpu_mem": 4.720543744, + "loss": 0.5568, + "grad_norm": 4.155337333679199, + "learning_rate": 0.000131150013563493 + }, + { + "step": 367, + "epoch": 0.5881410256410257, + "cpu_mem": 1.84721408, + "gpu_mem": 4.720562176, + "loss": 0.6101, + "grad_norm": 4.615206718444824, + "learning_rate": 0.00013031697450529902 + }, + { + "step": 368, + "epoch": 0.5897435897435898, + "cpu_mem": 1.84721408, + "gpu_mem": 4.72052224, + "loss": 0.3701, + "grad_norm": 2.8196117877960205, + "learning_rate": 0.0001294845527022846 + }, + { + "step": 369, + "epoch": 0.5913461538461539, + "cpu_mem": 1.84721408, + "gpu_mem": 4.720554496, + "loss": 0.7196, + "grad_norm": 4.3603010177612305, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 370, + "epoch": 0.592948717948718, + "cpu_mem": 1.847410688, + "gpu_mem": 4.720574464, + "loss": 0.6025, + "grad_norm": 3.7369251251220703, + "learning_rate": 0.0001278216652598487 + }, + { + "step": 371, + "epoch": 0.594551282051282, + "cpu_mem": 1.847410688, + "gpu_mem": 4.72056832, + "loss": 0.4856, + "grad_norm": 3.3088488578796387, + "learning_rate": 0.00012699125176819716 + }, + { + "step": 372, + "epoch": 0.5961538461538461, + "cpu_mem": 1.847410688, + "gpu_mem": 4.720531456, + "loss": 0.5732, + "grad_norm": 3.2756917476654053, + "learning_rate": 0.00012616155982563 + }, + { + "step": 373, + "epoch": 0.5977564102564102, + "cpu_mem": 1.847607296, + "gpu_mem": 4.720548352, + "loss": 0.3243, + "grad_norm": 2.769108772277832, + "learning_rate": 0.00012533261545109674 + }, + { + "step": 374, + "epoch": 0.5993589743589743, + "cpu_mem": 1.847607296, + "gpu_mem": 4.720525312, + "loss": 0.5527, + "grad_norm": 2.630009889602661, + "learning_rate": 0.00012450444464010352 + }, + { + "step": 375, + "epoch": 0.6009615384615384, + "cpu_mem": 1.847607296, + "gpu_mem": 4.720557568, + "loss": 0.4677, + "grad_norm": 3.176006555557251, + "learning_rate": 0.0001236770733638976 + }, + { + "step": 376, + "epoch": 0.6025641025641025, + "cpu_mem": 1.847607296, + "gpu_mem": 4.72055296, + "loss": 0.3802, + "grad_norm": 2.937474250793457, + "learning_rate": 0.000122850527568653 + }, + { + "step": 377, + "epoch": 0.6041666666666666, + "cpu_mem": 1.847607296, + "gpu_mem": 4.720562176, + "loss": 0.3597, + "grad_norm": 3.2246341705322266, + "learning_rate": 0.00012202483317465704 + }, + { + "step": 378, + "epoch": 0.6057692307692307, + "cpu_mem": 1.847803904, + "gpu_mem": 4.720536064, + "loss": 0.407, + "grad_norm": 3.3606414794921875, + "learning_rate": 0.00012120001607549698 + }, + { + "step": 379, + "epoch": 0.6073717948717948, + "cpu_mem": 1.847803904, + "gpu_mem": 4.720556032, + "loss": 0.4669, + "grad_norm": 3.0717763900756836, + "learning_rate": 0.00012037610213724862 + }, + { + "step": 380, + "epoch": 0.6089743589743589, + "cpu_mem": 1.847803904, + "gpu_mem": 4.72052992, + "loss": 0.4462, + "grad_norm": 3.0623738765716553, + "learning_rate": 0.0001195531171976646 + }, + { + "step": 381, + "epoch": 0.6105769230769231, + "cpu_mem": 1.848000512, + "gpu_mem": 4.720554496, + "loss": 0.4175, + "grad_norm": 3.199270248413086, + "learning_rate": 0.00011873108706536448 + }, + { + "step": 382, + "epoch": 0.6121794871794872, + "cpu_mem": 1.848000512, + "gpu_mem": 4.720539136, + "loss": 0.2448, + "grad_norm": 2.971148729324341, + "learning_rate": 0.00011791003751902542 + }, + { + "step": 383, + "epoch": 0.6137820512820513, + "cpu_mem": 1.848000512, + "gpu_mem": 4.720572928, + "loss": 0.4019, + "grad_norm": 4.172274589538574, + "learning_rate": 0.00011708999430657325 + }, + { + "step": 384, + "epoch": 0.6153846153846154, + "cpu_mem": 1.84819712, + "gpu_mem": 4.72055296, + "loss": 0.4257, + "grad_norm": 4.589349746704102, + "learning_rate": 0.00011627098314437586 + }, + { + "step": 385, + "epoch": 0.6169871794871795, + "cpu_mem": 1.84819712, + "gpu_mem": 4.7205376, + "loss": 0.4293, + "grad_norm": 4.5286712646484375, + "learning_rate": 0.0001154530297164359 + }, + { + "step": 386, + "epoch": 0.6185897435897436, + "cpu_mem": 1.848393728, + "gpu_mem": 4.720572928, + "loss": 0.4668, + "grad_norm": 4.871840953826904, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 387, + "epoch": 0.6201923076923077, + "cpu_mem": 1.848393728, + "gpu_mem": 4.720579072, + "loss": 0.3236, + "grad_norm": 5.063573837280273, + "learning_rate": 0.00011382039863268374 + }, + { + "step": 388, + "epoch": 0.6217948717948718, + "cpu_mem": 1.848393728, + "gpu_mem": 4.720542208, + "loss": 0.4673, + "grad_norm": 5.670955657958984, + "learning_rate": 0.00011300577217580905 + }, + { + "step": 389, + "epoch": 0.6233974358974359, + "cpu_mem": 1.848393728, + "gpu_mem": 4.720520704, + "loss": 0.4111, + "grad_norm": 4.968171119689941, + "learning_rate": 0.00011219230584946136 + }, + { + "step": 390, + "epoch": 0.625, + "cpu_mem": 1.848393728, + "gpu_mem": 4.720572928, + "loss": 0.595, + "grad_norm": 4.709218502044678, + "learning_rate": 0.00011138002516375864 + }, + { + "step": 391, + "epoch": 0.6266025641025641, + "cpu_mem": 1.848393728, + "gpu_mem": 4.720559104, + "loss": 0.2999, + "grad_norm": 3.508289098739624, + "learning_rate": 0.00011056895559163748 + }, + { + "step": 392, + "epoch": 0.6282051282051282, + "cpu_mem": 1.848393728, + "gpu_mem": 4.72055296, + "loss": 0.6149, + "grad_norm": 6.037300109863281, + "learning_rate": 0.00010975912256805436 + }, + { + "step": 393, + "epoch": 0.6298076923076923, + "cpu_mem": 1.848590336, + "gpu_mem": 4.720559104, + "loss": 0.514, + "grad_norm": 3.85451602935791, + "learning_rate": 0.00010895055148918756 + }, + { + "step": 394, + "epoch": 0.6314102564102564, + "cpu_mem": 1.848590336, + "gpu_mem": 4.720536064, + "loss": 0.6238, + "grad_norm": 5.335838317871094, + "learning_rate": 0.00010814326771164141 + }, + { + "step": 395, + "epoch": 0.6330128205128205, + "cpu_mem": 1.848590336, + "gpu_mem": 4.720549888, + "loss": 0.2083, + "grad_norm": 2.050891160964966, + "learning_rate": 0.00010733729655165054 + }, + { + "step": 396, + "epoch": 0.6346153846153846, + "cpu_mem": 1.848590336, + "gpu_mem": 4.720549888, + "loss": 0.6843, + "grad_norm": 5.356147766113281, + "learning_rate": 0.00010653266328428628 + }, + { + "step": 397, + "epoch": 0.6362179487179487, + "cpu_mem": 1.848786944, + "gpu_mem": 4.720519168, + "loss": 0.369, + "grad_norm": 3.5975537300109863, + "learning_rate": 0.00010572939314266402 + }, + { + "step": 398, + "epoch": 0.6378205128205128, + "cpu_mem": 1.848786944, + "gpu_mem": 4.72055296, + "loss": 0.329, + "grad_norm": 3.9362289905548096, + "learning_rate": 0.00010492751131715159 + }, + { + "step": 399, + "epoch": 0.6394230769230769, + "cpu_mem": 1.848786944, + "gpu_mem": 4.720531456, + "loss": 0.438, + "grad_norm": 3.512831211090088, + "learning_rate": 0.00010412704295457988 + }, + { + "step": 400, + "epoch": 0.6410256410256411, + "cpu_mem": 1.848786944, + "gpu_mem": 4.720539136, + "loss": 0.3141, + "grad_norm": 3.05904483795166, + "learning_rate": 0.00010332801315745361 + }, + { + "step": 401, + "epoch": 0.6426282051282052, + "cpu_mem": 1.848983552, + "gpu_mem": 4.720557568, + "loss": 0.4165, + "grad_norm": 3.2671499252319336, + "learning_rate": 0.00010253044698316464 + }, + { + "step": 402, + "epoch": 0.6442307692307693, + "cpu_mem": 1.84918016, + "gpu_mem": 4.720525312, + "loss": 0.583, + "grad_norm": 4.520811557769775, + "learning_rate": 0.00010173436944320582 + }, + { + "step": 403, + "epoch": 0.6458333333333334, + "cpu_mem": 1.84918016, + "gpu_mem": 4.72052992, + "loss": 0.3878, + "grad_norm": 2.7461984157562256, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 404, + "epoch": 0.6474358974358975, + "cpu_mem": 1.84918016, + "gpu_mem": 4.720525312, + "loss": 0.3627, + "grad_norm": 2.3579635620117188, + "learning_rate": 0.00010014678007805106 + }, + { + "step": 405, + "epoch": 0.6490384615384616, + "cpu_mem": 1.84918016, + "gpu_mem": 4.720569856, + "loss": 0.5495, + "grad_norm": 4.761881351470947, + "learning_rate": 9.935531803929469e-05 + }, + { + "step": 406, + "epoch": 0.6506410256410257, + "cpu_mem": 1.849376768, + "gpu_mem": 4.72055296, + "loss": 0.3223, + "grad_norm": 3.343362808227539, + "learning_rate": 9.856544420618624e-05 + }, + { + "step": 407, + "epoch": 0.6522435897435898, + "cpu_mem": 1.849376768, + "gpu_mem": 4.720542208, + "loss": 0.3886, + "grad_norm": 3.078606128692627, + "learning_rate": 9.777718334898859e-05 + }, + { + "step": 408, + "epoch": 0.6538461538461539, + "cpu_mem": 1.849376768, + "gpu_mem": 4.720563712, + "loss": 0.3166, + "grad_norm": 3.269817590713501, + "learning_rate": 9.699056018738192e-05 + }, + { + "step": 409, + "epoch": 0.655448717948718, + "cpu_mem": 1.849376768, + "gpu_mem": 4.72052992, + "loss": 0.408, + "grad_norm": 3.588259696960449, + "learning_rate": 9.62055993896888e-05 + }, + { + "step": 410, + "epoch": 0.657051282051282, + "cpu_mem": 1.849573376, + "gpu_mem": 4.72054528, + "loss": 0.382, + "grad_norm": 3.2437572479248047, + "learning_rate": 9.542232557210039e-05 + }, + { + "step": 411, + "epoch": 0.6586538461538461, + "cpu_mem": 1.849573376, + "gpu_mem": 4.72054528, + "loss": 0.4926, + "grad_norm": 3.4115805625915527, + "learning_rate": 9.464076329790451e-05 + }, + { + "step": 412, + "epoch": 0.6602564102564102, + "cpu_mem": 1.849573376, + "gpu_mem": 4.720536064, + "loss": 0.3312, + "grad_norm": 3.059218406677246, + "learning_rate": 9.386093707671543e-05 + }, + { + "step": 413, + "epoch": 0.6618589743589743, + "cpu_mem": 1.849573376, + "gpu_mem": 4.720546816, + "loss": 0.6388, + "grad_norm": 4.750514030456543, + "learning_rate": 9.308287136370511e-05 + }, + { + "step": 414, + "epoch": 0.6634615384615384, + "cpu_mem": 1.849573376, + "gpu_mem": 4.720571392, + "loss": 0.4278, + "grad_norm": 3.7710886001586914, + "learning_rate": 9.230659055883649e-05 + }, + { + "step": 415, + "epoch": 0.6650641025641025, + "cpu_mem": 1.849573376, + "gpu_mem": 4.720523776, + "loss": 0.5279, + "grad_norm": 3.685518741607666, + "learning_rate": 9.15321190060981e-05 + }, + { + "step": 416, + "epoch": 0.6666666666666666, + "cpu_mem": 1.849573376, + "gpu_mem": 4.720559104, + "loss": 0.4343, + "grad_norm": 4.250965118408203, + "learning_rate": 9.075948099274078e-05 + }, + { + "step": 417, + "epoch": 0.6682692307692307, + "cpu_mem": 1.849769984, + "gpu_mem": 4.720520704, + "loss": 0.411, + "grad_norm": 3.7654166221618652, + "learning_rate": 8.998870074851604e-05 + }, + { + "step": 418, + "epoch": 0.6698717948717948, + "cpu_mem": 1.849966592, + "gpu_mem": 4.720539136, + "loss": 0.4448, + "grad_norm": 3.7771546840667725, + "learning_rate": 8.9219802444916e-05 + }, + { + "step": 419, + "epoch": 0.6714743589743589, + "cpu_mem": 1.849966592, + "gpu_mem": 4.720531456, + "loss": 0.3723, + "grad_norm": 3.0325658321380615, + "learning_rate": 8.845281019441583e-05 + }, + { + "step": 420, + "epoch": 0.6730769230769231, + "cpu_mem": 1.849966592, + "gpu_mem": 4.72056832, + "loss": 0.4171, + "grad_norm": 2.913794994354248, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 421, + "epoch": 0.6746794871794872, + "cpu_mem": 1.849966592, + "gpu_mem": 4.720528384, + "loss": 0.3441, + "grad_norm": 3.3999364376068115, + "learning_rate": 8.692464000299362e-05 + }, + { + "step": 422, + "epoch": 0.6762820512820513, + "cpu_mem": 1.849966592, + "gpu_mem": 4.720542208, + "loss": 0.3345, + "grad_norm": 2.7293505668640137, + "learning_rate": 8.61635099851395e-05 + }, + { + "step": 423, + "epoch": 0.6778846153846154, + "cpu_mem": 1.849966592, + "gpu_mem": 4.720546816, + "loss": 0.2857, + "grad_norm": 2.9734745025634766, + "learning_rate": 8.540438186501792e-05 + }, + { + "step": 424, + "epoch": 0.6794871794871795, + "cpu_mem": 1.849966592, + "gpu_mem": 4.720508416, + "loss": 0.4494, + "grad_norm": 3.9761106967926025, + "learning_rate": 8.464727944871322e-05 + }, + { + "step": 425, + "epoch": 0.6810897435897436, + "cpu_mem": 1.849966592, + "gpu_mem": 4.720531456, + "loss": 0.5635, + "grad_norm": 3.7540183067321777, + "learning_rate": 8.389222647878426e-05 + }, + { + "step": 426, + "epoch": 0.6826923076923077, + "cpu_mem": 1.8501632, + "gpu_mem": 4.72052992, + "loss": 0.2158, + "grad_norm": 2.3447182178497314, + "learning_rate": 8.313924663351926e-05 + }, + { + "step": 427, + "epoch": 0.6842948717948718, + "cpu_mem": 1.8501632, + "gpu_mem": 4.720548352, + "loss": 0.476, + "grad_norm": 4.442580223083496, + "learning_rate": 8.238836352619424e-05 + }, + { + "step": 428, + "epoch": 0.6858974358974359, + "cpu_mem": 1.850359808, + "gpu_mem": 4.72054528, + "loss": 0.4089, + "grad_norm": 3.773097038269043, + "learning_rate": 8.163960070433164e-05 + }, + { + "step": 429, + "epoch": 0.6875, + "cpu_mem": 1.850359808, + "gpu_mem": 4.720543744, + "loss": 0.48, + "grad_norm": 4.508922100067139, + "learning_rate": 8.089298164896245e-05 + }, + { + "step": 430, + "epoch": 0.6891025641025641, + "cpu_mem": 1.850359808, + "gpu_mem": 4.720562176, + "loss": 0.3208, + "grad_norm": 3.905177593231201, + "learning_rate": 8.014852977388964e-05 + }, + { + "step": 431, + "epoch": 0.6907051282051282, + "cpu_mem": 1.850359808, + "gpu_mem": 4.720523776, + "loss": 0.5117, + "grad_norm": 4.572379112243652, + "learning_rate": 7.940626842495362e-05 + }, + { + "step": 432, + "epoch": 0.6923076923076923, + "cpu_mem": 1.850359808, + "gpu_mem": 4.72056832, + "loss": 0.3113, + "grad_norm": 3.333207130432129, + "learning_rate": 7.866622087930074e-05 + }, + { + "step": 433, + "epoch": 0.6939102564102564, + "cpu_mem": 1.850359808, + "gpu_mem": 4.720532992, + "loss": 0.3551, + "grad_norm": 3.3027172088623047, + "learning_rate": 7.792841034465275e-05 + }, + { + "step": 434, + "epoch": 0.6955128205128205, + "cpu_mem": 1.850556416, + "gpu_mem": 4.72056064, + "loss": 0.4954, + "grad_norm": 5.295358657836914, + "learning_rate": 7.719285995857938e-05 + }, + { + "step": 435, + "epoch": 0.6971153846153846, + "cpu_mem": 1.850556416, + "gpu_mem": 4.720540672, + "loss": 0.4484, + "grad_norm": 4.713922023773193, + "learning_rate": 7.64595927877727e-05 + }, + { + "step": 436, + "epoch": 0.6987179487179487, + "cpu_mem": 1.850753024, + "gpu_mem": 4.720586752, + "loss": 0.4355, + "grad_norm": 4.018678665161133, + "learning_rate": 7.572863182732332e-05 + }, + { + "step": 437, + "epoch": 0.7003205128205128, + "cpu_mem": 1.850753024, + "gpu_mem": 4.720551424, + "loss": 0.3533, + "grad_norm": 3.6029505729675293, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 438, + "epoch": 0.7019230769230769, + "cpu_mem": 1.850753024, + "gpu_mem": 4.720542208, + "loss": 0.277, + "grad_norm": 3.1121442317962646, + "learning_rate": 7.42737201555302e-05 + }, + { + "step": 439, + "epoch": 0.7035256410256411, + "cpu_mem": 1.850753024, + "gpu_mem": 4.720536064, + "loss": 0.2842, + "grad_norm": 3.1773321628570557, + "learning_rate": 7.354981506988387e-05 + }, + { + "step": 440, + "epoch": 0.7051282051282052, + "cpu_mem": 1.850753024, + "gpu_mem": 4.720520704, + "loss": 0.5414, + "grad_norm": 3.897446870803833, + "learning_rate": 7.282830744455895e-05 + }, + { + "step": 441, + "epoch": 0.7067307692307693, + "cpu_mem": 1.850753024, + "gpu_mem": 4.720539136, + "loss": 0.6575, + "grad_norm": 5.000871658325195, + "learning_rate": 7.210921990586957e-05 + }, + { + "step": 442, + "epoch": 0.7083333333333334, + "cpu_mem": 1.850753024, + "gpu_mem": 4.720540672, + "loss": 0.3082, + "grad_norm": 2.776646852493286, + "learning_rate": 7.139257500423665e-05 + }, + { + "step": 443, + "epoch": 0.7099358974358975, + "cpu_mem": 1.850753024, + "gpu_mem": 4.72054528, + "loss": 0.3874, + "grad_norm": 4.234773635864258, + "learning_rate": 7.067839521348035e-05 + }, + { + "step": 444, + "epoch": 0.7115384615384616, + "cpu_mem": 1.850753024, + "gpu_mem": 4.720548352, + "loss": 0.3219, + "grad_norm": 3.2595114707946777, + "learning_rate": 6.996670293011575e-05 + }, + { + "step": 445, + "epoch": 0.7131410256410257, + "cpu_mem": 1.850949632, + "gpu_mem": 4.720542208, + "loss": 0.4649, + "grad_norm": 4.4524030685424805, + "learning_rate": 6.92575204726501e-05 + }, + { + "step": 446, + "epoch": 0.7147435897435898, + "cpu_mem": 1.850949632, + "gpu_mem": 4.72056832, + "loss": 0.6603, + "grad_norm": 4.800167560577393, + "learning_rate": 6.855087008088307e-05 + }, + { + "step": 447, + "epoch": 0.7163461538461539, + "cpu_mem": 1.850949632, + "gpu_mem": 4.720536064, + "loss": 0.3887, + "grad_norm": 3.265150308609009, + "learning_rate": 6.784677391520952e-05 + }, + { + "step": 448, + "epoch": 0.717948717948718, + "cpu_mem": 1.850949632, + "gpu_mem": 4.720563712, + "loss": 0.3711, + "grad_norm": 4.501293182373047, + "learning_rate": 6.714525405592412e-05 + }, + { + "step": 449, + "epoch": 0.719551282051282, + "cpu_mem": 1.850949632, + "gpu_mem": 4.720571392, + "loss": 0.3759, + "grad_norm": 3.397003173828125, + "learning_rate": 6.644633250252937e-05 + }, + { + "step": 450, + "epoch": 0.7211538461538461, + "cpu_mem": 1.850949632, + "gpu_mem": 4.72055296, + "loss": 0.4915, + "grad_norm": 4.395791530609131, + "learning_rate": 6.575003117304535e-05 + }, + { + "step": 451, + "epoch": 0.7227564102564102, + "cpu_mem": 1.850949632, + "gpu_mem": 4.720539136, + "loss": 0.3682, + "grad_norm": 3.3490183353424072, + "learning_rate": 6.50563719033225e-05 + }, + { + "step": 452, + "epoch": 0.7243589743589743, + "cpu_mem": 1.850949632, + "gpu_mem": 4.720549888, + "loss": 0.4594, + "grad_norm": 4.575304985046387, + "learning_rate": 6.436537644635705e-05 + }, + { + "step": 453, + "epoch": 0.7259615384615384, + "cpu_mem": 1.850949632, + "gpu_mem": 4.720542208, + "loss": 0.3056, + "grad_norm": 3.527523994445801, + "learning_rate": 6.367706647160847e-05 + }, + { + "step": 454, + "epoch": 0.7275641025641025, + "cpu_mem": 1.850949632, + "gpu_mem": 4.720559104, + "loss": 0.4755, + "grad_norm": 4.2182207107543945, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 455, + "epoch": 0.7291666666666666, + "cpu_mem": 1.850949632, + "gpu_mem": 4.720531456, + "loss": 0.3991, + "grad_norm": 4.103976726531982, + "learning_rate": 6.230858922484288e-05 + }, + { + "step": 456, + "epoch": 0.7307692307692307, + "cpu_mem": 1.85114624, + "gpu_mem": 4.720562176, + "loss": 0.3709, + "grad_norm": 2.8709001541137695, + "learning_rate": 6.162846486795938e-05 + }, + { + "step": 457, + "epoch": 0.7323717948717948, + "cpu_mem": 1.851342848, + "gpu_mem": 4.720543744, + "loss": 0.5557, + "grad_norm": 5.510714054107666, + "learning_rate": 6.095111182221422e-05 + }, + { + "step": 458, + "epoch": 0.7339743589743589, + "cpu_mem": 1.851342848, + "gpu_mem": 4.720531456, + "loss": 0.4157, + "grad_norm": 3.6529366970062256, + "learning_rate": 6.027655132924397e-05 + }, + { + "step": 459, + "epoch": 0.7355769230769231, + "cpu_mem": 1.851342848, + "gpu_mem": 4.720543744, + "loss": 0.3844, + "grad_norm": 3.6382675170898438, + "learning_rate": 5.960480454311155e-05 + }, + { + "step": 460, + "epoch": 0.7371794871794872, + "cpu_mem": 1.851342848, + "gpu_mem": 4.720549888, + "loss": 0.255, + "grad_norm": 3.672861337661743, + "learning_rate": 5.893589252964258e-05 + }, + { + "step": 461, + "epoch": 0.7387820512820513, + "cpu_mem": 1.851342848, + "gpu_mem": 4.7205376, + "loss": 0.2724, + "grad_norm": 2.8575077056884766, + "learning_rate": 5.826983626576479e-05 + }, + { + "step": 462, + "epoch": 0.7403846153846154, + "cpu_mem": 1.851342848, + "gpu_mem": 4.720526848, + "loss": 0.2354, + "grad_norm": 2.8211300373077393, + "learning_rate": 5.760665663885046e-05 + }, + { + "step": 463, + "epoch": 0.7419871794871795, + "cpu_mem": 1.851342848, + "gpu_mem": 4.720528384, + "loss": 0.3436, + "grad_norm": 3.970731258392334, + "learning_rate": 5.6946374446060984e-05 + }, + { + "step": 464, + "epoch": 0.7435897435897436, + "cpu_mem": 1.851342848, + "gpu_mem": 4.720542208, + "loss": 0.4048, + "grad_norm": 3.4550774097442627, + "learning_rate": 5.6289010393695056e-05 + }, + { + "step": 465, + "epoch": 0.7451923076923077, + "cpu_mem": 1.851342848, + "gpu_mem": 4.72054528, + "loss": 0.4614, + "grad_norm": 4.5353193283081055, + "learning_rate": 5.563458509653904e-05 + }, + { + "step": 466, + "epoch": 0.7467948717948718, + "cpu_mem": 1.851342848, + "gpu_mem": 4.720556032, + "loss": 0.4686, + "grad_norm": 5.2157816886901855, + "learning_rate": 5.498311907722057e-05 + }, + { + "step": 467, + "epoch": 0.7483974358974359, + "cpu_mem": 1.851342848, + "gpu_mem": 4.72052992, + "loss": 0.415, + "grad_norm": 3.469881772994995, + "learning_rate": 5.43346327655652e-05 + }, + { + "step": 468, + "epoch": 0.75, + "cpu_mem": 1.851342848, + "gpu_mem": 4.72054528, + "loss": 0.6479, + "grad_norm": 4.875113010406494, + "learning_rate": 5.3689146497955274e-05 + }, + { + "step": 469, + "epoch": 0.7516025641025641, + "cpu_mem": 1.851342848, + "gpu_mem": 4.720554496, + "loss": 0.341, + "grad_norm": 3.6151936054229736, + "learning_rate": 5.30466805166927e-05 + }, + { + "step": 470, + "epoch": 0.7532051282051282, + "cpu_mem": 1.851342848, + "gpu_mem": 4.720528384, + "loss": 0.533, + "grad_norm": 5.140727996826172, + "learning_rate": 5.240725496936372e-05 + }, + { + "step": 471, + "epoch": 0.7548076923076923, + "cpu_mem": 1.851539456, + "gpu_mem": 4.720534528, + "loss": 0.3319, + "grad_norm": 3.277714967727661, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 472, + "epoch": 0.7564102564102564, + "cpu_mem": 1.851539456, + "gpu_mem": 4.720523776, + "loss": 0.4694, + "grad_norm": 3.496671676635742, + "learning_rate": 5.113760528948622e-05 + }, + { + "step": 473, + "epoch": 0.7580128205128205, + "cpu_mem": 1.851539456, + "gpu_mem": 4.72052992, + "loss": 0.3937, + "grad_norm": 4.2403950691223145, + "learning_rate": 5.05074209728614e-05 + }, + { + "step": 474, + "epoch": 0.7596153846153846, + "cpu_mem": 1.851539456, + "gpu_mem": 4.720566784, + "loss": 0.264, + "grad_norm": 3.1852779388427734, + "learning_rate": 4.988035672076899e-05 + }, + { + "step": 475, + "epoch": 0.7612179487179487, + "cpu_mem": 1.851539456, + "gpu_mem": 4.72051456, + "loss": 0.5384, + "grad_norm": 5.356995582580566, + "learning_rate": 4.925643219780052e-05 + }, + { + "step": 476, + "epoch": 0.7628205128205128, + "cpu_mem": 1.851539456, + "gpu_mem": 4.720534528, + "loss": 0.3588, + "grad_norm": 3.862870216369629, + "learning_rate": 4.863566697008634e-05 + }, + { + "step": 477, + "epoch": 0.7644230769230769, + "cpu_mem": 1.851539456, + "gpu_mem": 4.720534528, + "loss": 0.3914, + "grad_norm": 3.4243338108062744, + "learning_rate": 4.801808050468219e-05 + }, + { + "step": 478, + "epoch": 0.7660256410256411, + "cpu_mem": 1.851539456, + "gpu_mem": 4.720532992, + "loss": 0.3258, + "grad_norm": 2.846864938735962, + "learning_rate": 4.7403692168958305e-05 + }, + { + "step": 479, + "epoch": 0.7676282051282052, + "cpu_mem": 1.851736064, + "gpu_mem": 4.720531456, + "loss": 0.2571, + "grad_norm": 3.027273416519165, + "learning_rate": 4.679252122999255e-05 + }, + { + "step": 480, + "epoch": 0.7692307692307693, + "cpu_mem": 1.851736064, + "gpu_mem": 4.720523776, + "loss": 0.4565, + "grad_norm": 4.207841873168945, + "learning_rate": 4.618458685396579e-05 + }, + { + "step": 481, + "epoch": 0.7708333333333334, + "cpu_mem": 1.851932672, + "gpu_mem": 4.72058368, + "loss": 0.2933, + "grad_norm": 2.5025956630706787, + "learning_rate": 4.5579908105561016e-05 + }, + { + "step": 482, + "epoch": 0.7724358974358975, + "cpu_mem": 1.851932672, + "gpu_mem": 4.720528384, + "loss": 0.4107, + "grad_norm": 3.3136467933654785, + "learning_rate": 4.497850394736563e-05 + }, + { + "step": 483, + "epoch": 0.7740384615384616, + "cpu_mem": 1.851932672, + "gpu_mem": 4.720511488, + "loss": 0.4718, + "grad_norm": 3.0678510665893555, + "learning_rate": 4.438039323927648e-05 + }, + { + "step": 484, + "epoch": 0.7756410256410257, + "cpu_mem": 1.851932672, + "gpu_mem": 4.720542208, + "loss": 0.577, + "grad_norm": 3.9010672569274902, + "learning_rate": 4.3785594737908676e-05 + }, + { + "step": 485, + "epoch": 0.7772435897435898, + "cpu_mem": 1.851932672, + "gpu_mem": 4.720586752, + "loss": 0.3807, + "grad_norm": 3.5200212001800537, + "learning_rate": 4.319412709600723e-05 + }, + { + "step": 486, + "epoch": 0.7788461538461539, + "cpu_mem": 1.851932672, + "gpu_mem": 4.720566784, + "loss": 0.3084, + "grad_norm": 3.824744462966919, + "learning_rate": 4.2606008861862116e-05 + }, + { + "step": 487, + "epoch": 0.780448717948718, + "cpu_mem": 1.851932672, + "gpu_mem": 4.720566784, + "loss": 0.2274, + "grad_norm": 2.2511284351348877, + "learning_rate": 4.2021258478726774e-05 + }, + { + "step": 488, + "epoch": 0.782051282051282, + "cpu_mem": 1.851932672, + "gpu_mem": 4.720532992, + "loss": 0.3234, + "grad_norm": 3.275568723678589, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 489, + "epoch": 0.7836538461538461, + "cpu_mem": 1.851932672, + "gpu_mem": 4.720557568, + "loss": 0.4127, + "grad_norm": 2.867774248123169, + "learning_rate": 4.0861934509848507e-05 + }, + { + "step": 490, + "epoch": 0.7852564102564102, + "cpu_mem": 1.851932672, + "gpu_mem": 4.72056064, + "loss": 0.4143, + "grad_norm": 3.938815116882324, + "learning_rate": 4.028739728024022e-05 + }, + { + "step": 491, + "epoch": 0.7868589743589743, + "cpu_mem": 1.85212928, + "gpu_mem": 4.720539136, + "loss": 0.3237, + "grad_norm": 2.7918028831481934, + "learning_rate": 3.971630061277077e-05 + }, + { + "step": 492, + "epoch": 0.7884615384615384, + "cpu_mem": 1.85212928, + "gpu_mem": 4.720562176, + "loss": 0.2506, + "grad_norm": 2.4466474056243896, + "learning_rate": 3.914866241690115e-05 + }, + { + "step": 493, + "epoch": 0.7900641025641025, + "cpu_mem": 1.85212928, + "gpu_mem": 4.720542208, + "loss": 0.3909, + "grad_norm": 4.424420356750488, + "learning_rate": 3.858450049363532e-05 + }, + { + "step": 494, + "epoch": 0.7916666666666666, + "cpu_mem": 1.85212928, + "gpu_mem": 4.720566784, + "loss": 0.3421, + "grad_norm": 3.128190517425537, + "learning_rate": 3.8023832534962314e-05 + }, + { + "step": 495, + "epoch": 0.7932692307692307, + "cpu_mem": 1.85212928, + "gpu_mem": 4.720549888, + "loss": 0.2982, + "grad_norm": 2.9297492504119873, + "learning_rate": 3.746667612330109e-05 + }, + { + "step": 496, + "epoch": 0.7948717948717948, + "cpu_mem": 1.85212928, + "gpu_mem": 4.72054528, + "loss": 0.3702, + "grad_norm": 3.8990285396575928, + "learning_rate": 3.691304873094927e-05 + }, + { + "step": 497, + "epoch": 0.7964743589743589, + "cpu_mem": 1.85212928, + "gpu_mem": 4.720557568, + "loss": 0.3583, + "grad_norm": 3.2291672229766846, + "learning_rate": 3.636296771953544e-05 + }, + { + "step": 498, + "epoch": 0.7980769230769231, + "cpu_mem": 1.85212928, + "gpu_mem": 4.720528384, + "loss": 0.3541, + "grad_norm": 4.526378631591797, + "learning_rate": 3.581645033947425e-05 + }, + { + "step": 499, + "epoch": 0.7996794871794872, + "cpu_mem": 1.85212928, + "gpu_mem": 4.720542208, + "loss": 0.4501, + "grad_norm": 3.3313324451446533, + "learning_rate": 3.527351372942588e-05 + }, + { + "step": 500, + "epoch": 0.8012820512820513, + "cpu_mem": 1.85212928, + "gpu_mem": 4.720528384, + "loss": 0.3625, + "grad_norm": 4.104772567749023, + "learning_rate": 3.473417491575824e-05 + }, + { + "step": 501, + "epoch": 0.8028846153846154, + "cpu_mem": 1.85212928, + "gpu_mem": 4.72052224, + "loss": 0.3259, + "grad_norm": 3.6812901496887207, + "learning_rate": 3.41984508120132e-05 + }, + { + "step": 502, + "epoch": 0.8044871794871795, + "cpu_mem": 1.85212928, + "gpu_mem": 4.720528384, + "loss": 0.2609, + "grad_norm": 3.425013542175293, + "learning_rate": 3.366635821837627e-05 + }, + { + "step": 503, + "epoch": 0.8060897435897436, + "cpu_mem": 1.85212928, + "gpu_mem": 4.720542208, + "loss": 0.4273, + "grad_norm": 4.243935585021973, + "learning_rate": 3.3137913821149425e-05 + }, + { + "step": 504, + "epoch": 0.8076923076923077, + "cpu_mem": 1.85212928, + "gpu_mem": 4.720525312, + "loss": 0.5019, + "grad_norm": 4.30189847946167, + "learning_rate": 3.261313419222825e-05 + }, + { + "step": 505, + "epoch": 0.8092948717948718, + "cpu_mem": 1.85212928, + "gpu_mem": 4.720579072, + "loss": 0.2946, + "grad_norm": 2.4200336933135986, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 506, + "epoch": 0.8108974358974359, + "cpu_mem": 1.85212928, + "gpu_mem": 4.72052224, + "loss": 0.2182, + "grad_norm": 2.7074618339538574, + "learning_rate": 3.157463495173713e-05 + }, + { + "step": 507, + "epoch": 0.8125, + "cpu_mem": 1.85212928, + "gpu_mem": 4.720600576, + "loss": 0.3734, + "grad_norm": 4.562469005584717, + "learning_rate": 3.1060947907265936e-05 + }, + { + "step": 508, + "epoch": 0.8141025641025641, + "cpu_mem": 1.85212928, + "gpu_mem": 4.720543744, + "loss": 0.277, + "grad_norm": 2.718461275100708, + "learning_rate": 3.0550990764276634e-05 + }, + { + "step": 509, + "epoch": 0.8157051282051282, + "cpu_mem": 1.852325888, + "gpu_mem": 4.720562176, + "loss": 0.4068, + "grad_norm": 4.243529796600342, + "learning_rate": 3.00447795149086e-05 + }, + { + "step": 510, + "epoch": 0.8173076923076923, + "cpu_mem": 1.852325888, + "gpu_mem": 4.7205376, + "loss": 0.2632, + "grad_norm": 3.0724143981933594, + "learning_rate": 2.9542330033830884e-05 + }, + { + "step": 511, + "epoch": 0.8189102564102564, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720569856, + "loss": 0.5566, + "grad_norm": 5.213530540466309, + "learning_rate": 2.9043658077744316e-05 + }, + { + "step": 512, + "epoch": 0.8205128205128205, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720589824, + "loss": 0.4058, + "grad_norm": 4.296562194824219, + "learning_rate": 2.8548779284887442e-05 + }, + { + "step": 513, + "epoch": 0.8221153846153846, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720519168, + "loss": 0.3248, + "grad_norm": 3.392761468887329, + "learning_rate": 2.805770917454614e-05 + }, + { + "step": 514, + "epoch": 0.8237179487179487, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720532992, + "loss": 0.2288, + "grad_norm": 3.0542545318603516, + "learning_rate": 2.7570463146566758e-05 + }, + { + "step": 515, + "epoch": 0.8253205128205128, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720517632, + "loss": 0.5084, + "grad_norm": 5.2941741943359375, + "learning_rate": 2.708705648087332e-05 + }, + { + "step": 516, + "epoch": 0.8269230769230769, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720556032, + "loss": 0.2364, + "grad_norm": 2.4198131561279297, + "learning_rate": 2.6607504336988317e-05 + }, + { + "step": 517, + "epoch": 0.8285256410256411, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720556032, + "loss": 0.4098, + "grad_norm": 3.8796377182006836, + "learning_rate": 2.613182175355739e-05 + }, + { + "step": 518, + "epoch": 0.8301282051282052, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720542208, + "loss": 0.3362, + "grad_norm": 3.8518259525299072, + "learning_rate": 2.5660023647877644e-05 + }, + { + "step": 519, + "epoch": 0.8317307692307693, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720532992, + "loss": 0.2947, + "grad_norm": 3.185007095336914, + "learning_rate": 2.5192124815429777e-05 + }, + { + "step": 520, + "epoch": 0.8333333333333334, + "cpu_mem": 1.852522496, + "gpu_mem": 4.7205376, + "loss": 0.4535, + "grad_norm": 3.977987766265869, + "learning_rate": 2.472813992941418e-05 + }, + { + "step": 521, + "epoch": 0.8349358974358975, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720540672, + "loss": 0.4106, + "grad_norm": 4.126715660095215, + "learning_rate": 2.426808354029078e-05 + }, + { + "step": 522, + "epoch": 0.8365384615384616, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720546816, + "loss": 0.3091, + "grad_norm": 3.013603925704956, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 523, + "epoch": 0.8381410256410257, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720565248, + "loss": 0.2818, + "grad_norm": 3.990391731262207, + "learning_rate": 2.3359813838124277e-05 + }, + { + "step": 524, + "epoch": 0.8397435897435898, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720559104, + "loss": 0.5058, + "grad_norm": 4.9119648933410645, + "learning_rate": 2.2911629008211363e-05 + }, + { + "step": 525, + "epoch": 0.8413461538461539, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720536064, + "loss": 0.2244, + "grad_norm": 2.438129425048828, + "learning_rate": 2.24674296405579e-05 + }, + { + "step": 526, + "epoch": 0.842948717948718, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720523776, + "loss": 0.4372, + "grad_norm": 3.8770358562469482, + "learning_rate": 2.2027229665154446e-05 + }, + { + "step": 527, + "epoch": 0.844551282051282, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720489984, + "loss": 0.3935, + "grad_norm": 3.9676287174224854, + "learning_rate": 2.1591042886571634e-05 + }, + { + "step": 528, + "epoch": 0.8461538461538461, + "cpu_mem": 1.852522496, + "gpu_mem": 4.7205376, + "loss": 0.3213, + "grad_norm": 3.559718370437622, + "learning_rate": 2.1158882983527166e-05 + }, + { + "step": 529, + "epoch": 0.8477564102564102, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720503808, + "loss": 0.4361, + "grad_norm": 3.3674979209899902, + "learning_rate": 2.0730763508456738e-05 + }, + { + "step": 530, + "epoch": 0.8493589743589743, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720551424, + "loss": 0.4226, + "grad_norm": 3.7541987895965576, + "learning_rate": 2.0306697887089235e-05 + }, + { + "step": 531, + "epoch": 0.8509615384615384, + "cpu_mem": 1.852522496, + "gpu_mem": 4.720549888, + "loss": 0.3885, + "grad_norm": 2.987833261489868, + "learning_rate": 1.9886699418025543e-05 + }, + { + "step": 532, + "epoch": 0.8525641025641025, + "cpu_mem": 1.852719104, + "gpu_mem": 4.720551424, + "loss": 0.2451, + "grad_norm": 2.6760337352752686, + "learning_rate": 1.947078127232169e-05 + }, + { + "step": 533, + "epoch": 0.8541666666666666, + "cpu_mem": 1.852719104, + "gpu_mem": 4.72056064, + "loss": 0.5327, + "grad_norm": 5.310992240905762, + "learning_rate": 1.9058956493075644e-05 + }, + { + "step": 534, + "epoch": 0.8557692307692307, + "cpu_mem": 1.852719104, + "gpu_mem": 4.720536064, + "loss": 0.3923, + "grad_norm": 3.9822824001312256, + "learning_rate": 1.8651237995018324e-05 + }, + { + "step": 535, + "epoch": 0.8573717948717948, + "cpu_mem": 1.852719104, + "gpu_mem": 4.720520704, + "loss": 0.3539, + "grad_norm": 3.6433048248291016, + "learning_rate": 1.8247638564108607e-05 + }, + { + "step": 536, + "epoch": 0.8589743589743589, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720549888, + "loss": 0.3371, + "grad_norm": 2.9053633213043213, + "learning_rate": 1.7848170857132325e-05 + }, + { + "step": 537, + "epoch": 0.8605769230769231, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720563712, + "loss": 0.3465, + "grad_norm": 3.436824321746826, + "learning_rate": 1.7452847401305496e-05 + }, + { + "step": 538, + "epoch": 0.8621794871794872, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720519168, + "loss": 0.334, + "grad_norm": 3.257223129272461, + "learning_rate": 1.7061680593881344e-05 + }, + { + "step": 539, + "epoch": 0.8637820512820513, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720525312, + "loss": 0.4804, + "grad_norm": 3.544076681137085, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 540, + "epoch": 0.8653846153846154, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720554496, + "loss": 0.4318, + "grad_norm": 3.954712152481079, + "learning_rate": 1.6291865861111353e-05 + }, + { + "step": 541, + "epoch": 0.8669871794871795, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720549888, + "loss": 0.3014, + "grad_norm": 3.6946277618408203, + "learning_rate": 1.5913242076979493e-05 + }, + { + "step": 542, + "epoch": 0.8685897435897436, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720536064, + "loss": 0.5866, + "grad_norm": 5.290384292602539, + "learning_rate": 1.5538823222921288e-05 + }, + { + "step": 543, + "epoch": 0.8701923076923077, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720549888, + "loss": 0.3288, + "grad_norm": 3.3251030445098877, + "learning_rate": 1.5168621040626388e-05 + }, + { + "step": 544, + "epoch": 0.8717948717948718, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720539136, + "loss": 0.375, + "grad_norm": 4.027219295501709, + "learning_rate": 1.4802647139550577e-05 + }, + { + "step": 545, + "epoch": 0.8733974358974359, + "cpu_mem": 1.852915712, + "gpu_mem": 4.72054528, + "loss": 0.2697, + "grad_norm": 2.4976985454559326, + "learning_rate": 1.444091299655175e-05 + }, + { + "step": 546, + "epoch": 0.875, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720549888, + "loss": 0.5072, + "grad_norm": 4.492164611816406, + "learning_rate": 1.408342995552988e-05 + }, + { + "step": 547, + "epoch": 0.8766025641025641, + "cpu_mem": 1.852915712, + "gpu_mem": 4.72054528, + "loss": 0.3662, + "grad_norm": 3.4206387996673584, + "learning_rate": 1.3730209227071436e-05 + }, + { + "step": 548, + "epoch": 0.8782051282051282, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720519168, + "loss": 0.3408, + "grad_norm": 2.8707079887390137, + "learning_rate": 1.3381261888097755e-05 + }, + { + "step": 549, + "epoch": 0.8798076923076923, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720528384, + "loss": 0.3908, + "grad_norm": 3.2700259685516357, + "learning_rate": 1.303659888151753e-05 + }, + { + "step": 550, + "epoch": 0.8814102564102564, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720546816, + "loss": 0.5202, + "grad_norm": 4.23433780670166, + "learning_rate": 1.2696231015883913e-05 + }, + { + "step": 551, + "epoch": 0.8830128205128205, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720517632, + "loss": 0.3379, + "grad_norm": 2.8816049098968506, + "learning_rate": 1.2360168965055301e-05 + }, + { + "step": 552, + "epoch": 0.8846153846153846, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720548352, + "loss": 0.4193, + "grad_norm": 3.8861663341522217, + "learning_rate": 1.2028423267860805e-05 + }, + { + "step": 553, + "epoch": 0.8862179487179487, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720557568, + "loss": 0.2624, + "grad_norm": 2.7214395999908447, + "learning_rate": 1.1701004327769709e-05 + }, + { + "step": 554, + "epoch": 0.8878205128205128, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720519168, + "loss": 0.4888, + "grad_norm": 3.5735175609588623, + "learning_rate": 1.1377922412565005e-05 + }, + { + "step": 555, + "epoch": 0.8894230769230769, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720523776, + "loss": 0.3097, + "grad_norm": 3.183973789215088, + "learning_rate": 1.1059187654021762e-05 + }, + { + "step": 556, + "epoch": 0.8910256410256411, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720548352, + "loss": 0.4163, + "grad_norm": 3.666163921356201, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 557, + "epoch": 0.8926282051282052, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720566784, + "loss": 0.3364, + "grad_norm": 3.323470115661621, + "learning_rate": 1.0434799452076915e-05 + }, + { + "step": 558, + "epoch": 0.8942307692307693, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720548352, + "loss": 0.4435, + "grad_norm": 3.6063313484191895, + "learning_rate": 1.0129165589346643e-05 + }, + { + "step": 559, + "epoch": 0.8958333333333334, + "cpu_mem": 1.852915712, + "gpu_mem": 4.72059904, + "loss": 0.5035, + "grad_norm": 3.6231212615966797, + "learning_rate": 9.82791804400626e-06 + }, + { + "step": 560, + "epoch": 0.8974358974358975, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720531456, + "loss": 0.8143, + "grad_norm": 4.744032382965088, + "learning_rate": 9.531066263109971e-06 + }, + { + "step": 561, + "epoch": 0.8990384615384616, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720532992, + "loss": 0.3489, + "grad_norm": 4.081301689147949, + "learning_rate": 9.238619555861731e-06 + }, + { + "step": 562, + "epoch": 0.9006410256410257, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720532992, + "loss": 0.4037, + "grad_norm": 3.5316288471221924, + "learning_rate": 8.950587093323435e-06 + }, + { + "step": 563, + "epoch": 0.9022435897435898, + "cpu_mem": 1.852915712, + "gpu_mem": 4.720539136, + "loss": 0.2611, + "grad_norm": 3.360579013824463, + "learning_rate": 8.66697790812731e-06 + }, + { + "step": 564, + "epoch": 0.9038461538461539, + "cpu_mem": 1.85311232, + "gpu_mem": 4.72055296, + "loss": 0.285, + "grad_norm": 2.9638216495513916, + "learning_rate": 8.387800894192453e-06 + }, + { + "step": 565, + "epoch": 0.905448717948718, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720557568, + "loss": 0.4469, + "grad_norm": 4.186419486999512, + "learning_rate": 8.113064806446285e-06 + }, + { + "step": 566, + "epoch": 0.907051282051282, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720551424, + "loss": 0.4776, + "grad_norm": 3.4684760570526123, + "learning_rate": 7.842778260549654e-06 + }, + { + "step": 567, + "epoch": 0.9086538461538461, + "cpu_mem": 1.85311232, + "gpu_mem": 4.72054528, + "loss": 0.3747, + "grad_norm": 2.962671995162964, + "learning_rate": 7.5769497326268804e-06 + }, + { + "step": 568, + "epoch": 0.9102564102564102, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720559104, + "loss": 0.5951, + "grad_norm": 4.555113792419434, + "learning_rate": 7.315587558999864e-06 + }, + { + "step": 569, + "epoch": 0.9118589743589743, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720551424, + "loss": 0.5522, + "grad_norm": 4.260589122772217, + "learning_rate": 7.058699935926526e-06 + }, + { + "step": 570, + "epoch": 0.9134615384615384, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720536064, + "loss": 0.4622, + "grad_norm": 4.028753757476807, + "learning_rate": 6.8062949193440515e-06 + }, + { + "step": 571, + "epoch": 0.9150641025641025, + "cpu_mem": 1.85311232, + "gpu_mem": 4.72054528, + "loss": 0.4183, + "grad_norm": 3.320568323135376, + "learning_rate": 6.5583804246160385e-06 + }, + { + "step": 572, + "epoch": 0.9166666666666666, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720554496, + "loss": 0.2571, + "grad_norm": 3.13403058052063, + "learning_rate": 6.3149642262843804e-06 + }, + { + "step": 573, + "epoch": 0.9182692307692307, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720557568, + "loss": 0.3707, + "grad_norm": 3.0847299098968506, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 574, + "epoch": 0.9198717948717948, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720511488, + "loss": 0.4385, + "grad_norm": 4.007024765014648, + "learning_rate": 5.84165711141048e-06 + }, + { + "step": 575, + "epoch": 0.9214743589743589, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720565248, + "loss": 0.1897, + "grad_norm": 2.7542381286621094, + "learning_rate": 5.611781037671176e-06 + }, + { + "step": 576, + "epoch": 0.9230769230769231, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720566784, + "loss": 0.3052, + "grad_norm": 2.9125266075134277, + "learning_rate": 5.386432945468555e-06 + }, + { + "step": 577, + "epoch": 0.9246794871794872, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720511488, + "loss": 0.3784, + "grad_norm": 2.951606512069702, + "learning_rate": 5.165619901667311e-06 + }, + { + "step": 578, + "epoch": 0.9262820512820513, + "cpu_mem": 1.85311232, + "gpu_mem": 4.72054528, + "loss": 0.4002, + "grad_norm": 3.634507894515991, + "learning_rate": 4.949348830914002e-06 + }, + { + "step": 579, + "epoch": 0.9278846153846154, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720523776, + "loss": 0.395, + "grad_norm": 3.831821918487549, + "learning_rate": 4.737626515419951e-06 + }, + { + "step": 580, + "epoch": 0.9294871794871795, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720554496, + "loss": 0.3121, + "grad_norm": 2.7136783599853516, + "learning_rate": 4.530459594748592e-06 + }, + { + "step": 581, + "epoch": 0.9310897435897436, + "cpu_mem": 1.85311232, + "gpu_mem": 4.72052992, + "loss": 0.2885, + "grad_norm": 3.4227333068847656, + "learning_rate": 4.327854565607164e-06 + }, + { + "step": 582, + "epoch": 0.9326923076923077, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720563712, + "loss": 0.2843, + "grad_norm": 3.070899248123169, + "learning_rate": 4.129817781643091e-06 + }, + { + "step": 583, + "epoch": 0.9342948717948718, + "cpu_mem": 1.85311232, + "gpu_mem": 4.72058368, + "loss": 0.4583, + "grad_norm": 3.760401487350464, + "learning_rate": 3.9363554532446276e-06 + }, + { + "step": 584, + "epoch": 0.9358974358974359, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720548352, + "loss": 0.3484, + "grad_norm": 3.4083871841430664, + "learning_rate": 3.7474736473461607e-06 + }, + { + "step": 585, + "epoch": 0.9375, + "cpu_mem": 1.85311232, + "gpu_mem": 4.72056832, + "loss": 0.3228, + "grad_norm": 3.1646687984466553, + "learning_rate": 3.56317828723795e-06 + }, + { + "step": 586, + "epoch": 0.9391025641025641, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720548352, + "loss": 0.4006, + "grad_norm": 3.5837419033050537, + "learning_rate": 3.383475152380355e-06 + }, + { + "step": 587, + "epoch": 0.9407051282051282, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720549888, + "loss": 0.3418, + "grad_norm": 3.2200498580932617, + "learning_rate": 3.2083698782225997e-06 + }, + { + "step": 588, + "epoch": 0.9423076923076923, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720542208, + "loss": 0.3, + "grad_norm": 3.212144374847412, + "learning_rate": 3.0378679560260467e-06 + }, + { + "step": 589, + "epoch": 0.9439102564102564, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720540672, + "loss": 0.4279, + "grad_norm": 3.173780918121338, + "learning_rate": 2.871974732691984e-06 + }, + { + "step": 590, + "epoch": 0.9455128205128205, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720554496, + "loss": 0.3989, + "grad_norm": 3.441620111465454, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 591, + "epoch": 0.9471153846153846, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720525312, + "loss": 0.3532, + "grad_norm": 2.8896396160125732, + "learning_rate": 2.554035047414732e-06 + }, + { + "step": 592, + "epoch": 0.9487179487179487, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720576, + "loss": 0.4176, + "grad_norm": 3.6116034984588623, + "learning_rate": 2.401998555987389e-06 + }, + { + "step": 593, + "epoch": 0.9503205128205128, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720572928, + "loss": 0.4385, + "grad_norm": 4.021176338195801, + "learning_rate": 2.2545907041415457e-06 + }, + { + "step": 594, + "epoch": 0.9519230769230769, + "cpu_mem": 1.85311232, + "gpu_mem": 4.72055296, + "loss": 0.3559, + "grad_norm": 3.797605276107788, + "learning_rate": 2.1118161145537436e-06 + }, + { + "step": 595, + "epoch": 0.9535256410256411, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720534528, + "loss": 0.3643, + "grad_norm": 2.836670398712158, + "learning_rate": 1.973679264602485e-06 + }, + { + "step": 596, + "epoch": 0.9551282051282052, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720543744, + "loss": 0.2892, + "grad_norm": 3.29675030708313, + "learning_rate": 1.840184486227808e-06 + }, + { + "step": 597, + "epoch": 0.9567307692307693, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720511488, + "loss": 0.3781, + "grad_norm": 4.689011573791504, + "learning_rate": 1.711335965795435e-06 + }, + { + "step": 598, + "epoch": 0.9583333333333334, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720572928, + "loss": 0.4726, + "grad_norm": 3.461007595062256, + "learning_rate": 1.5871377439655054e-06 + }, + { + "step": 599, + "epoch": 0.9599358974358975, + "cpu_mem": 1.85311232, + "gpu_mem": 4.720571392, + "loss": 0.3344, + "grad_norm": 3.677208662033081, + "learning_rate": 1.4675937155658456e-06 + }, + { + "step": 600, + "epoch": 0.9615384615384616, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720526848, + "loss": 0.3356, + "grad_norm": 3.169806480407715, + "learning_rate": 1.3527076294698846e-06 + }, + { + "step": 601, + "epoch": 0.9631410256410257, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720559104, + "loss": 0.355, + "grad_norm": 3.2972850799560547, + "learning_rate": 1.2424830884790126e-06 + }, + { + "step": 602, + "epoch": 0.9647435897435898, + "cpu_mem": 1.853308928, + "gpu_mem": 4.72055296, + "loss": 0.3892, + "grad_norm": 4.131161212921143, + "learning_rate": 1.1369235492096397e-06 + }, + { + "step": 603, + "epoch": 0.9663461538461539, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720539136, + "loss": 0.2837, + "grad_norm": 3.3345999717712402, + "learning_rate": 1.0360323219847645e-06 + }, + { + "step": 604, + "epoch": 0.967948717948718, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720539136, + "loss": 0.4005, + "grad_norm": 3.9133825302124023, + "learning_rate": 9.398125707302084e-07 + }, + { + "step": 605, + "epoch": 0.969551282051282, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720565248, + "loss": 0.2771, + "grad_norm": 2.632892370223999, + "learning_rate": 8.482673128753947e-07 + }, + { + "step": 606, + "epoch": 0.9711538461538461, + "cpu_mem": 1.853308928, + "gpu_mem": 4.72055296, + "loss": 0.3006, + "grad_norm": 2.674530267715454, + "learning_rate": 7.613994192586736e-07 + }, + { + "step": 607, + "epoch": 0.9727564102564102, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720543744, + "loss": 0.4118, + "grad_norm": 3.2508935928344727, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 608, + "epoch": 0.9743589743589743, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720536064, + "loss": 0.5555, + "grad_norm": 4.437479496002197, + "learning_rate": 6.017064746021094e-07 + }, + { + "step": 609, + "epoch": 0.9759615384615384, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720559104, + "loss": 0.253, + "grad_norm": 2.209259510040283, + "learning_rate": 5.288864314965003e-07 + }, + { + "step": 610, + "epoch": 0.9775641025641025, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720548352, + "loss": 0.2036, + "grad_norm": 2.3002851009368896, + "learning_rate": 4.607537683404106e-07 + }, + { + "step": 611, + "epoch": 0.9791666666666666, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720532992, + "loss": 0.3645, + "grad_norm": 2.9848732948303223, + "learning_rate": 3.973106217585842e-07 + }, + { + "step": 612, + "epoch": 0.9807692307692307, + "cpu_mem": 1.853308928, + "gpu_mem": 4.72059904, + "loss": 0.274, + "grad_norm": 2.5904791355133057, + "learning_rate": 3.3855898131356915e-07 + }, + { + "step": 613, + "epoch": 0.9823717948717948, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720542208, + "loss": 0.3894, + "grad_norm": 3.8763275146484375, + "learning_rate": 2.845006894433843e-07 + }, + { + "step": 614, + "epoch": 0.9839743589743589, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720528384, + "loss": 0.3652, + "grad_norm": 3.627070665359497, + "learning_rate": 2.351374414037155e-07 + }, + { + "step": 615, + "epoch": 0.9855769230769231, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720592896, + "loss": 0.3991, + "grad_norm": 3.751986503601074, + "learning_rate": 1.9047078521474135e-07 + }, + { + "step": 616, + "epoch": 0.9871794871794872, + "cpu_mem": 1.853308928, + "gpu_mem": 4.72052224, + "loss": 0.3849, + "grad_norm": 3.4553236961364746, + "learning_rate": 1.505021216125557e-07 + }, + { + "step": 617, + "epoch": 0.9887820512820513, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720549888, + "loss": 0.1949, + "grad_norm": 2.814553737640381, + "learning_rate": 1.1523270400535245e-07 + }, + { + "step": 618, + "epoch": 0.9903846153846154, + "cpu_mem": 1.853308928, + "gpu_mem": 4.72055296, + "loss": 0.4099, + "grad_norm": 3.659895181655884, + "learning_rate": 8.466363843397383e-08 + }, + { + "step": 619, + "epoch": 0.9919871794871795, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720528384, + "loss": 0.4299, + "grad_norm": 4.018091678619385, + "learning_rate": 5.8795883537338106e-08 + }, + { + "step": 620, + "epoch": 0.9935897435897436, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720556032, + "loss": 0.3338, + "grad_norm": 3.5001885890960693, + "learning_rate": 3.763025052231361e-08 + }, + { + "step": 621, + "epoch": 0.9951923076923077, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720565248, + "loss": 0.253, + "grad_norm": 3.3059349060058594, + "learning_rate": 2.1167403138339088e-08 + }, + { + "step": 622, + "epoch": 0.9967948717948718, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720562176, + "loss": 0.3919, + "grad_norm": 3.0028076171875, + "learning_rate": 9.407857656540397e-09 + }, + { + "step": 623, + "epoch": 0.9983974358974359, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720534528, + "loss": 0.3865, + "grad_norm": 3.1815884113311768, + "learning_rate": 2.3519828535434325e-09 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720281088, + "loss": 0.25, + "grad_norm": 3.517228126525879, + "learning_rate": 0.0 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.853308928, + "gpu_mem": 4.720281088, + "train_runtime": 8289.3811, + "train_samples_per_second": 4.814, + "train_steps_per_second": 0.075, + "total_flos": 8.629042704352051e+16, + "train_loss": 0.6983747581402079 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r8-a2/README.md b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r8-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r8-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r8-a2/adapter_config.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..616e0cc3677d4646846654f1887fbef4d57d10ca --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r8-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r8-a2/eval_results.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e31e885eafd3b34669c1476a6bf1ee9c963519e8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "hellaswag", + "results": 0.8505277833100976 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r8-a2/training_configuration.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..c3af3303e530544f1e8d96d4e351720a087a40da --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "HELLASWAG", + "dataset_id": "Rowan/hellaswag", + "preprocess_id": "hellaswag_train_deepeval" + }, + "peft_config": { + "method": "lora", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 6307840 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 1, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-lora-hellaswag-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r8-a2", + "seed": 42, + "timestamp": "2025-08-30T01:38:21.710158" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r8-a2/training_logs.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..9533c6165b1e483bf5f717e2f3f6f9af65b97ff3 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-hellaswag-r8-a2/training_logs.json @@ -0,0 +1,5629 @@ +[ + { + "step": 1, + "epoch": 0.0016025641025641025, + "cpu_mem": 1.712644096, + "gpu_mem": 4.44299008, + "loss": 3.4877, + "grad_norm": 23.0362548828125, + "learning_rate": 4.7619047619047615e-06 + }, + { + "step": 2, + "epoch": 0.003205128205128205, + "cpu_mem": 1.718935552, + "gpu_mem": 4.493446144, + "loss": 3.6203, + "grad_norm": 22.6283016204834, + "learning_rate": 9.523809523809523e-06 + }, + { + "step": 3, + "epoch": 0.004807692307692308, + "cpu_mem": 1.7201152, + "gpu_mem": 4.493453824, + "loss": 3.381, + "grad_norm": 22.348154067993164, + "learning_rate": 1.4285714285714284e-05 + }, + { + "step": 4, + "epoch": 0.00641025641025641, + "cpu_mem": 1.72109824, + "gpu_mem": 4.493487616, + "loss": 3.4752, + "grad_norm": 21.149770736694336, + "learning_rate": 1.9047619047619046e-05 + }, + { + "step": 5, + "epoch": 0.008012820512820512, + "cpu_mem": 1.72208128, + "gpu_mem": 4.493450752, + "loss": 3.2376, + "grad_norm": 20.688838958740234, + "learning_rate": 2.3809523809523807e-05 + }, + { + "step": 6, + "epoch": 0.009615384615384616, + "cpu_mem": 1.72306432, + "gpu_mem": 4.493496832, + "loss": 3.1329, + "grad_norm": 21.438047409057617, + "learning_rate": 2.8571428571428567e-05 + }, + { + "step": 7, + "epoch": 0.011217948717948718, + "cpu_mem": 1.723850752, + "gpu_mem": 4.493456896, + "loss": 2.9243, + "grad_norm": 17.35895538330078, + "learning_rate": 3.333333333333333e-05 + }, + { + "step": 8, + "epoch": 0.01282051282051282, + "cpu_mem": 1.724833792, + "gpu_mem": 4.493487616, + "loss": 2.4011, + "grad_norm": 15.75527286529541, + "learning_rate": 3.809523809523809e-05 + }, + { + "step": 9, + "epoch": 0.014423076923076924, + "cpu_mem": 1.725816832, + "gpu_mem": 4.493487616, + "loss": 2.1931, + "grad_norm": 10.843381881713867, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 10, + "epoch": 0.016025641025641024, + "cpu_mem": 1.728765952, + "gpu_mem": 4.493430784, + "loss": 1.9729, + "grad_norm": 7.88395881652832, + "learning_rate": 4.7619047619047614e-05 + }, + { + "step": 11, + "epoch": 0.017628205128205128, + "cpu_mem": 1.731518464, + "gpu_mem": 4.493450752, + "loss": 1.6712, + "grad_norm": 6.300374507904053, + "learning_rate": 5.238095238095237e-05 + }, + { + "step": 12, + "epoch": 0.019230769230769232, + "cpu_mem": 1.732304896, + "gpu_mem": 4.49344768, + "loss": 1.6771, + "grad_norm": 5.766356945037842, + "learning_rate": 5.7142857142857135e-05 + }, + { + "step": 13, + "epoch": 0.020833333333333332, + "cpu_mem": 1.733091328, + "gpu_mem": 4.49344, + "loss": 1.4968, + "grad_norm": 3.0858969688415527, + "learning_rate": 6.190476190476189e-05 + }, + { + "step": 14, + "epoch": 0.022435897435897436, + "cpu_mem": 1.73387776, + "gpu_mem": 4.493466112, + "loss": 1.4421, + "grad_norm": 1.9313714504241943, + "learning_rate": 6.666666666666666e-05 + }, + { + "step": 15, + "epoch": 0.02403846153846154, + "cpu_mem": 1.734467584, + "gpu_mem": 4.493464576, + "loss": 1.4403, + "grad_norm": 2.833580255508423, + "learning_rate": 7.142857142857142e-05 + }, + { + "step": 16, + "epoch": 0.02564102564102564, + "cpu_mem": 1.735057408, + "gpu_mem": 4.493456896, + "loss": 1.4191, + "grad_norm": 1.8312991857528687, + "learning_rate": 7.619047619047618e-05 + }, + { + "step": 17, + "epoch": 0.027243589743589744, + "cpu_mem": 1.733681152, + "gpu_mem": 4.493456896, + "loss": 1.4589, + "grad_norm": 3.105937957763672, + "learning_rate": 8.095238095238093e-05 + }, + { + "step": 18, + "epoch": 0.028846153846153848, + "cpu_mem": 1.734467584, + "gpu_mem": 4.493456896, + "loss": 1.3775, + "grad_norm": 2.7767298221588135, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 19, + "epoch": 0.030448717948717948, + "cpu_mem": 1.735254016, + "gpu_mem": 4.493456896, + "loss": 1.4605, + "grad_norm": 1.9506347179412842, + "learning_rate": 9.047619047619046e-05 + }, + { + "step": 20, + "epoch": 0.03205128205128205, + "cpu_mem": 1.73584384, + "gpu_mem": 4.493430784, + "loss": 1.5004, + "grad_norm": 3.6380796432495117, + "learning_rate": 9.523809523809523e-05 + }, + { + "step": 21, + "epoch": 0.03365384615384615, + "cpu_mem": 1.736433664, + "gpu_mem": 4.49344768, + "loss": 1.4937, + "grad_norm": 3.4064600467681885, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 22, + "epoch": 0.035256410256410256, + "cpu_mem": 1.737023488, + "gpu_mem": 4.49345536, + "loss": 1.4269, + "grad_norm": 2.6639764308929443, + "learning_rate": 0.00010476190476190474 + }, + { + "step": 23, + "epoch": 0.03685897435897436, + "cpu_mem": 1.737613312, + "gpu_mem": 4.493469184, + "loss": 1.4179, + "grad_norm": 2.175226926803589, + "learning_rate": 0.0001095238095238095 + }, + { + "step": 24, + "epoch": 0.038461538461538464, + "cpu_mem": 1.738203136, + "gpu_mem": 4.493453824, + "loss": 1.3789, + "grad_norm": 2.3146252632141113, + "learning_rate": 0.00011428571428571427 + }, + { + "step": 25, + "epoch": 0.04006410256410257, + "cpu_mem": 1.74075904, + "gpu_mem": 4.493441536, + "loss": 1.4951, + "grad_norm": 4.642297267913818, + "learning_rate": 0.00011904761904761903 + }, + { + "step": 26, + "epoch": 0.041666666666666664, + "cpu_mem": 1.741348864, + "gpu_mem": 4.49344768, + "loss": 1.4944, + "grad_norm": 4.725379943847656, + "learning_rate": 0.00012380952380952378 + }, + { + "step": 27, + "epoch": 0.04326923076923077, + "cpu_mem": 1.742135296, + "gpu_mem": 4.49345536, + "loss": 1.4078, + "grad_norm": 2.075615882873535, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 28, + "epoch": 0.04487179487179487, + "cpu_mem": 1.742528512, + "gpu_mem": 4.493450752, + "loss": 1.4117, + "grad_norm": 1.697830319404602, + "learning_rate": 0.0001333333333333333 + }, + { + "step": 29, + "epoch": 0.046474358974358976, + "cpu_mem": 1.743118336, + "gpu_mem": 4.493459968, + "loss": 1.4295, + "grad_norm": 2.0823287963867188, + "learning_rate": 0.00013809523809523808 + }, + { + "step": 30, + "epoch": 0.04807692307692308, + "cpu_mem": 1.744101376, + "gpu_mem": 4.49343232, + "loss": 1.465, + "grad_norm": 2.519767999649048, + "learning_rate": 0.00014285714285714284 + }, + { + "step": 31, + "epoch": 0.049679487179487176, + "cpu_mem": 1.7446912, + "gpu_mem": 4.493487616, + "loss": 1.4028, + "grad_norm": 1.1726704835891724, + "learning_rate": 0.0001476190476190476 + }, + { + "step": 32, + "epoch": 0.05128205128205128, + "cpu_mem": 1.745281024, + "gpu_mem": 4.493479936, + "loss": 1.428, + "grad_norm": 1.4406405687332153, + "learning_rate": 0.00015238095238095237 + }, + { + "step": 33, + "epoch": 0.052884615384615384, + "cpu_mem": 1.745870848, + "gpu_mem": 4.493433856, + "loss": 1.3769, + "grad_norm": 0.412997841835022, + "learning_rate": 0.00015714285714285713 + }, + { + "step": 34, + "epoch": 0.05448717948717949, + "cpu_mem": 1.746460672, + "gpu_mem": 4.493452288, + "loss": 1.4108, + "grad_norm": 0.7952141761779785, + "learning_rate": 0.00016190476190476187 + }, + { + "step": 35, + "epoch": 0.05608974358974359, + "cpu_mem": 1.747050496, + "gpu_mem": 4.493473792, + "loss": 1.4619, + "grad_norm": 2.281728982925415, + "learning_rate": 0.00016666666666666666 + }, + { + "step": 36, + "epoch": 0.057692307692307696, + "cpu_mem": 1.74764032, + "gpu_mem": 4.493472256, + "loss": 1.4107, + "grad_norm": 0.9904084801673889, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 37, + "epoch": 0.05929487179487179, + "cpu_mem": 1.748230144, + "gpu_mem": 4.493504512, + "loss": 1.3992, + "grad_norm": 0.7199559807777405, + "learning_rate": 0.0001761904761904762 + }, + { + "step": 38, + "epoch": 0.060897435897435896, + "cpu_mem": 1.748819968, + "gpu_mem": 4.493456896, + "loss": 1.4032, + "grad_norm": 0.6078174710273743, + "learning_rate": 0.00018095238095238093 + }, + { + "step": 39, + "epoch": 0.0625, + "cpu_mem": 1.749409792, + "gpu_mem": 4.493513728, + "loss": 1.3662, + "grad_norm": 1.6473122835159302, + "learning_rate": 0.00018571428571428572 + }, + { + "step": 40, + "epoch": 0.0641025641025641, + "cpu_mem": 1.749999616, + "gpu_mem": 4.493441536, + "loss": 1.472, + "grad_norm": 1.7448725700378418, + "learning_rate": 0.00019047619047619045 + }, + { + "step": 41, + "epoch": 0.06570512820512821, + "cpu_mem": 1.75058944, + "gpu_mem": 4.493469184, + "loss": 1.3866, + "grad_norm": 0.9719277024269104, + "learning_rate": 0.00019523809523809522 + }, + { + "step": 42, + "epoch": 0.0673076923076923, + "cpu_mem": 1.750982656, + "gpu_mem": 4.493483008, + "loss": 1.4318, + "grad_norm": 1.621679425239563, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 43, + "epoch": 0.06891025641025642, + "cpu_mem": 1.751769088, + "gpu_mem": 4.493489152, + "loss": 1.3784, + "grad_norm": 0.571748673915863, + "learning_rate": 0.00020476190476190475 + }, + { + "step": 44, + "epoch": 0.07051282051282051, + "cpu_mem": 1.752162304, + "gpu_mem": 4.493467648, + "loss": 1.403, + "grad_norm": 0.9391312003135681, + "learning_rate": 0.00020952380952380948 + }, + { + "step": 45, + "epoch": 0.07211538461538461, + "cpu_mem": 1.75255552, + "gpu_mem": 4.493467648, + "loss": 1.3911, + "grad_norm": 0.41856327652931213, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 46, + "epoch": 0.07371794871794872, + "cpu_mem": 1.753145344, + "gpu_mem": 4.493467648, + "loss": 1.3776, + "grad_norm": 1.3814259767532349, + "learning_rate": 0.000219047619047619 + }, + { + "step": 47, + "epoch": 0.07532051282051282, + "cpu_mem": 1.753735168, + "gpu_mem": 4.493453824, + "loss": 1.393, + "grad_norm": 0.6978099942207336, + "learning_rate": 0.0002238095238095238 + }, + { + "step": 48, + "epoch": 0.07692307692307693, + "cpu_mem": 1.754324992, + "gpu_mem": 4.493472256, + "loss": 1.354, + "grad_norm": 0.43229755759239197, + "learning_rate": 0.00022857142857142854 + }, + { + "step": 49, + "epoch": 0.07852564102564102, + "cpu_mem": 1.754718208, + "gpu_mem": 4.493484544, + "loss": 1.4351, + "grad_norm": 1.297333836555481, + "learning_rate": 0.0002333333333333333 + }, + { + "step": 50, + "epoch": 0.08012820512820513, + "cpu_mem": 1.755111424, + "gpu_mem": 4.493461504, + "loss": 1.3853, + "grad_norm": 0.747490406036377, + "learning_rate": 0.00023809523809523807 + }, + { + "step": 51, + "epoch": 0.08173076923076923, + "cpu_mem": 1.755701248, + "gpu_mem": 4.493446144, + "loss": 1.3845, + "grad_norm": 0.8844747543334961, + "learning_rate": 0.00024285714285714283 + }, + { + "step": 52, + "epoch": 0.08333333333333333, + "cpu_mem": 1.756291072, + "gpu_mem": 4.493450752, + "loss": 1.3707, + "grad_norm": 0.5806766748428345, + "learning_rate": 0.00024761904761904757 + }, + { + "step": 53, + "epoch": 0.08493589743589744, + "cpu_mem": 1.756684288, + "gpu_mem": 4.4934784, + "loss": 1.3924, + "grad_norm": 0.8298816680908203, + "learning_rate": 0.0002523809523809524 + }, + { + "step": 54, + "epoch": 0.08653846153846154, + "cpu_mem": 1.757077504, + "gpu_mem": 4.493453824, + "loss": 1.4437, + "grad_norm": 1.3472962379455566, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 55, + "epoch": 0.08814102564102565, + "cpu_mem": 1.75747072, + "gpu_mem": 4.493472256, + "loss": 1.3872, + "grad_norm": 0.6884870529174805, + "learning_rate": 0.00026190476190476186 + }, + { + "step": 56, + "epoch": 0.08974358974358974, + "cpu_mem": 1.758060544, + "gpu_mem": 4.493466112, + "loss": 1.3797, + "grad_norm": 0.6756719350814819, + "learning_rate": 0.0002666666666666666 + }, + { + "step": 57, + "epoch": 0.09134615384615384, + "cpu_mem": 1.758650368, + "gpu_mem": 4.49343232, + "loss": 1.4081, + "grad_norm": 0.9279078245162964, + "learning_rate": 0.0002714285714285714 + }, + { + "step": 58, + "epoch": 0.09294871794871795, + "cpu_mem": 1.759043584, + "gpu_mem": 4.493461504, + "loss": 1.4189, + "grad_norm": 1.1654587984085083, + "learning_rate": 0.00027619047619047615 + }, + { + "step": 59, + "epoch": 0.09455128205128205, + "cpu_mem": 1.759633408, + "gpu_mem": 4.493444608, + "loss": 1.3376, + "grad_norm": 0.8263879418373108, + "learning_rate": 0.0002809523809523809 + }, + { + "step": 60, + "epoch": 0.09615384615384616, + "cpu_mem": 1.760026624, + "gpu_mem": 4.49348608, + "loss": 1.415, + "grad_norm": 1.8450270891189575, + "learning_rate": 0.0002857142857142857 + }, + { + "step": 61, + "epoch": 0.09775641025641026, + "cpu_mem": 1.76041984, + "gpu_mem": 4.493452288, + "loss": 1.4064, + "grad_norm": 0.9668666124343872, + "learning_rate": 0.00029047619047619045 + }, + { + "step": 62, + "epoch": 0.09935897435897435, + "cpu_mem": 1.761009664, + "gpu_mem": 4.493492224, + "loss": 1.3418, + "grad_norm": 1.1964612007141113, + "learning_rate": 0.0002952380952380952 + }, + { + "step": 63, + "epoch": 0.10096153846153846, + "cpu_mem": 1.76140288, + "gpu_mem": 4.493446144, + "loss": 1.4404, + "grad_norm": 1.4596117734909058, + "learning_rate": 0.0003 + }, + { + "step": 64, + "epoch": 0.10256410256410256, + "cpu_mem": 1.761796096, + "gpu_mem": 4.493450752, + "loss": 1.4426, + "grad_norm": 1.5563303232192993, + "learning_rate": 0.00029999764801714643 + }, + { + "step": 65, + "epoch": 0.10416666666666667, + "cpu_mem": 1.762189312, + "gpu_mem": 4.49344768, + "loss": 1.4109, + "grad_norm": 1.1105663776397705, + "learning_rate": 0.00029999059214234344 + }, + { + "step": 66, + "epoch": 0.10576923076923077, + "cpu_mem": 1.762779136, + "gpu_mem": 4.493466112, + "loss": 1.3996, + "grad_norm": 0.6154927611351013, + "learning_rate": 0.00029997883259686163 + }, + { + "step": 67, + "epoch": 0.10737179487179487, + "cpu_mem": 1.763172352, + "gpu_mem": 4.493458432, + "loss": 1.3961, + "grad_norm": 0.6701638698577881, + "learning_rate": 0.00029996236974947764 + }, + { + "step": 68, + "epoch": 0.10897435897435898, + "cpu_mem": 1.763762176, + "gpu_mem": 4.493443072, + "loss": 1.4251, + "grad_norm": 1.3010475635528564, + "learning_rate": 0.00029994120411646263 + }, + { + "step": 69, + "epoch": 0.11057692307692307, + "cpu_mem": 1.764155392, + "gpu_mem": 4.493513728, + "loss": 1.3944, + "grad_norm": 1.275406002998352, + "learning_rate": 0.00029991533636156603 + }, + { + "step": 70, + "epoch": 0.11217948717948718, + "cpu_mem": 1.764745216, + "gpu_mem": 4.493464576, + "loss": 1.4261, + "grad_norm": 1.60918128490448, + "learning_rate": 0.00029988476729599464 + }, + { + "step": 71, + "epoch": 0.11378205128205128, + "cpu_mem": 1.765138432, + "gpu_mem": 4.493489152, + "loss": 1.3588, + "grad_norm": 1.3156273365020752, + "learning_rate": 0.0002998494978783874 + }, + { + "step": 72, + "epoch": 0.11538461538461539, + "cpu_mem": 1.765531648, + "gpu_mem": 4.493459968, + "loss": 1.4264, + "grad_norm": 1.9019278287887573, + "learning_rate": 0.0002998095292147852 + }, + { + "step": 73, + "epoch": 0.11698717948717949, + "cpu_mem": 1.765924864, + "gpu_mem": 4.493452288, + "loss": 1.5391, + "grad_norm": 3.9599862098693848, + "learning_rate": 0.0002997648625585962 + }, + { + "step": 74, + "epoch": 0.11858974358974358, + "cpu_mem": 1.76631808, + "gpu_mem": 4.493446144, + "loss": 1.4125, + "grad_norm": 1.8825929164886475, + "learning_rate": 0.0002997154993105566 + }, + { + "step": 75, + "epoch": 0.1201923076923077, + "cpu_mem": 1.766711296, + "gpu_mem": 4.493475328, + "loss": 1.4699, + "grad_norm": 2.4831039905548096, + "learning_rate": 0.00029966144101868636 + }, + { + "step": 76, + "epoch": 0.12179487179487179, + "cpu_mem": 1.767104512, + "gpu_mem": 4.493466112, + "loss": 1.4897, + "grad_norm": 3.025399923324585, + "learning_rate": 0.0002996026893782414 + }, + { + "step": 77, + "epoch": 0.1233974358974359, + "cpu_mem": 1.767497728, + "gpu_mem": 4.493453824, + "loss": 1.4019, + "grad_norm": 0.9176809787750244, + "learning_rate": 0.00029953924623165955 + }, + { + "step": 78, + "epoch": 0.125, + "cpu_mem": 1.768087552, + "gpu_mem": 4.493446144, + "loss": 1.4079, + "grad_norm": 1.2582684755325317, + "learning_rate": 0.0002994711135685035 + }, + { + "step": 79, + "epoch": 0.1266025641025641, + "cpu_mem": 1.768480768, + "gpu_mem": 4.493498368, + "loss": 1.3926, + "grad_norm": 0.9529223442077637, + "learning_rate": 0.00029939829352539787 + }, + { + "step": 80, + "epoch": 0.1282051282051282, + "cpu_mem": 1.768873984, + "gpu_mem": 4.493476864, + "loss": 1.4195, + "grad_norm": 1.045619249343872, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 81, + "epoch": 0.12980769230769232, + "cpu_mem": 1.7692672, + "gpu_mem": 4.49347072, + "loss": 1.358, + "grad_norm": 0.33427485823631287, + "learning_rate": 0.0002992386005807413 + }, + { + "step": 82, + "epoch": 0.13141025641025642, + "cpu_mem": 1.769857024, + "gpu_mem": 4.49344768, + "loss": 1.3784, + "grad_norm": 0.8806460499763489, + "learning_rate": 0.00029915173268712456 + }, + { + "step": 83, + "epoch": 0.1330128205128205, + "cpu_mem": 1.77025024, + "gpu_mem": 4.493469184, + "loss": 1.4503, + "grad_norm": 1.5493355989456177, + "learning_rate": 0.0002990601874292698 + }, + { + "step": 84, + "epoch": 0.1346153846153846, + "cpu_mem": 1.770643456, + "gpu_mem": 4.493441536, + "loss": 1.4337, + "grad_norm": 1.2922650575637817, + "learning_rate": 0.0002989639676780152 + }, + { + "step": 85, + "epoch": 0.1362179487179487, + "cpu_mem": 1.770840064, + "gpu_mem": 4.493449216, + "loss": 1.3933, + "grad_norm": 0.7838622331619263, + "learning_rate": 0.0002988630764507904 + }, + { + "step": 86, + "epoch": 0.13782051282051283, + "cpu_mem": 1.77123328, + "gpu_mem": 4.493467648, + "loss": 1.4014, + "grad_norm": 0.9261447787284851, + "learning_rate": 0.00029875751691152094 + }, + { + "step": 87, + "epoch": 0.13942307692307693, + "cpu_mem": 1.771823104, + "gpu_mem": 4.493456896, + "loss": 1.3839, + "grad_norm": 0.3987230956554413, + "learning_rate": 0.0002986472923705301 + }, + { + "step": 88, + "epoch": 0.14102564102564102, + "cpu_mem": 1.772019712, + "gpu_mem": 4.49345536, + "loss": 1.3856, + "grad_norm": 0.8156496286392212, + "learning_rate": 0.0002985324062844341 + }, + { + "step": 89, + "epoch": 0.14262820512820512, + "cpu_mem": 1.772412928, + "gpu_mem": 4.493450752, + "loss": 1.4326, + "grad_norm": 1.1973363161087036, + "learning_rate": 0.0002984128622560345 + }, + { + "step": 90, + "epoch": 0.14423076923076922, + "cpu_mem": 1.772806144, + "gpu_mem": 4.49345536, + "loss": 1.3864, + "grad_norm": 0.7243323922157288, + "learning_rate": 0.0002982886640342046 + }, + { + "step": 91, + "epoch": 0.14583333333333334, + "cpu_mem": 1.77319936, + "gpu_mem": 4.493466112, + "loss": 1.3901, + "grad_norm": 0.8156611919403076, + "learning_rate": 0.00029815981551377217 + }, + { + "step": 92, + "epoch": 0.14743589743589744, + "cpu_mem": 1.773592576, + "gpu_mem": 4.493469184, + "loss": 1.3958, + "grad_norm": 0.7211673259735107, + "learning_rate": 0.00029802632073539745 + }, + { + "step": 93, + "epoch": 0.14903846153846154, + "cpu_mem": 1.773985792, + "gpu_mem": 4.493469184, + "loss": 1.4037, + "grad_norm": 0.38165047764778137, + "learning_rate": 0.0002978881838854462 + }, + { + "step": 94, + "epoch": 0.15064102564102563, + "cpu_mem": 1.774379008, + "gpu_mem": 4.493464576, + "loss": 1.3814, + "grad_norm": 0.41160324215888977, + "learning_rate": 0.00029774540929585847 + }, + { + "step": 95, + "epoch": 0.15224358974358973, + "cpu_mem": 1.774772224, + "gpu_mem": 4.493483008, + "loss": 1.4203, + "grad_norm": 1.3068056106567383, + "learning_rate": 0.0002975980014440126 + }, + { + "step": 96, + "epoch": 0.15384615384615385, + "cpu_mem": 1.77516544, + "gpu_mem": 4.49348608, + "loss": 1.3781, + "grad_norm": 0.4024551510810852, + "learning_rate": 0.00029744596495258525 + }, + { + "step": 97, + "epoch": 0.15544871794871795, + "cpu_mem": 1.775558656, + "gpu_mem": 4.49346304, + "loss": 1.402, + "grad_norm": 0.4701593816280365, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 98, + "epoch": 0.15705128205128205, + "cpu_mem": 1.775951872, + "gpu_mem": 4.493473792, + "loss": 1.409, + "grad_norm": 0.5078145265579224, + "learning_rate": 0.000297128025267308 + }, + { + "step": 99, + "epoch": 0.15865384615384615, + "cpu_mem": 1.776345088, + "gpu_mem": 4.493473792, + "loss": 1.3913, + "grad_norm": 0.7577505707740784, + "learning_rate": 0.00029696213204397396 + }, + { + "step": 100, + "epoch": 0.16025641025641027, + "cpu_mem": 1.776738304, + "gpu_mem": 4.493449216, + "loss": 1.3895, + "grad_norm": 0.3730054199695587, + "learning_rate": 0.00029679163012177737 + }, + { + "step": 101, + "epoch": 0.16185897435897437, + "cpu_mem": 1.77713152, + "gpu_mem": 4.4934784, + "loss": 1.4001, + "grad_norm": 0.7956017255783081, + "learning_rate": 0.0002966165248476196 + }, + { + "step": 102, + "epoch": 0.16346153846153846, + "cpu_mem": 1.777328128, + "gpu_mem": 4.49345536, + "loss": 1.368, + "grad_norm": 0.48397096991539, + "learning_rate": 0.00029643682171276203 + }, + { + "step": 103, + "epoch": 0.16506410256410256, + "cpu_mem": 1.777721344, + "gpu_mem": 4.493472256, + "loss": 1.4109, + "grad_norm": 0.6740492582321167, + "learning_rate": 0.0002962525263526538 + }, + { + "step": 104, + "epoch": 0.16666666666666666, + "cpu_mem": 1.777917952, + "gpu_mem": 4.49344, + "loss": 1.413, + "grad_norm": 0.9631777405738831, + "learning_rate": 0.0002960636445467553 + }, + { + "step": 105, + "epoch": 0.16826923076923078, + "cpu_mem": 1.778311168, + "gpu_mem": 4.49345536, + "loss": 1.3809, + "grad_norm": 0.44344624876976013, + "learning_rate": 0.0002958701822183569 + }, + { + "step": 106, + "epoch": 0.16987179487179488, + "cpu_mem": 1.778507776, + "gpu_mem": 4.493435392, + "loss": 1.4578, + "grad_norm": 1.666815161705017, + "learning_rate": 0.0002956721454343928 + }, + { + "step": 107, + "epoch": 0.17147435897435898, + "cpu_mem": 1.778900992, + "gpu_mem": 4.493476864, + "loss": 1.3861, + "grad_norm": 0.6710153222084045, + "learning_rate": 0.0002954695404052514 + }, + { + "step": 108, + "epoch": 0.17307692307692307, + "cpu_mem": 1.779294208, + "gpu_mem": 4.493472256, + "loss": 1.3997, + "grad_norm": 0.5651688575744629, + "learning_rate": 0.00029526237348458003 + }, + { + "step": 109, + "epoch": 0.17467948717948717, + "cpu_mem": 1.779687424, + "gpu_mem": 4.4934784, + "loss": 1.403, + "grad_norm": 1.0365393161773682, + "learning_rate": 0.000295050651169086 + }, + { + "step": 110, + "epoch": 0.1762820512820513, + "cpu_mem": 1.78008064, + "gpu_mem": 4.493475328, + "loss": 1.377, + "grad_norm": 0.5423694849014282, + "learning_rate": 0.00029483438009833264 + }, + { + "step": 111, + "epoch": 0.1778846153846154, + "cpu_mem": 1.780277248, + "gpu_mem": 4.493476864, + "loss": 1.4137, + "grad_norm": 1.3744028806686401, + "learning_rate": 0.0002946135670545314 + }, + { + "step": 112, + "epoch": 0.1794871794871795, + "cpu_mem": 1.780670464, + "gpu_mem": 4.493473792, + "loss": 1.3699, + "grad_norm": 0.5367031097412109, + "learning_rate": 0.0002943882189623288 + }, + { + "step": 113, + "epoch": 0.18108974358974358, + "cpu_mem": 1.78106368, + "gpu_mem": 4.493453824, + "loss": 1.4096, + "grad_norm": 0.6898807287216187, + "learning_rate": 0.00029415834288858947 + }, + { + "step": 114, + "epoch": 0.18269230769230768, + "cpu_mem": 1.781456896, + "gpu_mem": 4.493449216, + "loss": 1.3577, + "grad_norm": 0.5536346435546875, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 115, + "epoch": 0.1842948717948718, + "cpu_mem": 1.781850112, + "gpu_mem": 4.493467648, + "loss": 1.4198, + "grad_norm": 0.6543102860450745, + "learning_rate": 0.0002936850357737156 + }, + { + "step": 116, + "epoch": 0.1858974358974359, + "cpu_mem": 1.782243328, + "gpu_mem": 4.4934784, + "loss": 1.4092, + "grad_norm": 0.5865402817726135, + "learning_rate": 0.0002934416195753839 + }, + { + "step": 117, + "epoch": 0.1875, + "cpu_mem": 1.782439936, + "gpu_mem": 4.493464576, + "loss": 1.3748, + "grad_norm": 0.2984783351421356, + "learning_rate": 0.00029319370508065594 + }, + { + "step": 118, + "epoch": 0.1891025641025641, + "cpu_mem": 1.782833152, + "gpu_mem": 4.493479936, + "loss": 1.3684, + "grad_norm": 0.8238226771354675, + "learning_rate": 0.0002929413000640735 + }, + { + "step": 119, + "epoch": 0.1907051282051282, + "cpu_mem": 1.783226368, + "gpu_mem": 4.493461504, + "loss": 1.4213, + "grad_norm": 1.4312185049057007, + "learning_rate": 0.0002926844124410001 + }, + { + "step": 120, + "epoch": 0.19230769230769232, + "cpu_mem": 1.783422976, + "gpu_mem": 4.493487616, + "loss": 1.4098, + "grad_norm": 0.8734499216079712, + "learning_rate": 0.0002924230502673731 + }, + { + "step": 121, + "epoch": 0.19391025641025642, + "cpu_mem": 1.783619584, + "gpu_mem": 4.493446144, + "loss": 1.3758, + "grad_norm": 0.5772071480751038, + "learning_rate": 0.00029215722173945034 + }, + { + "step": 122, + "epoch": 0.1955128205128205, + "cpu_mem": 1.7840128, + "gpu_mem": 4.4934784, + "loss": 1.4137, + "grad_norm": 1.0213813781738281, + "learning_rate": 0.0002918869351935537 + }, + { + "step": 123, + "epoch": 0.1971153846153846, + "cpu_mem": 1.784406016, + "gpu_mem": 4.493472256, + "loss": 1.4068, + "grad_norm": 0.5098854899406433, + "learning_rate": 0.00029161219910580754 + }, + { + "step": 124, + "epoch": 0.1987179487179487, + "cpu_mem": 1.784799232, + "gpu_mem": 4.493473792, + "loss": 1.3695, + "grad_norm": 0.5449486970901489, + "learning_rate": 0.00029133302209187267 + }, + { + "step": 125, + "epoch": 0.20032051282051283, + "cpu_mem": 1.785192448, + "gpu_mem": 4.493449216, + "loss": 1.3583, + "grad_norm": 0.4825524687767029, + "learning_rate": 0.00029104941290667655 + }, + { + "step": 126, + "epoch": 0.20192307692307693, + "cpu_mem": 1.785389056, + "gpu_mem": 4.493458432, + "loss": 1.3526, + "grad_norm": 0.4110080599784851, + "learning_rate": 0.00029076138044413827 + }, + { + "step": 127, + "epoch": 0.20352564102564102, + "cpu_mem": 1.785585664, + "gpu_mem": 4.493444608, + "loss": 1.4206, + "grad_norm": 1.0523475408554077, + "learning_rate": 0.00029046893373689 + }, + { + "step": 128, + "epoch": 0.20512820512820512, + "cpu_mem": 1.78597888, + "gpu_mem": 4.493481472, + "loss": 1.3629, + "grad_norm": 0.4089713990688324, + "learning_rate": 0.00029017208195599375 + }, + { + "step": 129, + "epoch": 0.20673076923076922, + "cpu_mem": 1.786372096, + "gpu_mem": 4.4934784, + "loss": 1.3865, + "grad_norm": 0.3714219629764557, + "learning_rate": 0.0002898708344106533 + }, + { + "step": 130, + "epoch": 0.20833333333333334, + "cpu_mem": 1.786568704, + "gpu_mem": 4.4934784, + "loss": 1.3994, + "grad_norm": 0.5283492803573608, + "learning_rate": 0.00028956520054792303 + }, + { + "step": 131, + "epoch": 0.20993589743589744, + "cpu_mem": 1.78696192, + "gpu_mem": 4.493467648, + "loss": 1.3839, + "grad_norm": 0.47119033336639404, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 132, + "epoch": 0.21153846153846154, + "cpu_mem": 1.787355136, + "gpu_mem": 4.493467648, + "loss": 1.3501, + "grad_norm": 0.6966690421104431, + "learning_rate": 0.0002889408123459782 + }, + { + "step": 133, + "epoch": 0.21314102564102563, + "cpu_mem": 1.787551744, + "gpu_mem": 4.493449216, + "loss": 1.3886, + "grad_norm": 0.5709307193756104, + "learning_rate": 0.000288622077587435 + }, + { + "step": 134, + "epoch": 0.21474358974358973, + "cpu_mem": 1.78794496, + "gpu_mem": 4.493459968, + "loss": 1.3799, + "grad_norm": 0.664378821849823, + "learning_rate": 0.0002882989956722303 + }, + { + "step": 135, + "epoch": 0.21634615384615385, + "cpu_mem": 1.788141568, + "gpu_mem": 4.493469184, + "loss": 1.3524, + "grad_norm": 0.4631108045578003, + "learning_rate": 0.00028797157673213914 + }, + { + "step": 136, + "epoch": 0.21794871794871795, + "cpu_mem": 1.788534784, + "gpu_mem": 4.493484544, + "loss": 1.3895, + "grad_norm": 1.1079460382461548, + "learning_rate": 0.00028763983103494465 + }, + { + "step": 137, + "epoch": 0.21955128205128205, + "cpu_mem": 1.788731392, + "gpu_mem": 4.49343232, + "loss": 1.3536, + "grad_norm": 0.7550533413887024, + "learning_rate": 0.00028730376898411606 + }, + { + "step": 138, + "epoch": 0.22115384615384615, + "cpu_mem": 1.789124608, + "gpu_mem": 4.493452288, + "loss": 1.3699, + "grad_norm": 1.0372254848480225, + "learning_rate": 0.00028696340111848245 + }, + { + "step": 139, + "epoch": 0.22275641025641027, + "cpu_mem": 1.789321216, + "gpu_mem": 4.493433856, + "loss": 1.3393, + "grad_norm": 1.2427090406417847, + "learning_rate": 0.00028661873811190226 + }, + { + "step": 140, + "epoch": 0.22435897435897437, + "cpu_mem": 1.789714432, + "gpu_mem": 4.493450752, + "loss": 1.2868, + "grad_norm": 1.023554801940918, + "learning_rate": 0.0002862697907729285 + }, + { + "step": 141, + "epoch": 0.22596153846153846, + "cpu_mem": 1.78991104, + "gpu_mem": 4.493456896, + "loss": 1.3675, + "grad_norm": 2.991377830505371, + "learning_rate": 0.0002859165700444701 + }, + { + "step": 142, + "epoch": 0.22756410256410256, + "cpu_mem": 1.790304256, + "gpu_mem": 4.493453824, + "loss": 1.2417, + "grad_norm": 1.8879321813583374, + "learning_rate": 0.00028555908700344824 + }, + { + "step": 143, + "epoch": 0.22916666666666666, + "cpu_mem": 1.790500864, + "gpu_mem": 4.493479936, + "loss": 1.261, + "grad_norm": 2.495176315307617, + "learning_rate": 0.00028519735286044936 + }, + { + "step": 144, + "epoch": 0.23076923076923078, + "cpu_mem": 1.79089408, + "gpu_mem": 4.493453824, + "loss": 1.2151, + "grad_norm": 1.8553787469863892, + "learning_rate": 0.0002848313789593736 + }, + { + "step": 145, + "epoch": 0.23237179487179488, + "cpu_mem": 1.791090688, + "gpu_mem": 4.49349376, + "loss": 1.3457, + "grad_norm": 3.467176675796509, + "learning_rate": 0.00028446117677707867 + }, + { + "step": 146, + "epoch": 0.23397435897435898, + "cpu_mem": 1.791287296, + "gpu_mem": 4.493443072, + "loss": 1.2625, + "grad_norm": 3.3499250411987305, + "learning_rate": 0.0002840867579230205 + }, + { + "step": 147, + "epoch": 0.23557692307692307, + "cpu_mem": 1.791680512, + "gpu_mem": 4.493452288, + "loss": 1.3294, + "grad_norm": 9.814748764038086, + "learning_rate": 0.00028370813413888866 + }, + { + "step": 148, + "epoch": 0.23717948717948717, + "cpu_mem": 1.79187712, + "gpu_mem": 4.493472256, + "loss": 1.3121, + "grad_norm": 2.880732536315918, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 149, + "epoch": 0.2387820512820513, + "cpu_mem": 1.792073728, + "gpu_mem": 4.49346304, + "loss": 1.2123, + "grad_norm": 2.7824184894561768, + "learning_rate": 0.0002829383194061186 + }, + { + "step": 150, + "epoch": 0.2403846153846154, + "cpu_mem": 1.792466944, + "gpu_mem": 4.493475328, + "loss": 1.1706, + "grad_norm": 2.358233690261841, + "learning_rate": 0.00028254715259869444 + }, + { + "step": 151, + "epoch": 0.2419871794871795, + "cpu_mem": 1.79286016, + "gpu_mem": 4.49344, + "loss": 1.2043, + "grad_norm": 2.1822941303253174, + "learning_rate": 0.00028215182914286766 + }, + { + "step": 152, + "epoch": 0.24358974358974358, + "cpu_mem": 1.793056768, + "gpu_mem": 4.49347072, + "loss": 1.1004, + "grad_norm": 2.6122193336486816, + "learning_rate": 0.00028175236143589144 + }, + { + "step": 153, + "epoch": 0.24519230769230768, + "cpu_mem": 1.793253376, + "gpu_mem": 4.493466112, + "loss": 1.2481, + "grad_norm": 4.46557092666626, + "learning_rate": 0.0002813487620049817 + }, + { + "step": 154, + "epoch": 0.2467948717948718, + "cpu_mem": 1.793449984, + "gpu_mem": 4.493490688, + "loss": 1.3233, + "grad_norm": 8.05833911895752, + "learning_rate": 0.00028094104350692435 + }, + { + "step": 155, + "epoch": 0.2483974358974359, + "cpu_mem": 1.793646592, + "gpu_mem": 4.493427712, + "loss": 1.1171, + "grad_norm": 4.3249359130859375, + "learning_rate": 0.0002805292187276783 + }, + { + "step": 156, + "epoch": 0.25, + "cpu_mem": 1.7938432, + "gpu_mem": 4.493481472, + "loss": 1.107, + "grad_norm": 3.499655246734619, + "learning_rate": 0.0002801133005819744 + }, + { + "step": 157, + "epoch": 0.2516025641025641, + "cpu_mem": 1.794236416, + "gpu_mem": 4.493473792, + "loss": 1.2434, + "grad_norm": 3.9885709285736084, + "learning_rate": 0.00027969330211291077 + }, + { + "step": 158, + "epoch": 0.2532051282051282, + "cpu_mem": 1.794433024, + "gpu_mem": 4.493489152, + "loss": 1.0871, + "grad_norm": 3.815213441848755, + "learning_rate": 0.00027926923649154327 + }, + { + "step": 159, + "epoch": 0.2548076923076923, + "cpu_mem": 1.79482624, + "gpu_mem": 4.493490688, + "loss": 1.0313, + "grad_norm": 2.4548869132995605, + "learning_rate": 0.00027884111701647284 + }, + { + "step": 160, + "epoch": 0.2564102564102564, + "cpu_mem": 1.795022848, + "gpu_mem": 4.493458432, + "loss": 0.9887, + "grad_norm": 3.300377607345581, + "learning_rate": 0.00027840895711342834 + }, + { + "step": 161, + "epoch": 0.25801282051282054, + "cpu_mem": 1.795219456, + "gpu_mem": 4.493450752, + "loss": 0.9345, + "grad_norm": 3.2422165870666504, + "learning_rate": 0.00027797277033484553 + }, + { + "step": 162, + "epoch": 0.25961538461538464, + "cpu_mem": 1.795416064, + "gpu_mem": 4.49348608, + "loss": 0.9134, + "grad_norm": 3.1416478157043457, + "learning_rate": 0.0002775325703594421 + }, + { + "step": 163, + "epoch": 0.26121794871794873, + "cpu_mem": 1.79580928, + "gpu_mem": 4.493433856, + "loss": 0.9926, + "grad_norm": 3.0668585300445557, + "learning_rate": 0.0002770883709917886 + }, + { + "step": 164, + "epoch": 0.26282051282051283, + "cpu_mem": 1.796005888, + "gpu_mem": 4.493469184, + "loss": 1.004, + "grad_norm": 3.3945112228393555, + "learning_rate": 0.0002766401861618757 + }, + { + "step": 165, + "epoch": 0.2644230769230769, + "cpu_mem": 1.796399104, + "gpu_mem": 4.493458432, + "loss": 1.0683, + "grad_norm": 3.304023504257202, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 166, + "epoch": 0.266025641025641, + "cpu_mem": 1.796595712, + "gpu_mem": 4.493490688, + "loss": 0.8442, + "grad_norm": 4.041101932525635, + "learning_rate": 0.0002757319164597092 + }, + { + "step": 167, + "epoch": 0.2676282051282051, + "cpu_mem": 1.79679232, + "gpu_mem": 4.493484544, + "loss": 0.9793, + "grad_norm": 3.991379499435425, + "learning_rate": 0.0002752718600705858 + }, + { + "step": 168, + "epoch": 0.2692307692307692, + "cpu_mem": 1.797185536, + "gpu_mem": 4.49346304, + "loss": 0.9983, + "grad_norm": 8.161965370178223, + "learning_rate": 0.00027480787518457023 + }, + { + "step": 169, + "epoch": 0.2708333333333333, + "cpu_mem": 1.797382144, + "gpu_mem": 4.493459968, + "loss": 1.2306, + "grad_norm": 7.912356853485107, + "learning_rate": 0.0002743399763521223 + }, + { + "step": 170, + "epoch": 0.2724358974358974, + "cpu_mem": 1.797578752, + "gpu_mem": 4.493496832, + "loss": 0.7977, + "grad_norm": 4.782266616821289, + "learning_rate": 0.0002738681782464426 + }, + { + "step": 171, + "epoch": 0.27403846153846156, + "cpu_mem": 1.797971968, + "gpu_mem": 4.49347072, + "loss": 0.7059, + "grad_norm": 5.634410381317139, + "learning_rate": 0.0002733924956630117 + }, + { + "step": 172, + "epoch": 0.27564102564102566, + "cpu_mem": 1.798168576, + "gpu_mem": 4.49344768, + "loss": 0.9454, + "grad_norm": 5.690829277038574, + "learning_rate": 0.00027291294351912664 + }, + { + "step": 173, + "epoch": 0.27724358974358976, + "cpu_mem": 1.798365184, + "gpu_mem": 4.493473792, + "loss": 0.8284, + "grad_norm": 3.690347909927368, + "learning_rate": 0.00027242953685343327 + }, + { + "step": 174, + "epoch": 0.27884615384615385, + "cpu_mem": 1.798561792, + "gpu_mem": 4.49348608, + "loss": 0.9328, + "grad_norm": 4.540343761444092, + "learning_rate": 0.0002719422908254538 + }, + { + "step": 175, + "epoch": 0.28044871794871795, + "cpu_mem": 1.7987584, + "gpu_mem": 4.49344768, + "loss": 0.7905, + "grad_norm": 3.626569986343384, + "learning_rate": 0.0002714512207151125 + }, + { + "step": 176, + "epoch": 0.28205128205128205, + "cpu_mem": 1.798955008, + "gpu_mem": 4.493456896, + "loss": 0.7974, + "grad_norm": 3.1121065616607666, + "learning_rate": 0.0002709563419222557 + }, + { + "step": 177, + "epoch": 0.28365384615384615, + "cpu_mem": 1.799151616, + "gpu_mem": 4.493438464, + "loss": 0.8573, + "grad_norm": 3.970581293106079, + "learning_rate": 0.0002704576699661691 + }, + { + "step": 178, + "epoch": 0.28525641025641024, + "cpu_mem": 1.799348224, + "gpu_mem": 4.493452288, + "loss": 0.6121, + "grad_norm": 3.308837413787842, + "learning_rate": 0.0002699552204850914 + }, + { + "step": 179, + "epoch": 0.28685897435897434, + "cpu_mem": 1.79974144, + "gpu_mem": 4.493459968, + "loss": 0.7119, + "grad_norm": 4.1121506690979, + "learning_rate": 0.0002694490092357233 + }, + { + "step": 180, + "epoch": 0.28846153846153844, + "cpu_mem": 1.799938048, + "gpu_mem": 4.493441536, + "loss": 0.8021, + "grad_norm": 3.921325922012329, + "learning_rate": 0.00026893905209273404 + }, + { + "step": 181, + "epoch": 0.2900641025641026, + "cpu_mem": 1.800331264, + "gpu_mem": 4.493472256, + "loss": 0.7717, + "grad_norm": 5.189688682556152, + "learning_rate": 0.00026842536504826286 + }, + { + "step": 182, + "epoch": 0.2916666666666667, + "cpu_mem": 1.800331264, + "gpu_mem": 4.493443072, + "loss": 0.8691, + "grad_norm": 6.064367771148682, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 183, + "epoch": 0.2932692307692308, + "cpu_mem": 1.80072448, + "gpu_mem": 4.493467648, + "loss": 0.7765, + "grad_norm": 4.030826568603516, + "learning_rate": 0.0002673868658077717 + }, + { + "step": 184, + "epoch": 0.2948717948717949, + "cpu_mem": 1.800921088, + "gpu_mem": 4.49344768, + "loss": 0.5981, + "grad_norm": 3.938490867614746, + "learning_rate": 0.00026686208617885055 + }, + { + "step": 185, + "epoch": 0.296474358974359, + "cpu_mem": 1.801117696, + "gpu_mem": 4.493479936, + "loss": 0.9764, + "grad_norm": 7.410391330718994, + "learning_rate": 0.0002663336417816238 + }, + { + "step": 186, + "epoch": 0.2980769230769231, + "cpu_mem": 1.801314304, + "gpu_mem": 4.49347072, + "loss": 0.8313, + "grad_norm": 5.120713233947754, + "learning_rate": 0.0002658015491879868 + }, + { + "step": 187, + "epoch": 0.29967948717948717, + "cpu_mem": 1.801510912, + "gpu_mem": 4.493466112, + "loss": 0.7793, + "grad_norm": 3.0864500999450684, + "learning_rate": 0.00026526582508424175 + }, + { + "step": 188, + "epoch": 0.30128205128205127, + "cpu_mem": 1.80170752, + "gpu_mem": 4.493423104, + "loss": 0.8532, + "grad_norm": 4.613983154296875, + "learning_rate": 0.0002647264862705741 + }, + { + "step": 189, + "epoch": 0.30288461538461536, + "cpu_mem": 1.801904128, + "gpu_mem": 4.493502976, + "loss": 0.8504, + "grad_norm": 4.65530252456665, + "learning_rate": 0.00026418354966052573 + }, + { + "step": 190, + "epoch": 0.30448717948717946, + "cpu_mem": 1.802100736, + "gpu_mem": 4.493453824, + "loss": 0.6937, + "grad_norm": 4.396862030029297, + "learning_rate": 0.00026363703228046454 + }, + { + "step": 191, + "epoch": 0.3060897435897436, + "cpu_mem": 1.802297344, + "gpu_mem": 4.493453824, + "loss": 0.6667, + "grad_norm": 2.971975088119507, + "learning_rate": 0.0002630869512690507 + }, + { + "step": 192, + "epoch": 0.3076923076923077, + "cpu_mem": 1.80269056, + "gpu_mem": 4.493420032, + "loss": 0.8256, + "grad_norm": 4.682041168212891, + "learning_rate": 0.0002625333238766989 + }, + { + "step": 193, + "epoch": 0.3092948717948718, + "cpu_mem": 1.802887168, + "gpu_mem": 4.493459968, + "loss": 0.3789, + "grad_norm": 2.8076529502868652, + "learning_rate": 0.0002619761674650377 + }, + { + "step": 194, + "epoch": 0.3108974358974359, + "cpu_mem": 1.803083776, + "gpu_mem": 4.49345536, + "loss": 0.6972, + "grad_norm": 5.483014106750488, + "learning_rate": 0.0002614154995063647 + }, + { + "step": 195, + "epoch": 0.3125, + "cpu_mem": 1.803280384, + "gpu_mem": 4.493443072, + "loss": 0.915, + "grad_norm": 5.462584972381592, + "learning_rate": 0.00026085133758309883 + }, + { + "step": 196, + "epoch": 0.3141025641025641, + "cpu_mem": 1.803476992, + "gpu_mem": 4.493467648, + "loss": 0.6465, + "grad_norm": 4.632084846496582, + "learning_rate": 0.0002602836993872292 + }, + { + "step": 197, + "epoch": 0.3157051282051282, + "cpu_mem": 1.8036736, + "gpu_mem": 4.493483008, + "loss": 0.7491, + "grad_norm": 4.3185601234436035, + "learning_rate": 0.0002597126027197598 + }, + { + "step": 198, + "epoch": 0.3173076923076923, + "cpu_mem": 1.803870208, + "gpu_mem": 4.49345536, + "loss": 0.5112, + "grad_norm": 4.800030708312988, + "learning_rate": 0.0002591380654901515 + }, + { + "step": 199, + "epoch": 0.3189102564102564, + "cpu_mem": 1.804066816, + "gpu_mem": 4.493452288, + "loss": 1.0307, + "grad_norm": 8.838525772094727, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 200, + "epoch": 0.32051282051282054, + "cpu_mem": 1.804263424, + "gpu_mem": 4.493467648, + "loss": 0.8809, + "grad_norm": 6.483818531036377, + "learning_rate": 0.0002579787415212732 + }, + { + "step": 201, + "epoch": 0.32211538461538464, + "cpu_mem": 1.804263424, + "gpu_mem": 4.493444608, + "loss": 0.622, + "grad_norm": 3.3558332920074463, + "learning_rate": 0.00025739399113813784 + }, + { + "step": 202, + "epoch": 0.32371794871794873, + "cpu_mem": 1.804460032, + "gpu_mem": 4.493446144, + "loss": 0.8104, + "grad_norm": 4.434282302856445, + "learning_rate": 0.00025680587290399277 + }, + { + "step": 203, + "epoch": 0.32532051282051283, + "cpu_mem": 1.80465664, + "gpu_mem": 4.493487616, + "loss": 0.5998, + "grad_norm": 3.085745334625244, + "learning_rate": 0.0002562144052620913 + }, + { + "step": 204, + "epoch": 0.3269230769230769, + "cpu_mem": 1.805049856, + "gpu_mem": 4.493458432, + "loss": 0.6255, + "grad_norm": 3.1109416484832764, + "learning_rate": 0.00025561960676072354 + }, + { + "step": 205, + "epoch": 0.328525641025641, + "cpu_mem": 1.805246464, + "gpu_mem": 4.493458432, + "loss": 0.754, + "grad_norm": 2.9910120964050293, + "learning_rate": 0.0002550214960526344 + }, + { + "step": 206, + "epoch": 0.3301282051282051, + "cpu_mem": 1.805443072, + "gpu_mem": 4.49345536, + "loss": 0.5902, + "grad_norm": 2.724012613296509, + "learning_rate": 0.000254420091894439 + }, + { + "step": 207, + "epoch": 0.3317307692307692, + "cpu_mem": 1.80563968, + "gpu_mem": 4.49345536, + "loss": 0.692, + "grad_norm": 3.7828917503356934, + "learning_rate": 0.0002538154131460342 + }, + { + "step": 208, + "epoch": 0.3333333333333333, + "cpu_mem": 1.805836288, + "gpu_mem": 4.493446144, + "loss": 0.5959, + "grad_norm": 3.612858533859253, + "learning_rate": 0.00025320747877000745 + }, + { + "step": 209, + "epoch": 0.3349358974358974, + "cpu_mem": 1.806032896, + "gpu_mem": 4.493481472, + "loss": 0.4879, + "grad_norm": 4.187531471252441, + "learning_rate": 0.00025259630783104164 + }, + { + "step": 210, + "epoch": 0.33653846153846156, + "cpu_mem": 1.806229504, + "gpu_mem": 4.493438464, + "loss": 0.6323, + "grad_norm": 4.533961296081543, + "learning_rate": 0.00025198191949531786 + }, + { + "step": 211, + "epoch": 0.33814102564102566, + "cpu_mem": 1.806426112, + "gpu_mem": 4.493466112, + "loss": 0.5257, + "grad_norm": 3.2837133407592773, + "learning_rate": 0.00025136433302991366 + }, + { + "step": 212, + "epoch": 0.33974358974358976, + "cpu_mem": 1.80662272, + "gpu_mem": 4.493475328, + "loss": 0.5734, + "grad_norm": 3.948215961456299, + "learning_rate": 0.00025074356780219946 + }, + { + "step": 213, + "epoch": 0.34134615384615385, + "cpu_mem": 1.806819328, + "gpu_mem": 4.49344768, + "loss": 0.599, + "grad_norm": 3.6628825664520264, + "learning_rate": 0.000250119643279231 + }, + { + "step": 214, + "epoch": 0.34294871794871795, + "cpu_mem": 1.807015936, + "gpu_mem": 4.493456896, + "loss": 0.5521, + "grad_norm": 3.8152072429656982, + "learning_rate": 0.0002494925790271386 + }, + { + "step": 215, + "epoch": 0.34455128205128205, + "cpu_mem": 1.807212544, + "gpu_mem": 4.493458432, + "loss": 0.5121, + "grad_norm": 4.260671615600586, + "learning_rate": 0.00024886239471051376 + }, + { + "step": 216, + "epoch": 0.34615384615384615, + "cpu_mem": 1.807409152, + "gpu_mem": 4.493458432, + "loss": 0.5342, + "grad_norm": 4.816490173339844, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 217, + "epoch": 0.34775641025641024, + "cpu_mem": 1.807802368, + "gpu_mem": 4.493443072, + "loss": 0.662, + "grad_norm": 9.128137588500977, + "learning_rate": 0.0002475927450306363 + }, + { + "step": 218, + "epoch": 0.34935897435897434, + "cpu_mem": 1.807998976, + "gpu_mem": 4.493464576, + "loss": 0.5216, + "grad_norm": 5.2205281257629395, + "learning_rate": 0.0002469533194833073 + }, + { + "step": 219, + "epoch": 0.35096153846153844, + "cpu_mem": 1.807998976, + "gpu_mem": 4.493498368, + "loss": 0.621, + "grad_norm": 4.133636474609375, + "learning_rate": 0.0002463108535020447 + }, + { + "step": 220, + "epoch": 0.3525641025641026, + "cpu_mem": 1.808195584, + "gpu_mem": 4.493452288, + "loss": 0.6774, + "grad_norm": 4.034868240356445, + "learning_rate": 0.0002456653672344348 + }, + { + "step": 221, + "epoch": 0.3541666666666667, + "cpu_mem": 1.808392192, + "gpu_mem": 4.493458432, + "loss": 0.6048, + "grad_norm": 4.173713684082031, + "learning_rate": 0.0002450168809227794 + }, + { + "step": 222, + "epoch": 0.3557692307692308, + "cpu_mem": 1.8085888, + "gpu_mem": 4.493473792, + "loss": 0.639, + "grad_norm": 3.1333236694335938, + "learning_rate": 0.00024436541490346095 + }, + { + "step": 223, + "epoch": 0.3573717948717949, + "cpu_mem": 1.8085888, + "gpu_mem": 4.493492224, + "loss": 0.4823, + "grad_norm": 2.6162989139556885, + "learning_rate": 0.00024371098960630495 + }, + { + "step": 224, + "epoch": 0.358974358974359, + "cpu_mem": 1.808785408, + "gpu_mem": 4.493461504, + "loss": 0.5941, + "grad_norm": 2.9334747791290283, + "learning_rate": 0.000243053625553939 + }, + { + "step": 225, + "epoch": 0.3605769230769231, + "cpu_mem": 1.808982016, + "gpu_mem": 4.49344768, + "loss": 0.4638, + "grad_norm": 2.7242014408111572, + "learning_rate": 0.00024239334336114953 + }, + { + "step": 226, + "epoch": 0.36217948717948717, + "cpu_mem": 1.809178624, + "gpu_mem": 4.49344, + "loss": 0.4751, + "grad_norm": 3.0404505729675293, + "learning_rate": 0.0002417301637342352 + }, + { + "step": 227, + "epoch": 0.36378205128205127, + "cpu_mem": 1.809375232, + "gpu_mem": 4.493504512, + "loss": 0.7004, + "grad_norm": 4.1775665283203125, + "learning_rate": 0.00024106410747035744 + }, + { + "step": 228, + "epoch": 0.36538461538461536, + "cpu_mem": 1.80957184, + "gpu_mem": 4.493443072, + "loss": 0.5984, + "grad_norm": 3.6501922607421875, + "learning_rate": 0.00024039519545688846 + }, + { + "step": 229, + "epoch": 0.36698717948717946, + "cpu_mem": 1.809768448, + "gpu_mem": 4.493495296, + "loss": 0.477, + "grad_norm": 3.2237954139709473, + "learning_rate": 0.000239723448670756 + }, + { + "step": 230, + "epoch": 0.3685897435897436, + "cpu_mem": 1.809965056, + "gpu_mem": 4.493476864, + "loss": 0.5657, + "grad_norm": 3.6300415992736816, + "learning_rate": 0.0002390488881777858 + }, + { + "step": 231, + "epoch": 0.3701923076923077, + "cpu_mem": 1.810161664, + "gpu_mem": 4.493475328, + "loss": 0.5155, + "grad_norm": 3.782133102416992, + "learning_rate": 0.0002383715351320406 + }, + { + "step": 232, + "epoch": 0.3717948717948718, + "cpu_mem": 1.810358272, + "gpu_mem": 4.493479936, + "loss": 0.686, + "grad_norm": 3.592907667160034, + "learning_rate": 0.00023769141077515713 + }, + { + "step": 233, + "epoch": 0.3733974358974359, + "cpu_mem": 1.81055488, + "gpu_mem": 4.49345536, + "loss": 0.4307, + "grad_norm": 3.390540361404419, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 234, + "epoch": 0.375, + "cpu_mem": 1.810751488, + "gpu_mem": 4.493484544, + "loss": 0.4918, + "grad_norm": 4.497286319732666, + "learning_rate": 0.0002363229335283915 + }, + { + "step": 235, + "epoch": 0.3766025641025641, + "cpu_mem": 1.810948096, + "gpu_mem": 4.493461504, + "loss": 0.6267, + "grad_norm": 3.3237509727478027, + "learning_rate": 0.00023563462355364297 + }, + { + "step": 236, + "epoch": 0.3782051282051282, + "cpu_mem": 1.810948096, + "gpu_mem": 4.493522944, + "loss": 0.5496, + "grad_norm": 4.5222063064575195, + "learning_rate": 0.0002349436280966775 + }, + { + "step": 237, + "epoch": 0.3798076923076923, + "cpu_mem": 1.811144704, + "gpu_mem": 4.49344768, + "loss": 0.6535, + "grad_norm": 4.644615173339844, + "learning_rate": 0.00023424996882695468 + }, + { + "step": 238, + "epoch": 0.3814102564102564, + "cpu_mem": 1.811341312, + "gpu_mem": 4.493458432, + "loss": 0.5741, + "grad_norm": 3.8454477787017822, + "learning_rate": 0.00023355366749747063 + }, + { + "step": 239, + "epoch": 0.38301282051282054, + "cpu_mem": 1.81153792, + "gpu_mem": 4.493456896, + "loss": 0.7769, + "grad_norm": 4.515701770782471, + "learning_rate": 0.00023285474594407585 + }, + { + "step": 240, + "epoch": 0.38461538461538464, + "cpu_mem": 1.811734528, + "gpu_mem": 4.493453824, + "loss": 0.7862, + "grad_norm": 3.821063995361328, + "learning_rate": 0.0002321532260847905 + }, + { + "step": 241, + "epoch": 0.38621794871794873, + "cpu_mem": 1.811931136, + "gpu_mem": 4.493484544, + "loss": 0.4653, + "grad_norm": 2.609694480895996, + "learning_rate": 0.00023144912991911691 + }, + { + "step": 242, + "epoch": 0.38782051282051283, + "cpu_mem": 1.812127744, + "gpu_mem": 4.49346304, + "loss": 0.5452, + "grad_norm": 3.252509593963623, + "learning_rate": 0.0002307424795273499 + }, + { + "step": 243, + "epoch": 0.3894230769230769, + "cpu_mem": 1.812324352, + "gpu_mem": 4.493458432, + "loss": 0.5743, + "grad_norm": 3.831632614135742, + "learning_rate": 0.00023003329706988425 + }, + { + "step": 244, + "epoch": 0.391025641025641, + "cpu_mem": 1.81252096, + "gpu_mem": 4.493469184, + "loss": 0.5878, + "grad_norm": 2.5769033432006836, + "learning_rate": 0.00022932160478651963 + }, + { + "step": 245, + "epoch": 0.3926282051282051, + "cpu_mem": 1.81252096, + "gpu_mem": 4.493473792, + "loss": 0.6471, + "grad_norm": 3.546938419342041, + "learning_rate": 0.00022860742499576338 + }, + { + "step": 246, + "epoch": 0.3942307692307692, + "cpu_mem": 1.812717568, + "gpu_mem": 4.493435392, + "loss": 0.7321, + "grad_norm": 5.41407585144043, + "learning_rate": 0.00022789078009413042 + }, + { + "step": 247, + "epoch": 0.3958333333333333, + "cpu_mem": 1.812914176, + "gpu_mem": 4.493502976, + "loss": 0.9202, + "grad_norm": 4.238180160522461, + "learning_rate": 0.00022717169255544108 + }, + { + "step": 248, + "epoch": 0.3974358974358974, + "cpu_mem": 1.813110784, + "gpu_mem": 4.493466112, + "loss": 0.7082, + "grad_norm": 2.8106820583343506, + "learning_rate": 0.00022645018493011612 + }, + { + "step": 249, + "epoch": 0.39903846153846156, + "cpu_mem": 1.813307392, + "gpu_mem": 4.49345536, + "loss": 0.5994, + "grad_norm": 2.9212875366210938, + "learning_rate": 0.0002257262798444698 + }, + { + "step": 250, + "epoch": 0.40064102564102566, + "cpu_mem": 1.813504, + "gpu_mem": 4.493472256, + "loss": 0.5972, + "grad_norm": 2.083442449569702, + "learning_rate": 0.000225 + }, + { + "step": 251, + "epoch": 0.40224358974358976, + "cpu_mem": 1.813504, + "gpu_mem": 4.493446144, + "loss": 0.5049, + "grad_norm": 2.7087483406066895, + "learning_rate": 0.00022427136817267668 + }, + { + "step": 252, + "epoch": 0.40384615384615385, + "cpu_mem": 1.813700608, + "gpu_mem": 4.49349376, + "loss": 0.5216, + "grad_norm": 2.6150035858154297, + "learning_rate": 0.0002235404072122273 + }, + { + "step": 253, + "epoch": 0.40544871794871795, + "cpu_mem": 1.813897216, + "gpu_mem": 4.493461504, + "loss": 0.534, + "grad_norm": 4.001352310180664, + "learning_rate": 0.00022280714004142054 + }, + { + "step": 254, + "epoch": 0.40705128205128205, + "cpu_mem": 1.814093824, + "gpu_mem": 4.493450752, + "loss": 0.4006, + "grad_norm": 3.137725353240967, + "learning_rate": 0.00022207158965534726 + }, + { + "step": 255, + "epoch": 0.40865384615384615, + "cpu_mem": 1.814290432, + "gpu_mem": 4.493466112, + "loss": 0.5986, + "grad_norm": 4.0121588706970215, + "learning_rate": 0.0002213337791206993 + }, + { + "step": 256, + "epoch": 0.41025641025641024, + "cpu_mem": 1.81448704, + "gpu_mem": 4.49346304, + "loss": 0.5056, + "grad_norm": 3.6183862686157227, + "learning_rate": 0.00022059373157504636 + }, + { + "step": 257, + "epoch": 0.41185897435897434, + "cpu_mem": 1.81448704, + "gpu_mem": 4.49346304, + "loss": 0.5011, + "grad_norm": 3.6647024154663086, + "learning_rate": 0.00021985147022611038 + }, + { + "step": 258, + "epoch": 0.41346153846153844, + "cpu_mem": 1.814683648, + "gpu_mem": 4.493450752, + "loss": 0.6292, + "grad_norm": 5.131507873535156, + "learning_rate": 0.0002191070183510375 + }, + { + "step": 259, + "epoch": 0.4150641025641026, + "cpu_mem": 1.814880256, + "gpu_mem": 4.493433856, + "loss": 0.8643, + "grad_norm": 6.568894386291504, + "learning_rate": 0.00021836039929566835 + }, + { + "step": 260, + "epoch": 0.4166666666666667, + "cpu_mem": 1.814880256, + "gpu_mem": 4.493496832, + "loss": 0.674, + "grad_norm": 4.723886489868164, + "learning_rate": 0.0002176116364738058 + }, + { + "step": 261, + "epoch": 0.4182692307692308, + "cpu_mem": 1.815076864, + "gpu_mem": 4.493450752, + "loss": 0.4326, + "grad_norm": 3.94960355758667, + "learning_rate": 0.00021686075336648075 + }, + { + "step": 262, + "epoch": 0.4198717948717949, + "cpu_mem": 1.815273472, + "gpu_mem": 4.493459968, + "loss": 0.4169, + "grad_norm": 3.4277162551879883, + "learning_rate": 0.00021610777352121574 + }, + { + "step": 263, + "epoch": 0.421474358974359, + "cpu_mem": 1.81547008, + "gpu_mem": 4.493495296, + "loss": 0.3334, + "grad_norm": 3.1267659664154053, + "learning_rate": 0.0002153527205512867 + }, + { + "step": 264, + "epoch": 0.4230769230769231, + "cpu_mem": 1.81547008, + "gpu_mem": 4.493459968, + "loss": 0.5617, + "grad_norm": 4.216224193572998, + "learning_rate": 0.0002145956181349821 + }, + { + "step": 265, + "epoch": 0.42467948717948717, + "cpu_mem": 1.815666688, + "gpu_mem": 4.493464576, + "loss": 0.5289, + "grad_norm": 3.634931802749634, + "learning_rate": 0.00021383649001486055 + }, + { + "step": 266, + "epoch": 0.42628205128205127, + "cpu_mem": 1.815863296, + "gpu_mem": 4.493512192, + "loss": 0.4214, + "grad_norm": 3.024141788482666, + "learning_rate": 0.00021307535999700637 + }, + { + "step": 267, + "epoch": 0.42788461538461536, + "cpu_mem": 1.815863296, + "gpu_mem": 4.493521408, + "loss": 0.4139, + "grad_norm": 2.636969566345215, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 268, + "epoch": 0.42948717948717946, + "cpu_mem": 1.816059904, + "gpu_mem": 4.493475328, + "loss": 0.2556, + "grad_norm": 2.8076696395874023, + "learning_rate": 0.00021154718980558417 + }, + { + "step": 269, + "epoch": 0.4310897435897436, + "cpu_mem": 1.816256512, + "gpu_mem": 4.493469184, + "loss": 0.5734, + "grad_norm": 4.1219987869262695, + "learning_rate": 0.00021078019755508398 + }, + { + "step": 270, + "epoch": 0.4326923076923077, + "cpu_mem": 1.81645312, + "gpu_mem": 4.493530624, + "loss": 0.5319, + "grad_norm": 4.042218208312988, + "learning_rate": 0.00021001129925148396 + }, + { + "step": 271, + "epoch": 0.4342948717948718, + "cpu_mem": 1.816649728, + "gpu_mem": 4.493456896, + "loss": 0.4935, + "grad_norm": 4.582643985748291, + "learning_rate": 0.00020924051900725923 + }, + { + "step": 272, + "epoch": 0.4358974358974359, + "cpu_mem": 1.816649728, + "gpu_mem": 4.49345536, + "loss": 0.5112, + "grad_norm": 4.324197292327881, + "learning_rate": 0.00020846788099390188 + }, + { + "step": 273, + "epoch": 0.4375, + "cpu_mem": 1.816846336, + "gpu_mem": 4.493458432, + "loss": 0.4434, + "grad_norm": 2.8381097316741943, + "learning_rate": 0.0002076934094411635 + }, + { + "step": 274, + "epoch": 0.4391025641025641, + "cpu_mem": 1.817042944, + "gpu_mem": 4.493444608, + "loss": 0.5322, + "grad_norm": 3.631680965423584, + "learning_rate": 0.0002069171286362949 + }, + { + "step": 275, + "epoch": 0.4407051282051282, + "cpu_mem": 1.817042944, + "gpu_mem": 4.493459968, + "loss": 0.361, + "grad_norm": 2.9633803367614746, + "learning_rate": 0.00020613906292328457 + }, + { + "step": 276, + "epoch": 0.4423076923076923, + "cpu_mem": 1.817239552, + "gpu_mem": 4.493498368, + "loss": 0.477, + "grad_norm": 2.724745988845825, + "learning_rate": 0.0002053592367020955 + }, + { + "step": 277, + "epoch": 0.4439102564102564, + "cpu_mem": 1.81743616, + "gpu_mem": 4.4934784, + "loss": 0.6035, + "grad_norm": 3.5153865814208984, + "learning_rate": 0.0002045776744278996 + }, + { + "step": 278, + "epoch": 0.44551282051282054, + "cpu_mem": 1.81743616, + "gpu_mem": 4.493504512, + "loss": 0.5506, + "grad_norm": 3.6300365924835205, + "learning_rate": 0.00020379440061031118 + }, + { + "step": 279, + "epoch": 0.44711538461538464, + "cpu_mem": 1.817632768, + "gpu_mem": 4.49345536, + "loss": 0.4343, + "grad_norm": 3.3929617404937744, + "learning_rate": 0.00020300943981261808 + }, + { + "step": 280, + "epoch": 0.44871794871794873, + "cpu_mem": 1.817829376, + "gpu_mem": 4.493449216, + "loss": 0.5972, + "grad_norm": 3.6123995780944824, + "learning_rate": 0.0002022228166510114 + }, + { + "step": 281, + "epoch": 0.45032051282051283, + "cpu_mem": 1.817829376, + "gpu_mem": 4.493472256, + "loss": 0.4127, + "grad_norm": 2.8799502849578857, + "learning_rate": 0.00020143455579381373 + }, + { + "step": 282, + "epoch": 0.4519230769230769, + "cpu_mem": 1.817829376, + "gpu_mem": 4.493450752, + "loss": 0.5669, + "grad_norm": 3.2484076023101807, + "learning_rate": 0.0002006446819607053 + }, + { + "step": 283, + "epoch": 0.453525641025641, + "cpu_mem": 1.818025984, + "gpu_mem": 4.493464576, + "loss": 0.6393, + "grad_norm": 3.319472074508667, + "learning_rate": 0.00019985321992194892 + }, + { + "step": 284, + "epoch": 0.4551282051282051, + "cpu_mem": 1.818222592, + "gpu_mem": 4.493469184, + "loss": 0.4497, + "grad_norm": 3.215298891067505, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 285, + "epoch": 0.4567307692307692, + "cpu_mem": 1.818222592, + "gpu_mem": 4.493487616, + "loss": 0.6362, + "grad_norm": 4.6009392738342285, + "learning_rate": 0.00019826563055679418 + }, + { + "step": 286, + "epoch": 0.4583333333333333, + "cpu_mem": 1.8184192, + "gpu_mem": 4.493458432, + "loss": 0.3996, + "grad_norm": 2.7812013626098633, + "learning_rate": 0.00019746955301683537 + }, + { + "step": 287, + "epoch": 0.4599358974358974, + "cpu_mem": 1.818615808, + "gpu_mem": 4.49348608, + "loss": 0.7461, + "grad_norm": 3.4204180240631104, + "learning_rate": 0.0001966719868425464 + }, + { + "step": 288, + "epoch": 0.46153846153846156, + "cpu_mem": 1.818812416, + "gpu_mem": 4.493467648, + "loss": 0.5264, + "grad_norm": 2.7722835540771484, + "learning_rate": 0.0001958729570454201 + }, + { + "step": 289, + "epoch": 0.46314102564102566, + "cpu_mem": 1.818812416, + "gpu_mem": 4.49345536, + "loss": 0.5824, + "grad_norm": 3.284911632537842, + "learning_rate": 0.0001950724886828484 + }, + { + "step": 290, + "epoch": 0.46474358974358976, + "cpu_mem": 1.819009024, + "gpu_mem": 4.493464576, + "loss": 0.5656, + "grad_norm": 3.3946127891540527, + "learning_rate": 0.000194270606857336 + }, + { + "step": 291, + "epoch": 0.46634615384615385, + "cpu_mem": 1.819009024, + "gpu_mem": 4.493461504, + "loss": 0.492, + "grad_norm": 2.629390239715576, + "learning_rate": 0.00019346733671571367 + }, + { + "step": 292, + "epoch": 0.46794871794871795, + "cpu_mem": 1.819205632, + "gpu_mem": 4.493476864, + "loss": 0.505, + "grad_norm": 3.496201276779175, + "learning_rate": 0.00019266270344834942 + }, + { + "step": 293, + "epoch": 0.46955128205128205, + "cpu_mem": 1.81940224, + "gpu_mem": 4.493484544, + "loss": 0.4502, + "grad_norm": 3.1925528049468994, + "learning_rate": 0.00019185673228835857 + }, + { + "step": 294, + "epoch": 0.47115384615384615, + "cpu_mem": 1.819598848, + "gpu_mem": 4.493473792, + "loss": 0.508, + "grad_norm": 3.021618604660034, + "learning_rate": 0.00019104944851081244 + }, + { + "step": 295, + "epoch": 0.47275641025641024, + "cpu_mem": 1.819598848, + "gpu_mem": 4.493458432, + "loss": 0.7236, + "grad_norm": 3.4038844108581543, + "learning_rate": 0.00019024087743194564 + }, + { + "step": 296, + "epoch": 0.47435897435897434, + "cpu_mem": 1.819795456, + "gpu_mem": 4.493461504, + "loss": 0.465, + "grad_norm": 3.43562650680542, + "learning_rate": 0.0001894310444083625 + }, + { + "step": 297, + "epoch": 0.47596153846153844, + "cpu_mem": 1.819795456, + "gpu_mem": 4.49345536, + "loss": 0.4448, + "grad_norm": 2.8673324584960938, + "learning_rate": 0.00018861997483624136 + }, + { + "step": 298, + "epoch": 0.4775641025641026, + "cpu_mem": 1.819992064, + "gpu_mem": 4.493450752, + "loss": 0.804, + "grad_norm": 4.464987277984619, + "learning_rate": 0.00018780769415053866 + }, + { + "step": 299, + "epoch": 0.4791666666666667, + "cpu_mem": 1.820188672, + "gpu_mem": 4.493472256, + "loss": 0.7079, + "grad_norm": 3.8186893463134766, + "learning_rate": 0.00018699422782419094 + }, + { + "step": 300, + "epoch": 0.4807692307692308, + "cpu_mem": 1.820188672, + "gpu_mem": 4.493464576, + "loss": 0.611, + "grad_norm": 3.3747470378875732, + "learning_rate": 0.00018617960136731624 + }, + { + "step": 301, + "epoch": 0.4823717948717949, + "cpu_mem": 1.820188672, + "gpu_mem": 4.493436928, + "loss": 0.6288, + "grad_norm": 3.518157482147217, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 302, + "epoch": 0.483974358974359, + "cpu_mem": 1.820188672, + "gpu_mem": 4.493435392, + "loss": 0.4959, + "grad_norm": 2.176128387451172, + "learning_rate": 0.0001845469702835641 + }, + { + "step": 303, + "epoch": 0.4855769230769231, + "cpu_mem": 1.82038528, + "gpu_mem": 4.493461504, + "loss": 0.6353, + "grad_norm": 3.4326539039611816, + "learning_rate": 0.00018372901685562414 + }, + { + "step": 304, + "epoch": 0.48717948717948717, + "cpu_mem": 1.82038528, + "gpu_mem": 4.493444608, + "loss": 0.3789, + "grad_norm": 2.751147747039795, + "learning_rate": 0.00018291000569342676 + }, + { + "step": 305, + "epoch": 0.48878205128205127, + "cpu_mem": 1.820581888, + "gpu_mem": 4.493475328, + "loss": 0.4878, + "grad_norm": 3.502075672149658, + "learning_rate": 0.00018208996248097458 + }, + { + "step": 306, + "epoch": 0.49038461538461536, + "cpu_mem": 1.820581888, + "gpu_mem": 4.493458432, + "loss": 0.5232, + "grad_norm": 2.8718512058258057, + "learning_rate": 0.00018126891293463547 + }, + { + "step": 307, + "epoch": 0.49198717948717946, + "cpu_mem": 1.820778496, + "gpu_mem": 4.493489152, + "loss": 0.4482, + "grad_norm": 2.5313315391540527, + "learning_rate": 0.0001804468828023354 + }, + { + "step": 308, + "epoch": 0.4935897435897436, + "cpu_mem": 1.820975104, + "gpu_mem": 4.493456896, + "loss": 0.4657, + "grad_norm": 3.0432515144348145, + "learning_rate": 0.00017962389786275142 + }, + { + "step": 309, + "epoch": 0.4951923076923077, + "cpu_mem": 1.821171712, + "gpu_mem": 4.493483008, + "loss": 0.4221, + "grad_norm": 2.923752784729004, + "learning_rate": 0.000178799983924503 + }, + { + "step": 310, + "epoch": 0.4967948717948718, + "cpu_mem": 1.821171712, + "gpu_mem": 4.493458432, + "loss": 0.4414, + "grad_norm": 3.205054521560669, + "learning_rate": 0.00017797516682534293 + }, + { + "step": 311, + "epoch": 0.4983974358974359, + "cpu_mem": 1.821171712, + "gpu_mem": 4.493453824, + "loss": 0.5525, + "grad_norm": 4.117857933044434, + "learning_rate": 0.00017714947243134695 + }, + { + "step": 312, + "epoch": 0.5, + "cpu_mem": 1.82136832, + "gpu_mem": 4.493456896, + "loss": 0.3729, + "grad_norm": 3.563624382019043, + "learning_rate": 0.0001763229266361024 + }, + { + "step": 313, + "epoch": 0.5016025641025641, + "cpu_mem": 1.82136832, + "gpu_mem": 4.493475328, + "loss": 0.5065, + "grad_norm": 3.7563633918762207, + "learning_rate": 0.00017549555535989648 + }, + { + "step": 314, + "epoch": 0.5032051282051282, + "cpu_mem": 1.821564928, + "gpu_mem": 4.49345536, + "loss": 0.5609, + "grad_norm": 4.381458759307861, + "learning_rate": 0.00017466738454890323 + }, + { + "step": 315, + "epoch": 0.5048076923076923, + "cpu_mem": 1.821564928, + "gpu_mem": 4.493459968, + "loss": 0.4328, + "grad_norm": 4.222438812255859, + "learning_rate": 0.00017383844017436996 + }, + { + "step": 316, + "epoch": 0.5064102564102564, + "cpu_mem": 1.821761536, + "gpu_mem": 4.49345536, + "loss": 0.5902, + "grad_norm": 4.4066338539123535, + "learning_rate": 0.00017300874823180282 + }, + { + "step": 317, + "epoch": 0.5080128205128205, + "cpu_mem": 1.821761536, + "gpu_mem": 4.49346304, + "loss": 0.2935, + "grad_norm": 2.052401065826416, + "learning_rate": 0.00017217833474015128 + }, + { + "step": 318, + "epoch": 0.5096153846153846, + "cpu_mem": 1.821958144, + "gpu_mem": 4.493487616, + "loss": 0.4666, + "grad_norm": 3.059274911880493, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 319, + "epoch": 0.5112179487179487, + "cpu_mem": 1.821958144, + "gpu_mem": 4.493479936, + "loss": 0.4336, + "grad_norm": 3.4712181091308594, + "learning_rate": 0.0001705154472977154 + }, + { + "step": 320, + "epoch": 0.5128205128205128, + "cpu_mem": 1.822154752, + "gpu_mem": 4.493481472, + "loss": 0.5101, + "grad_norm": 3.3642971515655518, + "learning_rate": 0.00016968302549470095 + }, + { + "step": 321, + "epoch": 0.5144230769230769, + "cpu_mem": 1.822154752, + "gpu_mem": 4.493456896, + "loss": 0.4665, + "grad_norm": 2.9268720149993896, + "learning_rate": 0.00016884998643650694 + }, + { + "step": 322, + "epoch": 0.5160256410256411, + "cpu_mem": 1.82235136, + "gpu_mem": 4.493458432, + "loss": 0.5765, + "grad_norm": 3.454725980758667, + "learning_rate": 0.00016801635624704776 + }, + { + "step": 323, + "epoch": 0.5176282051282052, + "cpu_mem": 1.82235136, + "gpu_mem": 4.4934784, + "loss": 0.5082, + "grad_norm": 3.1189208030700684, + "learning_rate": 0.0001671821610687756 + }, + { + "step": 324, + "epoch": 0.5192307692307693, + "cpu_mem": 1.82235136, + "gpu_mem": 4.493450752, + "loss": 0.571, + "grad_norm": 3.045919418334961, + "learning_rate": 0.00016634742706186036 + }, + { + "step": 325, + "epoch": 0.5208333333333334, + "cpu_mem": 1.822547968, + "gpu_mem": 4.49346304, + "loss": 0.2733, + "grad_norm": 2.502483367919922, + "learning_rate": 0.00016551218040336993 + }, + { + "step": 326, + "epoch": 0.5224358974358975, + "cpu_mem": 1.822547968, + "gpu_mem": 4.493472256, + "loss": 0.3851, + "grad_norm": 2.7716379165649414, + "learning_rate": 0.00016467644728644843 + }, + { + "step": 327, + "epoch": 0.5240384615384616, + "cpu_mem": 1.822744576, + "gpu_mem": 4.493449216, + "loss": 0.5033, + "grad_norm": 3.882826089859009, + "learning_rate": 0.0001638402539194953 + }, + { + "step": 328, + "epoch": 0.5256410256410257, + "cpu_mem": 1.822744576, + "gpu_mem": 4.493473792, + "loss": 0.546, + "grad_norm": 3.999988079071045, + "learning_rate": 0.00016300362652534346 + }, + { + "step": 329, + "epoch": 0.5272435897435898, + "cpu_mem": 1.822941184, + "gpu_mem": 4.493473792, + "loss": 0.4575, + "grad_norm": 3.5052478313446045, + "learning_rate": 0.00016216659134043657 + }, + { + "step": 330, + "epoch": 0.5288461538461539, + "cpu_mem": 1.822941184, + "gpu_mem": 4.493456896, + "loss": 0.4015, + "grad_norm": 2.9534714221954346, + "learning_rate": 0.00016132917461400686 + }, + { + "step": 331, + "epoch": 0.530448717948718, + "cpu_mem": 1.823137792, + "gpu_mem": 4.493453824, + "loss": 0.4553, + "grad_norm": 3.4622557163238525, + "learning_rate": 0.00016049140260725127 + }, + { + "step": 332, + "epoch": 0.532051282051282, + "cpu_mem": 1.823137792, + "gpu_mem": 4.493446144, + "loss": 0.4769, + "grad_norm": 4.04550838470459, + "learning_rate": 0.00015965330159250845 + }, + { + "step": 333, + "epoch": 0.5336538461538461, + "cpu_mem": 1.8233344, + "gpu_mem": 4.493484544, + "loss": 0.4739, + "grad_norm": 3.891273021697998, + "learning_rate": 0.00015881489785243467 + }, + { + "step": 334, + "epoch": 0.5352564102564102, + "cpu_mem": 1.8233344, + "gpu_mem": 4.493461504, + "loss": 0.9837, + "grad_norm": 5.031757354736328, + "learning_rate": 0.00015797621767917942 + }, + { + "step": 335, + "epoch": 0.5368589743589743, + "cpu_mem": 1.823531008, + "gpu_mem": 4.493459968, + "loss": 0.5817, + "grad_norm": 4.34716796875, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 336, + "epoch": 0.5384615384615384, + "cpu_mem": 1.823531008, + "gpu_mem": 4.493476864, + "loss": 0.5035, + "grad_norm": 3.4535582065582275, + "learning_rate": 0.00015629813324424292 + }, + { + "step": 337, + "epoch": 0.5400641025641025, + "cpu_mem": 1.823531008, + "gpu_mem": 4.493461504, + "loss": 0.4014, + "grad_norm": 2.400301933288574, + "learning_rate": 0.00015545878160690583 + }, + { + "step": 338, + "epoch": 0.5416666666666666, + "cpu_mem": 1.823727616, + "gpu_mem": 4.493473792, + "loss": 0.4481, + "grad_norm": 2.571110486984253, + "learning_rate": 0.00015461925878342556 + }, + { + "step": 339, + "epoch": 0.5432692307692307, + "cpu_mem": 1.823727616, + "gpu_mem": 4.49348608, + "loss": 0.3557, + "grad_norm": 2.0774357318878174, + "learning_rate": 0.00015377959110104584 + }, + { + "step": 340, + "epoch": 0.5448717948717948, + "cpu_mem": 1.823727616, + "gpu_mem": 4.493461504, + "loss": 0.4274, + "grad_norm": 2.601247787475586, + "learning_rate": 0.00015293980489155333 + }, + { + "step": 341, + "epoch": 0.5464743589743589, + "cpu_mem": 1.823727616, + "gpu_mem": 4.493506048, + "loss": 0.5722, + "grad_norm": 3.1604530811309814, + "learning_rate": 0.00015209992649045152 + }, + { + "step": 342, + "epoch": 0.5480769230769231, + "cpu_mem": 1.823924224, + "gpu_mem": 4.493479936, + "loss": 0.5041, + "grad_norm": 2.560265064239502, + "learning_rate": 0.000151259982236135 + }, + { + "step": 343, + "epoch": 0.5496794871794872, + "cpu_mem": 1.824120832, + "gpu_mem": 4.493476864, + "loss": 0.5293, + "grad_norm": 2.7755417823791504, + "learning_rate": 0.00015041999846906367 + }, + { + "step": 344, + "epoch": 0.5512820512820513, + "cpu_mem": 1.824120832, + "gpu_mem": 4.493458432, + "loss": 0.3297, + "grad_norm": 2.582627773284912, + "learning_rate": 0.00014958000153093634 + }, + { + "step": 345, + "epoch": 0.5528846153846154, + "cpu_mem": 1.824120832, + "gpu_mem": 4.493464576, + "loss": 0.318, + "grad_norm": 2.2512402534484863, + "learning_rate": 0.000148740017763865 + }, + { + "step": 346, + "epoch": 0.5544871794871795, + "cpu_mem": 1.824120832, + "gpu_mem": 4.493433856, + "loss": 0.3793, + "grad_norm": 2.327022075653076, + "learning_rate": 0.00014790007350954845 + }, + { + "step": 347, + "epoch": 0.5560897435897436, + "cpu_mem": 1.82431744, + "gpu_mem": 4.493498368, + "loss": 0.5056, + "grad_norm": 3.870664596557617, + "learning_rate": 0.00014706019510844664 + }, + { + "step": 348, + "epoch": 0.5576923076923077, + "cpu_mem": 1.82431744, + "gpu_mem": 4.493452288, + "loss": 0.4802, + "grad_norm": 3.399979829788208, + "learning_rate": 0.0001462204088989541 + }, + { + "step": 349, + "epoch": 0.5592948717948718, + "cpu_mem": 1.824514048, + "gpu_mem": 4.493446144, + "loss": 0.3703, + "grad_norm": 2.9057939052581787, + "learning_rate": 0.00014538074121657447 + }, + { + "step": 350, + "epoch": 0.5608974358974359, + "cpu_mem": 1.824514048, + "gpu_mem": 4.49350144, + "loss": 0.2387, + "grad_norm": 1.9972052574157715, + "learning_rate": 0.00014454121839309415 + }, + { + "step": 351, + "epoch": 0.5625, + "cpu_mem": 1.824710656, + "gpu_mem": 4.493467648, + "loss": 0.2386, + "grad_norm": 3.136535167694092, + "learning_rate": 0.00014370186675575705 + }, + { + "step": 352, + "epoch": 0.5641025641025641, + "cpu_mem": 1.824710656, + "gpu_mem": 4.49345536, + "loss": 0.3655, + "grad_norm": 3.8938422203063965, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 353, + "epoch": 0.5657051282051282, + "cpu_mem": 1.824907264, + "gpu_mem": 4.493459968, + "loss": 0.4303, + "grad_norm": 3.872579574584961, + "learning_rate": 0.00014202378232082053 + }, + { + "step": 354, + "epoch": 0.5673076923076923, + "cpu_mem": 1.824907264, + "gpu_mem": 4.49344, + "loss": 0.4864, + "grad_norm": 4.861522674560547, + "learning_rate": 0.00014118510214756536 + }, + { + "step": 355, + "epoch": 0.5689102564102564, + "cpu_mem": 1.824907264, + "gpu_mem": 4.493464576, + "loss": 0.4038, + "grad_norm": 3.7588486671447754, + "learning_rate": 0.00014034669840749152 + }, + { + "step": 356, + "epoch": 0.5705128205128205, + "cpu_mem": 1.825103872, + "gpu_mem": 4.493443072, + "loss": 0.392, + "grad_norm": 3.8998050689697266, + "learning_rate": 0.0001395085973927487 + }, + { + "step": 357, + "epoch": 0.5721153846153846, + "cpu_mem": 1.82530048, + "gpu_mem": 4.493459968, + "loss": 0.288, + "grad_norm": 2.839799404144287, + "learning_rate": 0.00013867082538599317 + }, + { + "step": 358, + "epoch": 0.5737179487179487, + "cpu_mem": 1.82530048, + "gpu_mem": 4.49342464, + "loss": 0.4502, + "grad_norm": 5.1613450050354, + "learning_rate": 0.00013783340865956338 + }, + { + "step": 359, + "epoch": 0.5753205128205128, + "cpu_mem": 1.82530048, + "gpu_mem": 4.493456896, + "loss": 0.3924, + "grad_norm": 3.9701755046844482, + "learning_rate": 0.0001369963734746566 + }, + { + "step": 360, + "epoch": 0.5769230769230769, + "cpu_mem": 1.82530048, + "gpu_mem": 4.493446144, + "loss": 0.3466, + "grad_norm": 4.235082149505615, + "learning_rate": 0.0001361597460805047 + }, + { + "step": 361, + "epoch": 0.5785256410256411, + "cpu_mem": 1.825497088, + "gpu_mem": 4.493483008, + "loss": 0.6364, + "grad_norm": 6.123757362365723, + "learning_rate": 0.00013532355271355154 + }, + { + "step": 362, + "epoch": 0.5801282051282052, + "cpu_mem": 1.825497088, + "gpu_mem": 4.493449216, + "loss": 0.4268, + "grad_norm": 3.608308792114258, + "learning_rate": 0.00013448781959663004 + }, + { + "step": 363, + "epoch": 0.5817307692307693, + "cpu_mem": 1.825497088, + "gpu_mem": 4.493472256, + "loss": 0.4886, + "grad_norm": 3.748034715652466, + "learning_rate": 0.00013365257293813956 + }, + { + "step": 364, + "epoch": 0.5833333333333334, + "cpu_mem": 1.825497088, + "gpu_mem": 4.493461504, + "loss": 0.5931, + "grad_norm": 4.76796293258667, + "learning_rate": 0.00013281783893122446 + }, + { + "step": 365, + "epoch": 0.5849358974358975, + "cpu_mem": 1.825497088, + "gpu_mem": 4.493467648, + "loss": 0.3875, + "grad_norm": 3.2048704624176025, + "learning_rate": 0.00013198364375295224 + }, + { + "step": 366, + "epoch": 0.5865384615384616, + "cpu_mem": 1.825693696, + "gpu_mem": 4.493461504, + "loss": 0.601, + "grad_norm": 4.261781215667725, + "learning_rate": 0.000131150013563493 + }, + { + "step": 367, + "epoch": 0.5881410256410257, + "cpu_mem": 1.825890304, + "gpu_mem": 4.493479936, + "loss": 0.6997, + "grad_norm": 4.269753456115723, + "learning_rate": 0.00013031697450529902 + }, + { + "step": 368, + "epoch": 0.5897435897435898, + "cpu_mem": 1.825890304, + "gpu_mem": 4.49344, + "loss": 0.3723, + "grad_norm": 2.8096847534179688, + "learning_rate": 0.0001294845527022846 + }, + { + "step": 369, + "epoch": 0.5913461538461539, + "cpu_mem": 1.825890304, + "gpu_mem": 4.493472256, + "loss": 0.6555, + "grad_norm": 3.234971523284912, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 370, + "epoch": 0.592948717948718, + "cpu_mem": 1.825890304, + "gpu_mem": 4.493492224, + "loss": 0.5314, + "grad_norm": 2.4134883880615234, + "learning_rate": 0.0001278216652598487 + }, + { + "step": 371, + "epoch": 0.594551282051282, + "cpu_mem": 1.825890304, + "gpu_mem": 4.49348608, + "loss": 0.3486, + "grad_norm": 2.303971529006958, + "learning_rate": 0.00012699125176819716 + }, + { + "step": 372, + "epoch": 0.5961538461538461, + "cpu_mem": 1.826086912, + "gpu_mem": 4.493449216, + "loss": 0.5116, + "grad_norm": 3.0030975341796875, + "learning_rate": 0.00012616155982563 + }, + { + "step": 373, + "epoch": 0.5977564102564102, + "cpu_mem": 1.826086912, + "gpu_mem": 4.493466112, + "loss": 0.3908, + "grad_norm": 2.627114772796631, + "learning_rate": 0.00012533261545109674 + }, + { + "step": 374, + "epoch": 0.5993589743589743, + "cpu_mem": 1.82628352, + "gpu_mem": 4.493443072, + "loss": 0.562, + "grad_norm": 2.1029739379882812, + "learning_rate": 0.00012450444464010352 + }, + { + "step": 375, + "epoch": 0.6009615384615384, + "cpu_mem": 1.82628352, + "gpu_mem": 4.493475328, + "loss": 0.5706, + "grad_norm": 3.909468173980713, + "learning_rate": 0.0001236770733638976 + }, + { + "step": 376, + "epoch": 0.6025641025641025, + "cpu_mem": 1.82628352, + "gpu_mem": 4.49347072, + "loss": 0.4073, + "grad_norm": 2.4910640716552734, + "learning_rate": 0.000122850527568653 + }, + { + "step": 377, + "epoch": 0.6041666666666666, + "cpu_mem": 1.826480128, + "gpu_mem": 4.493479936, + "loss": 0.3914, + "grad_norm": 2.775496006011963, + "learning_rate": 0.00012202483317465704 + }, + { + "step": 378, + "epoch": 0.6057692307692307, + "cpu_mem": 1.826480128, + "gpu_mem": 4.493453824, + "loss": 0.3635, + "grad_norm": 2.3998405933380127, + "learning_rate": 0.00012120001607549698 + }, + { + "step": 379, + "epoch": 0.6073717948717948, + "cpu_mem": 1.826480128, + "gpu_mem": 4.493473792, + "loss": 0.4372, + "grad_norm": 2.5777652263641357, + "learning_rate": 0.00012037610213724862 + }, + { + "step": 380, + "epoch": 0.6089743589743589, + "cpu_mem": 1.826480128, + "gpu_mem": 4.49344768, + "loss": 0.4755, + "grad_norm": 3.1278529167175293, + "learning_rate": 0.0001195531171976646 + }, + { + "step": 381, + "epoch": 0.6105769230769231, + "cpu_mem": 1.826676736, + "gpu_mem": 4.493472256, + "loss": 0.5358, + "grad_norm": 3.656757354736328, + "learning_rate": 0.00011873108706536448 + }, + { + "step": 382, + "epoch": 0.6121794871794872, + "cpu_mem": 1.826676736, + "gpu_mem": 4.493456896, + "loss": 0.43, + "grad_norm": 3.2139899730682373, + "learning_rate": 0.00011791003751902542 + }, + { + "step": 383, + "epoch": 0.6137820512820513, + "cpu_mem": 1.826676736, + "gpu_mem": 4.493490688, + "loss": 0.4786, + "grad_norm": 2.9354476928710938, + "learning_rate": 0.00011708999430657325 + }, + { + "step": 384, + "epoch": 0.6153846153846154, + "cpu_mem": 1.826873344, + "gpu_mem": 4.49347072, + "loss": 0.4186, + "grad_norm": 3.750666856765747, + "learning_rate": 0.00011627098314437586 + }, + { + "step": 385, + "epoch": 0.6169871794871795, + "cpu_mem": 1.826873344, + "gpu_mem": 4.49345536, + "loss": 0.4401, + "grad_norm": 3.192333698272705, + "learning_rate": 0.0001154530297164359 + }, + { + "step": 386, + "epoch": 0.6185897435897436, + "cpu_mem": 1.826873344, + "gpu_mem": 4.493490688, + "loss": 0.4686, + "grad_norm": 3.4924445152282715, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 387, + "epoch": 0.6201923076923077, + "cpu_mem": 1.826873344, + "gpu_mem": 4.493496832, + "loss": 0.3121, + "grad_norm": 2.9909143447875977, + "learning_rate": 0.00011382039863268374 + }, + { + "step": 388, + "epoch": 0.6217948717948718, + "cpu_mem": 1.826873344, + "gpu_mem": 4.493459968, + "loss": 0.5696, + "grad_norm": 4.781128883361816, + "learning_rate": 0.00011300577217580905 + }, + { + "step": 389, + "epoch": 0.6233974358974359, + "cpu_mem": 1.827069952, + "gpu_mem": 4.493438464, + "loss": 0.4669, + "grad_norm": 3.514629364013672, + "learning_rate": 0.00011219230584946136 + }, + { + "step": 390, + "epoch": 0.625, + "cpu_mem": 1.827069952, + "gpu_mem": 4.493490688, + "loss": 0.6827, + "grad_norm": 3.9706435203552246, + "learning_rate": 0.00011138002516375864 + }, + { + "step": 391, + "epoch": 0.6266025641025641, + "cpu_mem": 1.827069952, + "gpu_mem": 4.493476864, + "loss": 0.2826, + "grad_norm": 2.5964889526367188, + "learning_rate": 0.00011056895559163748 + }, + { + "step": 392, + "epoch": 0.6282051282051282, + "cpu_mem": 1.827069952, + "gpu_mem": 4.49347072, + "loss": 0.5545, + "grad_norm": 3.93677020072937, + "learning_rate": 0.00010975912256805436 + }, + { + "step": 393, + "epoch": 0.6298076923076923, + "cpu_mem": 1.827069952, + "gpu_mem": 4.493476864, + "loss": 0.541, + "grad_norm": 3.3937196731567383, + "learning_rate": 0.00010895055148918756 + }, + { + "step": 394, + "epoch": 0.6314102564102564, + "cpu_mem": 1.82726656, + "gpu_mem": 4.493453824, + "loss": 0.5299, + "grad_norm": 3.5091288089752197, + "learning_rate": 0.00010814326771164141 + }, + { + "step": 395, + "epoch": 0.6330128205128205, + "cpu_mem": 1.82726656, + "gpu_mem": 4.493467648, + "loss": 0.2503, + "grad_norm": 2.0803916454315186, + "learning_rate": 0.00010733729655165054 + }, + { + "step": 396, + "epoch": 0.6346153846153846, + "cpu_mem": 1.82726656, + "gpu_mem": 4.493467648, + "loss": 0.6433, + "grad_norm": 4.549248218536377, + "learning_rate": 0.00010653266328428628 + }, + { + "step": 397, + "epoch": 0.6362179487179487, + "cpu_mem": 1.82726656, + "gpu_mem": 4.493436928, + "loss": 0.4028, + "grad_norm": 2.903439521789551, + "learning_rate": 0.00010572939314266402 + }, + { + "step": 398, + "epoch": 0.6378205128205128, + "cpu_mem": 1.82726656, + "gpu_mem": 4.49347072, + "loss": 0.435, + "grad_norm": 3.167689561843872, + "learning_rate": 0.00010492751131715159 + }, + { + "step": 399, + "epoch": 0.6394230769230769, + "cpu_mem": 1.827463168, + "gpu_mem": 4.493449216, + "loss": 0.4196, + "grad_norm": 2.8760015964508057, + "learning_rate": 0.00010412704295457988 + }, + { + "step": 400, + "epoch": 0.6410256410256411, + "cpu_mem": 1.827463168, + "gpu_mem": 4.493456896, + "loss": 0.2974, + "grad_norm": 2.3836724758148193, + "learning_rate": 0.00010332801315745361 + }, + { + "step": 401, + "epoch": 0.6426282051282052, + "cpu_mem": 1.827463168, + "gpu_mem": 4.493475328, + "loss": 0.3632, + "grad_norm": 2.3326659202575684, + "learning_rate": 0.00010253044698316464 + }, + { + "step": 402, + "epoch": 0.6442307692307693, + "cpu_mem": 1.827659776, + "gpu_mem": 4.493443072, + "loss": 0.6165, + "grad_norm": 3.909919023513794, + "learning_rate": 0.00010173436944320582 + }, + { + "step": 403, + "epoch": 0.6458333333333334, + "cpu_mem": 1.827659776, + "gpu_mem": 4.49344768, + "loss": 0.4154, + "grad_norm": 2.812187433242798, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 404, + "epoch": 0.6474358974358975, + "cpu_mem": 1.827856384, + "gpu_mem": 4.493443072, + "loss": 0.3668, + "grad_norm": 2.3467419147491455, + "learning_rate": 0.00010014678007805106 + }, + { + "step": 405, + "epoch": 0.6490384615384616, + "cpu_mem": 1.827856384, + "gpu_mem": 4.493487616, + "loss": 0.5283, + "grad_norm": 3.39583683013916, + "learning_rate": 9.935531803929469e-05 + }, + { + "step": 406, + "epoch": 0.6506410256410257, + "cpu_mem": 1.827856384, + "gpu_mem": 4.49347072, + "loss": 0.3491, + "grad_norm": 2.6071410179138184, + "learning_rate": 9.856544420618624e-05 + }, + { + "step": 407, + "epoch": 0.6522435897435898, + "cpu_mem": 1.827856384, + "gpu_mem": 4.493459968, + "loss": 0.4416, + "grad_norm": 2.9197824001312256, + "learning_rate": 9.777718334898859e-05 + }, + { + "step": 408, + "epoch": 0.6538461538461539, + "cpu_mem": 1.828052992, + "gpu_mem": 4.493481472, + "loss": 0.3597, + "grad_norm": 2.5150833129882812, + "learning_rate": 9.699056018738192e-05 + }, + { + "step": 409, + "epoch": 0.655448717948718, + "cpu_mem": 1.828052992, + "gpu_mem": 4.49344768, + "loss": 0.3385, + "grad_norm": 2.7419075965881348, + "learning_rate": 9.62055993896888e-05 + }, + { + "step": 410, + "epoch": 0.657051282051282, + "cpu_mem": 1.828052992, + "gpu_mem": 4.49346304, + "loss": 0.3742, + "grad_norm": 2.982290029525757, + "learning_rate": 9.542232557210039e-05 + }, + { + "step": 411, + "epoch": 0.6586538461538461, + "cpu_mem": 1.828052992, + "gpu_mem": 4.49346304, + "loss": 0.4288, + "grad_norm": 2.561265230178833, + "learning_rate": 9.464076329790451e-05 + }, + { + "step": 412, + "epoch": 0.6602564102564102, + "cpu_mem": 1.8282496, + "gpu_mem": 4.493453824, + "loss": 0.3781, + "grad_norm": 2.9267983436584473, + "learning_rate": 9.386093707671543e-05 + }, + { + "step": 413, + "epoch": 0.6618589743589743, + "cpu_mem": 1.8282496, + "gpu_mem": 4.493464576, + "loss": 0.7037, + "grad_norm": 3.9113543033599854, + "learning_rate": 9.308287136370511e-05 + }, + { + "step": 414, + "epoch": 0.6634615384615384, + "cpu_mem": 1.8282496, + "gpu_mem": 4.493489152, + "loss": 0.3949, + "grad_norm": 2.9471325874328613, + "learning_rate": 9.230659055883649e-05 + }, + { + "step": 415, + "epoch": 0.6650641025641025, + "cpu_mem": 1.8282496, + "gpu_mem": 4.493441536, + "loss": 0.5919, + "grad_norm": 3.533827543258667, + "learning_rate": 9.15321190060981e-05 + }, + { + "step": 416, + "epoch": 0.6666666666666666, + "cpu_mem": 1.8282496, + "gpu_mem": 4.493476864, + "loss": 0.3822, + "grad_norm": 2.996516227722168, + "learning_rate": 9.075948099274078e-05 + }, + { + "step": 417, + "epoch": 0.6682692307692307, + "cpu_mem": 1.8282496, + "gpu_mem": 4.493438464, + "loss": 0.3893, + "grad_norm": 3.400080919265747, + "learning_rate": 8.998870074851604e-05 + }, + { + "step": 418, + "epoch": 0.6698717948717948, + "cpu_mem": 1.828446208, + "gpu_mem": 4.493456896, + "loss": 0.4464, + "grad_norm": 3.034489393234253, + "learning_rate": 8.9219802444916e-05 + }, + { + "step": 419, + "epoch": 0.6714743589743589, + "cpu_mem": 1.828446208, + "gpu_mem": 4.493449216, + "loss": 0.3819, + "grad_norm": 2.7342591285705566, + "learning_rate": 8.845281019441583e-05 + }, + { + "step": 420, + "epoch": 0.6730769230769231, + "cpu_mem": 1.828446208, + "gpu_mem": 4.49348608, + "loss": 0.3659, + "grad_norm": 2.8229808807373047, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 421, + "epoch": 0.6746794871794872, + "cpu_mem": 1.828446208, + "gpu_mem": 4.493446144, + "loss": 0.3854, + "grad_norm": 3.468804359436035, + "learning_rate": 8.692464000299362e-05 + }, + { + "step": 422, + "epoch": 0.6762820512820513, + "cpu_mem": 1.828446208, + "gpu_mem": 4.493459968, + "loss": 0.3586, + "grad_norm": 2.4656476974487305, + "learning_rate": 8.61635099851395e-05 + }, + { + "step": 423, + "epoch": 0.6778846153846154, + "cpu_mem": 1.828642816, + "gpu_mem": 4.493464576, + "loss": 0.2411, + "grad_norm": 2.202556610107422, + "learning_rate": 8.540438186501792e-05 + }, + { + "step": 424, + "epoch": 0.6794871794871795, + "cpu_mem": 1.828642816, + "gpu_mem": 4.493426176, + "loss": 0.4281, + "grad_norm": 3.125492572784424, + "learning_rate": 8.464727944871322e-05 + }, + { + "step": 425, + "epoch": 0.6810897435897436, + "cpu_mem": 1.828642816, + "gpu_mem": 4.493449216, + "loss": 0.6751, + "grad_norm": 3.5058369636535645, + "learning_rate": 8.389222647878426e-05 + }, + { + "step": 426, + "epoch": 0.6826923076923077, + "cpu_mem": 1.828839424, + "gpu_mem": 4.49344768, + "loss": 0.39, + "grad_norm": 2.94826340675354, + "learning_rate": 8.313924663351926e-05 + }, + { + "step": 427, + "epoch": 0.6842948717948718, + "cpu_mem": 1.828839424, + "gpu_mem": 4.493466112, + "loss": 0.3809, + "grad_norm": 3.0372586250305176, + "learning_rate": 8.238836352619424e-05 + }, + { + "step": 428, + "epoch": 0.6858974358974359, + "cpu_mem": 1.829036032, + "gpu_mem": 4.49346304, + "loss": 0.3687, + "grad_norm": 3.1137216091156006, + "learning_rate": 8.163960070433164e-05 + }, + { + "step": 429, + "epoch": 0.6875, + "cpu_mem": 1.829036032, + "gpu_mem": 4.493461504, + "loss": 0.5207, + "grad_norm": 3.195096015930176, + "learning_rate": 8.089298164896245e-05 + }, + { + "step": 430, + "epoch": 0.6891025641025641, + "cpu_mem": 1.829036032, + "gpu_mem": 4.493479936, + "loss": 0.2498, + "grad_norm": 2.2157046794891357, + "learning_rate": 8.014852977388964e-05 + }, + { + "step": 431, + "epoch": 0.6907051282051282, + "cpu_mem": 1.829036032, + "gpu_mem": 4.493441536, + "loss": 0.435, + "grad_norm": 2.7655129432678223, + "learning_rate": 7.940626842495362e-05 + }, + { + "step": 432, + "epoch": 0.6923076923076923, + "cpu_mem": 1.829036032, + "gpu_mem": 4.49348608, + "loss": 0.4322, + "grad_norm": 3.4009151458740234, + "learning_rate": 7.866622087930074e-05 + }, + { + "step": 433, + "epoch": 0.6939102564102564, + "cpu_mem": 1.829036032, + "gpu_mem": 4.493450752, + "loss": 0.3392, + "grad_norm": 2.6258645057678223, + "learning_rate": 7.792841034465275e-05 + }, + { + "step": 434, + "epoch": 0.6955128205128205, + "cpu_mem": 1.82923264, + "gpu_mem": 4.4934784, + "loss": 0.5376, + "grad_norm": 3.795480251312256, + "learning_rate": 7.719285995857938e-05 + }, + { + "step": 435, + "epoch": 0.6971153846153846, + "cpu_mem": 1.82923264, + "gpu_mem": 4.493458432, + "loss": 0.4684, + "grad_norm": 3.921708583831787, + "learning_rate": 7.64595927877727e-05 + }, + { + "step": 436, + "epoch": 0.6987179487179487, + "cpu_mem": 1.82923264, + "gpu_mem": 4.493504512, + "loss": 0.5311, + "grad_norm": 3.689903974533081, + "learning_rate": 7.572863182732332e-05 + }, + { + "step": 437, + "epoch": 0.7003205128205128, + "cpu_mem": 1.82923264, + "gpu_mem": 4.493469184, + "loss": 0.3821, + "grad_norm": 2.5647411346435547, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 438, + "epoch": 0.7019230769230769, + "cpu_mem": 1.82923264, + "gpu_mem": 4.493459968, + "loss": 0.2941, + "grad_norm": 2.8284730911254883, + "learning_rate": 7.42737201555302e-05 + }, + { + "step": 439, + "epoch": 0.7035256410256411, + "cpu_mem": 1.82923264, + "gpu_mem": 4.493453824, + "loss": 0.2834, + "grad_norm": 2.466683864593506, + "learning_rate": 7.354981506988387e-05 + }, + { + "step": 440, + "epoch": 0.7051282051282052, + "cpu_mem": 1.829429248, + "gpu_mem": 4.493438464, + "loss": 0.4668, + "grad_norm": 2.9175660610198975, + "learning_rate": 7.282830744455895e-05 + }, + { + "step": 441, + "epoch": 0.7067307692307693, + "cpu_mem": 1.829429248, + "gpu_mem": 4.493456896, + "loss": 0.5894, + "grad_norm": 4.044260501861572, + "learning_rate": 7.210921990586957e-05 + }, + { + "step": 442, + "epoch": 0.7083333333333334, + "cpu_mem": 1.829429248, + "gpu_mem": 4.493458432, + "loss": 0.3793, + "grad_norm": 2.723440408706665, + "learning_rate": 7.139257500423665e-05 + }, + { + "step": 443, + "epoch": 0.7099358974358975, + "cpu_mem": 1.829429248, + "gpu_mem": 4.49346304, + "loss": 0.4921, + "grad_norm": 3.631112813949585, + "learning_rate": 7.067839521348035e-05 + }, + { + "step": 444, + "epoch": 0.7115384615384616, + "cpu_mem": 1.829429248, + "gpu_mem": 4.493466112, + "loss": 0.2675, + "grad_norm": 2.547144651412964, + "learning_rate": 6.996670293011575e-05 + }, + { + "step": 445, + "epoch": 0.7131410256410257, + "cpu_mem": 1.829429248, + "gpu_mem": 4.493459968, + "loss": 0.5852, + "grad_norm": 3.7137837409973145, + "learning_rate": 6.92575204726501e-05 + }, + { + "step": 446, + "epoch": 0.7147435897435898, + "cpu_mem": 1.829429248, + "gpu_mem": 4.49348608, + "loss": 0.5801, + "grad_norm": 3.3795206546783447, + "learning_rate": 6.855087008088307e-05 + }, + { + "step": 447, + "epoch": 0.7163461538461539, + "cpu_mem": 1.829429248, + "gpu_mem": 4.493453824, + "loss": 0.336, + "grad_norm": 1.8984929323196411, + "learning_rate": 6.784677391520952e-05 + }, + { + "step": 448, + "epoch": 0.717948717948718, + "cpu_mem": 1.829429248, + "gpu_mem": 4.493481472, + "loss": 0.3252, + "grad_norm": 2.439871072769165, + "learning_rate": 6.714525405592412e-05 + }, + { + "step": 449, + "epoch": 0.719551282051282, + "cpu_mem": 1.829625856, + "gpu_mem": 4.493489152, + "loss": 0.3445, + "grad_norm": 2.968015670776367, + "learning_rate": 6.644633250252937e-05 + }, + { + "step": 450, + "epoch": 0.7211538461538461, + "cpu_mem": 1.829625856, + "gpu_mem": 4.49347072, + "loss": 0.4695, + "grad_norm": 3.4054510593414307, + "learning_rate": 6.575003117304535e-05 + }, + { + "step": 451, + "epoch": 0.7227564102564102, + "cpu_mem": 1.829625856, + "gpu_mem": 4.493456896, + "loss": 0.2929, + "grad_norm": 2.5837979316711426, + "learning_rate": 6.50563719033225e-05 + }, + { + "step": 452, + "epoch": 0.7243589743589743, + "cpu_mem": 1.829625856, + "gpu_mem": 4.493467648, + "loss": 0.4489, + "grad_norm": 3.7888896465301514, + "learning_rate": 6.436537644635705e-05 + }, + { + "step": 453, + "epoch": 0.7259615384615384, + "cpu_mem": 1.829625856, + "gpu_mem": 4.493459968, + "loss": 0.4013, + "grad_norm": 3.1082444190979004, + "learning_rate": 6.367706647160847e-05 + }, + { + "step": 454, + "epoch": 0.7275641025641025, + "cpu_mem": 1.829625856, + "gpu_mem": 4.493476864, + "loss": 0.4974, + "grad_norm": 3.3254990577697754, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 455, + "epoch": 0.7291666666666666, + "cpu_mem": 1.829822464, + "gpu_mem": 4.493449216, + "loss": 0.3719, + "grad_norm": 3.1736068725585938, + "learning_rate": 6.230858922484288e-05 + }, + { + "step": 456, + "epoch": 0.7307692307692307, + "cpu_mem": 1.829822464, + "gpu_mem": 4.493479936, + "loss": 0.3973, + "grad_norm": 2.817267417907715, + "learning_rate": 6.162846486795938e-05 + }, + { + "step": 457, + "epoch": 0.7323717948717948, + "cpu_mem": 1.829822464, + "gpu_mem": 4.493461504, + "loss": 0.4839, + "grad_norm": 4.014505863189697, + "learning_rate": 6.095111182221422e-05 + }, + { + "step": 458, + "epoch": 0.7339743589743589, + "cpu_mem": 1.829822464, + "gpu_mem": 4.493449216, + "loss": 0.4599, + "grad_norm": 2.6717288494110107, + "learning_rate": 6.027655132924397e-05 + }, + { + "step": 459, + "epoch": 0.7355769230769231, + "cpu_mem": 1.829822464, + "gpu_mem": 4.493461504, + "loss": 0.5166, + "grad_norm": 3.224177598953247, + "learning_rate": 5.960480454311155e-05 + }, + { + "step": 460, + "epoch": 0.7371794871794872, + "cpu_mem": 1.829822464, + "gpu_mem": 4.493467648, + "loss": 0.3348, + "grad_norm": 3.703897476196289, + "learning_rate": 5.893589252964258e-05 + }, + { + "step": 461, + "epoch": 0.7387820512820513, + "cpu_mem": 1.829822464, + "gpu_mem": 4.49345536, + "loss": 0.2451, + "grad_norm": 2.198641777038574, + "learning_rate": 5.826983626576479e-05 + }, + { + "step": 462, + "epoch": 0.7403846153846154, + "cpu_mem": 1.829822464, + "gpu_mem": 4.493444608, + "loss": 0.3224, + "grad_norm": 3.1271092891693115, + "learning_rate": 5.760665663885046e-05 + }, + { + "step": 463, + "epoch": 0.7419871794871795, + "cpu_mem": 1.829822464, + "gpu_mem": 4.493446144, + "loss": 0.3408, + "grad_norm": 3.311673402786255, + "learning_rate": 5.6946374446060984e-05 + }, + { + "step": 464, + "epoch": 0.7435897435897436, + "cpu_mem": 1.829822464, + "gpu_mem": 4.493459968, + "loss": 0.3941, + "grad_norm": 2.2674014568328857, + "learning_rate": 5.6289010393695056e-05 + }, + { + "step": 465, + "epoch": 0.7451923076923077, + "cpu_mem": 1.830019072, + "gpu_mem": 4.49346304, + "loss": 0.3993, + "grad_norm": 2.844200372695923, + "learning_rate": 5.563458509653904e-05 + }, + { + "step": 466, + "epoch": 0.7467948717948718, + "cpu_mem": 1.830019072, + "gpu_mem": 4.493473792, + "loss": 0.3867, + "grad_norm": 3.105612277984619, + "learning_rate": 5.498311907722057e-05 + }, + { + "step": 467, + "epoch": 0.7483974358974359, + "cpu_mem": 1.830019072, + "gpu_mem": 4.49344768, + "loss": 0.4426, + "grad_norm": 2.887129783630371, + "learning_rate": 5.43346327655652e-05 + }, + { + "step": 468, + "epoch": 0.75, + "cpu_mem": 1.830019072, + "gpu_mem": 4.49346304, + "loss": 0.6534, + "grad_norm": 4.086610794067383, + "learning_rate": 5.3689146497955274e-05 + }, + { + "step": 469, + "epoch": 0.7516025641025641, + "cpu_mem": 1.83021568, + "gpu_mem": 4.493472256, + "loss": 0.4841, + "grad_norm": 3.0686604976654053, + "learning_rate": 5.30466805166927e-05 + }, + { + "step": 470, + "epoch": 0.7532051282051282, + "cpu_mem": 1.83021568, + "gpu_mem": 4.493446144, + "loss": 0.4839, + "grad_norm": 3.720183849334717, + "learning_rate": 5.240725496936372e-05 + }, + { + "step": 471, + "epoch": 0.7548076923076923, + "cpu_mem": 1.83021568, + "gpu_mem": 4.493452288, + "loss": 0.3774, + "grad_norm": 2.5031943321228027, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 472, + "epoch": 0.7564102564102564, + "cpu_mem": 1.83021568, + "gpu_mem": 4.493441536, + "loss": 0.5235, + "grad_norm": 3.084761142730713, + "learning_rate": 5.113760528948622e-05 + }, + { + "step": 473, + "epoch": 0.7580128205128205, + "cpu_mem": 1.83021568, + "gpu_mem": 4.49344768, + "loss": 0.466, + "grad_norm": 3.3196887969970703, + "learning_rate": 5.05074209728614e-05 + }, + { + "step": 474, + "epoch": 0.7596153846153846, + "cpu_mem": 1.83021568, + "gpu_mem": 4.493484544, + "loss": 0.2837, + "grad_norm": 2.4187469482421875, + "learning_rate": 4.988035672076899e-05 + }, + { + "step": 475, + "epoch": 0.7612179487179487, + "cpu_mem": 1.83021568, + "gpu_mem": 4.49343232, + "loss": 0.4953, + "grad_norm": 3.2367804050445557, + "learning_rate": 4.925643219780052e-05 + }, + { + "step": 476, + "epoch": 0.7628205128205128, + "cpu_mem": 1.83021568, + "gpu_mem": 4.493452288, + "loss": 0.3836, + "grad_norm": 2.4965908527374268, + "learning_rate": 4.863566697008634e-05 + }, + { + "step": 477, + "epoch": 0.7644230769230769, + "cpu_mem": 1.83021568, + "gpu_mem": 4.493452288, + "loss": 0.3643, + "grad_norm": 2.6967055797576904, + "learning_rate": 4.801808050468219e-05 + }, + { + "step": 478, + "epoch": 0.7660256410256411, + "cpu_mem": 1.83021568, + "gpu_mem": 4.493450752, + "loss": 0.3367, + "grad_norm": 2.3117501735687256, + "learning_rate": 4.7403692168958305e-05 + }, + { + "step": 479, + "epoch": 0.7676282051282052, + "cpu_mem": 1.830412288, + "gpu_mem": 4.493449216, + "loss": 0.2135, + "grad_norm": 1.8857765197753906, + "learning_rate": 4.679252122999255e-05 + }, + { + "step": 480, + "epoch": 0.7692307692307693, + "cpu_mem": 1.830412288, + "gpu_mem": 4.493441536, + "loss": 0.5031, + "grad_norm": 3.1443727016448975, + "learning_rate": 4.618458685396579e-05 + }, + { + "step": 481, + "epoch": 0.7708333333333334, + "cpu_mem": 1.830412288, + "gpu_mem": 4.49350144, + "loss": 0.3056, + "grad_norm": 2.212709665298462, + "learning_rate": 4.5579908105561016e-05 + }, + { + "step": 482, + "epoch": 0.7724358974358975, + "cpu_mem": 1.830412288, + "gpu_mem": 4.493446144, + "loss": 0.3905, + "grad_norm": 2.7691080570220947, + "learning_rate": 4.497850394736563e-05 + }, + { + "step": 483, + "epoch": 0.7740384615384616, + "cpu_mem": 1.830412288, + "gpu_mem": 4.493429248, + "loss": 0.4637, + "grad_norm": 2.672900915145874, + "learning_rate": 4.438039323927648e-05 + }, + { + "step": 484, + "epoch": 0.7756410256410257, + "cpu_mem": 1.830412288, + "gpu_mem": 4.493459968, + "loss": 0.5628, + "grad_norm": 3.507272720336914, + "learning_rate": 4.3785594737908676e-05 + }, + { + "step": 485, + "epoch": 0.7772435897435898, + "cpu_mem": 1.830608896, + "gpu_mem": 4.493504512, + "loss": 0.4307, + "grad_norm": 2.5625147819519043, + "learning_rate": 4.319412709600723e-05 + }, + { + "step": 486, + "epoch": 0.7788461538461539, + "cpu_mem": 1.830608896, + "gpu_mem": 4.493484544, + "loss": 0.3225, + "grad_norm": 2.545323371887207, + "learning_rate": 4.2606008861862116e-05 + }, + { + "step": 487, + "epoch": 0.780448717948718, + "cpu_mem": 1.830608896, + "gpu_mem": 4.493484544, + "loss": 0.2655, + "grad_norm": 2.1339597702026367, + "learning_rate": 4.2021258478726774e-05 + }, + { + "step": 488, + "epoch": 0.782051282051282, + "cpu_mem": 1.830608896, + "gpu_mem": 4.493450752, + "loss": 0.2863, + "grad_norm": 2.5236525535583496, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 489, + "epoch": 0.7836538461538461, + "cpu_mem": 1.830608896, + "gpu_mem": 4.493475328, + "loss": 0.3839, + "grad_norm": 2.1426899433135986, + "learning_rate": 4.0861934509848507e-05 + }, + { + "step": 490, + "epoch": 0.7852564102564102, + "cpu_mem": 1.830608896, + "gpu_mem": 4.4934784, + "loss": 0.4736, + "grad_norm": 3.2311179637908936, + "learning_rate": 4.028739728024022e-05 + }, + { + "step": 491, + "epoch": 0.7868589743589743, + "cpu_mem": 1.830608896, + "gpu_mem": 4.493456896, + "loss": 0.308, + "grad_norm": 3.0182197093963623, + "learning_rate": 3.971630061277077e-05 + }, + { + "step": 492, + "epoch": 0.7884615384615384, + "cpu_mem": 1.830608896, + "gpu_mem": 4.493479936, + "loss": 0.2922, + "grad_norm": 2.696887969970703, + "learning_rate": 3.914866241690115e-05 + }, + { + "step": 493, + "epoch": 0.7900641025641025, + "cpu_mem": 1.830608896, + "gpu_mem": 4.493459968, + "loss": 0.4366, + "grad_norm": 4.024561405181885, + "learning_rate": 3.858450049363532e-05 + }, + { + "step": 494, + "epoch": 0.7916666666666666, + "cpu_mem": 1.830608896, + "gpu_mem": 4.493484544, + "loss": 0.4043, + "grad_norm": 2.8331968784332275, + "learning_rate": 3.8023832534962314e-05 + }, + { + "step": 495, + "epoch": 0.7932692307692307, + "cpu_mem": 1.830608896, + "gpu_mem": 4.493467648, + "loss": 0.3547, + "grad_norm": 2.899472951889038, + "learning_rate": 3.746667612330109e-05 + }, + { + "step": 496, + "epoch": 0.7948717948717948, + "cpu_mem": 1.830608896, + "gpu_mem": 4.49346304, + "loss": 0.4281, + "grad_norm": 4.106040000915527, + "learning_rate": 3.691304873094927e-05 + }, + { + "step": 497, + "epoch": 0.7964743589743589, + "cpu_mem": 1.830608896, + "gpu_mem": 4.493475328, + "loss": 0.343, + "grad_norm": 3.0753960609436035, + "learning_rate": 3.636296771953544e-05 + }, + { + "step": 498, + "epoch": 0.7980769230769231, + "cpu_mem": 1.830608896, + "gpu_mem": 4.493446144, + "loss": 0.3736, + "grad_norm": 3.0308711528778076, + "learning_rate": 3.581645033947425e-05 + }, + { + "step": 499, + "epoch": 0.7996794871794872, + "cpu_mem": 1.830608896, + "gpu_mem": 4.493459968, + "loss": 0.611, + "grad_norm": 4.26881217956543, + "learning_rate": 3.527351372942588e-05 + }, + { + "step": 500, + "epoch": 0.8012820512820513, + "cpu_mem": 1.830805504, + "gpu_mem": 4.493446144, + "loss": 0.3116, + "grad_norm": 3.098851442337036, + "learning_rate": 3.473417491575824e-05 + }, + { + "step": 501, + "epoch": 0.8028846153846154, + "cpu_mem": 1.830805504, + "gpu_mem": 4.49344, + "loss": 0.484, + "grad_norm": 4.091184616088867, + "learning_rate": 3.41984508120132e-05 + }, + { + "step": 502, + "epoch": 0.8044871794871795, + "cpu_mem": 1.832968192, + "gpu_mem": 4.493446144, + "loss": 0.2739, + "grad_norm": 2.350385904312134, + "learning_rate": 3.366635821837627e-05 + }, + { + "step": 503, + "epoch": 0.8060897435897436, + "cpu_mem": 1.832968192, + "gpu_mem": 4.493459968, + "loss": 0.5061, + "grad_norm": 3.61661696434021, + "learning_rate": 3.3137913821149425e-05 + }, + { + "step": 504, + "epoch": 0.8076923076923077, + "cpu_mem": 1.832968192, + "gpu_mem": 4.493443072, + "loss": 0.4884, + "grad_norm": 3.1828746795654297, + "learning_rate": 3.261313419222825e-05 + }, + { + "step": 505, + "epoch": 0.8092948717948718, + "cpu_mem": 1.832968192, + "gpu_mem": 4.493496832, + "loss": 0.3173, + "grad_norm": 2.224966049194336, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 506, + "epoch": 0.8108974358974359, + "cpu_mem": 1.832968192, + "gpu_mem": 4.49344, + "loss": 0.2739, + "grad_norm": 2.5441434383392334, + "learning_rate": 3.157463495173713e-05 + }, + { + "step": 507, + "epoch": 0.8125, + "cpu_mem": 1.832968192, + "gpu_mem": 4.493518336, + "loss": 0.3867, + "grad_norm": 3.06235933303833, + "learning_rate": 3.1060947907265936e-05 + }, + { + "step": 508, + "epoch": 0.8141025641025641, + "cpu_mem": 1.832968192, + "gpu_mem": 4.493461504, + "loss": 0.3848, + "grad_norm": 3.358379364013672, + "learning_rate": 3.0550990764276634e-05 + }, + { + "step": 509, + "epoch": 0.8157051282051282, + "cpu_mem": 1.832968192, + "gpu_mem": 4.493479936, + "loss": 0.4504, + "grad_norm": 3.750854969024658, + "learning_rate": 3.00447795149086e-05 + }, + { + "step": 510, + "epoch": 0.8173076923076923, + "cpu_mem": 1.832968192, + "gpu_mem": 4.49345536, + "loss": 0.3453, + "grad_norm": 2.67159104347229, + "learning_rate": 2.9542330033830884e-05 + }, + { + "step": 511, + "epoch": 0.8189102564102564, + "cpu_mem": 1.832968192, + "gpu_mem": 4.493487616, + "loss": 0.3819, + "grad_norm": 2.588750123977661, + "learning_rate": 2.9043658077744316e-05 + }, + { + "step": 512, + "epoch": 0.8205128205128205, + "cpu_mem": 1.8331648, + "gpu_mem": 4.493507584, + "loss": 0.4837, + "grad_norm": 3.6776764392852783, + "learning_rate": 2.8548779284887442e-05 + }, + { + "step": 513, + "epoch": 0.8221153846153846, + "cpu_mem": 1.8331648, + "gpu_mem": 4.493436928, + "loss": 0.2782, + "grad_norm": 2.4540622234344482, + "learning_rate": 2.805770917454614e-05 + }, + { + "step": 514, + "epoch": 0.8237179487179487, + "cpu_mem": 1.8331648, + "gpu_mem": 4.493450752, + "loss": 0.2042, + "grad_norm": 1.8863685131072998, + "learning_rate": 2.7570463146566758e-05 + }, + { + "step": 515, + "epoch": 0.8253205128205128, + "cpu_mem": 1.8331648, + "gpu_mem": 4.493435392, + "loss": 0.4116, + "grad_norm": 3.134005069732666, + "learning_rate": 2.708705648087332e-05 + }, + { + "step": 516, + "epoch": 0.8269230769230769, + "cpu_mem": 1.8331648, + "gpu_mem": 4.493473792, + "loss": 0.2893, + "grad_norm": 2.3283092975616455, + "learning_rate": 2.6607504336988317e-05 + }, + { + "step": 517, + "epoch": 0.8285256410256411, + "cpu_mem": 1.8331648, + "gpu_mem": 4.493473792, + "loss": 0.4934, + "grad_norm": 3.5883405208587646, + "learning_rate": 2.613182175355739e-05 + }, + { + "step": 518, + "epoch": 0.8301282051282052, + "cpu_mem": 1.8331648, + "gpu_mem": 4.493459968, + "loss": 0.3806, + "grad_norm": 3.0393118858337402, + "learning_rate": 2.5660023647877644e-05 + }, + { + "step": 519, + "epoch": 0.8317307692307693, + "cpu_mem": 1.8331648, + "gpu_mem": 4.493450752, + "loss": 0.3472, + "grad_norm": 2.5635623931884766, + "learning_rate": 2.5192124815429777e-05 + }, + { + "step": 520, + "epoch": 0.8333333333333334, + "cpu_mem": 1.8331648, + "gpu_mem": 4.49345536, + "loss": 0.472, + "grad_norm": 2.8446834087371826, + "learning_rate": 2.472813992941418e-05 + }, + { + "step": 521, + "epoch": 0.8349358974358975, + "cpu_mem": 1.8331648, + "gpu_mem": 4.493458432, + "loss": 0.3598, + "grad_norm": 2.972809314727783, + "learning_rate": 2.426808354029078e-05 + }, + { + "step": 522, + "epoch": 0.8365384615384616, + "cpu_mem": 1.8331648, + "gpu_mem": 4.493464576, + "loss": 0.3936, + "grad_norm": 2.8008053302764893, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 523, + "epoch": 0.8381410256410257, + "cpu_mem": 1.833361408, + "gpu_mem": 4.493483008, + "loss": 0.3419, + "grad_norm": 3.153785228729248, + "learning_rate": 2.3359813838124277e-05 + }, + { + "step": 524, + "epoch": 0.8397435897435898, + "cpu_mem": 1.833361408, + "gpu_mem": 4.493476864, + "loss": 0.5996, + "grad_norm": 4.5397257804870605, + "learning_rate": 2.2911629008211363e-05 + }, + { + "step": 525, + "epoch": 0.8413461538461539, + "cpu_mem": 1.833361408, + "gpu_mem": 4.493453824, + "loss": 0.1864, + "grad_norm": 2.293668508529663, + "learning_rate": 2.24674296405579e-05 + }, + { + "step": 526, + "epoch": 0.842948717948718, + "cpu_mem": 1.833361408, + "gpu_mem": 4.493441536, + "loss": 0.5109, + "grad_norm": 3.5782692432403564, + "learning_rate": 2.2027229665154446e-05 + }, + { + "step": 527, + "epoch": 0.844551282051282, + "cpu_mem": 1.833361408, + "gpu_mem": 4.493407744, + "loss": 0.3744, + "grad_norm": 2.996725559234619, + "learning_rate": 2.1591042886571634e-05 + }, + { + "step": 528, + "epoch": 0.8461538461538461, + "cpu_mem": 1.833361408, + "gpu_mem": 4.49345536, + "loss": 0.3614, + "grad_norm": 2.9219892024993896, + "learning_rate": 2.1158882983527166e-05 + }, + { + "step": 529, + "epoch": 0.8477564102564102, + "cpu_mem": 1.833361408, + "gpu_mem": 4.493421568, + "loss": 0.4649, + "grad_norm": 3.0508267879486084, + "learning_rate": 2.0730763508456738e-05 + }, + { + "step": 530, + "epoch": 0.8493589743589743, + "cpu_mem": 1.833361408, + "gpu_mem": 4.493469184, + "loss": 0.4609, + "grad_norm": 2.9537737369537354, + "learning_rate": 2.0306697887089235e-05 + }, + { + "step": 531, + "epoch": 0.8509615384615384, + "cpu_mem": 1.833558016, + "gpu_mem": 4.493467648, + "loss": 0.4467, + "grad_norm": 2.669584035873413, + "learning_rate": 1.9886699418025543e-05 + }, + { + "step": 532, + "epoch": 0.8525641025641025, + "cpu_mem": 1.833558016, + "gpu_mem": 4.493469184, + "loss": 0.275, + "grad_norm": 2.125986099243164, + "learning_rate": 1.947078127232169e-05 + }, + { + "step": 533, + "epoch": 0.8541666666666666, + "cpu_mem": 1.833558016, + "gpu_mem": 4.4934784, + "loss": 0.4822, + "grad_norm": 3.5348775386810303, + "learning_rate": 1.9058956493075644e-05 + }, + { + "step": 534, + "epoch": 0.8557692307692307, + "cpu_mem": 1.833558016, + "gpu_mem": 4.493453824, + "loss": 0.5025, + "grad_norm": 3.440526247024536, + "learning_rate": 1.8651237995018324e-05 + }, + { + "step": 535, + "epoch": 0.8573717948717948, + "cpu_mem": 1.833558016, + "gpu_mem": 4.493438464, + "loss": 0.5478, + "grad_norm": 3.786937952041626, + "learning_rate": 1.8247638564108607e-05 + }, + { + "step": 536, + "epoch": 0.8589743589743589, + "cpu_mem": 1.833558016, + "gpu_mem": 4.493467648, + "loss": 0.4072, + "grad_norm": 3.0187056064605713, + "learning_rate": 1.7848170857132325e-05 + }, + { + "step": 537, + "epoch": 0.8605769230769231, + "cpu_mem": 1.833558016, + "gpu_mem": 4.493481472, + "loss": 0.4416, + "grad_norm": 3.1439080238342285, + "learning_rate": 1.7452847401305496e-05 + }, + { + "step": 538, + "epoch": 0.8621794871794872, + "cpu_mem": 1.833558016, + "gpu_mem": 4.493436928, + "loss": 0.2994, + "grad_norm": 2.288670301437378, + "learning_rate": 1.7061680593881344e-05 + }, + { + "step": 539, + "epoch": 0.8637820512820513, + "cpu_mem": 1.833558016, + "gpu_mem": 4.493443072, + "loss": 0.4906, + "grad_norm": 2.950049877166748, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 540, + "epoch": 0.8653846153846154, + "cpu_mem": 1.833558016, + "gpu_mem": 4.493472256, + "loss": 0.4516, + "grad_norm": 3.1415066719055176, + "learning_rate": 1.6291865861111353e-05 + }, + { + "step": 541, + "epoch": 0.8669871794871795, + "cpu_mem": 1.833558016, + "gpu_mem": 4.493467648, + "loss": 0.276, + "grad_norm": 2.2431318759918213, + "learning_rate": 1.5913242076979493e-05 + }, + { + "step": 542, + "epoch": 0.8685897435897436, + "cpu_mem": 1.833558016, + "gpu_mem": 4.493453824, + "loss": 0.5492, + "grad_norm": 4.094605445861816, + "learning_rate": 1.5538823222921288e-05 + }, + { + "step": 543, + "epoch": 0.8701923076923077, + "cpu_mem": 1.833558016, + "gpu_mem": 4.493467648, + "loss": 0.3491, + "grad_norm": 2.6665515899658203, + "learning_rate": 1.5168621040626388e-05 + }, + { + "step": 544, + "epoch": 0.8717948717948718, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493456896, + "loss": 0.3693, + "grad_norm": 3.0234501361846924, + "learning_rate": 1.4802647139550577e-05 + }, + { + "step": 545, + "epoch": 0.8733974358974359, + "cpu_mem": 1.833754624, + "gpu_mem": 4.49346304, + "loss": 0.2973, + "grad_norm": 1.9780503511428833, + "learning_rate": 1.444091299655175e-05 + }, + { + "step": 546, + "epoch": 0.875, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493467648, + "loss": 0.5476, + "grad_norm": 3.9950039386749268, + "learning_rate": 1.408342995552988e-05 + }, + { + "step": 547, + "epoch": 0.8766025641025641, + "cpu_mem": 1.833754624, + "gpu_mem": 4.49346304, + "loss": 0.3792, + "grad_norm": 3.0586211681365967, + "learning_rate": 1.3730209227071436e-05 + }, + { + "step": 548, + "epoch": 0.8782051282051282, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493436928, + "loss": 0.3422, + "grad_norm": 2.892664909362793, + "learning_rate": 1.3381261888097755e-05 + }, + { + "step": 549, + "epoch": 0.8798076923076923, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493446144, + "loss": 0.3701, + "grad_norm": 2.8089659214019775, + "learning_rate": 1.303659888151753e-05 + }, + { + "step": 550, + "epoch": 0.8814102564102564, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493464576, + "loss": 0.54, + "grad_norm": 3.3375396728515625, + "learning_rate": 1.2696231015883913e-05 + }, + { + "step": 551, + "epoch": 0.8830128205128205, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493435392, + "loss": 0.3485, + "grad_norm": 2.574497938156128, + "learning_rate": 1.2360168965055301e-05 + }, + { + "step": 552, + "epoch": 0.8846153846153846, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493466112, + "loss": 0.3971, + "grad_norm": 3.0742547512054443, + "learning_rate": 1.2028423267860805e-05 + }, + { + "step": 553, + "epoch": 0.8862179487179487, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493475328, + "loss": 0.3325, + "grad_norm": 2.48311185836792, + "learning_rate": 1.1701004327769709e-05 + }, + { + "step": 554, + "epoch": 0.8878205128205128, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493436928, + "loss": 0.5967, + "grad_norm": 3.5429396629333496, + "learning_rate": 1.1377922412565005e-05 + }, + { + "step": 555, + "epoch": 0.8894230769230769, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493441536, + "loss": 0.361, + "grad_norm": 2.636366128921509, + "learning_rate": 1.1059187654021762e-05 + }, + { + "step": 556, + "epoch": 0.8910256410256411, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493466112, + "loss": 0.4686, + "grad_norm": 2.972994089126587, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 557, + "epoch": 0.8926282051282052, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493484544, + "loss": 0.2934, + "grad_norm": 2.2632741928100586, + "learning_rate": 1.0434799452076915e-05 + }, + { + "step": 558, + "epoch": 0.8942307692307693, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493466112, + "loss": 0.4559, + "grad_norm": 3.0180819034576416, + "learning_rate": 1.0129165589346643e-05 + }, + { + "step": 559, + "epoch": 0.8958333333333334, + "cpu_mem": 1.833754624, + "gpu_mem": 4.4935168, + "loss": 0.4783, + "grad_norm": 3.3111588954925537, + "learning_rate": 9.82791804400626e-06 + }, + { + "step": 560, + "epoch": 0.8974358974358975, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493449216, + "loss": 0.8541, + "grad_norm": 4.499063491821289, + "learning_rate": 9.531066263109971e-06 + }, + { + "step": 561, + "epoch": 0.8990384615384616, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493450752, + "loss": 0.3581, + "grad_norm": 2.7906272411346436, + "learning_rate": 9.238619555861731e-06 + }, + { + "step": 562, + "epoch": 0.9006410256410257, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493450752, + "loss": 0.4059, + "grad_norm": 2.7121052742004395, + "learning_rate": 8.950587093323435e-06 + }, + { + "step": 563, + "epoch": 0.9022435897435898, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493456896, + "loss": 0.2914, + "grad_norm": 2.0060596466064453, + "learning_rate": 8.66697790812731e-06 + }, + { + "step": 564, + "epoch": 0.9038461538461539, + "cpu_mem": 1.833754624, + "gpu_mem": 4.49347072, + "loss": 0.2663, + "grad_norm": 2.212587356567383, + "learning_rate": 8.387800894192453e-06 + }, + { + "step": 565, + "epoch": 0.905448717948718, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493475328, + "loss": 0.4716, + "grad_norm": 3.3693501949310303, + "learning_rate": 8.113064806446285e-06 + }, + { + "step": 566, + "epoch": 0.907051282051282, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493469184, + "loss": 0.3184, + "grad_norm": 2.2549638748168945, + "learning_rate": 7.842778260549654e-06 + }, + { + "step": 567, + "epoch": 0.9086538461538461, + "cpu_mem": 1.833754624, + "gpu_mem": 4.49346304, + "loss": 0.4065, + "grad_norm": 3.1387252807617188, + "learning_rate": 7.5769497326268804e-06 + }, + { + "step": 568, + "epoch": 0.9102564102564102, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493476864, + "loss": 0.5942, + "grad_norm": 3.738659381866455, + "learning_rate": 7.315587558999864e-06 + }, + { + "step": 569, + "epoch": 0.9118589743589743, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493469184, + "loss": 0.4674, + "grad_norm": 3.0874791145324707, + "learning_rate": 7.058699935926526e-06 + }, + { + "step": 570, + "epoch": 0.9134615384615384, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493453824, + "loss": 0.4442, + "grad_norm": 3.321526050567627, + "learning_rate": 6.8062949193440515e-06 + }, + { + "step": 571, + "epoch": 0.9150641025641025, + "cpu_mem": 1.833754624, + "gpu_mem": 4.49346304, + "loss": 0.5158, + "grad_norm": 3.3405447006225586, + "learning_rate": 6.5583804246160385e-06 + }, + { + "step": 572, + "epoch": 0.9166666666666666, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493472256, + "loss": 0.3001, + "grad_norm": 2.603346586227417, + "learning_rate": 6.3149642262843804e-06 + }, + { + "step": 573, + "epoch": 0.9182692307692307, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493475328, + "loss": 0.3462, + "grad_norm": 2.6403286457061768, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 574, + "epoch": 0.9198717948717948, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493429248, + "loss": 0.3297, + "grad_norm": 2.7649548053741455, + "learning_rate": 5.84165711141048e-06 + }, + { + "step": 575, + "epoch": 0.9214743589743589, + "cpu_mem": 1.833754624, + "gpu_mem": 4.493483008, + "loss": 0.27, + "grad_norm": 2.502572536468506, + "learning_rate": 5.611781037671176e-06 + }, + { + "step": 576, + "epoch": 0.9230769230769231, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493484544, + "loss": 0.4138, + "grad_norm": 2.728264331817627, + "learning_rate": 5.386432945468555e-06 + }, + { + "step": 577, + "epoch": 0.9246794871794872, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493429248, + "loss": 0.4711, + "grad_norm": 2.8098089694976807, + "learning_rate": 5.165619901667311e-06 + }, + { + "step": 578, + "epoch": 0.9262820512820513, + "cpu_mem": 1.833951232, + "gpu_mem": 4.49346304, + "loss": 0.476, + "grad_norm": 3.227886438369751, + "learning_rate": 4.949348830914002e-06 + }, + { + "step": 579, + "epoch": 0.9278846153846154, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493441536, + "loss": 0.4664, + "grad_norm": 3.3720812797546387, + "learning_rate": 4.737626515419951e-06 + }, + { + "step": 580, + "epoch": 0.9294871794871795, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493472256, + "loss": 0.4188, + "grad_norm": 2.8544538021087646, + "learning_rate": 4.530459594748592e-06 + }, + { + "step": 581, + "epoch": 0.9310897435897436, + "cpu_mem": 1.833951232, + "gpu_mem": 4.49344768, + "loss": 0.2551, + "grad_norm": 2.328486919403076, + "learning_rate": 4.327854565607164e-06 + }, + { + "step": 582, + "epoch": 0.9326923076923077, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493481472, + "loss": 0.339, + "grad_norm": 2.6457130908966064, + "learning_rate": 4.129817781643091e-06 + }, + { + "step": 583, + "epoch": 0.9342948717948718, + "cpu_mem": 1.833951232, + "gpu_mem": 4.49350144, + "loss": 0.5062, + "grad_norm": 3.2027668952941895, + "learning_rate": 3.9363554532446276e-06 + }, + { + "step": 584, + "epoch": 0.9358974358974359, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493466112, + "loss": 0.3456, + "grad_norm": 2.3192310333251953, + "learning_rate": 3.7474736473461607e-06 + }, + { + "step": 585, + "epoch": 0.9375, + "cpu_mem": 1.833951232, + "gpu_mem": 4.49348608, + "loss": 0.3103, + "grad_norm": 2.4868180751800537, + "learning_rate": 3.56317828723795e-06 + }, + { + "step": 586, + "epoch": 0.9391025641025641, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493466112, + "loss": 0.4046, + "grad_norm": 2.915989398956299, + "learning_rate": 3.383475152380355e-06 + }, + { + "step": 587, + "epoch": 0.9407051282051282, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493467648, + "loss": 0.3546, + "grad_norm": 2.6890010833740234, + "learning_rate": 3.2083698782225997e-06 + }, + { + "step": 588, + "epoch": 0.9423076923076923, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493459968, + "loss": 0.2789, + "grad_norm": 2.7747838497161865, + "learning_rate": 3.0378679560260467e-06 + }, + { + "step": 589, + "epoch": 0.9439102564102564, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493458432, + "loss": 0.4663, + "grad_norm": 2.8676254749298096, + "learning_rate": 2.871974732691984e-06 + }, + { + "step": 590, + "epoch": 0.9455128205128205, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493472256, + "loss": 0.4314, + "grad_norm": 2.9055397510528564, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 591, + "epoch": 0.9471153846153846, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493443072, + "loss": 0.3861, + "grad_norm": 2.4627325534820557, + "learning_rate": 2.554035047414732e-06 + }, + { + "step": 592, + "epoch": 0.9487179487179487, + "cpu_mem": 1.833951232, + "gpu_mem": 4.49349376, + "loss": 0.4598, + "grad_norm": 3.121129035949707, + "learning_rate": 2.401998555987389e-06 + }, + { + "step": 593, + "epoch": 0.9503205128205128, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493490688, + "loss": 0.4269, + "grad_norm": 2.7861294746398926, + "learning_rate": 2.2545907041415457e-06 + }, + { + "step": 594, + "epoch": 0.9519230769230769, + "cpu_mem": 1.833951232, + "gpu_mem": 4.49347072, + "loss": 0.4499, + "grad_norm": 3.4911298751831055, + "learning_rate": 2.1118161145537436e-06 + }, + { + "step": 595, + "epoch": 0.9535256410256411, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493452288, + "loss": 0.32, + "grad_norm": 1.9178814888000488, + "learning_rate": 1.973679264602485e-06 + }, + { + "step": 596, + "epoch": 0.9551282051282052, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493461504, + "loss": 0.2796, + "grad_norm": 2.3001492023468018, + "learning_rate": 1.840184486227808e-06 + }, + { + "step": 597, + "epoch": 0.9567307692307693, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493429248, + "loss": 0.4667, + "grad_norm": 3.5399820804595947, + "learning_rate": 1.711335965795435e-06 + }, + { + "step": 598, + "epoch": 0.9583333333333334, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493490688, + "loss": 0.4338, + "grad_norm": 2.935100793838501, + "learning_rate": 1.5871377439655054e-06 + }, + { + "step": 599, + "epoch": 0.9599358974358975, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493489152, + "loss": 0.3108, + "grad_norm": 2.4993538856506348, + "learning_rate": 1.4675937155658456e-06 + }, + { + "step": 600, + "epoch": 0.9615384615384616, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493444608, + "loss": 0.44, + "grad_norm": 3.0255091190338135, + "learning_rate": 1.3527076294698846e-06 + }, + { + "step": 601, + "epoch": 0.9631410256410257, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493476864, + "loss": 0.4348, + "grad_norm": 2.864409923553467, + "learning_rate": 1.2424830884790126e-06 + }, + { + "step": 602, + "epoch": 0.9647435897435898, + "cpu_mem": 1.833951232, + "gpu_mem": 4.49347072, + "loss": 0.468, + "grad_norm": 3.740830659866333, + "learning_rate": 1.1369235492096397e-06 + }, + { + "step": 603, + "epoch": 0.9663461538461539, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493456896, + "loss": 0.2514, + "grad_norm": 2.4231369495391846, + "learning_rate": 1.0360323219847645e-06 + }, + { + "step": 604, + "epoch": 0.967948717948718, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493456896, + "loss": 0.4601, + "grad_norm": 3.805792808532715, + "learning_rate": 9.398125707302084e-07 + }, + { + "step": 605, + "epoch": 0.969551282051282, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493483008, + "loss": 0.2769, + "grad_norm": 2.1526989936828613, + "learning_rate": 8.482673128753947e-07 + }, + { + "step": 606, + "epoch": 0.9711538461538461, + "cpu_mem": 1.833951232, + "gpu_mem": 4.49347072, + "loss": 0.2899, + "grad_norm": 2.3680365085601807, + "learning_rate": 7.613994192586736e-07 + }, + { + "step": 607, + "epoch": 0.9727564102564102, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493461504, + "loss": 0.4832, + "grad_norm": 2.7598695755004883, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 608, + "epoch": 0.9743589743589743, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493453824, + "loss": 0.6084, + "grad_norm": 3.987431526184082, + "learning_rate": 6.017064746021094e-07 + }, + { + "step": 609, + "epoch": 0.9759615384615384, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493476864, + "loss": 0.3074, + "grad_norm": 2.5461957454681396, + "learning_rate": 5.288864314965003e-07 + }, + { + "step": 610, + "epoch": 0.9775641025641025, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493466112, + "loss": 0.2339, + "grad_norm": 1.9635378122329712, + "learning_rate": 4.607537683404106e-07 + }, + { + "step": 611, + "epoch": 0.9791666666666666, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493450752, + "loss": 0.3494, + "grad_norm": 2.240482807159424, + "learning_rate": 3.973106217585842e-07 + }, + { + "step": 612, + "epoch": 0.9807692307692307, + "cpu_mem": 1.833951232, + "gpu_mem": 4.4935168, + "loss": 0.3414, + "grad_norm": 2.454118013381958, + "learning_rate": 3.3855898131356915e-07 + }, + { + "step": 613, + "epoch": 0.9823717948717948, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493459968, + "loss": 0.4161, + "grad_norm": 2.8107340335845947, + "learning_rate": 2.845006894433843e-07 + }, + { + "step": 614, + "epoch": 0.9839743589743589, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493446144, + "loss": 0.4077, + "grad_norm": 3.0012588500976562, + "learning_rate": 2.351374414037155e-07 + }, + { + "step": 615, + "epoch": 0.9855769230769231, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493510656, + "loss": 0.4181, + "grad_norm": 3.306007146835327, + "learning_rate": 1.9047078521474135e-07 + }, + { + "step": 616, + "epoch": 0.9871794871794872, + "cpu_mem": 1.833951232, + "gpu_mem": 4.49344, + "loss": 0.4448, + "grad_norm": 2.7862045764923096, + "learning_rate": 1.505021216125557e-07 + }, + { + "step": 617, + "epoch": 0.9887820512820513, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493467648, + "loss": 0.2424, + "grad_norm": 2.16896390914917, + "learning_rate": 1.1523270400535245e-07 + }, + { + "step": 618, + "epoch": 0.9903846153846154, + "cpu_mem": 1.833951232, + "gpu_mem": 4.49347072, + "loss": 0.4911, + "grad_norm": 3.126185417175293, + "learning_rate": 8.466363843397383e-08 + }, + { + "step": 619, + "epoch": 0.9919871794871795, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493446144, + "loss": 0.5142, + "grad_norm": 3.2604799270629883, + "learning_rate": 5.8795883537338106e-08 + }, + { + "step": 620, + "epoch": 0.9935897435897436, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493473792, + "loss": 0.3886, + "grad_norm": 2.7932610511779785, + "learning_rate": 3.763025052231361e-08 + }, + { + "step": 621, + "epoch": 0.9951923076923077, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493483008, + "loss": 0.3329, + "grad_norm": 3.04341459274292, + "learning_rate": 2.1167403138339088e-08 + }, + { + "step": 622, + "epoch": 0.9967948717948718, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493479936, + "loss": 0.3933, + "grad_norm": 2.5388660430908203, + "learning_rate": 9.407857656540397e-09 + }, + { + "step": 623, + "epoch": 0.9983974358974359, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493452288, + "loss": 0.4304, + "grad_norm": 2.8624160289764404, + "learning_rate": 2.3519828535434325e-09 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493198848, + "loss": 0.2764, + "grad_norm": 3.3210692405700684, + "learning_rate": 0.0 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 1.833951232, + "gpu_mem": 4.493198848, + "train_runtime": 8237.0954, + "train_samples_per_second": 4.845, + "train_steps_per_second": 0.076, + "total_flos": 8.474956527272755e+16, + "train_loss": 0.7438592348629848 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r2-a2/README.md b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r2-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r2-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r2-a2/adapter_config.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..26ebe9ef584396639cb6b281f2c8108d7f3fd14a --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r2-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 4, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 2, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r2-a2/eval_results.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..f7dc89fae284272ae1f22a34b5d46850129b430a --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "logiqa", + "results": 0.29539351373683126 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r2-a2/training_configuration.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..bf1896c09897517bec8362bd7bd4e2149e59cee7 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "LOGIQA", + "dataset_id": "data/logiqa_train", + "preprocess_id": "logiqa_train_deepeval" + }, + "peft_config": { + "method": "lora", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 1576960 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 3, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-lora-logiqa-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r2-a2", + "seed": 42, + "timestamp": "2025-08-29T15:07:35.784150" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r2-a2/training_logs.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..8f60d60872e666c0cb892018435e25c4f96dc0e8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r2-a2/training_logs.json @@ -0,0 +1,5305 @@ +[ + { + "step": 1, + "epoch": 0.005089058524173028, + "cpu_mem": 1.739329536, + "gpu_mem": 4.424109568, + "loss": 3.8396, + "grad_norm": 14.890569686889648, + "learning_rate": 5.084745762711864e-06 + }, + { + "step": 2, + "epoch": 0.010178117048346057, + "cpu_mem": 1.742671872, + "gpu_mem": 4.436744704, + "loss": 3.9728, + "grad_norm": 14.838702201843262, + "learning_rate": 1.0169491525423728e-05 + }, + { + "step": 3, + "epoch": 0.015267175572519083, + "cpu_mem": 1.742671872, + "gpu_mem": 4.436821504, + "loss": 3.8356, + "grad_norm": 14.563454627990723, + "learning_rate": 1.5254237288135592e-05 + }, + { + "step": 4, + "epoch": 0.020356234096692113, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436720128, + "loss": 3.7722, + "grad_norm": 15.690662384033203, + "learning_rate": 2.0338983050847455e-05 + }, + { + "step": 5, + "epoch": 0.02544529262086514, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436735488, + "loss": 3.832, + "grad_norm": 14.722855567932129, + "learning_rate": 2.542372881355932e-05 + }, + { + "step": 6, + "epoch": 0.030534351145038167, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436727808, + "loss": 3.7182, + "grad_norm": 13.99138355255127, + "learning_rate": 3.0508474576271185e-05 + }, + { + "step": 7, + "epoch": 0.035623409669211195, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436798464, + "loss": 3.6497, + "grad_norm": 14.228583335876465, + "learning_rate": 3.559322033898305e-05 + }, + { + "step": 8, + "epoch": 0.04071246819338423, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436783104, + "loss": 3.4215, + "grad_norm": 13.729578018188477, + "learning_rate": 4.067796610169491e-05 + }, + { + "step": 9, + "epoch": 0.04580152671755725, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436781568, + "loss": 3.2754, + "grad_norm": 13.172603607177734, + "learning_rate": 4.576271186440678e-05 + }, + { + "step": 10, + "epoch": 0.05089058524173028, + "cpu_mem": 1.743065088, + "gpu_mem": 4.43679232, + "loss": 3.4197, + "grad_norm": 12.823274612426758, + "learning_rate": 5.084745762711864e-05 + }, + { + "step": 11, + "epoch": 0.05597964376590331, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436695552, + "loss": 2.9908, + "grad_norm": 11.39487361907959, + "learning_rate": 5.59322033898305e-05 + }, + { + "step": 12, + "epoch": 0.061068702290076333, + "cpu_mem": 1.743065088, + "gpu_mem": 4.43674624, + "loss": 2.6939, + "grad_norm": 10.157551765441895, + "learning_rate": 6.101694915254237e-05 + }, + { + "step": 13, + "epoch": 0.06615776081424936, + "cpu_mem": 1.743065088, + "gpu_mem": 4.4368384, + "loss": 2.5334, + "grad_norm": 8.99533748626709, + "learning_rate": 6.610169491525423e-05 + }, + { + "step": 14, + "epoch": 0.07124681933842239, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436750848, + "loss": 2.3871, + "grad_norm": 7.733980655670166, + "learning_rate": 7.11864406779661e-05 + }, + { + "step": 15, + "epoch": 0.07633587786259542, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436889088, + "loss": 2.082, + "grad_norm": 6.679530620574951, + "learning_rate": 7.627118644067796e-05 + }, + { + "step": 16, + "epoch": 0.08142493638676845, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436749312, + "loss": 1.9155, + "grad_norm": 4.9330291748046875, + "learning_rate": 8.135593220338982e-05 + }, + { + "step": 17, + "epoch": 0.08651399491094147, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436780032, + "loss": 1.6732, + "grad_norm": 3.505525827407837, + "learning_rate": 8.64406779661017e-05 + }, + { + "step": 18, + "epoch": 0.0916030534351145, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436743168, + "loss": 1.6612, + "grad_norm": 3.4193079471588135, + "learning_rate": 9.152542372881355e-05 + }, + { + "step": 19, + "epoch": 0.09669211195928754, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436651008, + "loss": 1.5623, + "grad_norm": 2.2514662742614746, + "learning_rate": 9.661016949152541e-05 + }, + { + "step": 20, + "epoch": 0.10178117048346055, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436689408, + "loss": 1.5152, + "grad_norm": 1.4252986907958984, + "learning_rate": 0.00010169491525423727 + }, + { + "step": 21, + "epoch": 0.10687022900763359, + "cpu_mem": 1.743065088, + "gpu_mem": 4.43682304, + "loss": 1.5075, + "grad_norm": 2.3872432708740234, + "learning_rate": 0.00010677966101694915 + }, + { + "step": 22, + "epoch": 0.11195928753180662, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436721664, + "loss": 1.4449, + "grad_norm": 1.6003974676132202, + "learning_rate": 0.000111864406779661 + }, + { + "step": 23, + "epoch": 0.11704834605597965, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436749312, + "loss": 1.4348, + "grad_norm": 1.748751163482666, + "learning_rate": 0.00011694915254237288 + }, + { + "step": 24, + "epoch": 0.12213740458015267, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436743168, + "loss": 1.4464, + "grad_norm": 1.5526318550109863, + "learning_rate": 0.00012203389830508474 + }, + { + "step": 25, + "epoch": 0.1272264631043257, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436749312, + "loss": 1.4717, + "grad_norm": 2.194411516189575, + "learning_rate": 0.00012711864406779658 + }, + { + "step": 26, + "epoch": 0.13231552162849872, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436801536, + "loss": 1.4254, + "grad_norm": 2.8708908557891846, + "learning_rate": 0.00013220338983050846 + }, + { + "step": 27, + "epoch": 0.13740458015267176, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436743168, + "loss": 1.4397, + "grad_norm": 1.7595107555389404, + "learning_rate": 0.00013728813559322033 + }, + { + "step": 28, + "epoch": 0.14249363867684478, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436689408, + "loss": 1.4387, + "grad_norm": 1.2992548942565918, + "learning_rate": 0.0001423728813559322 + }, + { + "step": 29, + "epoch": 0.1475826972010178, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436781568, + "loss": 1.4462, + "grad_norm": 1.5589715242385864, + "learning_rate": 0.00014745762711864405 + }, + { + "step": 30, + "epoch": 0.15267175572519084, + "cpu_mem": 1.743065088, + "gpu_mem": 4.43677696, + "loss": 1.4385, + "grad_norm": 2.785175323486328, + "learning_rate": 0.00015254237288135592 + }, + { + "step": 31, + "epoch": 0.15776081424936386, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436755456, + "loss": 1.4369, + "grad_norm": 2.2685933113098145, + "learning_rate": 0.0001576271186440678 + }, + { + "step": 32, + "epoch": 0.1628498727735369, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436760064, + "loss": 1.402, + "grad_norm": 1.674202799797058, + "learning_rate": 0.00016271186440677964 + }, + { + "step": 33, + "epoch": 0.16793893129770993, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436795392, + "loss": 1.3477, + "grad_norm": 1.4087262153625488, + "learning_rate": 0.0001677966101694915 + }, + { + "step": 34, + "epoch": 0.17302798982188294, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436720128, + "loss": 1.4901, + "grad_norm": 1.9975885152816772, + "learning_rate": 0.0001728813559322034 + }, + { + "step": 35, + "epoch": 0.178117048346056, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436767744, + "loss": 1.4603, + "grad_norm": 2.562002420425415, + "learning_rate": 0.00017796610169491523 + }, + { + "step": 36, + "epoch": 0.183206106870229, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436783104, + "loss": 1.5432, + "grad_norm": 3.635402202606201, + "learning_rate": 0.0001830508474576271 + }, + { + "step": 37, + "epoch": 0.18829516539440203, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436849152, + "loss": 1.3955, + "grad_norm": 2.4205291271209717, + "learning_rate": 0.00018813559322033895 + }, + { + "step": 38, + "epoch": 0.19338422391857507, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436732416, + "loss": 1.3782, + "grad_norm": 1.0632727146148682, + "learning_rate": 0.00019322033898305083 + }, + { + "step": 39, + "epoch": 0.1984732824427481, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436849152, + "loss": 1.4481, + "grad_norm": 1.5670065879821777, + "learning_rate": 0.0001983050847457627 + }, + { + "step": 40, + "epoch": 0.2035623409669211, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436770816, + "loss": 1.3873, + "grad_norm": 0.9268918037414551, + "learning_rate": 0.00020338983050847455 + }, + { + "step": 41, + "epoch": 0.20865139949109415, + "cpu_mem": 1.743065088, + "gpu_mem": 4.43666944, + "loss": 1.4305, + "grad_norm": 1.4222270250320435, + "learning_rate": 0.00020847457627118642 + }, + { + "step": 42, + "epoch": 0.21374045801526717, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436741632, + "loss": 1.4171, + "grad_norm": 0.8759964108467102, + "learning_rate": 0.0002135593220338983 + }, + { + "step": 43, + "epoch": 0.21882951653944022, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436704768, + "loss": 1.3998, + "grad_norm": 1.1457537412643433, + "learning_rate": 0.00021864406779661014 + }, + { + "step": 44, + "epoch": 0.22391857506361323, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436740096, + "loss": 1.43, + "grad_norm": 1.0342144966125488, + "learning_rate": 0.000223728813559322 + }, + { + "step": 45, + "epoch": 0.22900763358778625, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436793856, + "loss": 1.4123, + "grad_norm": 0.8197781443595886, + "learning_rate": 0.00022881355932203386 + }, + { + "step": 46, + "epoch": 0.2340966921119593, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436841472, + "loss": 1.4025, + "grad_norm": 1.188051462173462, + "learning_rate": 0.00023389830508474576 + }, + { + "step": 47, + "epoch": 0.23918575063613232, + "cpu_mem": 1.743065088, + "gpu_mem": 4.43666944, + "loss": 1.401, + "grad_norm": 1.0979923009872437, + "learning_rate": 0.0002389830508474576 + }, + { + "step": 48, + "epoch": 0.24427480916030533, + "cpu_mem": 1.743065088, + "gpu_mem": 4.4367232, + "loss": 1.4091, + "grad_norm": 0.91158127784729, + "learning_rate": 0.00024406779661016948 + }, + { + "step": 49, + "epoch": 0.24936386768447838, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436712448, + "loss": 1.3812, + "grad_norm": 0.5696355700492859, + "learning_rate": 0.00024915254237288135 + }, + { + "step": 50, + "epoch": 0.2544529262086514, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436721664, + "loss": 1.3844, + "grad_norm": 0.7077459692955017, + "learning_rate": 0.00025423728813559317 + }, + { + "step": 51, + "epoch": 0.2595419847328244, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436816896, + "loss": 1.3561, + "grad_norm": 0.5969799160957336, + "learning_rate": 0.0002593220338983051 + }, + { + "step": 52, + "epoch": 0.26463104325699743, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436758528, + "loss": 1.4594, + "grad_norm": 2.1068673133850098, + "learning_rate": 0.0002644067796610169 + }, + { + "step": 53, + "epoch": 0.2697201017811705, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436836864, + "loss": 1.4788, + "grad_norm": 2.285597562789917, + "learning_rate": 0.0002694915254237288 + }, + { + "step": 54, + "epoch": 0.2748091603053435, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436737024, + "loss": 1.4165, + "grad_norm": 1.251336693763733, + "learning_rate": 0.00027457627118644066 + }, + { + "step": 55, + "epoch": 0.27989821882951654, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436733952, + "loss": 1.3761, + "grad_norm": 0.778680682182312, + "learning_rate": 0.0002796610169491525 + }, + { + "step": 56, + "epoch": 0.28498727735368956, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436778496, + "loss": 1.428, + "grad_norm": 1.3937779664993286, + "learning_rate": 0.0002847457627118644 + }, + { + "step": 57, + "epoch": 0.2900763358778626, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436770816, + "loss": 1.3857, + "grad_norm": 0.7286733984947205, + "learning_rate": 0.00028983050847457623 + }, + { + "step": 58, + "epoch": 0.2951653944020356, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436793856, + "loss": 1.3887, + "grad_norm": 0.60840904712677, + "learning_rate": 0.0002949152542372881 + }, + { + "step": 59, + "epoch": 0.30025445292620867, + "cpu_mem": 1.743065088, + "gpu_mem": 4.4367616, + "loss": 1.3681, + "grad_norm": 1.0955506563186646, + "learning_rate": 0.0003 + }, + { + "step": 60, + "epoch": 0.3053435114503817, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436752384, + "loss": 1.4251, + "grad_norm": 0.805712878704071, + "learning_rate": 0.00029999735486167307 + }, + { + "step": 61, + "epoch": 0.3104325699745547, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436789248, + "loss": 1.362, + "grad_norm": 0.39562878012657166, + "learning_rate": 0.00029998941953998247 + }, + { + "step": 62, + "epoch": 0.3155216284987277, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436829184, + "loss": 1.3987, + "grad_norm": 0.627046525478363, + "learning_rate": 0.0002999761943147951 + }, + { + "step": 63, + "epoch": 0.32061068702290074, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436758528, + "loss": 1.3843, + "grad_norm": 0.4877071678638458, + "learning_rate": 0.000299957679652545 + }, + { + "step": 64, + "epoch": 0.3256997455470738, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436660224, + "loss": 1.3898, + "grad_norm": 0.9378731846809387, + "learning_rate": 0.0002999338762062168 + }, + { + "step": 65, + "epoch": 0.33078880407124683, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436741632, + "loss": 1.3968, + "grad_norm": 0.5176810622215271, + "learning_rate": 0.00029990478481532246 + }, + { + "step": 66, + "epoch": 0.33587786259541985, + "cpu_mem": 1.743065088, + "gpu_mem": 4.43684608, + "loss": 1.4015, + "grad_norm": 0.2605176270008087, + "learning_rate": 0.00029987040650587214 + }, + { + "step": 67, + "epoch": 0.34096692111959287, + "cpu_mem": 1.743065088, + "gpu_mem": 4.4367232, + "loss": 1.4292, + "grad_norm": 0.5805103182792664, + "learning_rate": 0.0002998307424903376 + }, + { + "step": 68, + "epoch": 0.3460559796437659, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436775424, + "loss": 1.4081, + "grad_norm": 0.6137901544570923, + "learning_rate": 0.00029978579416760955 + }, + { + "step": 69, + "epoch": 0.3511450381679389, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436767744, + "loss": 1.3809, + "grad_norm": 0.439973384141922, + "learning_rate": 0.00029973556312294853 + }, + { + "step": 70, + "epoch": 0.356234096692112, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436683264, + "loss": 1.3855, + "grad_norm": 0.30250582098960876, + "learning_rate": 0.0002996800511279286 + }, + { + "step": 71, + "epoch": 0.361323155216285, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436706304, + "loss": 1.4116, + "grad_norm": 1.355234980583191, + "learning_rate": 0.0002996192601403751 + }, + { + "step": 72, + "epoch": 0.366412213740458, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436755456, + "loss": 1.3985, + "grad_norm": 0.5770633816719055, + "learning_rate": 0.00029955319230429584 + }, + { + "step": 73, + "epoch": 0.37150127226463103, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436709376, + "loss": 1.4031, + "grad_norm": 0.4191872775554657, + "learning_rate": 0.00029948184994980486 + }, + { + "step": 74, + "epoch": 0.37659033078880405, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436763136, + "loss": 1.396, + "grad_norm": 0.7420408725738525, + "learning_rate": 0.0002994052355930409 + }, + { + "step": 75, + "epoch": 0.3816793893129771, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436806144, + "loss": 1.3765, + "grad_norm": 0.6575621366500854, + "learning_rate": 0.0002993233519360781 + }, + { + "step": 76, + "epoch": 0.38676844783715014, + "cpu_mem": 1.743065088, + "gpu_mem": 4.43675392, + "loss": 1.3739, + "grad_norm": 0.41931378841400146, + "learning_rate": 0.0002992362018668312 + }, + { + "step": 77, + "epoch": 0.39185750636132316, + "cpu_mem": 1.743065088, + "gpu_mem": 4.43669248, + "loss": 1.3974, + "grad_norm": 0.4701511561870575, + "learning_rate": 0.00029914378845895343 + }, + { + "step": 78, + "epoch": 0.3969465648854962, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436793856, + "loss": 1.3603, + "grad_norm": 0.6620966196060181, + "learning_rate": 0.000299046114971728 + }, + { + "step": 79, + "epoch": 0.4020356234096692, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436810752, + "loss": 1.3582, + "grad_norm": 1.4872643947601318, + "learning_rate": 0.0002989431848499534 + }, + { + "step": 80, + "epoch": 0.4071246819338422, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436651008, + "loss": 1.4545, + "grad_norm": 1.3103445768356323, + "learning_rate": 0.0002988350017238218 + }, + { + "step": 81, + "epoch": 0.4122137404580153, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436756992, + "loss": 1.443, + "grad_norm": 1.3303346633911133, + "learning_rate": 0.0002987215694087909 + }, + { + "step": 82, + "epoch": 0.4173027989821883, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436729344, + "loss": 1.3957, + "grad_norm": 1.0776361227035522, + "learning_rate": 0.0002986028919054496 + }, + { + "step": 83, + "epoch": 0.4223918575063613, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436743168, + "loss": 1.3889, + "grad_norm": 1.3011589050292969, + "learning_rate": 0.00029847897339937675 + }, + { + "step": 84, + "epoch": 0.42748091603053434, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436749312, + "loss": 1.3971, + "grad_norm": 0.7042205929756165, + "learning_rate": 0.0002983498182609935 + }, + { + "step": 85, + "epoch": 0.43256997455470736, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436778496, + "loss": 1.3895, + "grad_norm": 0.45027655363082886, + "learning_rate": 0.0002982154310454093 + }, + { + "step": 86, + "epoch": 0.43765903307888043, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436720128, + "loss": 1.3695, + "grad_norm": 0.4252176880836487, + "learning_rate": 0.00029807581649226114 + }, + { + "step": 87, + "epoch": 0.44274809160305345, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436741632, + "loss": 1.3872, + "grad_norm": 0.23573040962219238, + "learning_rate": 0.00029793097952554646 + }, + { + "step": 88, + "epoch": 0.44783715012722647, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436767744, + "loss": 1.3742, + "grad_norm": 0.6922330856323242, + "learning_rate": 0.0002977809252534494 + }, + { + "step": 89, + "epoch": 0.4529262086513995, + "cpu_mem": 1.743065088, + "gpu_mem": 4.43671552, + "loss": 1.3951, + "grad_norm": 0.7909450531005859, + "learning_rate": 0.00029762565896816073 + }, + { + "step": 90, + "epoch": 0.4580152671755725, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436747776, + "loss": 1.4245, + "grad_norm": 0.7887980937957764, + "learning_rate": 0.000297465186145691 + }, + { + "step": 91, + "epoch": 0.4631043256997455, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436740096, + "loss": 1.3583, + "grad_norm": 0.6335704922676086, + "learning_rate": 0.0002972995124456779 + }, + { + "step": 92, + "epoch": 0.4681933842239186, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436680192, + "loss": 1.3885, + "grad_norm": 0.638899564743042, + "learning_rate": 0.0002971286437111861 + }, + { + "step": 93, + "epoch": 0.4732824427480916, + "cpu_mem": 1.743065088, + "gpu_mem": 4.436881408, + "loss": 1.411, + "grad_norm": 0.7641863822937012, + "learning_rate": 0.0002969525859685014 + }, + { + "step": 94, + "epoch": 0.47837150127226463, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436695552, + "loss": 1.465, + "grad_norm": 1.950778841972351, + "learning_rate": 0.0002967713454269183 + }, + { + "step": 95, + "epoch": 0.48346055979643765, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436862976, + "loss": 1.4305, + "grad_norm": 1.4036184549331665, + "learning_rate": 0.0002965849284785207 + }, + { + "step": 96, + "epoch": 0.48854961832061067, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436737024, + "loss": 1.3637, + "grad_norm": 0.8325475454330444, + "learning_rate": 0.000296393341697957 + }, + { + "step": 97, + "epoch": 0.49363867684478374, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436732416, + "loss": 1.3833, + "grad_norm": 0.5069141387939453, + "learning_rate": 0.00029619659184220755 + }, + { + "step": 98, + "epoch": 0.49872773536895676, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436780032, + "loss": 1.3817, + "grad_norm": 1.0236603021621704, + "learning_rate": 0.00029599468585034684 + }, + { + "step": 99, + "epoch": 0.5038167938931297, + "cpu_mem": 1.743261696, + "gpu_mem": 4.4367616, + "loss": 1.4183, + "grad_norm": 1.984674096107483, + "learning_rate": 0.0002957876308432986 + }, + { + "step": 100, + "epoch": 0.5089058524173028, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436726272, + "loss": 1.3696, + "grad_norm": 0.7911021113395691, + "learning_rate": 0.0002955754341235846 + }, + { + "step": 101, + "epoch": 0.5139949109414759, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436709376, + "loss": 1.4085, + "grad_norm": 0.6993526816368103, + "learning_rate": 0.00029535810317506714 + }, + { + "step": 102, + "epoch": 0.5190839694656488, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436760064, + "loss": 1.4031, + "grad_norm": 0.9811055064201355, + "learning_rate": 0.00029513564566268524 + }, + { + "step": 103, + "epoch": 0.5241730279898219, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436720128, + "loss": 1.377, + "grad_norm": 0.5724412202835083, + "learning_rate": 0.0002949080694321841 + }, + { + "step": 104, + "epoch": 0.5292620865139949, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436809216, + "loss": 1.3963, + "grad_norm": 0.409641295671463, + "learning_rate": 0.0002946753825098386 + }, + { + "step": 105, + "epoch": 0.5343511450381679, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436678656, + "loss": 1.3848, + "grad_norm": 0.6518911123275757, + "learning_rate": 0.0002944375931021699 + }, + { + "step": 106, + "epoch": 0.539440203562341, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436741632, + "loss": 1.4199, + "grad_norm": 0.7107927799224854, + "learning_rate": 0.0002941947095956564 + }, + { + "step": 107, + "epoch": 0.544529262086514, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436737024, + "loss": 1.3726, + "grad_norm": 0.297811895608902, + "learning_rate": 0.0002939467405564377 + }, + { + "step": 108, + "epoch": 0.549618320610687, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436726272, + "loss": 1.3762, + "grad_norm": 0.4476221799850464, + "learning_rate": 0.00029369369473001265 + }, + { + "step": 109, + "epoch": 0.55470737913486, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436895232, + "loss": 1.3918, + "grad_norm": 0.3346934914588928, + "learning_rate": 0.0002934355810409307 + }, + { + "step": 110, + "epoch": 0.5597964376590331, + "cpu_mem": 1.743261696, + "gpu_mem": 4.43669248, + "loss": 1.3425, + "grad_norm": 0.5044710040092468, + "learning_rate": 0.0002931724085924774 + }, + { + "step": 111, + "epoch": 0.5648854961832062, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436694016, + "loss": 1.3738, + "grad_norm": 0.38246071338653564, + "learning_rate": 0.00029290418666635314 + }, + { + "step": 112, + "epoch": 0.5699745547073791, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436793856, + "loss": 1.39, + "grad_norm": 0.4819703698158264, + "learning_rate": 0.0002926309247223459 + }, + { + "step": 113, + "epoch": 0.5750636132315522, + "cpu_mem": 1.743261696, + "gpu_mem": 4.43689216, + "loss": 1.3948, + "grad_norm": 0.6270788311958313, + "learning_rate": 0.0002923526323979975 + }, + { + "step": 114, + "epoch": 0.5801526717557252, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436721664, + "loss": 1.406, + "grad_norm": 0.6029590368270874, + "learning_rate": 0.00029206931950826387 + }, + { + "step": 115, + "epoch": 0.5852417302798982, + "cpu_mem": 1.743261696, + "gpu_mem": 4.43673088, + "loss": 1.4022, + "grad_norm": 0.40159720182418823, + "learning_rate": 0.00029178099604516876 + }, + { + "step": 116, + "epoch": 0.5903307888040712, + "cpu_mem": 1.743261696, + "gpu_mem": 4.43679232, + "loss": 1.3807, + "grad_norm": 0.5236168503761292, + "learning_rate": 0.0002914876721774515 + }, + { + "step": 117, + "epoch": 0.5954198473282443, + "cpu_mem": 1.743261696, + "gpu_mem": 4.4366848, + "loss": 1.3749, + "grad_norm": 0.3737568259239197, + "learning_rate": 0.00029118935825020806 + }, + { + "step": 118, + "epoch": 0.6005089058524173, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436780032, + "loss": 1.3839, + "grad_norm": 0.33281195163726807, + "learning_rate": 0.00029088606478452656 + }, + { + "step": 119, + "epoch": 0.6055979643765903, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436913664, + "loss": 1.3768, + "grad_norm": 0.5290294885635376, + "learning_rate": 0.0002905778024771158 + }, + { + "step": 120, + "epoch": 0.6106870229007634, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436816896, + "loss": 1.4226, + "grad_norm": 1.1272141933441162, + "learning_rate": 0.00029026458219992855 + }, + { + "step": 121, + "epoch": 0.6157760814249363, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436862976, + "loss": 1.3915, + "grad_norm": 0.5086135268211365, + "learning_rate": 0.00028994641499977745 + }, + { + "step": 122, + "epoch": 0.6208651399491094, + "cpu_mem": 1.743261696, + "gpu_mem": 4.43680768, + "loss": 1.3836, + "grad_norm": 0.4160374402999878, + "learning_rate": 0.00028962331209794604 + }, + { + "step": 123, + "epoch": 0.6259541984732825, + "cpu_mem": 1.743261696, + "gpu_mem": 4.43684608, + "loss": 1.3915, + "grad_norm": 0.7828189134597778, + "learning_rate": 0.00028929528488979244 + }, + { + "step": 124, + "epoch": 0.6310432569974554, + "cpu_mem": 1.743261696, + "gpu_mem": 4.43676928, + "loss": 1.4056, + "grad_norm": 0.6638072729110718, + "learning_rate": 0.0002889623449443479 + }, + { + "step": 125, + "epoch": 0.6361323155216285, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436804608, + "loss": 1.3987, + "grad_norm": 0.5323939323425293, + "learning_rate": 0.0002886245040039086 + }, + { + "step": 126, + "epoch": 0.6412213740458015, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436710912, + "loss": 1.3697, + "grad_norm": 0.44775280356407166, + "learning_rate": 0.0002882817739836215 + }, + { + "step": 127, + "epoch": 0.6463104325699746, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436735488, + "loss": 1.4134, + "grad_norm": 0.5691746473312378, + "learning_rate": 0.000287934166971064 + }, + { + "step": 128, + "epoch": 0.6513994910941476, + "cpu_mem": 1.743261696, + "gpu_mem": 4.43670784, + "loss": 1.3776, + "grad_norm": 0.4177893400192261, + "learning_rate": 0.0002875816952258179 + }, + { + "step": 129, + "epoch": 0.6564885496183206, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436713984, + "loss": 1.4166, + "grad_norm": 0.6348063945770264, + "learning_rate": 0.00028722437117903693 + }, + { + "step": 130, + "epoch": 0.6615776081424937, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436704768, + "loss": 1.4313, + "grad_norm": 0.6840851306915283, + "learning_rate": 0.000286862207433008 + }, + { + "step": 131, + "epoch": 0.6666666666666666, + "cpu_mem": 1.743261696, + "gpu_mem": 4.43663872, + "loss": 1.3743, + "grad_norm": 0.23212480545043945, + "learning_rate": 0.00028649521676070726 + }, + { + "step": 132, + "epoch": 0.6717557251908397, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436801536, + "loss": 1.4049, + "grad_norm": 0.5477487444877625, + "learning_rate": 0.0002861234121053493 + }, + { + "step": 133, + "epoch": 0.6768447837150128, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436713984, + "loss": 1.3754, + "grad_norm": 0.7280015349388123, + "learning_rate": 0.0002857468065799307 + }, + { + "step": 134, + "epoch": 0.6819338422391857, + "cpu_mem": 1.743261696, + "gpu_mem": 4.436717056, + "loss": 1.3888, + "grad_norm": 0.29141995310783386, + "learning_rate": 0.0002853654134667676 + }, + { + "step": 135, + "epoch": 0.6870229007633588, + "cpu_mem": 1.744244736, + "gpu_mem": 4.436772352, + "loss": 1.3771, + "grad_norm": 0.49185696244239807, + "learning_rate": 0.0002849792462170271 + }, + { + "step": 136, + "epoch": 0.6921119592875318, + "cpu_mem": 1.746014208, + "gpu_mem": 4.436675584, + "loss": 1.4121, + "grad_norm": 0.634144127368927, + "learning_rate": 0.0002845883184502533 + }, + { + "step": 137, + "epoch": 0.6972010178117048, + "cpu_mem": 1.746014208, + "gpu_mem": 4.436843008, + "loss": 1.3839, + "grad_norm": 0.49704277515411377, + "learning_rate": 0.00028419264395388626 + }, + { + "step": 138, + "epoch": 0.7022900763358778, + "cpu_mem": 1.746014208, + "gpu_mem": 4.436709376, + "loss": 1.3913, + "grad_norm": 0.8326661586761475, + "learning_rate": 0.0002837922366827765 + }, + { + "step": 139, + "epoch": 0.7073791348600509, + "cpu_mem": 1.746014208, + "gpu_mem": 4.436698624, + "loss": 1.3476, + "grad_norm": 0.805647075176239, + "learning_rate": 0.00028338711075869216 + }, + { + "step": 140, + "epoch": 0.712468193384224, + "cpu_mem": 1.746014208, + "gpu_mem": 4.436750848, + "loss": 1.3564, + "grad_norm": 0.606671154499054, + "learning_rate": 0.00028297728046982137 + }, + { + "step": 141, + "epoch": 0.7175572519083969, + "cpu_mem": 1.746210816, + "gpu_mem": 4.436675584, + "loss": 1.4004, + "grad_norm": 0.8290622234344482, + "learning_rate": 0.00028256276027026816 + }, + { + "step": 142, + "epoch": 0.72264631043257, + "cpu_mem": 1.746210816, + "gpu_mem": 4.436737024, + "loss": 1.4311, + "grad_norm": 1.1394269466400146, + "learning_rate": 0.0002821435647795429 + }, + { + "step": 143, + "epoch": 0.727735368956743, + "cpu_mem": 1.746604032, + "gpu_mem": 4.436735488, + "loss": 1.4059, + "grad_norm": 0.7818772196769714, + "learning_rate": 0.00028171970878204623 + }, + { + "step": 144, + "epoch": 0.732824427480916, + "cpu_mem": 1.746997248, + "gpu_mem": 4.436680192, + "loss": 1.3593, + "grad_norm": 0.25497373938560486, + "learning_rate": 0.0002812912072265481 + }, + { + "step": 145, + "epoch": 0.7379134860050891, + "cpu_mem": 1.747390464, + "gpu_mem": 4.43667712, + "loss": 1.4054, + "grad_norm": 0.8249735236167908, + "learning_rate": 0.00028085807522566043 + }, + { + "step": 146, + "epoch": 0.7430025445292621, + "cpu_mem": 1.747980288, + "gpu_mem": 4.436778496, + "loss": 1.3818, + "grad_norm": 0.6901052594184875, + "learning_rate": 0.00028042032805530387 + }, + { + "step": 147, + "epoch": 0.7480916030534351, + "cpu_mem": 1.748176896, + "gpu_mem": 4.436783104, + "loss": 1.3719, + "grad_norm": 0.6487355828285217, + "learning_rate": 0.00027997798115416935 + }, + { + "step": 148, + "epoch": 0.7531806615776081, + "cpu_mem": 1.748570112, + "gpu_mem": 4.436881408, + "loss": 1.3815, + "grad_norm": 0.5103967189788818, + "learning_rate": 0.0002795310501231734 + }, + { + "step": 149, + "epoch": 0.7582697201017812, + "cpu_mem": 1.749159936, + "gpu_mem": 4.436724736, + "loss": 1.3936, + "grad_norm": 0.6964251399040222, + "learning_rate": 0.0002790795507249081 + }, + { + "step": 150, + "epoch": 0.7633587786259542, + "cpu_mem": 1.749553152, + "gpu_mem": 4.436717056, + "loss": 1.3817, + "grad_norm": 0.42476513981819153, + "learning_rate": 0.00027862349888308494 + }, + { + "step": 151, + "epoch": 0.7684478371501272, + "cpu_mem": 1.750339584, + "gpu_mem": 4.436660224, + "loss": 1.3719, + "grad_norm": 0.25353553891181946, + "learning_rate": 0.0002781629106819733 + }, + { + "step": 152, + "epoch": 0.7735368956743003, + "cpu_mem": 1.750536192, + "gpu_mem": 4.43669248, + "loss": 1.37, + "grad_norm": 0.3466099202632904, + "learning_rate": 0.00027769780236583315 + }, + { + "step": 153, + "epoch": 0.7786259541984732, + "cpu_mem": 1.750929408, + "gpu_mem": 4.436720128, + "loss": 1.3672, + "grad_norm": 0.4257703125476837, + "learning_rate": 0.0002772281903383424 + }, + { + "step": 154, + "epoch": 0.7837150127226463, + "cpu_mem": 1.751519232, + "gpu_mem": 4.436770816, + "loss": 1.3724, + "grad_norm": 0.5158722400665283, + "learning_rate": 0.00027675409116201797 + }, + { + "step": 155, + "epoch": 0.7888040712468194, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436683264, + "loss": 1.4001, + "grad_norm": 0.328879714012146, + "learning_rate": 0.00027627552155763186 + }, + { + "step": 156, + "epoch": 0.7938931297709924, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436703232, + "loss": 1.4019, + "grad_norm": 0.7888994812965393, + "learning_rate": 0.00027579249840362145 + }, + { + "step": 157, + "epoch": 0.7989821882951654, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436780032, + "loss": 1.3774, + "grad_norm": 0.6170064210891724, + "learning_rate": 0.0002753050387354942 + }, + { + "step": 158, + "epoch": 0.8040712468193384, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436681728, + "loss": 1.4038, + "grad_norm": 0.45818525552749634, + "learning_rate": 0.0002748131597452268 + }, + { + "step": 159, + "epoch": 0.8091603053435115, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436783104, + "loss": 1.4131, + "grad_norm": 0.6385368704795837, + "learning_rate": 0.00027431687878065874 + }, + { + "step": 160, + "epoch": 0.8142493638676844, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436740096, + "loss": 1.4096, + "grad_norm": 0.921544075012207, + "learning_rate": 0.00027381621334488085 + }, + { + "step": 161, + "epoch": 0.8193384223918575, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43673856, + "loss": 1.3676, + "grad_norm": 0.3892118036746979, + "learning_rate": 0.00027331118109561744 + }, + { + "step": 162, + "epoch": 0.8244274809160306, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436721664, + "loss": 1.3821, + "grad_norm": 0.24806740880012512, + "learning_rate": 0.000272801799844604 + }, + { + "step": 163, + "epoch": 0.8295165394402035, + "cpu_mem": 1.754468352, + "gpu_mem": 4.4368384, + "loss": 1.4089, + "grad_norm": 0.6826182007789612, + "learning_rate": 0.00027228808755695884 + }, + { + "step": 164, + "epoch": 0.8346055979643766, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436733952, + "loss": 1.3929, + "grad_norm": 0.7167485356330872, + "learning_rate": 0.00027177006235054943 + }, + { + "step": 165, + "epoch": 0.8396946564885496, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436813824, + "loss": 1.3958, + "grad_norm": 0.6157979369163513, + "learning_rate": 0.0002712477424953534 + }, + { + "step": 166, + "epoch": 0.8447837150127226, + "cpu_mem": 1.754468352, + "gpu_mem": 4.4367232, + "loss": 1.3508, + "grad_norm": 0.5625067353248596, + "learning_rate": 0.00027072114641281435 + }, + { + "step": 167, + "epoch": 0.8498727735368957, + "cpu_mem": 1.754468352, + "gpu_mem": 4.4366848, + "loss": 1.3907, + "grad_norm": 0.6795075535774231, + "learning_rate": 0.0002701902926751921 + }, + { + "step": 168, + "epoch": 0.8549618320610687, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436687872, + "loss": 1.3826, + "grad_norm": 0.7599951028823853, + "learning_rate": 0.00026965520000490743 + }, + { + "step": 169, + "epoch": 0.8600508905852418, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436755456, + "loss": 1.3655, + "grad_norm": 0.4838642179965973, + "learning_rate": 0.0002691158872738822 + }, + { + "step": 170, + "epoch": 0.8651399491094147, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436763136, + "loss": 1.398, + "grad_norm": 0.4351670444011688, + "learning_rate": 0.00026857237350287334 + }, + { + "step": 171, + "epoch": 0.8702290076335878, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436737024, + "loss": 1.3814, + "grad_norm": 0.4242396950721741, + "learning_rate": 0.0002680246778608023 + }, + { + "step": 172, + "epoch": 0.8753180661577609, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436717056, + "loss": 1.3851, + "grad_norm": 0.365092933177948, + "learning_rate": 0.0002674728196640788 + }, + { + "step": 173, + "epoch": 0.8804071246819338, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43675392, + "loss": 1.3871, + "grad_norm": 0.38515305519104004, + "learning_rate": 0.00026691681837591984 + }, + { + "step": 174, + "epoch": 0.8854961832061069, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436697088, + "loss": 1.4092, + "grad_norm": 0.38060635328292847, + "learning_rate": 0.00026635669360566296 + }, + { + "step": 175, + "epoch": 0.8905852417302799, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436767744, + "loss": 1.3656, + "grad_norm": 0.389305055141449, + "learning_rate": 0.00026579246510807477 + }, + { + "step": 176, + "epoch": 0.8956743002544529, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436664832, + "loss": 1.3678, + "grad_norm": 0.2662959396839142, + "learning_rate": 0.00026522415278265425 + }, + { + "step": 177, + "epoch": 0.9007633587786259, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436674048, + "loss": 1.3808, + "grad_norm": 0.4523981511592865, + "learning_rate": 0.0002646517766729309 + }, + { + "step": 178, + "epoch": 0.905852417302799, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436750848, + "loss": 1.3679, + "grad_norm": 0.3419038951396942, + "learning_rate": 0.0002640753569657579 + }, + { + "step": 179, + "epoch": 0.910941475826972, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436758528, + "loss": 1.4013, + "grad_norm": 0.5359638929367065, + "learning_rate": 0.0002634949139906 + }, + { + "step": 180, + "epoch": 0.916030534351145, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436764672, + "loss": 1.3825, + "grad_norm": 0.33749067783355713, + "learning_rate": 0.00026291046821881673 + }, + { + "step": 181, + "epoch": 0.9211195928753181, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436683264, + "loss": 1.4003, + "grad_norm": 0.35537198185920715, + "learning_rate": 0.0002623220402629402 + }, + { + "step": 182, + "epoch": 0.926208651399491, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436793856, + "loss": 1.392, + "grad_norm": 0.3484475016593933, + "learning_rate": 0.0002617296508759483 + }, + { + "step": 183, + "epoch": 0.9312977099236641, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436781568, + "loss": 1.3878, + "grad_norm": 0.32158714532852173, + "learning_rate": 0.00026113332095053257 + }, + { + "step": 184, + "epoch": 0.9363867684478372, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436764672, + "loss": 1.3833, + "grad_norm": 0.7160983681678772, + "learning_rate": 0.0002605330715183616 + }, + { + "step": 185, + "epoch": 0.9414758269720102, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436703232, + "loss": 1.3825, + "grad_norm": 0.704812228679657, + "learning_rate": 0.0002599289237493392 + }, + { + "step": 186, + "epoch": 0.9465648854961832, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436735488, + "loss": 1.347, + "grad_norm": 0.28050166368484497, + "learning_rate": 0.0002593208989508575 + }, + { + "step": 187, + "epoch": 0.9516539440203562, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436832256, + "loss": 1.3708, + "grad_norm": 0.24672040343284607, + "learning_rate": 0.00025870901856704583 + }, + { + "step": 188, + "epoch": 0.9567430025445293, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436804608, + "loss": 1.3512, + "grad_norm": 0.2819032669067383, + "learning_rate": 0.00025809330417801425 + }, + { + "step": 189, + "epoch": 0.9618320610687023, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436709376, + "loss": 1.4678, + "grad_norm": 1.0478708744049072, + "learning_rate": 0.00025747377749909254 + }, + { + "step": 190, + "epoch": 0.9669211195928753, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436741632, + "loss": 1.3791, + "grad_norm": 0.5302679538726807, + "learning_rate": 0.00025685046038006413 + }, + { + "step": 191, + "epoch": 0.9720101781170484, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43673088, + "loss": 1.4214, + "grad_norm": 0.6149977445602417, + "learning_rate": 0.0002562233748043958 + }, + { + "step": 192, + "epoch": 0.9770992366412213, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436667904, + "loss": 1.3833, + "grad_norm": 0.5764893889427185, + "learning_rate": 0.00025559254288846196 + }, + { + "step": 193, + "epoch": 0.9821882951653944, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436789248, + "loss": 1.4085, + "grad_norm": 0.4914844036102295, + "learning_rate": 0.0002549579868807651 + }, + { + "step": 194, + "epoch": 0.9872773536895675, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436710912, + "loss": 1.3619, + "grad_norm": 0.3805903196334839, + "learning_rate": 0.0002543197291611507 + }, + { + "step": 195, + "epoch": 0.9923664122137404, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43682304, + "loss": 1.3782, + "grad_norm": 0.3630470931529999, + "learning_rate": 0.0002536777922400183 + }, + { + "step": 196, + "epoch": 0.9974554707379135, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436818432, + "loss": 1.3791, + "grad_norm": 0.375320702791214, + "learning_rate": 0.0002530321987575271 + }, + { + "step": 197, + "epoch": 1.0025445292620865, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443060224, + "loss": 2.063, + "grad_norm": 0.5011320114135742, + "learning_rate": 0.0002523829714827981 + }, + { + "step": 198, + "epoch": 1.0076335877862594, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443126272, + "loss": 1.3666, + "grad_norm": 0.30286723375320435, + "learning_rate": 0.00025173013331311053 + }, + { + "step": 199, + "epoch": 1.0127226463104326, + "cpu_mem": 1.754468352, + "gpu_mem": 4.4431232, + "loss": 1.4004, + "grad_norm": 0.7149483561515808, + "learning_rate": 0.0002510737072730946 + }, + { + "step": 200, + "epoch": 1.0178117048346056, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443008, + "loss": 1.3577, + "grad_norm": 0.42287155985832214, + "learning_rate": 0.0002504137165139193 + }, + { + "step": 201, + "epoch": 1.0229007633587786, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443058688, + "loss": 1.3907, + "grad_norm": 0.5887948274612427, + "learning_rate": 0.0002497501843124761 + }, + { + "step": 202, + "epoch": 1.0279898218829517, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443052544, + "loss": 1.3856, + "grad_norm": 0.533507227897644, + "learning_rate": 0.00024908313407055765 + }, + { + "step": 203, + "epoch": 1.0330788804071247, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44310016, + "loss": 1.3861, + "grad_norm": 0.49695512652397156, + "learning_rate": 0.00024841258931403284 + }, + { + "step": 204, + "epoch": 1.0381679389312977, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443032576, + "loss": 1.3516, + "grad_norm": 0.42552492022514343, + "learning_rate": 0.00024773857369201675 + }, + { + "step": 205, + "epoch": 1.0432569974554706, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443078656, + "loss": 1.3797, + "grad_norm": 0.3473730981349945, + "learning_rate": 0.00024706111097603676 + }, + { + "step": 206, + "epoch": 1.0483460559796438, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44309248, + "loss": 1.357, + "grad_norm": 0.5166810154914856, + "learning_rate": 0.00024638022505919425 + }, + { + "step": 207, + "epoch": 1.0534351145038168, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443047936, + "loss": 1.3865, + "grad_norm": 0.695208728313446, + "learning_rate": 0.00024569593995532157 + }, + { + "step": 208, + "epoch": 1.0585241730279897, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443040256, + "loss": 1.3715, + "grad_norm": 0.5172139406204224, + "learning_rate": 0.00024500827979813546 + }, + { + "step": 209, + "epoch": 1.063613231552163, + "cpu_mem": 1.754468352, + "gpu_mem": 4.4431616, + "loss": 1.3774, + "grad_norm": 0.7842063903808594, + "learning_rate": 0.0002443172688403859 + }, + { + "step": 210, + "epoch": 1.0687022900763359, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443058688, + "loss": 1.3592, + "grad_norm": 0.592850923538208, + "learning_rate": 0.00024362293145300027 + }, + { + "step": 211, + "epoch": 1.0737913486005088, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443060224, + "loss": 1.3666, + "grad_norm": 0.5342758297920227, + "learning_rate": 0.00024292529212422445 + }, + { + "step": 212, + "epoch": 1.078880407124682, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443063296, + "loss": 1.379, + "grad_norm": 0.4315674602985382, + "learning_rate": 0.00024222437545875887 + }, + { + "step": 213, + "epoch": 1.083969465648855, + "cpu_mem": 1.754468352, + "gpu_mem": 4.442989568, + "loss": 1.3979, + "grad_norm": 0.6502771377563477, + "learning_rate": 0.0002415202061768906 + }, + { + "step": 214, + "epoch": 1.089058524173028, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44298496, + "loss": 1.3552, + "grad_norm": 0.725476861000061, + "learning_rate": 0.0002408128091136217 + }, + { + "step": 215, + "epoch": 1.094147582697201, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443035648, + "loss": 1.3843, + "grad_norm": 1.1101680994033813, + "learning_rate": 0.00024010220921779336 + }, + { + "step": 216, + "epoch": 1.099236641221374, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443164672, + "loss": 1.384, + "grad_norm": 0.6923633813858032, + "learning_rate": 0.00023938843155120581 + }, + { + "step": 217, + "epoch": 1.104325699745547, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443026432, + "loss": 1.4379, + "grad_norm": 1.0502599477767944, + "learning_rate": 0.00023867150128773453 + }, + { + "step": 218, + "epoch": 1.10941475826972, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443052544, + "loss": 1.4131, + "grad_norm": 0.8403629660606384, + "learning_rate": 0.0002379514437124425 + }, + { + "step": 219, + "epoch": 1.1145038167938932, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443064832, + "loss": 1.4059, + "grad_norm": 0.5872739553451538, + "learning_rate": 0.00023722828422068814 + }, + { + "step": 220, + "epoch": 1.1195928753180662, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443037184, + "loss": 1.435, + "grad_norm": 1.2773349285125732, + "learning_rate": 0.00023650204831723008 + }, + { + "step": 221, + "epoch": 1.1246819338422391, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443011072, + "loss": 1.4214, + "grad_norm": 0.568953812122345, + "learning_rate": 0.00023577276161532718 + }, + { + "step": 222, + "epoch": 1.1297709923664123, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44302336, + "loss": 1.3727, + "grad_norm": 0.45195794105529785, + "learning_rate": 0.0002350404498358356 + }, + { + "step": 223, + "epoch": 1.1348600508905853, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443051008, + "loss": 1.3701, + "grad_norm": 0.5958787798881531, + "learning_rate": 0.00023430513880630133 + }, + { + "step": 224, + "epoch": 1.1399491094147582, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44306944, + "loss": 1.3909, + "grad_norm": 0.44872698187828064, + "learning_rate": 0.00023356685446004966 + }, + { + "step": 225, + "epoch": 1.1450381679389312, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443120128, + "loss": 1.3444, + "grad_norm": 0.3074606955051422, + "learning_rate": 0.00023282562283527005 + }, + { + "step": 226, + "epoch": 1.1501272264631044, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443070976, + "loss": 1.4061, + "grad_norm": 0.40770965814590454, + "learning_rate": 0.00023208147007409827 + }, + { + "step": 227, + "epoch": 1.1552162849872774, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443021824, + "loss": 1.3891, + "grad_norm": 0.531233549118042, + "learning_rate": 0.00023133442242169425 + }, + { + "step": 228, + "epoch": 1.1603053435114503, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443066368, + "loss": 1.4126, + "grad_norm": 0.6560361385345459, + "learning_rate": 0.00023058450622531632 + }, + { + "step": 229, + "epoch": 1.1653944020356235, + "cpu_mem": 1.754468352, + "gpu_mem": 4.442964992, + "loss": 1.3634, + "grad_norm": 0.39216163754463196, + "learning_rate": 0.00022983174793339206 + }, + { + "step": 230, + "epoch": 1.1704834605597965, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443032576, + "loss": 1.3936, + "grad_norm": 0.4498794972896576, + "learning_rate": 0.0002290761740945857 + }, + { + "step": 231, + "epoch": 1.1755725190839694, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44302336, + "loss": 1.4174, + "grad_norm": 0.764809787273407, + "learning_rate": 0.00022831781135686135 + }, + { + "step": 232, + "epoch": 1.1806615776081424, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443027968, + "loss": 1.3522, + "grad_norm": 0.67972332239151, + "learning_rate": 0.00022755668646654375 + }, + { + "step": 233, + "epoch": 1.1857506361323156, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443143168, + "loss": 1.3695, + "grad_norm": 0.5254322290420532, + "learning_rate": 0.00022679282626737442 + }, + { + "step": 234, + "epoch": 1.1908396946564885, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443103232, + "loss": 1.4108, + "grad_norm": 0.5276061296463013, + "learning_rate": 0.00022602625769956519 + }, + { + "step": 235, + "epoch": 1.1959287531806615, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44306176, + "loss": 1.3867, + "grad_norm": 0.6067991256713867, + "learning_rate": 0.00022525700779884802 + }, + { + "step": 236, + "epoch": 1.2010178117048347, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443060224, + "loss": 1.3857, + "grad_norm": 0.7150607109069824, + "learning_rate": 0.00022448510369552164 + }, + { + "step": 237, + "epoch": 1.2061068702290076, + "cpu_mem": 1.754468352, + "gpu_mem": 4.4430848, + "loss": 1.3845, + "grad_norm": 0.4039321839809418, + "learning_rate": 0.0002237105726134943 + }, + { + "step": 238, + "epoch": 1.2111959287531806, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443104768, + "loss": 1.3878, + "grad_norm": 0.4020727872848511, + "learning_rate": 0.00022293344186932406 + }, + { + "step": 239, + "epoch": 1.2162849872773536, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443021824, + "loss": 1.3698, + "grad_norm": 0.5960152745246887, + "learning_rate": 0.00022215373887125514 + }, + { + "step": 240, + "epoch": 1.2213740458015268, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443156992, + "loss": 1.3754, + "grad_norm": 0.39943110942840576, + "learning_rate": 0.00022137149111825128 + }, + { + "step": 241, + "epoch": 1.2264631043256997, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443110912, + "loss": 1.3752, + "grad_norm": 0.5267123579978943, + "learning_rate": 0.00022058672619902606 + }, + { + "step": 242, + "epoch": 1.2315521628498727, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443132416, + "loss": 1.3894, + "grad_norm": 0.40814080834388733, + "learning_rate": 0.00021979947179106966 + }, + { + "step": 243, + "epoch": 1.2366412213740459, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443018752, + "loss": 1.3549, + "grad_norm": 1.0090746879577637, + "learning_rate": 0.0002190097556596728 + }, + { + "step": 244, + "epoch": 1.2417302798982188, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443006464, + "loss": 1.4225, + "grad_norm": 0.9296849966049194, + "learning_rate": 0.0002182176056569476 + }, + { + "step": 245, + "epoch": 1.2468193384223918, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443011072, + "loss": 1.3972, + "grad_norm": 0.7424530982971191, + "learning_rate": 0.00021742304972084518 + }, + { + "step": 246, + "epoch": 1.2519083969465647, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443047936, + "loss": 1.3891, + "grad_norm": 0.5140881538391113, + "learning_rate": 0.00021662611587417035 + }, + { + "step": 247, + "epoch": 1.256997455470738, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443009536, + "loss": 1.3645, + "grad_norm": 0.6565972566604614, + "learning_rate": 0.00021582683222359317 + }, + { + "step": 248, + "epoch": 1.262086513994911, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443055616, + "loss": 1.4001, + "grad_norm": 0.44416433572769165, + "learning_rate": 0.00021502522695865796 + }, + { + "step": 249, + "epoch": 1.267175572519084, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443064832, + "loss": 1.3794, + "grad_norm": 0.2666358947753906, + "learning_rate": 0.00021422132835078884 + }, + { + "step": 250, + "epoch": 1.272264631043257, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443103232, + "loss": 1.3734, + "grad_norm": 0.697902500629425, + "learning_rate": 0.0002134151647522927 + }, + { + "step": 251, + "epoch": 1.27735368956743, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443021824, + "loss": 1.379, + "grad_norm": 0.6014329195022583, + "learning_rate": 0.00021260676459535933 + }, + { + "step": 252, + "epoch": 1.282442748091603, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44303104, + "loss": 1.3746, + "grad_norm": 0.8216103911399841, + "learning_rate": 0.00021179615639105857 + }, + { + "step": 253, + "epoch": 1.2875318066157762, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443097088, + "loss": 1.3617, + "grad_norm": 0.3290558159351349, + "learning_rate": 0.00021098336872833482 + }, + { + "step": 254, + "epoch": 1.2926208651399491, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44302336, + "loss": 1.341, + "grad_norm": 0.5096772909164429, + "learning_rate": 0.0002101684302729987 + }, + { + "step": 255, + "epoch": 1.297709923664122, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443017216, + "loss": 1.3894, + "grad_norm": 0.708290696144104, + "learning_rate": 0.00020935136976671617 + }, + { + "step": 256, + "epoch": 1.3027989821882953, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443083264, + "loss": 1.3923, + "grad_norm": 0.7479275465011597, + "learning_rate": 0.00020853221602599458 + }, + { + "step": 257, + "epoch": 1.3078880407124682, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443041792, + "loss": 1.3559, + "grad_norm": 0.3675205707550049, + "learning_rate": 0.00020771099794116672 + }, + { + "step": 258, + "epoch": 1.3129770992366412, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443155456, + "loss": 1.3727, + "grad_norm": 0.6131960153579712, + "learning_rate": 0.0002068877444753717 + }, + { + "step": 259, + "epoch": 1.3180661577608141, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443032576, + "loss": 1.3866, + "grad_norm": 0.6342862844467163, + "learning_rate": 0.0002060624846635335 + }, + { + "step": 260, + "epoch": 1.3231552162849873, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443058688, + "loss": 1.389, + "grad_norm": 0.5796828866004944, + "learning_rate": 0.00020523524761133677 + }, + { + "step": 261, + "epoch": 1.3282442748091603, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443064832, + "loss": 1.3892, + "grad_norm": 0.8109932541847229, + "learning_rate": 0.00020440606249420073 + }, + { + "step": 262, + "epoch": 1.3333333333333333, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443132416, + "loss": 1.3729, + "grad_norm": 0.4213597774505615, + "learning_rate": 0.00020357495855624974 + }, + { + "step": 263, + "epoch": 1.3384223918575064, + "cpu_mem": 1.754468352, + "gpu_mem": 4.442988032, + "loss": 1.394, + "grad_norm": 0.8927932977676392, + "learning_rate": 0.0002027419651092822 + }, + { + "step": 264, + "epoch": 1.3435114503816794, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443121664, + "loss": 1.373, + "grad_norm": 0.36559730768203735, + "learning_rate": 0.00020190711153173676 + }, + { + "step": 265, + "epoch": 1.3486005089058524, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443150848, + "loss": 1.3689, + "grad_norm": 0.9183509945869446, + "learning_rate": 0.00020107042726765588 + }, + { + "step": 266, + "epoch": 1.3536895674300253, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443020288, + "loss": 1.4004, + "grad_norm": 0.691754937171936, + "learning_rate": 0.0002002319418256479 + }, + { + "step": 267, + "epoch": 1.3587786259541985, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443021824, + "loss": 1.4019, + "grad_norm": 0.6860160231590271, + "learning_rate": 0.00019939168477784583 + }, + { + "step": 268, + "epoch": 1.3638676844783715, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44302336, + "loss": 1.3552, + "grad_norm": 0.2803248167037964, + "learning_rate": 0.00019854968575886458 + }, + { + "step": 269, + "epoch": 1.3689567430025447, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443097088, + "loss": 1.4101, + "grad_norm": 1.3276103734970093, + "learning_rate": 0.00019770597446475588 + }, + { + "step": 270, + "epoch": 1.3740458015267176, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443078656, + "loss": 1.3491, + "grad_norm": 0.3497350215911865, + "learning_rate": 0.0001968605806519608 + }, + { + "step": 271, + "epoch": 1.3791348600508906, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443021824, + "loss": 1.4174, + "grad_norm": 1.0692956447601318, + "learning_rate": 0.00019601353413626032 + }, + { + "step": 272, + "epoch": 1.3842239185750635, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443060224, + "loss": 1.3573, + "grad_norm": 0.5750541687011719, + "learning_rate": 0.00019516486479172386 + }, + { + "step": 273, + "epoch": 1.3893129770992365, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443127808, + "loss": 1.3822, + "grad_norm": 0.5188866257667542, + "learning_rate": 0.0001943146025496555 + }, + { + "step": 274, + "epoch": 1.3944020356234097, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443080192, + "loss": 1.4124, + "grad_norm": 0.8508633375167847, + "learning_rate": 0.00019346277739753855 + }, + { + "step": 275, + "epoch": 1.3994910941475827, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44320768, + "loss": 1.3885, + "grad_norm": 0.5124416947364807, + "learning_rate": 0.00019260941937797776 + }, + { + "step": 276, + "epoch": 1.4045801526717558, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44309248, + "loss": 1.3719, + "grad_norm": 0.4789351224899292, + "learning_rate": 0.00019175455858763988 + }, + { + "step": 277, + "epoch": 1.4096692111959288, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443041792, + "loss": 1.3588, + "grad_norm": 0.5270251035690308, + "learning_rate": 0.0001908982251761921 + }, + { + "step": 278, + "epoch": 1.4147582697201018, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443029504, + "loss": 1.3311, + "grad_norm": 0.5683107972145081, + "learning_rate": 0.00019004044934523871 + }, + { + "step": 279, + "epoch": 1.4198473282442747, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443021824, + "loss": 1.3799, + "grad_norm": 0.29272767901420593, + "learning_rate": 0.00018918126134725616 + }, + { + "step": 280, + "epoch": 1.424936386768448, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443144704, + "loss": 1.3814, + "grad_norm": 0.6151399612426758, + "learning_rate": 0.00018832069148452582 + }, + { + "step": 281, + "epoch": 1.4300254452926209, + "cpu_mem": 1.754468352, + "gpu_mem": 4.4430848, + "loss": 1.4301, + "grad_norm": 1.1445720195770264, + "learning_rate": 0.00018745877010806534 + }, + { + "step": 282, + "epoch": 1.4351145038167938, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44303104, + "loss": 1.4225, + "grad_norm": 0.8756213784217834, + "learning_rate": 0.00018659552761655828 + }, + { + "step": 283, + "epoch": 1.440203562340967, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443047936, + "loss": 1.4342, + "grad_norm": 0.8914481997489929, + "learning_rate": 0.00018573099445528204 + }, + { + "step": 284, + "epoch": 1.44529262086514, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443187712, + "loss": 1.3629, + "grad_norm": 0.43964895606040955, + "learning_rate": 0.00018486520111503387 + }, + { + "step": 285, + "epoch": 1.450381679389313, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443058688, + "loss": 1.3787, + "grad_norm": 0.4049091637134552, + "learning_rate": 0.0001839981781310558 + }, + { + "step": 286, + "epoch": 1.455470737913486, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44305408, + "loss": 1.4022, + "grad_norm": 0.6066441535949707, + "learning_rate": 0.00018312995608195747 + }, + { + "step": 287, + "epoch": 1.460559796437659, + "cpu_mem": 1.754468352, + "gpu_mem": 4.442957312, + "loss": 1.3752, + "grad_norm": 0.473114937543869, + "learning_rate": 0.00018226056558863778 + }, + { + "step": 288, + "epoch": 1.465648854961832, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443032576, + "loss": 1.3664, + "grad_norm": 0.4487999975681305, + "learning_rate": 0.00018139003731320496 + }, + { + "step": 289, + "epoch": 1.470737913486005, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443052544, + "loss": 1.3819, + "grad_norm": 0.37400707602500916, + "learning_rate": 0.00018051840195789506 + }, + { + "step": 290, + "epoch": 1.4758269720101782, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443018752, + "loss": 1.3682, + "grad_norm": 0.29694893956184387, + "learning_rate": 0.00017964569026398926 + }, + { + "step": 291, + "epoch": 1.4809160305343512, + "cpu_mem": 1.754468352, + "gpu_mem": 4.442980352, + "loss": 1.3704, + "grad_norm": 0.3713376820087433, + "learning_rate": 0.00017877193301072945 + }, + { + "step": 292, + "epoch": 1.4860050890585241, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443106304, + "loss": 1.3758, + "grad_norm": 0.3098212480545044, + "learning_rate": 0.0001778971610142331 + }, + { + "step": 293, + "epoch": 1.491094147582697, + "cpu_mem": 1.754468352, + "gpu_mem": 4.442983424, + "loss": 1.406, + "grad_norm": 0.47422292828559875, + "learning_rate": 0.00017702140512640594 + }, + { + "step": 294, + "epoch": 1.4961832061068703, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443026432, + "loss": 1.4, + "grad_norm": 0.40575140714645386, + "learning_rate": 0.00017614469623385414 + }, + { + "step": 295, + "epoch": 1.5012722646310432, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443009536, + "loss": 1.347, + "grad_norm": 0.5910519361495972, + "learning_rate": 0.00017526706525679498 + }, + { + "step": 296, + "epoch": 1.5063613231552164, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443051008, + "loss": 1.3907, + "grad_norm": 0.6195645332336426, + "learning_rate": 0.00017438854314796623 + }, + { + "step": 297, + "epoch": 1.5114503816793894, + "cpu_mem": 1.754468352, + "gpu_mem": 4.442991104, + "loss": 1.3643, + "grad_norm": 0.4798504412174225, + "learning_rate": 0.00017350916089153455 + }, + { + "step": 298, + "epoch": 1.5165394402035624, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443014144, + "loss": 1.3624, + "grad_norm": 0.7231450080871582, + "learning_rate": 0.00017262894950200277 + }, + { + "step": 299, + "epoch": 1.5216284987277353, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443003392, + "loss": 1.3606, + "grad_norm": 0.40347886085510254, + "learning_rate": 0.000171747940023116 + }, + { + "step": 300, + "epoch": 1.5267175572519083, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44301568, + "loss": 1.3785, + "grad_norm": 0.3462912440299988, + "learning_rate": 0.0001708661635267667 + }, + { + "step": 301, + "epoch": 1.5318066157760815, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443018752, + "loss": 1.3401, + "grad_norm": 0.7711390852928162, + "learning_rate": 0.00016998365111189906 + }, + { + "step": 302, + "epoch": 1.5368956743002544, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44300032, + "loss": 1.3761, + "grad_norm": 0.34701788425445557, + "learning_rate": 0.00016910043390341183 + }, + { + "step": 303, + "epoch": 1.5419847328244276, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443006464, + "loss": 1.3702, + "grad_norm": 0.6614573001861572, + "learning_rate": 0.0001682165430510609 + }, + { + "step": 304, + "epoch": 1.5470737913486006, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44295424, + "loss": 1.447, + "grad_norm": 1.3555269241333008, + "learning_rate": 0.00016733200972836055 + }, + { + "step": 305, + "epoch": 1.5521628498727735, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443026432, + "loss": 1.3496, + "grad_norm": 0.41422897577285767, + "learning_rate": 0.00016644686513148397 + }, + { + "step": 306, + "epoch": 1.5572519083969465, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443067904, + "loss": 1.3535, + "grad_norm": 0.46228137612342834, + "learning_rate": 0.00016556114047816317 + }, + { + "step": 307, + "epoch": 1.5623409669211195, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44301568, + "loss": 1.3525, + "grad_norm": 0.36741986870765686, + "learning_rate": 0.00016467486700658785 + }, + { + "step": 308, + "epoch": 1.5674300254452926, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443026432, + "loss": 1.3997, + "grad_norm": 0.5673362016677856, + "learning_rate": 0.0001637880759743037 + }, + { + "step": 309, + "epoch": 1.5725190839694656, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443009536, + "loss": 1.3959, + "grad_norm": 0.6793502569198608, + "learning_rate": 0.00016290079865711004 + }, + { + "step": 310, + "epoch": 1.5776081424936388, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443064832, + "loss": 1.3707, + "grad_norm": 0.3574424386024475, + "learning_rate": 0.00016201306634795675 + }, + { + "step": 311, + "epoch": 1.5826972010178118, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44303104, + "loss": 1.3617, + "grad_norm": 0.28747645020484924, + "learning_rate": 0.00016112491035584047 + }, + { + "step": 312, + "epoch": 1.5877862595419847, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443040256, + "loss": 1.3907, + "grad_norm": 0.3864355981349945, + "learning_rate": 0.00016023636200470065 + }, + { + "step": 313, + "epoch": 1.5928753180661577, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443041792, + "loss": 1.3727, + "grad_norm": 0.7653246521949768, + "learning_rate": 0.00015934745263231464 + }, + { + "step": 314, + "epoch": 1.5979643765903306, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44326912, + "loss": 1.3696, + "grad_norm": 0.4049779772758484, + "learning_rate": 0.00015845821358919236 + }, + { + "step": 315, + "epoch": 1.6030534351145038, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44306176, + "loss": 1.367, + "grad_norm": 0.5472803711891174, + "learning_rate": 0.00015756867623747088 + }, + { + "step": 316, + "epoch": 1.608142493638677, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443034112, + "loss": 1.3705, + "grad_norm": 0.5814691185951233, + "learning_rate": 0.00015667887194980806 + }, + { + "step": 317, + "epoch": 1.61323155216285, + "cpu_mem": 1.754468352, + "gpu_mem": 4.442998784, + "loss": 1.3645, + "grad_norm": 0.8484359383583069, + "learning_rate": 0.00015578883210827626 + }, + { + "step": 318, + "epoch": 1.618320610687023, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443089408, + "loss": 1.3489, + "grad_norm": 0.6026954650878906, + "learning_rate": 0.0001548985881032554 + }, + { + "step": 319, + "epoch": 1.623409669211196, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443014144, + "loss": 1.3531, + "grad_norm": 0.749883234500885, + "learning_rate": 0.00015400817133232606 + }, + { + "step": 320, + "epoch": 1.6284987277353689, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44306176, + "loss": 1.3668, + "grad_norm": 0.5169723033905029, + "learning_rate": 0.00015311761319916184 + }, + { + "step": 321, + "epoch": 1.6335877862595418, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443003392, + "loss": 1.3408, + "grad_norm": 0.6389563083648682, + "learning_rate": 0.00015222694511242215 + }, + { + "step": 322, + "epoch": 1.638676844783715, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443224576, + "loss": 1.4, + "grad_norm": 0.9002379775047302, + "learning_rate": 0.00015133619848464424 + }, + { + "step": 323, + "epoch": 1.6437659033078882, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443106304, + "loss": 1.3494, + "grad_norm": 0.7456210851669312, + "learning_rate": 0.0001504454047311353 + }, + { + "step": 324, + "epoch": 1.6488549618320612, + "cpu_mem": 1.754468352, + "gpu_mem": 4.442998784, + "loss": 1.3513, + "grad_norm": 1.214333176612854, + "learning_rate": 0.00014955459526886468 + }, + { + "step": 325, + "epoch": 1.6539440203562341, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44303872, + "loss": 1.3977, + "grad_norm": 0.7481204271316528, + "learning_rate": 0.00014866380151535574 + }, + { + "step": 326, + "epoch": 1.659033078880407, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443086336, + "loss": 1.3488, + "grad_norm": 0.7917167544364929, + "learning_rate": 0.0001477730548875778 + }, + { + "step": 327, + "epoch": 1.66412213740458, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443052544, + "loss": 1.382, + "grad_norm": 0.8137959837913513, + "learning_rate": 0.0001468823868008382 + }, + { + "step": 328, + "epoch": 1.6692111959287532, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44310784, + "loss": 1.3795, + "grad_norm": 0.788689911365509, + "learning_rate": 0.000145991828667674 + }, + { + "step": 329, + "epoch": 1.6743002544529262, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44300032, + "loss": 1.3937, + "grad_norm": 1.2032560110092163, + "learning_rate": 0.0001451014118967446 + }, + { + "step": 330, + "epoch": 1.6793893129770994, + "cpu_mem": 1.754468352, + "gpu_mem": 4.4430848, + "loss": 1.7253, + "grad_norm": 93.90869903564453, + "learning_rate": 0.00014421116789172374 + }, + { + "step": 331, + "epoch": 1.6844783715012723, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443074048, + "loss": 1.3583, + "grad_norm": 2.907238483428955, + "learning_rate": 0.00014332112805019194 + }, + { + "step": 332, + "epoch": 1.6895674300254453, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443029504, + "loss": 1.4153, + "grad_norm": 1.644963026046753, + "learning_rate": 0.00014243132376252912 + }, + { + "step": 333, + "epoch": 1.6946564885496183, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443118592, + "loss": 1.4227, + "grad_norm": 1.5427173376083374, + "learning_rate": 0.00014154178641080767 + }, + { + "step": 334, + "epoch": 1.6997455470737912, + "cpu_mem": 1.754468352, + "gpu_mem": 4.4430464, + "loss": 1.3727, + "grad_norm": 4.219435691833496, + "learning_rate": 0.0001406525473676854 + }, + { + "step": 335, + "epoch": 1.7048346055979644, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443055616, + "loss": 1.4003, + "grad_norm": 0.82020103931427, + "learning_rate": 0.00013976363799529936 + }, + { + "step": 336, + "epoch": 1.7099236641221374, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44310784, + "loss": 1.3571, + "grad_norm": 0.6735349297523499, + "learning_rate": 0.00013887508964415956 + }, + { + "step": 337, + "epoch": 1.7150127226463106, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44310784, + "loss": 1.3573, + "grad_norm": 1.3740174770355225, + "learning_rate": 0.00013798693365204325 + }, + { + "step": 338, + "epoch": 1.7201017811704835, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443144704, + "loss": 1.4087, + "grad_norm": 1.0574979782104492, + "learning_rate": 0.00013709920134288993 + }, + { + "step": 339, + "epoch": 1.7251908396946565, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44306176, + "loss": 1.404, + "grad_norm": 0.8239401578903198, + "learning_rate": 0.00013621192402569628 + }, + { + "step": 340, + "epoch": 1.7302798982188294, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44313088, + "loss": 1.3668, + "grad_norm": 0.5590053200721741, + "learning_rate": 0.00013532513299341215 + }, + { + "step": 341, + "epoch": 1.7353689567430024, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443072512, + "loss": 1.37, + "grad_norm": 0.5792288780212402, + "learning_rate": 0.00013443885952183683 + }, + { + "step": 342, + "epoch": 1.7404580152671756, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443078656, + "loss": 1.3381, + "grad_norm": 0.6344015598297119, + "learning_rate": 0.00013355313486851603 + }, + { + "step": 343, + "epoch": 1.7455470737913485, + "cpu_mem": 1.754468352, + "gpu_mem": 4.442989568, + "loss": 1.3917, + "grad_norm": 0.8274447917938232, + "learning_rate": 0.00013266799027163942 + }, + { + "step": 344, + "epoch": 1.7506361323155217, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44310784, + "loss": 1.3931, + "grad_norm": 0.5958060622215271, + "learning_rate": 0.00013178345694893906 + }, + { + "step": 345, + "epoch": 1.7557251908396947, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443060224, + "loss": 1.3524, + "grad_norm": 0.5338724851608276, + "learning_rate": 0.0001308995660965881 + }, + { + "step": 346, + "epoch": 1.7608142493638677, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443006464, + "loss": 1.3722, + "grad_norm": 0.687784731388092, + "learning_rate": 0.00013001634888810094 + }, + { + "step": 347, + "epoch": 1.7659033078880406, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443055616, + "loss": 1.3597, + "grad_norm": 0.6201812624931335, + "learning_rate": 0.0001291338364732333 + }, + { + "step": 348, + "epoch": 1.7709923664122136, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443181568, + "loss": 1.3874, + "grad_norm": 0.5360949635505676, + "learning_rate": 0.00012825205997688403 + }, + { + "step": 349, + "epoch": 1.7760814249363868, + "cpu_mem": 1.754468352, + "gpu_mem": 4.442980352, + "loss": 1.3361, + "grad_norm": 0.4340069591999054, + "learning_rate": 0.00012737105049799723 + }, + { + "step": 350, + "epoch": 1.78117048346056, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443044864, + "loss": 1.3704, + "grad_norm": 0.3970584571361542, + "learning_rate": 0.00012649083910846543 + }, + { + "step": 351, + "epoch": 1.786259541984733, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443060224, + "loss": 1.3533, + "grad_norm": 0.3974694013595581, + "learning_rate": 0.00012561145685203374 + }, + { + "step": 352, + "epoch": 1.7913486005089059, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443051008, + "loss": 1.3623, + "grad_norm": 0.3154001832008362, + "learning_rate": 0.00012473293474320505 + }, + { + "step": 353, + "epoch": 1.7964376590330788, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443047936, + "loss": 1.3601, + "grad_norm": 0.7864052653312683, + "learning_rate": 0.00012385530376614586 + }, + { + "step": 354, + "epoch": 1.8015267175572518, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44302336, + "loss": 1.3915, + "grad_norm": 0.521012008190155, + "learning_rate": 0.00012297859487359408 + }, + { + "step": 355, + "epoch": 1.806615776081425, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443006464, + "loss": 1.3566, + "grad_norm": 0.35906168818473816, + "learning_rate": 0.0001221028389857669 + }, + { + "step": 356, + "epoch": 1.811704834605598, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443008, + "loss": 1.365, + "grad_norm": 0.3324143886566162, + "learning_rate": 0.00012122806698927051 + }, + { + "step": 357, + "epoch": 1.8167938931297711, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443060224, + "loss": 1.3543, + "grad_norm": 0.6431567072868347, + "learning_rate": 0.00012035430973601075 + }, + { + "step": 358, + "epoch": 1.821882951653944, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443126272, + "loss": 1.3794, + "grad_norm": 0.9437767267227173, + "learning_rate": 0.00011948159804210495 + }, + { + "step": 359, + "epoch": 1.826972010178117, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443080192, + "loss": 1.3527, + "grad_norm": 0.48852601647377014, + "learning_rate": 0.00011860996268679504 + }, + { + "step": 360, + "epoch": 1.83206106870229, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44313088, + "loss": 1.3804, + "grad_norm": 0.387025386095047, + "learning_rate": 0.00011773943441136221 + }, + { + "step": 361, + "epoch": 1.837150127226463, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443101696, + "loss": 1.406, + "grad_norm": 0.6763213276863098, + "learning_rate": 0.00011687004391804251 + }, + { + "step": 362, + "epoch": 1.8422391857506362, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443035648, + "loss": 1.3468, + "grad_norm": 0.5405109524726868, + "learning_rate": 0.00011600182186894417 + }, + { + "step": 363, + "epoch": 1.8473282442748091, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44299264, + "loss": 1.3316, + "grad_norm": 0.4610773026943207, + "learning_rate": 0.00011513479888496609 + }, + { + "step": 364, + "epoch": 1.8524173027989823, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443020288, + "loss": 1.32, + "grad_norm": 0.8890807628631592, + "learning_rate": 0.00011426900554471795 + }, + { + "step": 365, + "epoch": 1.8575063613231553, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443003392, + "loss": 1.3799, + "grad_norm": 0.4655788540840149, + "learning_rate": 0.0001134044723834417 + }, + { + "step": 366, + "epoch": 1.8625954198473282, + "cpu_mem": 1.754468352, + "gpu_mem": 4.4430464, + "loss": 1.3638, + "grad_norm": 0.6658959984779358, + "learning_rate": 0.00011254122989193465 + }, + { + "step": 367, + "epoch": 1.8676844783715012, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44310016, + "loss": 1.3888, + "grad_norm": 0.956990122795105, + "learning_rate": 0.00011167930851547418 + }, + { + "step": 368, + "epoch": 1.8727735368956742, + "cpu_mem": 1.754468352, + "gpu_mem": 4.442997248, + "loss": 1.3822, + "grad_norm": 0.6323347687721252, + "learning_rate": 0.0001108187386527438 + }, + { + "step": 369, + "epoch": 1.8778625954198473, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443057152, + "loss": 1.3604, + "grad_norm": 1.0068360567092896, + "learning_rate": 0.00010995955065476126 + }, + { + "step": 370, + "epoch": 1.8829516539440203, + "cpu_mem": 1.754468352, + "gpu_mem": 4.442980352, + "loss": 1.3702, + "grad_norm": 1.0610517263412476, + "learning_rate": 0.00010910177482380795 + }, + { + "step": 371, + "epoch": 1.8880407124681935, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443140096, + "loss": 1.3503, + "grad_norm": 0.6671691536903381, + "learning_rate": 0.00010824544141236015 + }, + { + "step": 372, + "epoch": 1.8931297709923665, + "cpu_mem": 1.754468352, + "gpu_mem": 4.442994176, + "loss": 1.3589, + "grad_norm": 0.4450676441192627, + "learning_rate": 0.00010739058062202224 + }, + { + "step": 373, + "epoch": 1.8982188295165394, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443017216, + "loss": 1.3317, + "grad_norm": 0.5091978907585144, + "learning_rate": 0.00010653722260246145 + }, + { + "step": 374, + "epoch": 1.9033078880407124, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443040256, + "loss": 1.4249, + "grad_norm": 1.3311100006103516, + "learning_rate": 0.00010568539745034447 + }, + { + "step": 375, + "epoch": 1.9083969465648853, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443264512, + "loss": 1.4448, + "grad_norm": 1.1353776454925537, + "learning_rate": 0.00010483513520827614 + }, + { + "step": 376, + "epoch": 1.9134860050890585, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443021824, + "loss": 1.3726, + "grad_norm": 0.5384571552276611, + "learning_rate": 0.00010398646586373969 + }, + { + "step": 377, + "epoch": 1.9185750636132317, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443144704, + "loss": 1.3645, + "grad_norm": 0.6091146469116211, + "learning_rate": 0.00010313941934803922 + }, + { + "step": 378, + "epoch": 1.9236641221374047, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443060224, + "loss": 1.3778, + "grad_norm": 0.5416065454483032, + "learning_rate": 0.00010229402553524413 + }, + { + "step": 379, + "epoch": 1.9287531806615776, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443149312, + "loss": 1.3787, + "grad_norm": 0.996558666229248, + "learning_rate": 0.00010145031424113542 + }, + { + "step": 380, + "epoch": 1.9338422391857506, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443047936, + "loss": 1.3721, + "grad_norm": 0.8434969782829285, + "learning_rate": 0.00010060831522215416 + }, + { + "step": 381, + "epoch": 1.9389312977099236, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443055616, + "loss": 1.3708, + "grad_norm": 0.5035191178321838, + "learning_rate": 9.976805817435207e-05 + }, + { + "step": 382, + "epoch": 1.9440203562340967, + "cpu_mem": 1.754468352, + "gpu_mem": 4.442978816, + "loss": 1.3345, + "grad_norm": 0.779111921787262, + "learning_rate": 9.89295727323441e-05 + }, + { + "step": 383, + "epoch": 1.9491094147582697, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443058688, + "loss": 1.4146, + "grad_norm": 0.9946375489234924, + "learning_rate": 9.809288846826327e-05 + }, + { + "step": 384, + "epoch": 1.954198473282443, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443029504, + "loss": 1.3745, + "grad_norm": 0.6795583367347717, + "learning_rate": 9.725803489071779e-05 + }, + { + "step": 385, + "epoch": 1.9592875318066159, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443017216, + "loss": 1.3262, + "grad_norm": 0.5522642135620117, + "learning_rate": 9.642504144375026e-05 + }, + { + "step": 386, + "epoch": 1.9643765903307888, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443160064, + "loss": 1.3092, + "grad_norm": 0.682063639163971, + "learning_rate": 9.559393750579926e-05 + }, + { + "step": 387, + "epoch": 1.9694656488549618, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443043328, + "loss": 1.369, + "grad_norm": 0.496204137802124, + "learning_rate": 9.476475238866318e-05 + }, + { + "step": 388, + "epoch": 1.9745547073791347, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44305408, + "loss": 1.3772, + "grad_norm": 0.7089257836341858, + "learning_rate": 9.393751533646649e-05 + }, + { + "step": 389, + "epoch": 1.979643765903308, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443209216, + "loss": 1.3984, + "grad_norm": 0.7045975923538208, + "learning_rate": 9.31122555246283e-05 + }, + { + "step": 390, + "epoch": 1.984732824427481, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443018752, + "loss": 1.3823, + "grad_norm": 0.7336382865905762, + "learning_rate": 9.228900205883324e-05 + }, + { + "step": 391, + "epoch": 1.989821882951654, + "cpu_mem": 1.754468352, + "gpu_mem": 4.443040256, + "loss": 1.3697, + "grad_norm": 0.6636502742767334, + "learning_rate": 9.146778397400543e-05 + }, + { + "step": 392, + "epoch": 1.994910941475827, + "cpu_mem": 1.754468352, + "gpu_mem": 4.44307712, + "loss": 1.3894, + "grad_norm": 0.6760593056678772, + "learning_rate": 9.064863023328384e-05 + }, + { + "step": 393, + "epoch": 2.0, + "cpu_mem": 1.754468352, + "gpu_mem": 4.442663936, + "loss": 2.058, + "grad_norm": 0.8926852941513062, + "learning_rate": 8.983156972700125e-05 + }, + { + "step": 394, + "epoch": 2.005089058524173, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436690944, + "loss": 1.3505, + "grad_norm": 0.8491313457489014, + "learning_rate": 8.901663127166513e-05 + }, + { + "step": 395, + "epoch": 2.010178117048346, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43666944, + "loss": 1.3789, + "grad_norm": 0.5739907622337341, + "learning_rate": 8.820384360894143e-05 + }, + { + "step": 396, + "epoch": 2.015267175572519, + "cpu_mem": 1.754468352, + "gpu_mem": 4.4366848, + "loss": 1.3709, + "grad_norm": 0.9756261110305786, + "learning_rate": 8.739323540464063e-05 + }, + { + "step": 397, + "epoch": 2.0203562340966923, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436832256, + "loss": 1.3362, + "grad_norm": 0.538524329662323, + "learning_rate": 8.658483524770728e-05 + }, + { + "step": 398, + "epoch": 2.0254452926208653, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436778496, + "loss": 1.3778, + "grad_norm": 0.6404532194137573, + "learning_rate": 8.577867164921113e-05 + }, + { + "step": 399, + "epoch": 2.030534351145038, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436795392, + "loss": 1.3736, + "grad_norm": 0.6494705080986023, + "learning_rate": 8.497477304134203e-05 + }, + { + "step": 400, + "epoch": 2.035623409669211, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436747776, + "loss": 1.3841, + "grad_norm": 0.620377242565155, + "learning_rate": 8.41731677764068e-05 + }, + { + "step": 401, + "epoch": 2.040712468193384, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436772352, + "loss": 1.3767, + "grad_norm": 0.6045562028884888, + "learning_rate": 8.337388412582972e-05 + }, + { + "step": 402, + "epoch": 2.045801526717557, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436819968, + "loss": 1.3427, + "grad_norm": 0.612251341342926, + "learning_rate": 8.257695027915481e-05 + }, + { + "step": 403, + "epoch": 2.05089058524173, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436729344, + "loss": 1.3738, + "grad_norm": 0.8450515270233154, + "learning_rate": 8.178239434305235e-05 + }, + { + "step": 404, + "epoch": 2.0559796437659035, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436701696, + "loss": 1.3502, + "grad_norm": 0.8077211976051331, + "learning_rate": 8.099024434032717e-05 + }, + { + "step": 405, + "epoch": 2.0610687022900764, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436790784, + "loss": 1.3287, + "grad_norm": 0.4236655831336975, + "learning_rate": 8.02005282089303e-05 + }, + { + "step": 406, + "epoch": 2.0661577608142494, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436681728, + "loss": 1.3596, + "grad_norm": 0.6391414403915405, + "learning_rate": 7.941327380097388e-05 + }, + { + "step": 407, + "epoch": 2.0712468193384224, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436733952, + "loss": 1.34, + "grad_norm": 0.5631117820739746, + "learning_rate": 7.862850888174869e-05 + }, + { + "step": 408, + "epoch": 2.0763358778625953, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436687872, + "loss": 1.3461, + "grad_norm": 0.580104649066925, + "learning_rate": 7.784626112874487e-05 + }, + { + "step": 409, + "epoch": 2.0814249363867683, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436890624, + "loss": 1.3457, + "grad_norm": 0.53022700548172, + "learning_rate": 7.706655813067594e-05 + }, + { + "step": 410, + "epoch": 2.0865139949109412, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436889088, + "loss": 1.3721, + "grad_norm": 0.8033708930015564, + "learning_rate": 7.628942738650573e-05 + }, + { + "step": 411, + "epoch": 2.0916030534351147, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436812288, + "loss": 1.3268, + "grad_norm": 0.5477765202522278, + "learning_rate": 7.551489630447835e-05 + }, + { + "step": 412, + "epoch": 2.0966921119592876, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43676928, + "loss": 1.3299, + "grad_norm": 0.5233580470085144, + "learning_rate": 7.474299220115195e-05 + }, + { + "step": 413, + "epoch": 2.1017811704834606, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43674624, + "loss": 1.3197, + "grad_norm": 0.6099407076835632, + "learning_rate": 7.397374230043484e-05 + }, + { + "step": 414, + "epoch": 2.1068702290076335, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43670016, + "loss": 1.3327, + "grad_norm": 0.5495460629463196, + "learning_rate": 7.320717373262557e-05 + }, + { + "step": 415, + "epoch": 2.1119592875318065, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436816896, + "loss": 1.3185, + "grad_norm": 0.8084400296211243, + "learning_rate": 7.244331353345625e-05 + }, + { + "step": 416, + "epoch": 2.1170483460559795, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436704768, + "loss": 1.3546, + "grad_norm": 0.5238580107688904, + "learning_rate": 7.16821886431386e-05 + }, + { + "step": 417, + "epoch": 2.122137404580153, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436694016, + "loss": 1.3587, + "grad_norm": 0.6973832249641418, + "learning_rate": 7.092382590541432e-05 + }, + { + "step": 418, + "epoch": 2.127226463104326, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436698624, + "loss": 1.3357, + "grad_norm": 0.4791755974292755, + "learning_rate": 7.016825206660788e-05 + }, + { + "step": 419, + "epoch": 2.132315521628499, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436813824, + "loss": 1.4031, + "grad_norm": 0.8645904660224915, + "learning_rate": 6.941549377468367e-05 + }, + { + "step": 420, + "epoch": 2.1374045801526718, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436787712, + "loss": 1.3699, + "grad_norm": 0.7681152820587158, + "learning_rate": 6.866557757830575e-05 + }, + { + "step": 421, + "epoch": 2.1424936386768447, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436689408, + "loss": 1.3169, + "grad_norm": 1.0802290439605713, + "learning_rate": 6.791852992590169e-05 + }, + { + "step": 422, + "epoch": 2.1475826972010177, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436724736, + "loss": 1.3371, + "grad_norm": 0.8197013139724731, + "learning_rate": 6.717437716472997e-05 + }, + { + "step": 423, + "epoch": 2.1526717557251906, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436690944, + "loss": 1.3337, + "grad_norm": 0.6200465559959412, + "learning_rate": 6.643314553995034e-05 + }, + { + "step": 424, + "epoch": 2.157760814249364, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436824576, + "loss": 1.3075, + "grad_norm": 0.6519500017166138, + "learning_rate": 6.569486119369863e-05 + }, + { + "step": 425, + "epoch": 2.162849872773537, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436798464, + "loss": 1.3407, + "grad_norm": 0.8384870886802673, + "learning_rate": 6.495955016416441e-05 + }, + { + "step": 426, + "epoch": 2.16793893129771, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436843008, + "loss": 1.295, + "grad_norm": 0.9940242171287537, + "learning_rate": 6.422723838467286e-05 + }, + { + "step": 427, + "epoch": 2.173027989821883, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436733952, + "loss": 1.3128, + "grad_norm": 1.050050139427185, + "learning_rate": 6.349795168276994e-05 + }, + { + "step": 428, + "epoch": 2.178117048346056, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436750848, + "loss": 1.3192, + "grad_norm": 1.335123062133789, + "learning_rate": 6.277171577931187e-05 + }, + { + "step": 429, + "epoch": 2.183206106870229, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436826112, + "loss": 1.3827, + "grad_norm": 1.2839961051940918, + "learning_rate": 6.204855628755751e-05 + }, + { + "step": 430, + "epoch": 2.188295165394402, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436674048, + "loss": 1.3968, + "grad_norm": 1.2869881391525269, + "learning_rate": 6.13284987122654e-05 + }, + { + "step": 431, + "epoch": 2.1933842239185752, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43679232, + "loss": 1.3694, + "grad_norm": 2.3456263542175293, + "learning_rate": 6.061156844879417e-05 + }, + { + "step": 432, + "epoch": 2.198473282442748, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436781568, + "loss": 1.3367, + "grad_norm": 2.165839195251465, + "learning_rate": 5.9897790782206636e-05 + }, + { + "step": 433, + "epoch": 2.203562340966921, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436747776, + "loss": 1.3431, + "grad_norm": 1.0713649988174438, + "learning_rate": 5.9187190886378306e-05 + }, + { + "step": 434, + "epoch": 2.208651399491094, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436726272, + "loss": 1.334, + "grad_norm": 0.9470629096031189, + "learning_rate": 5.8479793823109406e-05 + }, + { + "step": 435, + "epoch": 2.213740458015267, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436721664, + "loss": 1.3814, + "grad_norm": 1.5658239126205444, + "learning_rate": 5.777562454124113e-05 + }, + { + "step": 436, + "epoch": 2.21882951653944, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436732416, + "loss": 1.3322, + "grad_norm": 0.9318075776100159, + "learning_rate": 5.7074707875775496e-05 + }, + { + "step": 437, + "epoch": 2.223918575063613, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436786176, + "loss": 1.3782, + "grad_norm": 1.1330004930496216, + "learning_rate": 5.637706854699974e-05 + }, + { + "step": 438, + "epoch": 2.2290076335877864, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436703232, + "loss": 1.3603, + "grad_norm": 1.074501872062683, + "learning_rate": 5.568273115961414e-05 + }, + { + "step": 439, + "epoch": 2.2340966921119594, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43673088, + "loss": 1.3696, + "grad_norm": 1.0834554433822632, + "learning_rate": 5.499172020186447e-05 + }, + { + "step": 440, + "epoch": 2.2391857506361323, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436729344, + "loss": 1.3415, + "grad_norm": 1.1225998401641846, + "learning_rate": 5.430406004467842e-05 + }, + { + "step": 441, + "epoch": 2.2442748091603053, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436772352, + "loss": 1.3524, + "grad_norm": 0.8675007224082947, + "learning_rate": 5.361977494080572e-05 + }, + { + "step": 442, + "epoch": 2.2493638676844783, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436770816, + "loss": 1.3333, + "grad_norm": 1.0830079317092896, + "learning_rate": 5.293888902396319e-05 + }, + { + "step": 443, + "epoch": 2.2544529262086512, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436821504, + "loss": 1.3587, + "grad_norm": 1.611135721206665, + "learning_rate": 5.2261426307983204e-05 + }, + { + "step": 444, + "epoch": 2.2595419847328246, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436827648, + "loss": 1.351, + "grad_norm": 1.169464349746704, + "learning_rate": 5.158741068596714e-05 + }, + { + "step": 445, + "epoch": 2.2646310432569976, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436766208, + "loss": 1.3519, + "grad_norm": 1.103286623954773, + "learning_rate": 5.0916865929442326e-05 + }, + { + "step": 446, + "epoch": 2.2697201017811706, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436755456, + "loss": 1.415, + "grad_norm": 1.6089816093444824, + "learning_rate": 5.024981568752386e-05 + }, + { + "step": 447, + "epoch": 2.2748091603053435, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436780032, + "loss": 1.3876, + "grad_norm": 1.1294152736663818, + "learning_rate": 4.958628348608065e-05 + }, + { + "step": 448, + "epoch": 2.2798982188295165, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436796928, + "loss": 1.3184, + "grad_norm": 1.0290534496307373, + "learning_rate": 4.892629272690536e-05 + }, + { + "step": 449, + "epoch": 2.2849872773536894, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436667904, + "loss": 1.3652, + "grad_norm": 0.8614636063575745, + "learning_rate": 4.826986668688944e-05 + }, + { + "step": 450, + "epoch": 2.2900763358778624, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436843008, + "loss": 1.3237, + "grad_norm": 1.0500050783157349, + "learning_rate": 4.761702851720191e-05 + }, + { + "step": 451, + "epoch": 2.2951653944020354, + "cpu_mem": 1.754468352, + "gpu_mem": 4.4368, + "loss": 1.3284, + "grad_norm": 0.9447882771492004, + "learning_rate": 4.6967801242472916e-05 + }, + { + "step": 452, + "epoch": 2.300254452926209, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436709376, + "loss": 1.3361, + "grad_norm": 0.8990108370780945, + "learning_rate": 4.632220775998172e-05 + }, + { + "step": 453, + "epoch": 2.3053435114503817, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436657152, + "loss": 1.3426, + "grad_norm": 0.6518520712852478, + "learning_rate": 4.568027083884929e-05 + }, + { + "step": 454, + "epoch": 2.3104325699745547, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436672512, + "loss": 1.4059, + "grad_norm": 1.1936618089675903, + "learning_rate": 4.504201311923488e-05 + }, + { + "step": 455, + "epoch": 2.3155216284987277, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436773888, + "loss": 1.3323, + "grad_norm": 0.7533771991729736, + "learning_rate": 4.440745711153804e-05 + }, + { + "step": 456, + "epoch": 2.3206106870229006, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436783104, + "loss": 1.3635, + "grad_norm": 1.1786432266235352, + "learning_rate": 4.377662519560423e-05 + }, + { + "step": 457, + "epoch": 2.325699745547074, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436812288, + "loss": 1.3207, + "grad_norm": 0.8457615375518799, + "learning_rate": 4.3149539619935836e-05 + }, + { + "step": 458, + "epoch": 2.330788804071247, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436873728, + "loss": 1.3288, + "grad_norm": 0.7218217849731445, + "learning_rate": 4.252622250090746e-05 + }, + { + "step": 459, + "epoch": 2.33587786259542, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436727808, + "loss": 1.3253, + "grad_norm": 0.8112698793411255, + "learning_rate": 4.190669582198571e-05 + }, + { + "step": 460, + "epoch": 2.340966921119593, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43681536, + "loss": 1.3147, + "grad_norm": 1.3152931928634644, + "learning_rate": 4.1290981432954185e-05 + }, + { + "step": 461, + "epoch": 2.346055979643766, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436718592, + "loss": 1.3382, + "grad_norm": 0.8810279369354248, + "learning_rate": 4.067910104914249e-05 + }, + { + "step": 462, + "epoch": 2.351145038167939, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436789248, + "loss": 1.3254, + "grad_norm": 0.7967131733894348, + "learning_rate": 4.007107625066079e-05 + }, + { + "step": 463, + "epoch": 2.356234096692112, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436824576, + "loss": 1.324, + "grad_norm": 1.2178125381469727, + "learning_rate": 3.946692848163836e-05 + }, + { + "step": 464, + "epoch": 2.3613231552162848, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436763136, + "loss": 1.2672, + "grad_norm": 0.9049829244613647, + "learning_rate": 3.886667904946739e-05 + }, + { + "step": 465, + "epoch": 2.366412213740458, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436720128, + "loss": 1.312, + "grad_norm": 0.9222964644432068, + "learning_rate": 3.8270349124051694e-05 + }, + { + "step": 466, + "epoch": 2.371501272264631, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436721664, + "loss": 1.38, + "grad_norm": 0.8797817826271057, + "learning_rate": 3.767795973705975e-05 + }, + { + "step": 467, + "epoch": 2.376590330788804, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436758528, + "loss": 1.3048, + "grad_norm": 0.8464904427528381, + "learning_rate": 3.708953178118324e-05 + }, + { + "step": 468, + "epoch": 2.381679389312977, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43673088, + "loss": 1.3004, + "grad_norm": 0.8803203701972961, + "learning_rate": 3.6505086009399944e-05 + }, + { + "step": 469, + "epoch": 2.38676844783715, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436787712, + "loss": 1.3582, + "grad_norm": 1.1918636560440063, + "learning_rate": 3.5924643034242136e-05 + }, + { + "step": 470, + "epoch": 2.391857506361323, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436816896, + "loss": 1.3408, + "grad_norm": 1.5736128091812134, + "learning_rate": 3.5348223327069105e-05 + }, + { + "step": 471, + "epoch": 2.3969465648854964, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436717056, + "loss": 1.3286, + "grad_norm": 1.5431745052337646, + "learning_rate": 3.4775847217345756e-05 + }, + { + "step": 472, + "epoch": 2.4020356234096694, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436740096, + "loss": 1.3202, + "grad_norm": 1.4597489833831787, + "learning_rate": 3.420753489192524e-05 + }, + { + "step": 473, + "epoch": 2.4071246819338423, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43671552, + "loss": 1.369, + "grad_norm": 1.4939475059509277, + "learning_rate": 3.364330639433701e-05 + }, + { + "step": 474, + "epoch": 2.4122137404580153, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43680768, + "loss": 1.3622, + "grad_norm": 1.12813401222229, + "learning_rate": 3.308318162408013e-05 + }, + { + "step": 475, + "epoch": 2.4173027989821882, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43667712, + "loss": 1.3141, + "grad_norm": 1.0926592350006104, + "learning_rate": 3.2527180335921186e-05 + }, + { + "step": 476, + "epoch": 2.422391857506361, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436813824, + "loss": 1.3688, + "grad_norm": 0.9176599979400635, + "learning_rate": 3.197532213919774e-05 + }, + { + "step": 477, + "epoch": 2.427480916030534, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436670976, + "loss": 1.3319, + "grad_norm": 1.3738716840744019, + "learning_rate": 3.1427626497126654e-05 + }, + { + "step": 478, + "epoch": 2.432569974554707, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436694016, + "loss": 1.3144, + "grad_norm": 0.8720364570617676, + "learning_rate": 3.088411272611781e-05 + }, + { + "step": 479, + "epoch": 2.4376590330788805, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436741632, + "loss": 1.3328, + "grad_norm": 1.0878186225891113, + "learning_rate": 3.0344799995092533e-05 + }, + { + "step": 480, + "epoch": 2.4427480916030535, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43678464, + "loss": 1.3539, + "grad_norm": 0.8469704389572144, + "learning_rate": 2.9809707324807912e-05 + }, + { + "step": 481, + "epoch": 2.4478371501272265, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436712448, + "loss": 1.3382, + "grad_norm": 1.5568212270736694, + "learning_rate": 2.9278853587185658e-05 + }, + { + "step": 482, + "epoch": 2.4529262086513994, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436810752, + "loss": 1.3585, + "grad_norm": 1.5294939279556274, + "learning_rate": 2.8752257504646616e-05 + }, + { + "step": 483, + "epoch": 2.4580152671755724, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436706304, + "loss": 1.3161, + "grad_norm": 0.9430554509162903, + "learning_rate": 2.8229937649450613e-05 + }, + { + "step": 484, + "epoch": 2.4631043256997454, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43686144, + "loss": 1.336, + "grad_norm": 0.9589649438858032, + "learning_rate": 2.7711912443041123e-05 + }, + { + "step": 485, + "epoch": 2.4681933842239188, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436694016, + "loss": 1.3488, + "grad_norm": 0.841921329498291, + "learning_rate": 2.719820015539596e-05 + }, + { + "step": 486, + "epoch": 2.4732824427480917, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436737024, + "loss": 1.3655, + "grad_norm": 1.0995687246322632, + "learning_rate": 2.6688818904382513e-05 + }, + { + "step": 487, + "epoch": 2.4783715012722647, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436758528, + "loss": 1.3275, + "grad_norm": 0.9871271848678589, + "learning_rate": 2.6183786655119144e-05 + }, + { + "step": 488, + "epoch": 2.4834605597964376, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436681728, + "loss": 1.3261, + "grad_norm": 1.0054882764816284, + "learning_rate": 2.5683121219341217e-05 + }, + { + "step": 489, + "epoch": 2.4885496183206106, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436818432, + "loss": 1.3269, + "grad_norm": 1.5892537832260132, + "learning_rate": 2.518684025477319e-05 + }, + { + "step": 490, + "epoch": 2.4936386768447836, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436706304, + "loss": 1.293, + "grad_norm": 1.00835382938385, + "learning_rate": 2.469496126450578e-05 + }, + { + "step": 491, + "epoch": 2.4987277353689565, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436703232, + "loss": 1.3928, + "grad_norm": 0.9828410744667053, + "learning_rate": 2.4207501596378508e-05 + }, + { + "step": 492, + "epoch": 2.5038167938931295, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43667712, + "loss": 1.2711, + "grad_norm": 0.8667196035385132, + "learning_rate": 2.3724478442368133e-05 + }, + { + "step": 493, + "epoch": 2.508905852417303, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436712448, + "loss": 1.2606, + "grad_norm": 1.1051603555679321, + "learning_rate": 2.324590883798204e-05 + }, + { + "step": 494, + "epoch": 2.513994910941476, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436727808, + "loss": 1.3117, + "grad_norm": 1.1522477865219116, + "learning_rate": 2.2771809661657614e-05 + }, + { + "step": 495, + "epoch": 2.519083969465649, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436721664, + "loss": 1.3349, + "grad_norm": 0.8900982141494751, + "learning_rate": 2.2302197634166835e-05 + }, + { + "step": 496, + "epoch": 2.524173027989822, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436733952, + "loss": 1.3789, + "grad_norm": 1.4223146438598633, + "learning_rate": 2.1837089318026714e-05 + }, + { + "step": 497, + "epoch": 2.5292620865139948, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436766208, + "loss": 1.2972, + "grad_norm": 1.1988942623138428, + "learning_rate": 2.1376501116915047e-05 + }, + { + "step": 498, + "epoch": 2.534351145038168, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436767744, + "loss": 1.3185, + "grad_norm": 0.8718321323394775, + "learning_rate": 2.0920449275091837e-05 + }, + { + "step": 499, + "epoch": 2.539440203562341, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436740096, + "loss": 1.3326, + "grad_norm": 1.2118189334869385, + "learning_rate": 2.0468949876826573e-05 + }, + { + "step": 500, + "epoch": 2.544529262086514, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436789248, + "loss": 1.3088, + "grad_norm": 1.1309117078781128, + "learning_rate": 2.002201884583065e-05 + }, + { + "step": 501, + "epoch": 2.549618320610687, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436718592, + "loss": 1.3702, + "grad_norm": 1.3940666913986206, + "learning_rate": 1.957967194469615e-05 + }, + { + "step": 502, + "epoch": 2.55470737913486, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43677696, + "loss": 1.2716, + "grad_norm": 1.3596471548080444, + "learning_rate": 1.9141924774339566e-05 + }, + { + "step": 503, + "epoch": 2.559796437659033, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436767744, + "loss": 1.2456, + "grad_norm": 0.9311547875404358, + "learning_rate": 1.8708792773451874e-05 + }, + { + "step": 504, + "epoch": 2.564885496183206, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436701696, + "loss": 1.3189, + "grad_norm": 1.2461668252944946, + "learning_rate": 1.828029121795375e-05 + }, + { + "step": 505, + "epoch": 2.569974554707379, + "cpu_mem": 1.754468352, + "gpu_mem": 4.4367232, + "loss": 1.3403, + "grad_norm": 1.18405020236969, + "learning_rate": 1.7856435220457092e-05 + }, + { + "step": 506, + "epoch": 2.5750636132315523, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436778496, + "loss": 1.3616, + "grad_norm": 1.9778786897659302, + "learning_rate": 1.7437239729731806e-05 + }, + { + "step": 507, + "epoch": 2.5801526717557253, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436695552, + "loss": 1.3167, + "grad_norm": 0.9370551109313965, + "learning_rate": 1.7022719530178624e-05 + }, + { + "step": 508, + "epoch": 2.5852417302798982, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436760064, + "loss": 1.3515, + "grad_norm": 1.1121931076049805, + "learning_rate": 1.6612889241307836e-05 + }, + { + "step": 509, + "epoch": 2.590330788804071, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436718592, + "loss": 1.3285, + "grad_norm": 1.306088924407959, + "learning_rate": 1.620776331722347e-05 + }, + { + "step": 510, + "epoch": 2.595419847328244, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436750848, + "loss": 1.3566, + "grad_norm": 1.346854329109192, + "learning_rate": 1.580735604611368e-05 + }, + { + "step": 511, + "epoch": 2.6005089058524176, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436724736, + "loss": 1.3189, + "grad_norm": 1.3941583633422852, + "learning_rate": 1.5411681549746678e-05 + }, + { + "step": 512, + "epoch": 2.6055979643765905, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436718592, + "loss": 1.3337, + "grad_norm": 0.9706240892410278, + "learning_rate": 1.502075378297285e-05 + }, + { + "step": 513, + "epoch": 2.6106870229007635, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436729344, + "loss": 1.3881, + "grad_norm": 1.3901233673095703, + "learning_rate": 1.4634586533232428e-05 + }, + { + "step": 514, + "epoch": 2.6157760814249365, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436634112, + "loss": 1.375, + "grad_norm": 1.2406089305877686, + "learning_rate": 1.4253193420069292e-05 + }, + { + "step": 515, + "epoch": 2.6208651399491094, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436703232, + "loss": 1.307, + "grad_norm": 1.1133575439453125, + "learning_rate": 1.3876587894650686e-05 + }, + { + "step": 516, + "epoch": 2.6259541984732824, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436750848, + "loss": 1.3129, + "grad_norm": 1.2303940057754517, + "learning_rate": 1.350478323929271e-05 + }, + { + "step": 517, + "epoch": 2.6310432569974553, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436695552, + "loss": 1.3354, + "grad_norm": 1.8455513715744019, + "learning_rate": 1.3137792566992001e-05 + }, + { + "step": 518, + "epoch": 2.6361323155216283, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43675392, + "loss": 1.3738, + "grad_norm": 1.2305988073349, + "learning_rate": 1.2775628820963091e-05 + }, + { + "step": 519, + "epoch": 2.6412213740458013, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436760064, + "loss": 1.28, + "grad_norm": 1.1454541683197021, + "learning_rate": 1.2418304774182075e-05 + }, + { + "step": 520, + "epoch": 2.6463104325699747, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436896768, + "loss": 1.2774, + "grad_norm": 0.9283663034439087, + "learning_rate": 1.2065833028935968e-05 + }, + { + "step": 521, + "epoch": 2.6513994910941476, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436706304, + "loss": 1.2801, + "grad_norm": 1.1059356927871704, + "learning_rate": 1.1718226016378507e-05 + }, + { + "step": 522, + "epoch": 2.6564885496183206, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436770816, + "loss": 1.3079, + "grad_norm": 1.2374422550201416, + "learning_rate": 1.137549599609136e-05 + }, + { + "step": 523, + "epoch": 2.6615776081424936, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436686336, + "loss": 1.3206, + "grad_norm": 1.6041053533554077, + "learning_rate": 1.103765505565205e-05 + }, + { + "step": 524, + "epoch": 2.6666666666666665, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436703232, + "loss": 1.3073, + "grad_norm": 1.0383658409118652, + "learning_rate": 1.0704715110207579e-05 + }, + { + "step": 525, + "epoch": 2.67175572519084, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436703232, + "loss": 1.2766, + "grad_norm": 1.0319260358810425, + "learning_rate": 1.0376687902053981e-05 + }, + { + "step": 526, + "epoch": 2.676844783715013, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436829184, + "loss": 1.3521, + "grad_norm": 0.9934118986129761, + "learning_rate": 1.0053585000222524e-05 + }, + { + "step": 527, + "epoch": 2.681933842239186, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436709376, + "loss": 1.2922, + "grad_norm": 1.3146089315414429, + "learning_rate": 9.735417800071433e-06 + }, + { + "step": 528, + "epoch": 2.687022900763359, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436867584, + "loss": 1.343, + "grad_norm": 1.041687250137329, + "learning_rate": 9.42219752288414e-06 + }, + { + "step": 529, + "epoch": 2.6921119592875318, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43669248, + "loss": 1.2894, + "grad_norm": 0.9384854435920715, + "learning_rate": 9.113935215473428e-06 + }, + { + "step": 530, + "epoch": 2.6972010178117047, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436710912, + "loss": 1.3701, + "grad_norm": 1.0016591548919678, + "learning_rate": 8.810641749791902e-06 + }, + { + "step": 531, + "epoch": 2.7022900763358777, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436887552, + "loss": 1.2722, + "grad_norm": 1.0993374586105347, + "learning_rate": 8.512327822548481e-06 + }, + { + "step": 532, + "epoch": 2.7073791348600507, + "cpu_mem": 1.754468352, + "gpu_mem": 4.4367616, + "loss": 1.2972, + "grad_norm": 0.9385271072387695, + "learning_rate": 8.219003954831199e-06 + }, + { + "step": 533, + "epoch": 2.712468193384224, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436758528, + "loss": 1.3179, + "grad_norm": 1.3874512910842896, + "learning_rate": 7.930680491736135e-06 + }, + { + "step": 534, + "epoch": 2.717557251908397, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436767744, + "loss": 1.359, + "grad_norm": 1.12680983543396, + "learning_rate": 7.647367602002491e-06 + }, + { + "step": 535, + "epoch": 2.72264631043257, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436643328, + "loss": 1.3363, + "grad_norm": 1.1593486070632935, + "learning_rate": 7.369075277654091e-06 + }, + { + "step": 536, + "epoch": 2.727735368956743, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436925952, + "loss": 1.3338, + "grad_norm": 1.3790289163589478, + "learning_rate": 7.095813333646832e-06 + }, + { + "step": 537, + "epoch": 2.732824427480916, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436689408, + "loss": 1.3035, + "grad_norm": 1.0079233646392822, + "learning_rate": 6.827591407522548e-06 + }, + { + "step": 538, + "epoch": 2.7379134860050893, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43678464, + "loss": 1.3631, + "grad_norm": 0.9196240901947021, + "learning_rate": 6.564418959069273e-06 + }, + { + "step": 539, + "epoch": 2.7430025445292623, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436750848, + "loss": 1.3543, + "grad_norm": 1.7374764680862427, + "learning_rate": 6.3063052699873326e-06 + }, + { + "step": 540, + "epoch": 2.7480916030534353, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436718592, + "loss": 1.3676, + "grad_norm": 1.4939314126968384, + "learning_rate": 6.053259443562286e-06 + }, + { + "step": 541, + "epoch": 2.753180661577608, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436737024, + "loss": 1.3755, + "grad_norm": 1.283039927482605, + "learning_rate": 5.8052904043435985e-06 + }, + { + "step": 542, + "epoch": 2.758269720101781, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436787712, + "loss": 1.4174, + "grad_norm": 1.4029946327209473, + "learning_rate": 5.56240689783013e-06 + }, + { + "step": 543, + "epoch": 2.763358778625954, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436683264, + "loss": 1.3638, + "grad_norm": 1.183237910270691, + "learning_rate": 5.324617490161409e-06 + }, + { + "step": 544, + "epoch": 2.768447837150127, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436712448, + "loss": 1.327, + "grad_norm": 1.1774239540100098, + "learning_rate": 5.091930567815866e-06 + }, + { + "step": 545, + "epoch": 2.7735368956743, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436655616, + "loss": 1.3798, + "grad_norm": 1.0397051572799683, + "learning_rate": 4.86435433731473e-06 + }, + { + "step": 546, + "epoch": 2.778625954198473, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43673088, + "loss": 1.2692, + "grad_norm": 1.1654272079467773, + "learning_rate": 4.641896824932861e-06 + }, + { + "step": 547, + "epoch": 2.7837150127226464, + "cpu_mem": 1.754468352, + "gpu_mem": 4.4367232, + "loss": 1.3461, + "grad_norm": 1.0464783906936646, + "learning_rate": 4.424565876415415e-06 + }, + { + "step": 548, + "epoch": 2.7888040712468194, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43675392, + "loss": 1.3184, + "grad_norm": 1.0117496252059937, + "learning_rate": 4.212369156701373e-06 + }, + { + "step": 549, + "epoch": 2.7938931297709924, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436683264, + "loss": 1.3487, + "grad_norm": 1.0898796319961548, + "learning_rate": 4.005314149653133e-06 + }, + { + "step": 550, + "epoch": 2.7989821882951653, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436709376, + "loss": 1.303, + "grad_norm": 1.2821986675262451, + "learning_rate": 3.8034081577924147e-06 + }, + { + "step": 551, + "epoch": 2.8040712468193383, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436804608, + "loss": 1.3238, + "grad_norm": 1.2596768140792847, + "learning_rate": 3.6066583020429864e-06 + }, + { + "step": 552, + "epoch": 2.8091603053435117, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43670016, + "loss": 1.2797, + "grad_norm": 1.125903844833374, + "learning_rate": 3.415071521479246e-06 + }, + { + "step": 553, + "epoch": 2.8142493638676847, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436709376, + "loss": 1.3797, + "grad_norm": 1.066947102546692, + "learning_rate": 3.2286545730817183e-06 + }, + { + "step": 554, + "epoch": 2.8193384223918576, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436737024, + "loss": 1.3859, + "grad_norm": 1.3472384214401245, + "learning_rate": 3.0474140314985628e-06 + }, + { + "step": 555, + "epoch": 2.8244274809160306, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436750848, + "loss": 1.3775, + "grad_norm": 1.5011615753173828, + "learning_rate": 2.8713562888138754e-06 + }, + { + "step": 556, + "epoch": 2.8295165394402035, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436744704, + "loss": 1.3357, + "grad_norm": 1.1852895021438599, + "learning_rate": 2.7004875543220506e-06 + }, + { + "step": 557, + "epoch": 2.8346055979643765, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43670784, + "loss": 1.3193, + "grad_norm": 1.2091888189315796, + "learning_rate": 2.5348138543089425e-06 + }, + { + "step": 558, + "epoch": 2.8396946564885495, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436766208, + "loss": 1.3501, + "grad_norm": 1.5426965951919556, + "learning_rate": 2.374341031839283e-06 + }, + { + "step": 559, + "epoch": 2.8447837150127224, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436675584, + "loss": 1.3597, + "grad_norm": 1.349898099899292, + "learning_rate": 2.2190747465505644e-06 + }, + { + "step": 560, + "epoch": 2.849872773536896, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436760064, + "loss": 1.3197, + "grad_norm": 1.3502475023269653, + "learning_rate": 2.0690204744534976e-06 + }, + { + "step": 561, + "epoch": 2.854961832061069, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436798464, + "loss": 1.2797, + "grad_norm": 1.4866161346435547, + "learning_rate": 1.924183507738819e-06 + }, + { + "step": 562, + "epoch": 2.8600508905852418, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436843008, + "loss": 1.3884, + "grad_norm": 1.2403790950775146, + "learning_rate": 1.7845689545906704e-06 + }, + { + "step": 563, + "epoch": 2.8651399491094147, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436804608, + "loss": 1.2733, + "grad_norm": 1.2063604593276978, + "learning_rate": 1.6501817390064786e-06 + }, + { + "step": 564, + "epoch": 2.8702290076335877, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436741632, + "loss": 1.3637, + "grad_norm": 1.293779730796814, + "learning_rate": 1.521026600623243e-06 + }, + { + "step": 565, + "epoch": 2.875318066157761, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436795392, + "loss": 1.366, + "grad_norm": 1.4982142448425293, + "learning_rate": 1.3971080945503866e-06 + }, + { + "step": 566, + "epoch": 2.880407124681934, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436733952, + "loss": 1.3494, + "grad_norm": 1.8843094110488892, + "learning_rate": 1.2784305912090842e-06 + }, + { + "step": 567, + "epoch": 2.885496183206107, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436720128, + "loss": 1.2715, + "grad_norm": 1.0696361064910889, + "learning_rate": 1.1649982761782195e-06 + }, + { + "step": 568, + "epoch": 2.89058524173028, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436932096, + "loss": 1.2674, + "grad_norm": 1.0314807891845703, + "learning_rate": 1.0568151500465693e-06 + }, + { + "step": 569, + "epoch": 2.895674300254453, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436796928, + "loss": 1.2918, + "grad_norm": 2.015803337097168, + "learning_rate": 9.538850282719833e-07 + }, + { + "step": 570, + "epoch": 2.900763358778626, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436770816, + "loss": 1.3413, + "grad_norm": 1.2085055112838745, + "learning_rate": 8.56211541046542e-07 + }, + { + "step": 571, + "epoch": 2.905852417302799, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43677696, + "loss": 1.307, + "grad_norm": 1.069379210472107, + "learning_rate": 7.637981331687582e-07 + }, + { + "step": 572, + "epoch": 2.910941475826972, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436810752, + "loss": 1.3884, + "grad_norm": 1.5220832824707031, + "learning_rate": 6.766480639218752e-07 + }, + { + "step": 573, + "epoch": 2.916030534351145, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436718592, + "loss": 1.4082, + "grad_norm": 1.1597318649291992, + "learning_rate": 5.947644069591084e-07 + }, + { + "step": 574, + "epoch": 2.921119592875318, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436803072, + "loss": 1.3793, + "grad_norm": 1.3705661296844482, + "learning_rate": 5.181500501950986e-07 + }, + { + "step": 575, + "epoch": 2.926208651399491, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436690944, + "loss": 1.2997, + "grad_norm": 0.9692081212997437, + "learning_rate": 4.468076957041433e-07 + }, + { + "step": 576, + "epoch": 2.931297709923664, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43675392, + "loss": 1.3854, + "grad_norm": 1.3916468620300293, + "learning_rate": 3.807398596248401e-07 + }, + { + "step": 577, + "epoch": 2.936386768447837, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436718592, + "loss": 1.3298, + "grad_norm": 1.313440203666687, + "learning_rate": 3.199488720714072e-07 + }, + { + "step": 578, + "epoch": 2.94147582697201, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436718592, + "loss": 1.3245, + "grad_norm": 1.0562657117843628, + "learning_rate": 2.64436877051466e-07 + }, + { + "step": 579, + "epoch": 2.9465648854961835, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436803072, + "loss": 1.3452, + "grad_norm": 1.4710932970046997, + "learning_rate": 2.1420583239040167e-07 + }, + { + "step": 580, + "epoch": 2.9516539440203564, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436749312, + "loss": 1.3669, + "grad_norm": 1.1509836912155151, + "learning_rate": 1.6925750966238494e-07 + }, + { + "step": 581, + "epoch": 2.9567430025445294, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43676928, + "loss": 1.3232, + "grad_norm": 1.4320374727249146, + "learning_rate": 1.295934941278387e-07 + }, + { + "step": 582, + "epoch": 2.9618320610687023, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436697088, + "loss": 1.3238, + "grad_norm": 1.9172191619873047, + "learning_rate": 9.52151846775162e-08 + }, + { + "step": 583, + "epoch": 2.9669211195928753, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43678464, + "loss": 1.3993, + "grad_norm": 1.2157620191574097, + "learning_rate": 6.612379378320709e-08 + }, + { + "step": 584, + "epoch": 2.9720101781170483, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436752384, + "loss": 1.2784, + "grad_norm": 1.046169400215149, + "learning_rate": 4.232034745495494e-08 + }, + { + "step": 585, + "epoch": 2.9770992366412212, + "cpu_mem": 1.754468352, + "gpu_mem": 4.43673088, + "loss": 1.2559, + "grad_norm": 1.0000903606414795, + "learning_rate": 2.3805685204869583e-08 + }, + { + "step": 586, + "epoch": 2.982188295165394, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436674048, + "loss": 1.3405, + "grad_norm": 1.9830927848815918, + "learning_rate": 1.0580460017517444e-08 + }, + { + "step": 587, + "epoch": 2.9872773536895676, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436740096, + "loss": 1.4212, + "grad_norm": 1.175218939781189, + "learning_rate": 2.645138326906604e-09 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436781568, + "loss": 1.3349, + "grad_norm": 0.9889572858810425, + "learning_rate": 0.0 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 1.754468352, + "gpu_mem": 4.436781568, + "train_runtime": 8572.2835, + "train_samples_per_second": 4.398, + "train_steps_per_second": 0.069, + "total_flos": 8.835531003429274e+16, + "train_loss": 1.4245477939138607 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r32-a2/README.md b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r32-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r32-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r32-a2/adapter_config.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..03eabaea80bc9f8c1936ead28264f565a8ac69c0 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r32-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r32-a2/eval_results.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..4270d53ff68b935c8cde29363f9a8eec8ed8daad --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "logiqa", + "results": 0.4404048750258211 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r32-a2/training_configuration.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..52754254245b15b66659b44fe6dc8847c3612c65 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "LOGIQA", + "dataset_id": "data/logiqa_train", + "preprocess_id": "logiqa_train_deepeval" + }, + "peft_config": { + "method": "lora", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 25231360 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 3, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-lora-logiqa-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r32-a2", + "seed": 42, + "timestamp": "2025-08-30T05:27:39.999429" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r32-a2/training_logs.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..f8180e63e88ccffe5aee0405ebf8f8376f7f5e77 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r32-a2/training_logs.json @@ -0,0 +1,5305 @@ +[ + { + "step": 1, + "epoch": 0.005089058524173028, + "cpu_mem": 1.901764608, + "gpu_mem": 4.518727168, + "loss": 3.8396, + "grad_norm": 59.99252700805664, + "learning_rate": 5.084745762711864e-06 + }, + { + "step": 2, + "epoch": 0.010178117048346057, + "cpu_mem": 1.907073024, + "gpu_mem": 4.720597504, + "loss": 3.9728, + "grad_norm": 59.67111587524414, + "learning_rate": 1.0169491525423728e-05 + }, + { + "step": 3, + "epoch": 0.015267175572519083, + "cpu_mem": 1.907073024, + "gpu_mem": 4.720674304, + "loss": 3.553, + "grad_norm": 53.67893600463867, + "learning_rate": 1.5254237288135592e-05 + }, + { + "step": 4, + "epoch": 0.020356234096692113, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720572928, + "loss": 2.9619, + "grad_norm": 45.048484802246094, + "learning_rate": 2.0338983050847455e-05 + }, + { + "step": 5, + "epoch": 0.02544529262086514, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720588288, + "loss": 2.5426, + "grad_norm": 27.52631187438965, + "learning_rate": 2.542372881355932e-05 + }, + { + "step": 6, + "epoch": 0.030534351145038167, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720580608, + "loss": 2.1516, + "grad_norm": 15.725869178771973, + "learning_rate": 3.0508474576271185e-05 + }, + { + "step": 7, + "epoch": 0.035623409669211195, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720651264, + "loss": 1.7714, + "grad_norm": 10.264392852783203, + "learning_rate": 3.559322033898305e-05 + }, + { + "step": 8, + "epoch": 0.04071246819338423, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720635904, + "loss": 1.5726, + "grad_norm": 5.139598369598389, + "learning_rate": 4.067796610169491e-05 + }, + { + "step": 9, + "epoch": 0.04580152671755725, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720634368, + "loss": 1.5024, + "grad_norm": 3.5041873455047607, + "learning_rate": 4.576271186440678e-05 + }, + { + "step": 10, + "epoch": 0.05089058524173028, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72064512, + "loss": 1.4251, + "grad_norm": 3.0726158618927, + "learning_rate": 5.084745762711864e-05 + }, + { + "step": 11, + "epoch": 0.05597964376590331, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720548352, + "loss": 1.3888, + "grad_norm": 2.290050745010376, + "learning_rate": 5.59322033898305e-05 + }, + { + "step": 12, + "epoch": 0.061068702290076333, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72059904, + "loss": 1.4388, + "grad_norm": 3.6388707160949707, + "learning_rate": 6.101694915254237e-05 + }, + { + "step": 13, + "epoch": 0.06615776081424936, + "cpu_mem": 1.907269632, + "gpu_mem": 4.7206912, + "loss": 1.3846, + "grad_norm": 1.3347753286361694, + "learning_rate": 6.610169491525423e-05 + }, + { + "step": 14, + "epoch": 0.07124681933842239, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720603648, + "loss": 1.4826, + "grad_norm": 3.605297327041626, + "learning_rate": 7.11864406779661e-05 + }, + { + "step": 15, + "epoch": 0.07633587786259542, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720741888, + "loss": 1.4351, + "grad_norm": 2.1151256561279297, + "learning_rate": 7.627118644067796e-05 + }, + { + "step": 16, + "epoch": 0.08142493638676845, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720602112, + "loss": 1.4496, + "grad_norm": 3.1328134536743164, + "learning_rate": 8.135593220338982e-05 + }, + { + "step": 17, + "epoch": 0.08651399491094147, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720632832, + "loss": 1.3674, + "grad_norm": 1.4650118350982666, + "learning_rate": 8.64406779661017e-05 + }, + { + "step": 18, + "epoch": 0.0916030534351145, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720595968, + "loss": 1.3929, + "grad_norm": 2.603435754776001, + "learning_rate": 9.152542372881355e-05 + }, + { + "step": 19, + "epoch": 0.09669211195928754, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720503808, + "loss": 1.3799, + "grad_norm": 0.8704575300216675, + "learning_rate": 9.661016949152541e-05 + }, + { + "step": 20, + "epoch": 0.10178117048346055, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720542208, + "loss": 1.372, + "grad_norm": 0.9829480648040771, + "learning_rate": 0.00010169491525423727 + }, + { + "step": 21, + "epoch": 0.10687022900763359, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72067584, + "loss": 1.459, + "grad_norm": 3.492403268814087, + "learning_rate": 0.00010677966101694915 + }, + { + "step": 22, + "epoch": 0.11195928753180662, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720574464, + "loss": 1.4387, + "grad_norm": 2.5575737953186035, + "learning_rate": 0.000111864406779661 + }, + { + "step": 23, + "epoch": 0.11704834605597965, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720602112, + "loss": 1.4426, + "grad_norm": 2.993204355239868, + "learning_rate": 0.00011694915254237288 + }, + { + "step": 24, + "epoch": 0.12213740458015267, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720595968, + "loss": 1.3913, + "grad_norm": 1.5055561065673828, + "learning_rate": 0.00012203389830508474 + }, + { + "step": 25, + "epoch": 0.1272264631043257, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720602112, + "loss": 1.4397, + "grad_norm": 2.678650140762329, + "learning_rate": 0.00012711864406779658 + }, + { + "step": 26, + "epoch": 0.13231552162849872, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720654336, + "loss": 1.4374, + "grad_norm": 2.8956878185272217, + "learning_rate": 0.00013220338983050846 + }, + { + "step": 27, + "epoch": 0.13740458015267176, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720595968, + "loss": 1.4214, + "grad_norm": 2.037658929824829, + "learning_rate": 0.00013728813559322033 + }, + { + "step": 28, + "epoch": 0.14249363867684478, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720542208, + "loss": 1.4185, + "grad_norm": 1.2095122337341309, + "learning_rate": 0.0001423728813559322 + }, + { + "step": 29, + "epoch": 0.1475826972010178, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720634368, + "loss": 1.3916, + "grad_norm": 1.0310418605804443, + "learning_rate": 0.00014745762711864405 + }, + { + "step": 30, + "epoch": 0.15267175572519084, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72062976, + "loss": 1.3469, + "grad_norm": 1.6156489849090576, + "learning_rate": 0.00015254237288135592 + }, + { + "step": 31, + "epoch": 0.15776081424936386, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720608256, + "loss": 1.3627, + "grad_norm": 1.092143177986145, + "learning_rate": 0.0001576271186440678 + }, + { + "step": 32, + "epoch": 0.1628498727735369, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720612864, + "loss": 1.5446, + "grad_norm": 3.8374016284942627, + "learning_rate": 0.00016271186440677964 + }, + { + "step": 33, + "epoch": 0.16793893129770993, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720648192, + "loss": 1.3663, + "grad_norm": 1.8016067743301392, + "learning_rate": 0.0001677966101694915 + }, + { + "step": 34, + "epoch": 0.17302798982188294, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720572928, + "loss": 1.4599, + "grad_norm": 2.552316427230835, + "learning_rate": 0.0001728813559322034 + }, + { + "step": 35, + "epoch": 0.178117048346056, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720620544, + "loss": 1.3576, + "grad_norm": 0.8835170269012451, + "learning_rate": 0.00017796610169491523 + }, + { + "step": 36, + "epoch": 0.183206106870229, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720635904, + "loss": 1.5555, + "grad_norm": 4.316595554351807, + "learning_rate": 0.0001830508474576271 + }, + { + "step": 37, + "epoch": 0.18829516539440203, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720701952, + "loss": 1.6498, + "grad_norm": 5.0872321128845215, + "learning_rate": 0.00018813559322033895 + }, + { + "step": 38, + "epoch": 0.19338422391857507, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720585216, + "loss": 1.3883, + "grad_norm": 1.0261530876159668, + "learning_rate": 0.00019322033898305083 + }, + { + "step": 39, + "epoch": 0.1984732824427481, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720701952, + "loss": 1.4742, + "grad_norm": 1.7385681867599487, + "learning_rate": 0.0001983050847457627 + }, + { + "step": 40, + "epoch": 0.2035623409669211, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720623616, + "loss": 1.3464, + "grad_norm": 0.4705820381641388, + "learning_rate": 0.00020338983050847455 + }, + { + "step": 41, + "epoch": 0.20865139949109415, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72052224, + "loss": 1.4086, + "grad_norm": 1.3943504095077515, + "learning_rate": 0.00020847457627118642 + }, + { + "step": 42, + "epoch": 0.21374045801526717, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720594432, + "loss": 1.4611, + "grad_norm": 1.8179737329483032, + "learning_rate": 0.0002135593220338983 + }, + { + "step": 43, + "epoch": 0.21882951653944022, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720557568, + "loss": 1.4186, + "grad_norm": 1.4873411655426025, + "learning_rate": 0.00021864406779661014 + }, + { + "step": 44, + "epoch": 0.22391857506361323, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720592896, + "loss": 1.4839, + "grad_norm": 2.3078815937042236, + "learning_rate": 0.000223728813559322 + }, + { + "step": 45, + "epoch": 0.22900763358778625, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720646656, + "loss": 1.4429, + "grad_norm": 1.3310329914093018, + "learning_rate": 0.00022881355932203386 + }, + { + "step": 46, + "epoch": 0.2340966921119593, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720694272, + "loss": 1.4083, + "grad_norm": 1.3149898052215576, + "learning_rate": 0.00023389830508474576 + }, + { + "step": 47, + "epoch": 0.23918575063613232, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72052224, + "loss": 1.3924, + "grad_norm": 1.2706291675567627, + "learning_rate": 0.0002389830508474576 + }, + { + "step": 48, + "epoch": 0.24427480916030533, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720576, + "loss": 1.4856, + "grad_norm": 3.344759941101074, + "learning_rate": 0.00024406779661016948 + }, + { + "step": 49, + "epoch": 0.24936386768447838, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720565248, + "loss": 1.5709, + "grad_norm": 4.3566575050354, + "learning_rate": 0.00024915254237288135 + }, + { + "step": 50, + "epoch": 0.2544529262086514, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720574464, + "loss": 1.4302, + "grad_norm": 1.9431793689727783, + "learning_rate": 0.00025423728813559317 + }, + { + "step": 51, + "epoch": 0.2595419847328244, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720669696, + "loss": 1.3816, + "grad_norm": 0.8050864338874817, + "learning_rate": 0.0002593220338983051 + }, + { + "step": 52, + "epoch": 0.26463104325699743, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720611328, + "loss": 1.458, + "grad_norm": 2.169312000274658, + "learning_rate": 0.0002644067796610169 + }, + { + "step": 53, + "epoch": 0.2697201017811705, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720689664, + "loss": 1.4728, + "grad_norm": 2.371241807937622, + "learning_rate": 0.0002694915254237288 + }, + { + "step": 54, + "epoch": 0.2748091603053435, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720589824, + "loss": 1.4387, + "grad_norm": 1.8364287614822388, + "learning_rate": 0.00027457627118644066 + }, + { + "step": 55, + "epoch": 0.27989821882951654, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720586752, + "loss": 1.4856, + "grad_norm": 2.710259199142456, + "learning_rate": 0.0002796610169491525 + }, + { + "step": 56, + "epoch": 0.28498727735368956, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720631296, + "loss": 1.4107, + "grad_norm": 1.6049015522003174, + "learning_rate": 0.0002847457627118644 + }, + { + "step": 57, + "epoch": 0.2900763358778626, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720623616, + "loss": 1.3945, + "grad_norm": 1.3367737531661987, + "learning_rate": 0.00028983050847457623 + }, + { + "step": 58, + "epoch": 0.2951653944020356, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720646656, + "loss": 1.412, + "grad_norm": 1.7989500761032104, + "learning_rate": 0.0002949152542372881 + }, + { + "step": 59, + "epoch": 0.30025445292620867, + "cpu_mem": 1.907269632, + "gpu_mem": 4.7206144, + "loss": 1.5412, + "grad_norm": 3.7082583904266357, + "learning_rate": 0.0003 + }, + { + "step": 60, + "epoch": 0.3053435114503817, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720605184, + "loss": 1.3695, + "grad_norm": 0.8449872136116028, + "learning_rate": 0.00029999735486167307 + }, + { + "step": 61, + "epoch": 0.3104325699745547, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720642048, + "loss": 1.4134, + "grad_norm": 2.4225635528564453, + "learning_rate": 0.00029998941953998247 + }, + { + "step": 62, + "epoch": 0.3155216284987277, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720681984, + "loss": 1.3882, + "grad_norm": 1.0452338457107544, + "learning_rate": 0.0002999761943147951 + }, + { + "step": 63, + "epoch": 0.32061068702290074, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720611328, + "loss": 1.4061, + "grad_norm": 1.530485987663269, + "learning_rate": 0.000299957679652545 + }, + { + "step": 64, + "epoch": 0.3256997455470738, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720513024, + "loss": 1.4679, + "grad_norm": 2.8326432704925537, + "learning_rate": 0.0002999338762062168 + }, + { + "step": 65, + "epoch": 0.33078880407124683, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720594432, + "loss": 1.4525, + "grad_norm": 1.906180739402771, + "learning_rate": 0.00029990478481532246 + }, + { + "step": 66, + "epoch": 0.33587786259541985, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72069888, + "loss": 1.3913, + "grad_norm": 0.42727380990982056, + "learning_rate": 0.00029987040650587214 + }, + { + "step": 67, + "epoch": 0.34096692111959287, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720576, + "loss": 1.3908, + "grad_norm": 1.0744200944900513, + "learning_rate": 0.0002998307424903376 + }, + { + "step": 68, + "epoch": 0.3460559796437659, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720628224, + "loss": 1.4548, + "grad_norm": 1.5512852668762207, + "learning_rate": 0.00029978579416760955 + }, + { + "step": 69, + "epoch": 0.3511450381679389, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720620544, + "loss": 1.4201, + "grad_norm": 1.200818657875061, + "learning_rate": 0.00029973556312294853 + }, + { + "step": 70, + "epoch": 0.356234096692112, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720536064, + "loss": 1.4204, + "grad_norm": 1.1756865978240967, + "learning_rate": 0.0002996800511279286 + }, + { + "step": 71, + "epoch": 0.361323155216285, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720559104, + "loss": 1.4138, + "grad_norm": 1.628706932067871, + "learning_rate": 0.0002996192601403751 + }, + { + "step": 72, + "epoch": 0.366412213740458, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720608256, + "loss": 1.3833, + "grad_norm": 0.6174067258834839, + "learning_rate": 0.00029955319230429584 + }, + { + "step": 73, + "epoch": 0.37150127226463103, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720562176, + "loss": 1.4092, + "grad_norm": 0.7638588547706604, + "learning_rate": 0.00029948184994980486 + }, + { + "step": 74, + "epoch": 0.37659033078880405, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720615936, + "loss": 1.4304, + "grad_norm": 1.2857800722122192, + "learning_rate": 0.0002994052355930409 + }, + { + "step": 75, + "epoch": 0.3816793893129771, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720658944, + "loss": 1.4875, + "grad_norm": 2.6705689430236816, + "learning_rate": 0.0002993233519360781 + }, + { + "step": 76, + "epoch": 0.38676844783715014, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72060672, + "loss": 1.4255, + "grad_norm": 1.612109899520874, + "learning_rate": 0.0002992362018668312 + }, + { + "step": 77, + "epoch": 0.39185750636132316, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72054528, + "loss": 1.3903, + "grad_norm": 0.662988007068634, + "learning_rate": 0.00029914378845895343 + }, + { + "step": 78, + "epoch": 0.3969465648854962, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720646656, + "loss": 1.3576, + "grad_norm": 0.8656507134437561, + "learning_rate": 0.000299046114971728 + }, + { + "step": 79, + "epoch": 0.4020356234096692, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720663552, + "loss": 1.3104, + "grad_norm": 1.7098761796951294, + "learning_rate": 0.0002989431848499534 + }, + { + "step": 80, + "epoch": 0.4071246819338422, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720503808, + "loss": 1.5638, + "grad_norm": 3.7132842540740967, + "learning_rate": 0.0002988350017238218 + }, + { + "step": 81, + "epoch": 0.4122137404580153, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720609792, + "loss": 1.4576, + "grad_norm": 2.9683139324188232, + "learning_rate": 0.0002987215694087909 + }, + { + "step": 82, + "epoch": 0.4173027989821883, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720582144, + "loss": 1.5866, + "grad_norm": 4.857396602630615, + "learning_rate": 0.0002986028919054496 + }, + { + "step": 83, + "epoch": 0.4223918575063613, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720595968, + "loss": 1.4623, + "grad_norm": 2.821737766265869, + "learning_rate": 0.00029847897339937675 + }, + { + "step": 84, + "epoch": 0.42748091603053434, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720602112, + "loss": 1.5676, + "grad_norm": 6.004476547241211, + "learning_rate": 0.0002983498182609935 + }, + { + "step": 85, + "epoch": 0.43256997455470736, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720631296, + "loss": 1.4458, + "grad_norm": 2.314448833465576, + "learning_rate": 0.0002982154310454093 + }, + { + "step": 86, + "epoch": 0.43765903307888043, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720572928, + "loss": 1.4135, + "grad_norm": 1.7498201131820679, + "learning_rate": 0.00029807581649226114 + }, + { + "step": 87, + "epoch": 0.44274809160305345, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720594432, + "loss": 1.4108, + "grad_norm": 1.0758187770843506, + "learning_rate": 0.00029793097952554646 + }, + { + "step": 88, + "epoch": 0.44783715012722647, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720620544, + "loss": 1.3859, + "grad_norm": 1.9811064004898071, + "learning_rate": 0.0002977809252534494 + }, + { + "step": 89, + "epoch": 0.4529262086513995, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72056832, + "loss": 1.4342, + "grad_norm": 1.9113335609436035, + "learning_rate": 0.00029762565896816073 + }, + { + "step": 90, + "epoch": 0.4580152671755725, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720600576, + "loss": 1.4759, + "grad_norm": 2.7367117404937744, + "learning_rate": 0.000297465186145691 + }, + { + "step": 91, + "epoch": 0.4631043256997455, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720592896, + "loss": 1.3386, + "grad_norm": 0.958081841468811, + "learning_rate": 0.0002972995124456779 + }, + { + "step": 92, + "epoch": 0.4681933842239186, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720532992, + "loss": 1.4321, + "grad_norm": 1.3418999910354614, + "learning_rate": 0.0002971286437111861 + }, + { + "step": 93, + "epoch": 0.4732824427480916, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720734208, + "loss": 1.4222, + "grad_norm": 1.2392725944519043, + "learning_rate": 0.0002969525859685014 + }, + { + "step": 94, + "epoch": 0.47837150127226463, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720548352, + "loss": 1.4054, + "grad_norm": 1.3942866325378418, + "learning_rate": 0.0002967713454269183 + }, + { + "step": 95, + "epoch": 0.48346055979643765, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720715776, + "loss": 1.3992, + "grad_norm": 0.6622376441955566, + "learning_rate": 0.0002965849284785207 + }, + { + "step": 96, + "epoch": 0.48854961832061067, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720589824, + "loss": 1.4214, + "grad_norm": 1.4259381294250488, + "learning_rate": 0.000296393341697957 + }, + { + "step": 97, + "epoch": 0.49363867684478374, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720585216, + "loss": 1.4267, + "grad_norm": 1.2295571565628052, + "learning_rate": 0.00029619659184220755 + }, + { + "step": 98, + "epoch": 0.49872773536895676, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720632832, + "loss": 1.405, + "grad_norm": 0.9338927865028381, + "learning_rate": 0.00029599468585034684 + }, + { + "step": 99, + "epoch": 0.5038167938931297, + "cpu_mem": 1.907269632, + "gpu_mem": 4.7206144, + "loss": 1.3682, + "grad_norm": 1.1455888748168945, + "learning_rate": 0.0002957876308432986 + }, + { + "step": 100, + "epoch": 0.5089058524173028, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720579072, + "loss": 1.407, + "grad_norm": 0.9003887176513672, + "learning_rate": 0.0002955754341235846 + }, + { + "step": 101, + "epoch": 0.5139949109414759, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720562176, + "loss": 1.4379, + "grad_norm": 0.7834309339523315, + "learning_rate": 0.00029535810317506714 + }, + { + "step": 102, + "epoch": 0.5190839694656488, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720612864, + "loss": 1.4459, + "grad_norm": 1.0764751434326172, + "learning_rate": 0.00029513564566268524 + }, + { + "step": 103, + "epoch": 0.5241730279898219, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720572928, + "loss": 1.3951, + "grad_norm": 0.600545346736908, + "learning_rate": 0.0002949080694321841 + }, + { + "step": 104, + "epoch": 0.5292620865139949, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720662016, + "loss": 1.4377, + "grad_norm": 1.6442540884017944, + "learning_rate": 0.0002946753825098386 + }, + { + "step": 105, + "epoch": 0.5343511450381679, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720531456, + "loss": 1.4681, + "grad_norm": 3.0231728553771973, + "learning_rate": 0.0002944375931021699 + }, + { + "step": 106, + "epoch": 0.539440203562341, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720594432, + "loss": 1.4268, + "grad_norm": 1.485253930091858, + "learning_rate": 0.0002941947095956564 + }, + { + "step": 107, + "epoch": 0.544529262086514, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720589824, + "loss": 1.399, + "grad_norm": 0.8487690091133118, + "learning_rate": 0.0002939467405564377 + }, + { + "step": 108, + "epoch": 0.549618320610687, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720579072, + "loss": 1.3679, + "grad_norm": 0.3334135115146637, + "learning_rate": 0.00029369369473001265 + }, + { + "step": 109, + "epoch": 0.55470737913486, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720748032, + "loss": 1.3924, + "grad_norm": 0.8910844326019287, + "learning_rate": 0.0002934355810409307 + }, + { + "step": 110, + "epoch": 0.5597964376590331, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72054528, + "loss": 1.3866, + "grad_norm": 1.5437201261520386, + "learning_rate": 0.0002931724085924774 + }, + { + "step": 111, + "epoch": 0.5648854961832062, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720546816, + "loss": 1.4113, + "grad_norm": 1.8648587465286255, + "learning_rate": 0.00029290418666635314 + }, + { + "step": 112, + "epoch": 0.5699745547073791, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720646656, + "loss": 1.4232, + "grad_norm": 1.5794148445129395, + "learning_rate": 0.0002926309247223459 + }, + { + "step": 113, + "epoch": 0.5750636132315522, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72074496, + "loss": 1.4035, + "grad_norm": 1.3009464740753174, + "learning_rate": 0.0002923526323979975 + }, + { + "step": 114, + "epoch": 0.5801526717557252, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720574464, + "loss": 1.4148, + "grad_norm": 1.124483585357666, + "learning_rate": 0.00029206931950826387 + }, + { + "step": 115, + "epoch": 0.5852417302798982, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72058368, + "loss": 1.4323, + "grad_norm": 1.3276902437210083, + "learning_rate": 0.00029178099604516876 + }, + { + "step": 116, + "epoch": 0.5903307888040712, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72064512, + "loss": 1.4043, + "grad_norm": 1.0728600025177002, + "learning_rate": 0.0002914876721774515 + }, + { + "step": 117, + "epoch": 0.5954198473282443, + "cpu_mem": 1.907269632, + "gpu_mem": 4.7205376, + "loss": 1.3901, + "grad_norm": 0.4750644266605377, + "learning_rate": 0.00029118935825020806 + }, + { + "step": 118, + "epoch": 0.6005089058524173, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720632832, + "loss": 1.3997, + "grad_norm": 0.5814637541770935, + "learning_rate": 0.00029088606478452656 + }, + { + "step": 119, + "epoch": 0.6055979643765903, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720766464, + "loss": 1.3955, + "grad_norm": 1.0561155080795288, + "learning_rate": 0.0002905778024771158 + }, + { + "step": 120, + "epoch": 0.6106870229007634, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720669696, + "loss": 1.4473, + "grad_norm": 2.33194637298584, + "learning_rate": 0.00029026458219992855 + }, + { + "step": 121, + "epoch": 0.6157760814249363, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720715776, + "loss": 1.3964, + "grad_norm": 1.0123968124389648, + "learning_rate": 0.00028994641499977745 + }, + { + "step": 122, + "epoch": 0.6208651399491094, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72066048, + "loss": 1.3771, + "grad_norm": 0.3792378902435303, + "learning_rate": 0.00028962331209794604 + }, + { + "step": 123, + "epoch": 0.6259541984732825, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72069888, + "loss": 1.4741, + "grad_norm": 1.648614764213562, + "learning_rate": 0.00028929528488979244 + }, + { + "step": 124, + "epoch": 0.6310432569974554, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72062208, + "loss": 1.3681, + "grad_norm": 0.7505699992179871, + "learning_rate": 0.0002889623449443479 + }, + { + "step": 125, + "epoch": 0.6361323155216285, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720657408, + "loss": 1.4321, + "grad_norm": 1.083065390586853, + "learning_rate": 0.0002886245040039086 + }, + { + "step": 126, + "epoch": 0.6412213740458015, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720563712, + "loss": 1.352, + "grad_norm": 0.4003942906856537, + "learning_rate": 0.0002882817739836215 + }, + { + "step": 127, + "epoch": 0.6463104325699746, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720588288, + "loss": 1.4088, + "grad_norm": 0.8949209451675415, + "learning_rate": 0.000287934166971064 + }, + { + "step": 128, + "epoch": 0.6513994910941476, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72056064, + "loss": 1.365, + "grad_norm": 0.311297744512558, + "learning_rate": 0.0002875816952258179 + }, + { + "step": 129, + "epoch": 0.6564885496183206, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720566784, + "loss": 1.3986, + "grad_norm": 0.9927392601966858, + "learning_rate": 0.00028722437117903693 + }, + { + "step": 130, + "epoch": 0.6615776081424937, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720557568, + "loss": 1.4665, + "grad_norm": 1.3895574808120728, + "learning_rate": 0.000286862207433008 + }, + { + "step": 131, + "epoch": 0.6666666666666666, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72049152, + "loss": 1.4256, + "grad_norm": 1.0016359090805054, + "learning_rate": 0.00028649521676070726 + }, + { + "step": 132, + "epoch": 0.6717557251908397, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720654336, + "loss": 1.4143, + "grad_norm": 0.8196082711219788, + "learning_rate": 0.0002861234121053493 + }, + { + "step": 133, + "epoch": 0.6768447837150128, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720566784, + "loss": 1.4124, + "grad_norm": 1.549772024154663, + "learning_rate": 0.0002857468065799307 + }, + { + "step": 134, + "epoch": 0.6819338422391857, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720569856, + "loss": 1.3994, + "grad_norm": 0.6074214577674866, + "learning_rate": 0.0002853654134667676 + }, + { + "step": 135, + "epoch": 0.6870229007633588, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720625152, + "loss": 1.4096, + "grad_norm": 0.9988752007484436, + "learning_rate": 0.0002849792462170271 + }, + { + "step": 136, + "epoch": 0.6921119592875318, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720528384, + "loss": 1.4503, + "grad_norm": 1.1939074993133545, + "learning_rate": 0.0002845883184502533 + }, + { + "step": 137, + "epoch": 0.6972010178117048, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720695808, + "loss": 1.404, + "grad_norm": 0.8434684872627258, + "learning_rate": 0.00028419264395388626 + }, + { + "step": 138, + "epoch": 0.7022900763358778, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720562176, + "loss": 1.3611, + "grad_norm": 0.7427964806556702, + "learning_rate": 0.0002837922366827765 + }, + { + "step": 139, + "epoch": 0.7073791348600509, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720551424, + "loss": 1.4059, + "grad_norm": 1.4208147525787354, + "learning_rate": 0.00028338711075869216 + }, + { + "step": 140, + "epoch": 0.712468193384224, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720603648, + "loss": 1.3841, + "grad_norm": 0.9749029874801636, + "learning_rate": 0.00028297728046982137 + }, + { + "step": 141, + "epoch": 0.7175572519083969, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720528384, + "loss": 1.4063, + "grad_norm": 1.1700817346572876, + "learning_rate": 0.00028256276027026816 + }, + { + "step": 142, + "epoch": 0.72264631043257, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720589824, + "loss": 1.4663, + "grad_norm": 1.2552353143692017, + "learning_rate": 0.0002821435647795429 + }, + { + "step": 143, + "epoch": 0.727735368956743, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720588288, + "loss": 1.4056, + "grad_norm": 1.0296167135238647, + "learning_rate": 0.00028171970878204623 + }, + { + "step": 144, + "epoch": 0.732824427480916, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720532992, + "loss": 1.363, + "grad_norm": 0.2649649381637573, + "learning_rate": 0.0002812912072265481 + }, + { + "step": 145, + "epoch": 0.7379134860050891, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72052992, + "loss": 1.4154, + "grad_norm": 0.8450047969818115, + "learning_rate": 0.00028085807522566043 + }, + { + "step": 146, + "epoch": 0.7430025445292621, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720631296, + "loss": 1.3872, + "grad_norm": 0.8766883611679077, + "learning_rate": 0.00028042032805530387 + }, + { + "step": 147, + "epoch": 0.7480916030534351, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720635904, + "loss": 1.3835, + "grad_norm": 0.7928040027618408, + "learning_rate": 0.00027997798115416935 + }, + { + "step": 148, + "epoch": 0.7531806615776081, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720734208, + "loss": 1.4135, + "grad_norm": 0.7650040984153748, + "learning_rate": 0.0002795310501231734 + }, + { + "step": 149, + "epoch": 0.7582697201017812, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720577536, + "loss": 1.3936, + "grad_norm": 0.5712445974349976, + "learning_rate": 0.0002790795507249081 + }, + { + "step": 150, + "epoch": 0.7633587786259542, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720569856, + "loss": 1.4041, + "grad_norm": 0.7785497307777405, + "learning_rate": 0.00027862349888308494 + }, + { + "step": 151, + "epoch": 0.7684478371501272, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720513024, + "loss": 1.3797, + "grad_norm": 0.3148995637893677, + "learning_rate": 0.0002781629106819733 + }, + { + "step": 152, + "epoch": 0.7735368956743003, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72054528, + "loss": 1.388, + "grad_norm": 0.44492611289024353, + "learning_rate": 0.00027769780236583315 + }, + { + "step": 153, + "epoch": 0.7786259541984732, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720572928, + "loss": 1.4022, + "grad_norm": 1.038348913192749, + "learning_rate": 0.0002772281903383424 + }, + { + "step": 154, + "epoch": 0.7837150127226463, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720623616, + "loss": 1.3954, + "grad_norm": 0.7102695107460022, + "learning_rate": 0.00027675409116201797 + }, + { + "step": 155, + "epoch": 0.7888040712468194, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720536064, + "loss": 1.3933, + "grad_norm": 0.2385653853416443, + "learning_rate": 0.00027627552155763186 + }, + { + "step": 156, + "epoch": 0.7938931297709924, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720556032, + "loss": 1.3837, + "grad_norm": 0.632422149181366, + "learning_rate": 0.00027579249840362145 + }, + { + "step": 157, + "epoch": 0.7989821882951654, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720632832, + "loss": 1.399, + "grad_norm": 0.7155194282531738, + "learning_rate": 0.0002753050387354942 + }, + { + "step": 158, + "epoch": 0.8040712468193384, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720534528, + "loss": 1.3891, + "grad_norm": 0.26147013902664185, + "learning_rate": 0.0002748131597452268 + }, + { + "step": 159, + "epoch": 0.8091603053435115, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720635904, + "loss": 1.4021, + "grad_norm": 0.6663335561752319, + "learning_rate": 0.00027431687878065874 + }, + { + "step": 160, + "epoch": 0.8142493638676844, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720592896, + "loss": 1.3978, + "grad_norm": 0.7611777782440186, + "learning_rate": 0.00027381621334488085 + }, + { + "step": 161, + "epoch": 0.8193384223918575, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72059136, + "loss": 1.3716, + "grad_norm": 0.3903695046901703, + "learning_rate": 0.00027331118109561744 + }, + { + "step": 162, + "epoch": 0.8244274809160306, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720574464, + "loss": 1.3871, + "grad_norm": 0.2537837624549866, + "learning_rate": 0.000272801799844604 + }, + { + "step": 163, + "epoch": 0.8295165394402035, + "cpu_mem": 1.907269632, + "gpu_mem": 4.7206912, + "loss": 1.4169, + "grad_norm": 0.6235453486442566, + "learning_rate": 0.00027228808755695884 + }, + { + "step": 164, + "epoch": 0.8346055979643766, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720586752, + "loss": 1.3908, + "grad_norm": 0.4745137095451355, + "learning_rate": 0.00027177006235054943 + }, + { + "step": 165, + "epoch": 0.8396946564885496, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720666624, + "loss": 1.3956, + "grad_norm": 0.39399972558021545, + "learning_rate": 0.0002712477424953534 + }, + { + "step": 166, + "epoch": 0.8447837150127226, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720576, + "loss": 1.3593, + "grad_norm": 0.38647961616516113, + "learning_rate": 0.00027072114641281435 + }, + { + "step": 167, + "epoch": 0.8498727735368957, + "cpu_mem": 1.907269632, + "gpu_mem": 4.7205376, + "loss": 1.3952, + "grad_norm": 1.182348370552063, + "learning_rate": 0.0002701902926751921 + }, + { + "step": 168, + "epoch": 0.8549618320610687, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720540672, + "loss": 1.3868, + "grad_norm": 0.7920267581939697, + "learning_rate": 0.00026965520000490743 + }, + { + "step": 169, + "epoch": 0.8600508905852418, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720608256, + "loss": 1.3737, + "grad_norm": 0.7145357728004456, + "learning_rate": 0.0002691158872738822 + }, + { + "step": 170, + "epoch": 0.8651399491094147, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720615936, + "loss": 1.3939, + "grad_norm": 0.4024410545825958, + "learning_rate": 0.00026857237350287334 + }, + { + "step": 171, + "epoch": 0.8702290076335878, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720589824, + "loss": 1.4027, + "grad_norm": 0.5698131918907166, + "learning_rate": 0.0002680246778608023 + }, + { + "step": 172, + "epoch": 0.8753180661577609, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720569856, + "loss": 1.3828, + "grad_norm": 0.31165170669555664, + "learning_rate": 0.0002674728196640788 + }, + { + "step": 173, + "epoch": 0.8804071246819338, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72060672, + "loss": 1.3911, + "grad_norm": 0.7065814137458801, + "learning_rate": 0.00026691681837591984 + }, + { + "step": 174, + "epoch": 0.8854961832061069, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720549888, + "loss": 1.3939, + "grad_norm": 0.4439936578273773, + "learning_rate": 0.00026635669360566296 + }, + { + "step": 175, + "epoch": 0.8905852417302799, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720620544, + "loss": 1.3702, + "grad_norm": 0.6827486157417297, + "learning_rate": 0.00026579246510807477 + }, + { + "step": 176, + "epoch": 0.8956743002544529, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720517632, + "loss": 1.3673, + "grad_norm": 0.25498485565185547, + "learning_rate": 0.00026522415278265425 + }, + { + "step": 177, + "epoch": 0.9007633587786259, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720526848, + "loss": 1.4264, + "grad_norm": 1.038550853729248, + "learning_rate": 0.0002646517766729309 + }, + { + "step": 178, + "epoch": 0.905852417302799, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720603648, + "loss": 1.4003, + "grad_norm": 0.659854531288147, + "learning_rate": 0.0002640753569657579 + }, + { + "step": 179, + "epoch": 0.910941475826972, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720611328, + "loss": 1.3807, + "grad_norm": 0.5350374579429626, + "learning_rate": 0.0002634949139906 + }, + { + "step": 180, + "epoch": 0.916030534351145, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720617472, + "loss": 1.3912, + "grad_norm": 0.5421314239501953, + "learning_rate": 0.00026291046821881673 + }, + { + "step": 181, + "epoch": 0.9211195928753181, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720536064, + "loss": 1.3948, + "grad_norm": 0.4251003861427307, + "learning_rate": 0.0002623220402629402 + }, + { + "step": 182, + "epoch": 0.926208651399491, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720646656, + "loss": 1.4, + "grad_norm": 0.6372740864753723, + "learning_rate": 0.0002617296508759483 + }, + { + "step": 183, + "epoch": 0.9312977099236641, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720634368, + "loss": 1.3764, + "grad_norm": 0.3400779664516449, + "learning_rate": 0.00026113332095053257 + }, + { + "step": 184, + "epoch": 0.9363867684478372, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720617472, + "loss": 1.3875, + "grad_norm": 1.0121591091156006, + "learning_rate": 0.0002605330715183616 + }, + { + "step": 185, + "epoch": 0.9414758269720102, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720556032, + "loss": 1.371, + "grad_norm": 1.0047165155410767, + "learning_rate": 0.0002599289237493392 + }, + { + "step": 186, + "epoch": 0.9465648854961832, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720588288, + "loss": 1.3514, + "grad_norm": 0.21813718974590302, + "learning_rate": 0.0002593208989508575 + }, + { + "step": 187, + "epoch": 0.9516539440203562, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720685056, + "loss": 1.3601, + "grad_norm": 0.31030571460723877, + "learning_rate": 0.00025870901856704583 + }, + { + "step": 188, + "epoch": 0.9567430025445293, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720657408, + "loss": 1.3685, + "grad_norm": 0.5505935549736023, + "learning_rate": 0.00025809330417801425 + }, + { + "step": 189, + "epoch": 0.9618320610687023, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720562176, + "loss": 1.5429, + "grad_norm": 1.793271541595459, + "learning_rate": 0.00025747377749909254 + }, + { + "step": 190, + "epoch": 0.9669211195928753, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720594432, + "loss": 1.3878, + "grad_norm": 0.7305254340171814, + "learning_rate": 0.00025685046038006413 + }, + { + "step": 191, + "epoch": 0.9720101781170484, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72058368, + "loss": 1.4118, + "grad_norm": 0.59135901927948, + "learning_rate": 0.0002562233748043958 + }, + { + "step": 192, + "epoch": 0.9770992366412213, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720520704, + "loss": 1.4208, + "grad_norm": 1.2517582178115845, + "learning_rate": 0.00025559254288846196 + }, + { + "step": 193, + "epoch": 0.9821882951653944, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720642048, + "loss": 1.3842, + "grad_norm": 0.20054444670677185, + "learning_rate": 0.0002549579868807651 + }, + { + "step": 194, + "epoch": 0.9872773536895675, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720563712, + "loss": 1.4128, + "grad_norm": 1.237249732017517, + "learning_rate": 0.0002543197291611507 + }, + { + "step": 195, + "epoch": 0.9923664122137404, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72067584, + "loss": 1.3733, + "grad_norm": 0.5007869601249695, + "learning_rate": 0.0002536777922400183 + }, + { + "step": 196, + "epoch": 0.9974554707379135, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720671232, + "loss": 1.396, + "grad_norm": 1.0157169103622437, + "learning_rate": 0.0002530321987575271 + }, + { + "step": 197, + "epoch": 1.0025445292620865, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821530624, + "loss": 2.0757, + "grad_norm": 1.0732213258743286, + "learning_rate": 0.0002523829714827981 + }, + { + "step": 198, + "epoch": 1.0076335877862594, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821596672, + "loss": 1.3943, + "grad_norm": 0.7270272374153137, + "learning_rate": 0.00025173013331311053 + }, + { + "step": 199, + "epoch": 1.0127226463104326, + "cpu_mem": 1.907269632, + "gpu_mem": 4.8215936, + "loss": 1.3737, + "grad_norm": 0.9400615096092224, + "learning_rate": 0.0002510737072730946 + }, + { + "step": 200, + "epoch": 1.0178117048346056, + "cpu_mem": 1.907269632, + "gpu_mem": 4.8214784, + "loss": 1.3746, + "grad_norm": 0.8902804255485535, + "learning_rate": 0.0002504137165139193 + }, + { + "step": 201, + "epoch": 1.0229007633587786, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821529088, + "loss": 1.4119, + "grad_norm": 1.3377816677093506, + "learning_rate": 0.0002497501843124761 + }, + { + "step": 202, + "epoch": 1.0279898218829517, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821522944, + "loss": 1.4581, + "grad_norm": 1.7516824007034302, + "learning_rate": 0.00024908313407055765 + }, + { + "step": 203, + "epoch": 1.0330788804071247, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82157056, + "loss": 1.3795, + "grad_norm": 0.5005955100059509, + "learning_rate": 0.00024841258931403284 + }, + { + "step": 204, + "epoch": 1.0381679389312977, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821502976, + "loss": 1.3491, + "grad_norm": 0.4736703336238861, + "learning_rate": 0.00024773857369201675 + }, + { + "step": 205, + "epoch": 1.0432569974554706, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821549056, + "loss": 1.3917, + "grad_norm": 0.6640774607658386, + "learning_rate": 0.00024706111097603676 + }, + { + "step": 206, + "epoch": 1.0483460559796438, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82156288, + "loss": 1.3777, + "grad_norm": 0.8200928568840027, + "learning_rate": 0.00024638022505919425 + }, + { + "step": 207, + "epoch": 1.0534351145038168, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821518336, + "loss": 1.3678, + "grad_norm": 0.9486890435218811, + "learning_rate": 0.00024569593995532157 + }, + { + "step": 208, + "epoch": 1.0585241730279897, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821510656, + "loss": 1.4051, + "grad_norm": 1.1935594081878662, + "learning_rate": 0.00024500827979813546 + }, + { + "step": 209, + "epoch": 1.063613231552163, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821632, + "loss": 1.3798, + "grad_norm": 1.0667433738708496, + "learning_rate": 0.0002443172688403859 + }, + { + "step": 210, + "epoch": 1.0687022900763359, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821529088, + "loss": 1.3766, + "grad_norm": 0.8363673686981201, + "learning_rate": 0.00024362293145300027 + }, + { + "step": 211, + "epoch": 1.0737913486005088, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821530624, + "loss": 1.3706, + "grad_norm": 0.8627127408981323, + "learning_rate": 0.00024292529212422445 + }, + { + "step": 212, + "epoch": 1.078880407124682, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821533696, + "loss": 1.3714, + "grad_norm": 0.5216781497001648, + "learning_rate": 0.00024222437545875887 + }, + { + "step": 213, + "epoch": 1.083969465648855, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821459968, + "loss": 1.4364, + "grad_norm": 1.0127185583114624, + "learning_rate": 0.0002415202061768906 + }, + { + "step": 214, + "epoch": 1.089058524173028, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82145536, + "loss": 1.3634, + "grad_norm": 0.6949228048324585, + "learning_rate": 0.0002408128091136217 + }, + { + "step": 215, + "epoch": 1.094147582697201, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821506048, + "loss": 1.3616, + "grad_norm": 1.143639087677002, + "learning_rate": 0.00024010220921779336 + }, + { + "step": 216, + "epoch": 1.099236641221374, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821635072, + "loss": 1.4057, + "grad_norm": 1.1576107740402222, + "learning_rate": 0.00023938843155120581 + }, + { + "step": 217, + "epoch": 1.104325699745547, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821496832, + "loss": 1.3824, + "grad_norm": 1.0592457056045532, + "learning_rate": 0.00023867150128773453 + }, + { + "step": 218, + "epoch": 1.10941475826972, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821522944, + "loss": 1.3605, + "grad_norm": 0.7536828517913818, + "learning_rate": 0.0002379514437124425 + }, + { + "step": 219, + "epoch": 1.1145038167938932, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821535232, + "loss": 1.3795, + "grad_norm": 0.773123025894165, + "learning_rate": 0.00023722828422068814 + }, + { + "step": 220, + "epoch": 1.1195928753180662, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821507584, + "loss": 1.4662, + "grad_norm": 2.223757028579712, + "learning_rate": 0.00023650204831723008 + }, + { + "step": 221, + "epoch": 1.1246819338422391, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821481472, + "loss": 1.3907, + "grad_norm": 0.688571572303772, + "learning_rate": 0.00023577276161532718 + }, + { + "step": 222, + "epoch": 1.1297709923664123, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82149376, + "loss": 1.4063, + "grad_norm": 0.84425288438797, + "learning_rate": 0.0002350404498358356 + }, + { + "step": 223, + "epoch": 1.1348600508905853, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821521408, + "loss": 1.3553, + "grad_norm": 0.7873929142951965, + "learning_rate": 0.00023430513880630133 + }, + { + "step": 224, + "epoch": 1.1399491094147582, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82153984, + "loss": 1.3851, + "grad_norm": 0.5374301075935364, + "learning_rate": 0.00023356685446004966 + }, + { + "step": 225, + "epoch": 1.1450381679389312, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821590528, + "loss": 1.394, + "grad_norm": 1.1137584447860718, + "learning_rate": 0.00023282562283527005 + }, + { + "step": 226, + "epoch": 1.1501272264631044, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821541376, + "loss": 1.4279, + "grad_norm": 1.3788444995880127, + "learning_rate": 0.00023208147007409827 + }, + { + "step": 227, + "epoch": 1.1552162849872774, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821492224, + "loss": 1.4276, + "grad_norm": 1.7641851902008057, + "learning_rate": 0.00023133442242169425 + }, + { + "step": 228, + "epoch": 1.1603053435114503, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821536768, + "loss": 1.4022, + "grad_norm": 0.8321373462677002, + "learning_rate": 0.00023058450622531632 + }, + { + "step": 229, + "epoch": 1.1653944020356235, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821435392, + "loss": 1.3989, + "grad_norm": 1.38264799118042, + "learning_rate": 0.00022983174793339206 + }, + { + "step": 230, + "epoch": 1.1704834605597965, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821502976, + "loss": 1.4009, + "grad_norm": 0.6628998517990112, + "learning_rate": 0.0002290761740945857 + }, + { + "step": 231, + "epoch": 1.1755725190839694, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82149376, + "loss": 1.3616, + "grad_norm": 0.4883979260921478, + "learning_rate": 0.00022831781135686135 + }, + { + "step": 232, + "epoch": 1.1806615776081424, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821498368, + "loss": 1.385, + "grad_norm": 1.8598504066467285, + "learning_rate": 0.00022755668646654375 + }, + { + "step": 233, + "epoch": 1.1857506361323156, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821613568, + "loss": 1.419, + "grad_norm": 1.0684043169021606, + "learning_rate": 0.00022679282626737442 + }, + { + "step": 234, + "epoch": 1.1908396946564885, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821573632, + "loss": 1.4336, + "grad_norm": 1.1408436298370361, + "learning_rate": 0.00022602625769956519 + }, + { + "step": 235, + "epoch": 1.1959287531806615, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82153216, + "loss": 1.4026, + "grad_norm": 1.1013821363449097, + "learning_rate": 0.00022525700779884802 + }, + { + "step": 236, + "epoch": 1.2010178117048347, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821530624, + "loss": 1.3791, + "grad_norm": 1.1608268022537231, + "learning_rate": 0.00022448510369552164 + }, + { + "step": 237, + "epoch": 1.2061068702290076, + "cpu_mem": 1.907269632, + "gpu_mem": 4.8215552, + "loss": 1.38, + "grad_norm": 0.47783467173576355, + "learning_rate": 0.0002237105726134943 + }, + { + "step": 238, + "epoch": 1.2111959287531806, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821575168, + "loss": 1.3846, + "grad_norm": 0.7773339152336121, + "learning_rate": 0.00022293344186932406 + }, + { + "step": 239, + "epoch": 1.2162849872773536, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821492224, + "loss": 1.4389, + "grad_norm": 1.6273157596588135, + "learning_rate": 0.00022215373887125514 + }, + { + "step": 240, + "epoch": 1.2213740458015268, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821627392, + "loss": 1.3975, + "grad_norm": 1.0677049160003662, + "learning_rate": 0.00022137149111825128 + }, + { + "step": 241, + "epoch": 1.2264631043256997, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821581312, + "loss": 1.3654, + "grad_norm": 0.7847086787223816, + "learning_rate": 0.00022058672619902606 + }, + { + "step": 242, + "epoch": 1.2315521628498727, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821602816, + "loss": 1.4061, + "grad_norm": 1.369586706161499, + "learning_rate": 0.00021979947179106966 + }, + { + "step": 243, + "epoch": 1.2366412213740459, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821489152, + "loss": 1.3315, + "grad_norm": 1.684906005859375, + "learning_rate": 0.0002190097556596728 + }, + { + "step": 244, + "epoch": 1.2417302798982188, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821476864, + "loss": 1.512, + "grad_norm": 2.912116050720215, + "learning_rate": 0.0002182176056569476 + }, + { + "step": 245, + "epoch": 1.2468193384223918, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821481472, + "loss": 1.4806, + "grad_norm": 2.643267869949341, + "learning_rate": 0.00021742304972084518 + }, + { + "step": 246, + "epoch": 1.2519083969465647, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821518336, + "loss": 1.4542, + "grad_norm": 2.125279426574707, + "learning_rate": 0.00021662611587417035 + }, + { + "step": 247, + "epoch": 1.256997455470738, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821479936, + "loss": 1.343, + "grad_norm": 0.8747023940086365, + "learning_rate": 0.00021582683222359317 + }, + { + "step": 248, + "epoch": 1.262086513994911, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821526016, + "loss": 1.4241, + "grad_norm": 0.9106222987174988, + "learning_rate": 0.00021502522695865796 + }, + { + "step": 249, + "epoch": 1.267175572519084, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821535232, + "loss": 1.3888, + "grad_norm": 0.45617902278900146, + "learning_rate": 0.00021422132835078884 + }, + { + "step": 250, + "epoch": 1.272264631043257, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821573632, + "loss": 1.3944, + "grad_norm": 1.260534405708313, + "learning_rate": 0.0002134151647522927 + }, + { + "step": 251, + "epoch": 1.27735368956743, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821492224, + "loss": 1.3956, + "grad_norm": 0.6775566935539246, + "learning_rate": 0.00021260676459535933 + }, + { + "step": 252, + "epoch": 1.282442748091603, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82150144, + "loss": 1.3656, + "grad_norm": 0.9624189734458923, + "learning_rate": 0.00021179615639105857 + }, + { + "step": 253, + "epoch": 1.2875318066157762, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821567488, + "loss": 1.3805, + "grad_norm": 0.8021829128265381, + "learning_rate": 0.00021098336872833482 + }, + { + "step": 254, + "epoch": 1.2926208651399491, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82149376, + "loss": 1.3539, + "grad_norm": 0.34260293841362, + "learning_rate": 0.0002101684302729987 + }, + { + "step": 255, + "epoch": 1.297709923664122, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821487616, + "loss": 1.3811, + "grad_norm": 0.5769440531730652, + "learning_rate": 0.00020935136976671617 + }, + { + "step": 256, + "epoch": 1.3027989821882953, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821553664, + "loss": 1.375, + "grad_norm": 0.5958117246627808, + "learning_rate": 0.00020853221602599458 + }, + { + "step": 257, + "epoch": 1.3078880407124682, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821512192, + "loss": 1.3573, + "grad_norm": 0.43720385432243347, + "learning_rate": 0.00020771099794116672 + }, + { + "step": 258, + "epoch": 1.3129770992366412, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821625856, + "loss": 1.3682, + "grad_norm": 0.9466622471809387, + "learning_rate": 0.0002068877444753717 + }, + { + "step": 259, + "epoch": 1.3180661577608141, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821502976, + "loss": 1.3706, + "grad_norm": 0.5799251794815063, + "learning_rate": 0.0002060624846635335 + }, + { + "step": 260, + "epoch": 1.3231552162849873, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821529088, + "loss": 1.3712, + "grad_norm": 0.4343704879283905, + "learning_rate": 0.00020523524761133677 + }, + { + "step": 261, + "epoch": 1.3282442748091603, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821535232, + "loss": 1.3753, + "grad_norm": 1.0381672382354736, + "learning_rate": 0.00020440606249420073 + }, + { + "step": 262, + "epoch": 1.3333333333333333, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821602816, + "loss": 1.3872, + "grad_norm": 0.6213380694389343, + "learning_rate": 0.00020357495855624974 + }, + { + "step": 263, + "epoch": 1.3384223918575064, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821458432, + "loss": 1.3749, + "grad_norm": 0.9851323366165161, + "learning_rate": 0.0002027419651092822 + }, + { + "step": 264, + "epoch": 1.3435114503816794, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821592064, + "loss": 1.371, + "grad_norm": 0.7497409582138062, + "learning_rate": 0.00020190711153173676 + }, + { + "step": 265, + "epoch": 1.3486005089058524, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821621248, + "loss": 1.3436, + "grad_norm": 0.8570588231086731, + "learning_rate": 0.00020107042726765588 + }, + { + "step": 266, + "epoch": 1.3536895674300253, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821490688, + "loss": 1.3818, + "grad_norm": 1.322851300239563, + "learning_rate": 0.0002002319418256479 + }, + { + "step": 267, + "epoch": 1.3587786259541985, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821492224, + "loss": 1.3868, + "grad_norm": 0.8798698782920837, + "learning_rate": 0.00019939168477784583 + }, + { + "step": 268, + "epoch": 1.3638676844783715, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82149376, + "loss": 1.3664, + "grad_norm": 0.979320764541626, + "learning_rate": 0.00019854968575886458 + }, + { + "step": 269, + "epoch": 1.3689567430025447, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821567488, + "loss": 1.4157, + "grad_norm": 1.4372776746749878, + "learning_rate": 0.00019770597446475588 + }, + { + "step": 270, + "epoch": 1.3740458015267176, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821549056, + "loss": 1.3389, + "grad_norm": 0.7059783935546875, + "learning_rate": 0.0001968605806519608 + }, + { + "step": 271, + "epoch": 1.3791348600508906, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821492224, + "loss": 1.3768, + "grad_norm": 0.6480652093887329, + "learning_rate": 0.00019601353413626032 + }, + { + "step": 272, + "epoch": 1.3842239185750635, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821530624, + "loss": 1.375, + "grad_norm": 1.0025056600570679, + "learning_rate": 0.00019516486479172386 + }, + { + "step": 273, + "epoch": 1.3893129770992365, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821598208, + "loss": 1.3603, + "grad_norm": 0.7130928635597229, + "learning_rate": 0.0001943146025496555 + }, + { + "step": 274, + "epoch": 1.3944020356234097, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821550592, + "loss": 1.3649, + "grad_norm": 0.6306337118148804, + "learning_rate": 0.00019346277739753855 + }, + { + "step": 275, + "epoch": 1.3994910941475827, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82167808, + "loss": 1.381, + "grad_norm": 0.7618113160133362, + "learning_rate": 0.00019260941937797776 + }, + { + "step": 276, + "epoch": 1.4045801526717558, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82156288, + "loss": 1.3675, + "grad_norm": 0.6250836253166199, + "learning_rate": 0.00019175455858763988 + }, + { + "step": 277, + "epoch": 1.4096692111959288, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821512192, + "loss": 1.355, + "grad_norm": 0.6826677322387695, + "learning_rate": 0.0001908982251761921 + }, + { + "step": 278, + "epoch": 1.4147582697201018, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821499904, + "loss": 1.3168, + "grad_norm": 0.614322304725647, + "learning_rate": 0.00019004044934523871 + }, + { + "step": 279, + "epoch": 1.4198473282442747, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821492224, + "loss": 1.3574, + "grad_norm": 0.516415536403656, + "learning_rate": 0.00018918126134725616 + }, + { + "step": 280, + "epoch": 1.424936386768448, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821615104, + "loss": 1.3695, + "grad_norm": 0.9311376810073853, + "learning_rate": 0.00018832069148452582 + }, + { + "step": 281, + "epoch": 1.4300254452926209, + "cpu_mem": 1.907269632, + "gpu_mem": 4.8215552, + "loss": 1.4083, + "grad_norm": 1.2857376337051392, + "learning_rate": 0.00018745877010806534 + }, + { + "step": 282, + "epoch": 1.4351145038167938, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82150144, + "loss": 1.4045, + "grad_norm": 1.0299582481384277, + "learning_rate": 0.00018659552761655828 + }, + { + "step": 283, + "epoch": 1.440203562340967, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821518336, + "loss": 1.3534, + "grad_norm": 0.6391008496284485, + "learning_rate": 0.00018573099445528204 + }, + { + "step": 284, + "epoch": 1.44529262086514, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821658112, + "loss": 1.4216, + "grad_norm": 0.7651782631874084, + "learning_rate": 0.00018486520111503387 + }, + { + "step": 285, + "epoch": 1.450381679389313, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821529088, + "loss": 1.4418, + "grad_norm": 0.9582176804542542, + "learning_rate": 0.0001839981781310558 + }, + { + "step": 286, + "epoch": 1.455470737913486, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82152448, + "loss": 1.4197, + "grad_norm": 0.8249621987342834, + "learning_rate": 0.00018312995608195747 + }, + { + "step": 287, + "epoch": 1.460559796437659, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821427712, + "loss": 1.4054, + "grad_norm": 0.709058940410614, + "learning_rate": 0.00018226056558863778 + }, + { + "step": 288, + "epoch": 1.465648854961832, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821502976, + "loss": 1.3848, + "grad_norm": 0.6203559637069702, + "learning_rate": 0.00018139003731320496 + }, + { + "step": 289, + "epoch": 1.470737913486005, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821522944, + "loss": 1.3729, + "grad_norm": 0.4898349940776825, + "learning_rate": 0.00018051840195789506 + }, + { + "step": 290, + "epoch": 1.4758269720101782, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821489152, + "loss": 1.382, + "grad_norm": 0.4060085117816925, + "learning_rate": 0.00017964569026398926 + }, + { + "step": 291, + "epoch": 1.4809160305343512, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821450752, + "loss": 1.4118, + "grad_norm": 0.7285274267196655, + "learning_rate": 0.00017877193301072945 + }, + { + "step": 292, + "epoch": 1.4860050890585241, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821576704, + "loss": 1.4023, + "grad_norm": 0.7276368737220764, + "learning_rate": 0.0001778971610142331 + }, + { + "step": 293, + "epoch": 1.491094147582697, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821453824, + "loss": 1.3998, + "grad_norm": 0.7203302383422852, + "learning_rate": 0.00017702140512640594 + }, + { + "step": 294, + "epoch": 1.4961832061068703, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821496832, + "loss": 1.3853, + "grad_norm": 0.4345015585422516, + "learning_rate": 0.00017614469623385414 + }, + { + "step": 295, + "epoch": 1.5012722646310432, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821479936, + "loss": 1.3877, + "grad_norm": 1.0828698873519897, + "learning_rate": 0.00017526706525679498 + }, + { + "step": 296, + "epoch": 1.5063613231552164, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821521408, + "loss": 1.4189, + "grad_norm": 1.0149608850479126, + "learning_rate": 0.00017438854314796623 + }, + { + "step": 297, + "epoch": 1.5114503816793894, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821461504, + "loss": 1.386, + "grad_norm": 0.6397606134414673, + "learning_rate": 0.00017350916089153455 + }, + { + "step": 298, + "epoch": 1.5165394402035624, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821484544, + "loss": 1.3896, + "grad_norm": 0.8548509478569031, + "learning_rate": 0.00017262894950200277 + }, + { + "step": 299, + "epoch": 1.5216284987277353, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821473792, + "loss": 1.3773, + "grad_norm": 0.4836077094078064, + "learning_rate": 0.000171747940023116 + }, + { + "step": 300, + "epoch": 1.5267175572519083, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82148608, + "loss": 1.377, + "grad_norm": 0.42877694964408875, + "learning_rate": 0.0001708661635267667 + }, + { + "step": 301, + "epoch": 1.5318066157760815, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821489152, + "loss": 1.3709, + "grad_norm": 0.9926077723503113, + "learning_rate": 0.00016998365111189906 + }, + { + "step": 302, + "epoch": 1.5368956743002544, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82147072, + "loss": 1.3833, + "grad_norm": 0.34563905000686646, + "learning_rate": 0.00016910043390341183 + }, + { + "step": 303, + "epoch": 1.5419847328244276, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821476864, + "loss": 1.4106, + "grad_norm": 0.737495481967926, + "learning_rate": 0.0001682165430510609 + }, + { + "step": 304, + "epoch": 1.5470737913486006, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82142464, + "loss": 1.4429, + "grad_norm": 1.259983777999878, + "learning_rate": 0.00016733200972836055 + }, + { + "step": 305, + "epoch": 1.5521628498727735, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821496832, + "loss": 1.3521, + "grad_norm": 0.45584774017333984, + "learning_rate": 0.00016644686513148397 + }, + { + "step": 306, + "epoch": 1.5572519083969465, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821538304, + "loss": 1.3885, + "grad_norm": 0.8280604481697083, + "learning_rate": 0.00016556114047816317 + }, + { + "step": 307, + "epoch": 1.5623409669211195, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82148608, + "loss": 1.3645, + "grad_norm": 0.611052930355072, + "learning_rate": 0.00016467486700658785 + }, + { + "step": 308, + "epoch": 1.5674300254452926, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821496832, + "loss": 1.3878, + "grad_norm": 0.5024601817131042, + "learning_rate": 0.0001637880759743037 + }, + { + "step": 309, + "epoch": 1.5725190839694656, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821479936, + "loss": 1.4179, + "grad_norm": 0.9302980303764343, + "learning_rate": 0.00016290079865711004 + }, + { + "step": 310, + "epoch": 1.5776081424936388, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821535232, + "loss": 1.3837, + "grad_norm": 0.39204055070877075, + "learning_rate": 0.00016201306634795675 + }, + { + "step": 311, + "epoch": 1.5826972010178118, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82150144, + "loss": 1.3762, + "grad_norm": 0.2946418523788452, + "learning_rate": 0.00016112491035584047 + }, + { + "step": 312, + "epoch": 1.5877862595419847, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821510656, + "loss": 1.3787, + "grad_norm": 0.42769718170166016, + "learning_rate": 0.00016023636200470065 + }, + { + "step": 313, + "epoch": 1.5928753180661577, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821512192, + "loss": 1.3609, + "grad_norm": 0.6987353563308716, + "learning_rate": 0.00015934745263231464 + }, + { + "step": 314, + "epoch": 1.5979643765903306, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82173952, + "loss": 1.3999, + "grad_norm": 0.7235163450241089, + "learning_rate": 0.00015845821358919236 + }, + { + "step": 315, + "epoch": 1.6030534351145038, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82153216, + "loss": 1.378, + "grad_norm": 0.5818735957145691, + "learning_rate": 0.00015756867623747088 + }, + { + "step": 316, + "epoch": 1.608142493638677, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821504512, + "loss": 1.3858, + "grad_norm": 0.7103995680809021, + "learning_rate": 0.00015667887194980806 + }, + { + "step": 317, + "epoch": 1.61323155216285, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821469184, + "loss": 1.3498, + "grad_norm": 0.6764567494392395, + "learning_rate": 0.00015578883210827626 + }, + { + "step": 318, + "epoch": 1.618320610687023, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821559808, + "loss": 1.3648, + "grad_norm": 0.7487600445747375, + "learning_rate": 0.0001548985881032554 + }, + { + "step": 319, + "epoch": 1.623409669211196, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821484544, + "loss": 1.3525, + "grad_norm": 0.684175968170166, + "learning_rate": 0.00015400817133232606 + }, + { + "step": 320, + "epoch": 1.6284987277353689, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82153216, + "loss": 1.3389, + "grad_norm": 0.5159670114517212, + "learning_rate": 0.00015311761319916184 + }, + { + "step": 321, + "epoch": 1.6335877862595418, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821473792, + "loss": 1.3724, + "grad_norm": 0.9983177185058594, + "learning_rate": 0.00015222694511242215 + }, + { + "step": 322, + "epoch": 1.638676844783715, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821694976, + "loss": 1.4022, + "grad_norm": 1.1525691747665405, + "learning_rate": 0.00015133619848464424 + }, + { + "step": 323, + "epoch": 1.6437659033078882, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821576704, + "loss": 1.3666, + "grad_norm": 0.6710824370384216, + "learning_rate": 0.0001504454047311353 + }, + { + "step": 324, + "epoch": 1.6488549618320612, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821469184, + "loss": 1.3555, + "grad_norm": 0.9384012818336487, + "learning_rate": 0.00014955459526886468 + }, + { + "step": 325, + "epoch": 1.6539440203562341, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82150912, + "loss": 1.3342, + "grad_norm": 0.5504294037818909, + "learning_rate": 0.00014866380151535574 + }, + { + "step": 326, + "epoch": 1.659033078880407, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821556736, + "loss": 1.3258, + "grad_norm": 0.7388536334037781, + "learning_rate": 0.0001477730548875778 + }, + { + "step": 327, + "epoch": 1.66412213740458, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821522944, + "loss": 1.3763, + "grad_norm": 0.9521717429161072, + "learning_rate": 0.0001468823868008382 + }, + { + "step": 328, + "epoch": 1.6692111959287532, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82157824, + "loss": 1.3488, + "grad_norm": 0.8228641748428345, + "learning_rate": 0.000145991828667674 + }, + { + "step": 329, + "epoch": 1.6743002544529262, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82147072, + "loss": 1.3751, + "grad_norm": 1.09230637550354, + "learning_rate": 0.0001451014118967446 + }, + { + "step": 330, + "epoch": 1.6793893129770994, + "cpu_mem": 1.907269632, + "gpu_mem": 4.8215552, + "loss": 1.3583, + "grad_norm": 1.600640058517456, + "learning_rate": 0.00014421116789172374 + }, + { + "step": 331, + "epoch": 1.6844783715012723, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821544448, + "loss": 1.2802, + "grad_norm": 0.9562586545944214, + "learning_rate": 0.00014332112805019194 + }, + { + "step": 332, + "epoch": 1.6895674300254453, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821499904, + "loss": 1.317, + "grad_norm": 1.071610927581787, + "learning_rate": 0.00014243132376252912 + }, + { + "step": 333, + "epoch": 1.6946564885496183, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821588992, + "loss": 1.3914, + "grad_norm": 1.2270925045013428, + "learning_rate": 0.00014154178641080767 + }, + { + "step": 334, + "epoch": 1.6997455470737912, + "cpu_mem": 1.907269632, + "gpu_mem": 4.8215168, + "loss": 1.3692, + "grad_norm": 1.4479587078094482, + "learning_rate": 0.0001406525473676854 + }, + { + "step": 335, + "epoch": 1.7048346055979644, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821526016, + "loss": 1.3716, + "grad_norm": 1.3169020414352417, + "learning_rate": 0.00013976363799529936 + }, + { + "step": 336, + "epoch": 1.7099236641221374, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82157824, + "loss": 1.3992, + "grad_norm": 1.2635364532470703, + "learning_rate": 0.00013887508964415956 + }, + { + "step": 337, + "epoch": 1.7150127226463106, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82157824, + "loss": 1.2899, + "grad_norm": 1.373231053352356, + "learning_rate": 0.00013798693365204325 + }, + { + "step": 338, + "epoch": 1.7201017811704835, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821615104, + "loss": 1.422, + "grad_norm": 1.4053988456726074, + "learning_rate": 0.00013709920134288993 + }, + { + "step": 339, + "epoch": 1.7251908396946565, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82153216, + "loss": 1.3407, + "grad_norm": 0.8282471895217896, + "learning_rate": 0.00013621192402569628 + }, + { + "step": 340, + "epoch": 1.7302798982188294, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82160128, + "loss": 1.3706, + "grad_norm": 0.7190238833427429, + "learning_rate": 0.00013532513299341215 + }, + { + "step": 341, + "epoch": 1.7353689567430024, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821542912, + "loss": 1.3459, + "grad_norm": 0.7860512137413025, + "learning_rate": 0.00013443885952183683 + }, + { + "step": 342, + "epoch": 1.7404580152671756, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821549056, + "loss": 1.3488, + "grad_norm": 1.1901220083236694, + "learning_rate": 0.00013355313486851603 + }, + { + "step": 343, + "epoch": 1.7455470737913485, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821459968, + "loss": 1.4079, + "grad_norm": 1.0684932470321655, + "learning_rate": 0.00013266799027163942 + }, + { + "step": 344, + "epoch": 1.7506361323155217, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82157824, + "loss": 1.3284, + "grad_norm": 0.7722901701927185, + "learning_rate": 0.00013178345694893906 + }, + { + "step": 345, + "epoch": 1.7557251908396947, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821530624, + "loss": 1.3612, + "grad_norm": 0.8951602578163147, + "learning_rate": 0.0001308995660965881 + }, + { + "step": 346, + "epoch": 1.7608142493638677, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821476864, + "loss": 1.3775, + "grad_norm": 1.1325756311416626, + "learning_rate": 0.00013001634888810094 + }, + { + "step": 347, + "epoch": 1.7659033078880406, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821526016, + "loss": 1.325, + "grad_norm": 0.942642331123352, + "learning_rate": 0.0001291338364732333 + }, + { + "step": 348, + "epoch": 1.7709923664122136, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821651968, + "loss": 1.433, + "grad_norm": 1.4455312490463257, + "learning_rate": 0.00012825205997688403 + }, + { + "step": 349, + "epoch": 1.7760814249363868, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821450752, + "loss": 1.3552, + "grad_norm": 0.9797115921974182, + "learning_rate": 0.00012737105049799723 + }, + { + "step": 350, + "epoch": 1.78117048346056, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821515264, + "loss": 1.3862, + "grad_norm": 0.8878766894340515, + "learning_rate": 0.00012649083910846543 + }, + { + "step": 351, + "epoch": 1.786259541984733, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821530624, + "loss": 1.3621, + "grad_norm": 0.8418009281158447, + "learning_rate": 0.00012561145685203374 + }, + { + "step": 352, + "epoch": 1.7913486005089059, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821521408, + "loss": 1.3483, + "grad_norm": 0.8451316356658936, + "learning_rate": 0.00012473293474320505 + }, + { + "step": 353, + "epoch": 1.7964376590330788, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821518336, + "loss": 1.2696, + "grad_norm": 1.170657992362976, + "learning_rate": 0.00012385530376614586 + }, + { + "step": 354, + "epoch": 1.8015267175572518, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82149376, + "loss": 1.3504, + "grad_norm": 1.2538940906524658, + "learning_rate": 0.00012297859487359408 + }, + { + "step": 355, + "epoch": 1.806615776081425, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821476864, + "loss": 1.3664, + "grad_norm": 1.6823302507400513, + "learning_rate": 0.0001221028389857669 + }, + { + "step": 356, + "epoch": 1.811704834605598, + "cpu_mem": 1.907269632, + "gpu_mem": 4.8214784, + "loss": 1.2637, + "grad_norm": 1.147717833518982, + "learning_rate": 0.00012122806698927051 + }, + { + "step": 357, + "epoch": 1.8167938931297711, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821530624, + "loss": 1.3341, + "grad_norm": 1.516517162322998, + "learning_rate": 0.00012035430973601075 + }, + { + "step": 358, + "epoch": 1.821882951653944, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821596672, + "loss": 1.2912, + "grad_norm": 2.154348373413086, + "learning_rate": 0.00011948159804210495 + }, + { + "step": 359, + "epoch": 1.826972010178117, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821550592, + "loss": 1.2844, + "grad_norm": 1.7716398239135742, + "learning_rate": 0.00011860996268679504 + }, + { + "step": 360, + "epoch": 1.83206106870229, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82160128, + "loss": 1.3746, + "grad_norm": 1.260575771331787, + "learning_rate": 0.00011773943441136221 + }, + { + "step": 361, + "epoch": 1.837150127226463, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821572096, + "loss": 1.3232, + "grad_norm": 1.2118550539016724, + "learning_rate": 0.00011687004391804251 + }, + { + "step": 362, + "epoch": 1.8422391857506362, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821506048, + "loss": 1.3599, + "grad_norm": 1.393874168395996, + "learning_rate": 0.00011600182186894417 + }, + { + "step": 363, + "epoch": 1.8473282442748091, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82146304, + "loss": 1.3158, + "grad_norm": 1.3296869993209839, + "learning_rate": 0.00011513479888496609 + }, + { + "step": 364, + "epoch": 1.8524173027989823, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821490688, + "loss": 1.2019, + "grad_norm": 1.4633570909500122, + "learning_rate": 0.00011426900554471795 + }, + { + "step": 365, + "epoch": 1.8575063613231553, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821473792, + "loss": 1.2939, + "grad_norm": 1.6732406616210938, + "learning_rate": 0.0001134044723834417 + }, + { + "step": 366, + "epoch": 1.8625954198473282, + "cpu_mem": 1.907269632, + "gpu_mem": 4.8215168, + "loss": 1.2322, + "grad_norm": 1.6740715503692627, + "learning_rate": 0.00011254122989193465 + }, + { + "step": 367, + "epoch": 1.8676844783715012, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82157056, + "loss": 1.3154, + "grad_norm": 1.8534026145935059, + "learning_rate": 0.00011167930851547418 + }, + { + "step": 368, + "epoch": 1.8727735368956742, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821467648, + "loss": 1.3205, + "grad_norm": 1.7085485458374023, + "learning_rate": 0.0001108187386527438 + }, + { + "step": 369, + "epoch": 1.8778625954198473, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821527552, + "loss": 1.2231, + "grad_norm": 1.6455297470092773, + "learning_rate": 0.00010995955065476126 + }, + { + "step": 370, + "epoch": 1.8829516539440203, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821450752, + "loss": 1.3686, + "grad_norm": 2.124823808670044, + "learning_rate": 0.00010910177482380795 + }, + { + "step": 371, + "epoch": 1.8880407124681935, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821610496, + "loss": 1.2377, + "grad_norm": 1.5744125843048096, + "learning_rate": 0.00010824544141236015 + }, + { + "step": 372, + "epoch": 1.8931297709923665, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821464576, + "loss": 1.252, + "grad_norm": 1.4778114557266235, + "learning_rate": 0.00010739058062202224 + }, + { + "step": 373, + "epoch": 1.8982188295165394, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821487616, + "loss": 1.2767, + "grad_norm": 1.8218437433242798, + "learning_rate": 0.00010653722260246145 + }, + { + "step": 374, + "epoch": 1.9033078880407124, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821510656, + "loss": 1.3831, + "grad_norm": 2.3935351371765137, + "learning_rate": 0.00010568539745034447 + }, + { + "step": 375, + "epoch": 1.9083969465648853, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821734912, + "loss": 1.382, + "grad_norm": 1.897986650466919, + "learning_rate": 0.00010483513520827614 + }, + { + "step": 376, + "epoch": 1.9134860050890585, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821492224, + "loss": 1.3874, + "grad_norm": 1.6060776710510254, + "learning_rate": 0.00010398646586373969 + }, + { + "step": 377, + "epoch": 1.9185750636132317, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821615104, + "loss": 1.2883, + "grad_norm": 1.5025074481964111, + "learning_rate": 0.00010313941934803922 + }, + { + "step": 378, + "epoch": 1.9236641221374047, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821530624, + "loss": 1.3705, + "grad_norm": 1.5119719505310059, + "learning_rate": 0.00010229402553524413 + }, + { + "step": 379, + "epoch": 1.9287531806615776, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821619712, + "loss": 1.424, + "grad_norm": 2.0617623329162598, + "learning_rate": 0.00010145031424113542 + }, + { + "step": 380, + "epoch": 1.9338422391857506, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821518336, + "loss": 1.3578, + "grad_norm": 1.5505425930023193, + "learning_rate": 0.00010060831522215416 + }, + { + "step": 381, + "epoch": 1.9389312977099236, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821526016, + "loss": 1.3223, + "grad_norm": 0.9542545080184937, + "learning_rate": 9.976805817435207e-05 + }, + { + "step": 382, + "epoch": 1.9440203562340967, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821449216, + "loss": 1.3313, + "grad_norm": 1.0151793956756592, + "learning_rate": 9.89295727323441e-05 + }, + { + "step": 383, + "epoch": 1.9491094147582697, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821529088, + "loss": 1.3219, + "grad_norm": 0.91681307554245, + "learning_rate": 9.809288846826327e-05 + }, + { + "step": 384, + "epoch": 1.954198473282443, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821499904, + "loss": 1.3673, + "grad_norm": 1.2002599239349365, + "learning_rate": 9.725803489071779e-05 + }, + { + "step": 385, + "epoch": 1.9592875318066159, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821487616, + "loss": 1.3236, + "grad_norm": 1.0015177726745605, + "learning_rate": 9.642504144375026e-05 + }, + { + "step": 386, + "epoch": 1.9643765903307888, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821630464, + "loss": 1.2369, + "grad_norm": 0.840761661529541, + "learning_rate": 9.559393750579926e-05 + }, + { + "step": 387, + "epoch": 1.9694656488549618, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821513728, + "loss": 1.2858, + "grad_norm": 0.930248498916626, + "learning_rate": 9.476475238866318e-05 + }, + { + "step": 388, + "epoch": 1.9745547073791347, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82152448, + "loss": 1.3397, + "grad_norm": 1.08246648311615, + "learning_rate": 9.393751533646649e-05 + }, + { + "step": 389, + "epoch": 1.979643765903308, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821679616, + "loss": 1.321, + "grad_norm": 1.0461398363113403, + "learning_rate": 9.31122555246283e-05 + }, + { + "step": 390, + "epoch": 1.984732824427481, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821489152, + "loss": 1.271, + "grad_norm": 1.3000860214233398, + "learning_rate": 9.228900205883324e-05 + }, + { + "step": 391, + "epoch": 1.989821882951654, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821510656, + "loss": 1.2346, + "grad_norm": 1.1982030868530273, + "learning_rate": 9.146778397400543e-05 + }, + { + "step": 392, + "epoch": 1.994910941475827, + "cpu_mem": 1.907269632, + "gpu_mem": 4.82154752, + "loss": 1.2596, + "grad_norm": 1.1921290159225464, + "learning_rate": 9.064863023328384e-05 + }, + { + "step": 393, + "epoch": 2.0, + "cpu_mem": 1.907269632, + "gpu_mem": 4.821134336, + "loss": 1.772, + "grad_norm": 2.747661828994751, + "learning_rate": 8.983156972700125e-05 + }, + { + "step": 394, + "epoch": 2.005089058524173, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720543744, + "loss": 1.1745, + "grad_norm": 2.146927833557129, + "learning_rate": 8.901663127166513e-05 + }, + { + "step": 395, + "epoch": 2.010178117048346, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72052224, + "loss": 1.2352, + "grad_norm": 2.2698745727539062, + "learning_rate": 8.820384360894143e-05 + }, + { + "step": 396, + "epoch": 2.015267175572519, + "cpu_mem": 1.907269632, + "gpu_mem": 4.7205376, + "loss": 1.2625, + "grad_norm": 2.372821569442749, + "learning_rate": 8.739323540464063e-05 + }, + { + "step": 397, + "epoch": 2.0203562340966923, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720685056, + "loss": 1.0771, + "grad_norm": 2.0655136108398438, + "learning_rate": 8.658483524770728e-05 + }, + { + "step": 398, + "epoch": 2.0254452926208653, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720631296, + "loss": 1.1858, + "grad_norm": 2.823200225830078, + "learning_rate": 8.577867164921113e-05 + }, + { + "step": 399, + "epoch": 2.030534351145038, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720648192, + "loss": 1.14, + "grad_norm": 3.027404546737671, + "learning_rate": 8.497477304134203e-05 + }, + { + "step": 400, + "epoch": 2.035623409669211, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720600576, + "loss": 1.2272, + "grad_norm": 3.397188425064087, + "learning_rate": 8.41731677764068e-05 + }, + { + "step": 401, + "epoch": 2.040712468193384, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720625152, + "loss": 1.1409, + "grad_norm": 2.5992941856384277, + "learning_rate": 8.337388412582972e-05 + }, + { + "step": 402, + "epoch": 2.045801526717557, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720672768, + "loss": 1.2013, + "grad_norm": 2.3347225189208984, + "learning_rate": 8.257695027915481e-05 + }, + { + "step": 403, + "epoch": 2.05089058524173, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720582144, + "loss": 1.203, + "grad_norm": 2.1089553833007812, + "learning_rate": 8.178239434305235e-05 + }, + { + "step": 404, + "epoch": 2.0559796437659035, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720554496, + "loss": 1.052, + "grad_norm": 1.961490273475647, + "learning_rate": 8.099024434032717e-05 + }, + { + "step": 405, + "epoch": 2.0610687022900764, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720643584, + "loss": 1.1544, + "grad_norm": 1.947253942489624, + "learning_rate": 8.02005282089303e-05 + }, + { + "step": 406, + "epoch": 2.0661577608142494, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720534528, + "loss": 1.0507, + "grad_norm": 1.8350939750671387, + "learning_rate": 7.941327380097388e-05 + }, + { + "step": 407, + "epoch": 2.0712468193384224, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720586752, + "loss": 1.1711, + "grad_norm": 2.401553153991699, + "learning_rate": 7.862850888174869e-05 + }, + { + "step": 408, + "epoch": 2.0763358778625953, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720540672, + "loss": 1.0855, + "grad_norm": 2.5688185691833496, + "learning_rate": 7.784626112874487e-05 + }, + { + "step": 409, + "epoch": 2.0814249363867683, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720743424, + "loss": 1.1423, + "grad_norm": 2.0632102489471436, + "learning_rate": 7.706655813067594e-05 + }, + { + "step": 410, + "epoch": 2.0865139949109412, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720741888, + "loss": 1.1998, + "grad_norm": 2.516848564147949, + "learning_rate": 7.628942738650573e-05 + }, + { + "step": 411, + "epoch": 2.0916030534351147, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720665088, + "loss": 1.0053, + "grad_norm": 1.9849578142166138, + "learning_rate": 7.551489630447835e-05 + }, + { + "step": 412, + "epoch": 2.0966921119592876, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72062208, + "loss": 1.205, + "grad_norm": 2.975541591644287, + "learning_rate": 7.474299220115195e-05 + }, + { + "step": 413, + "epoch": 2.1017811704834606, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72059904, + "loss": 1.0976, + "grad_norm": 2.543401002883911, + "learning_rate": 7.397374230043484e-05 + }, + { + "step": 414, + "epoch": 2.1068702290076335, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72055296, + "loss": 1.0961, + "grad_norm": 1.9914641380310059, + "learning_rate": 7.320717373262557e-05 + }, + { + "step": 415, + "epoch": 2.1119592875318065, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720669696, + "loss": 1.2864, + "grad_norm": 3.0595102310180664, + "learning_rate": 7.244331353345625e-05 + }, + { + "step": 416, + "epoch": 2.1170483460559795, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720557568, + "loss": 1.2244, + "grad_norm": 2.315214157104492, + "learning_rate": 7.16821886431386e-05 + }, + { + "step": 417, + "epoch": 2.122137404580153, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720546816, + "loss": 1.2131, + "grad_norm": 2.4656543731689453, + "learning_rate": 7.092382590541432e-05 + }, + { + "step": 418, + "epoch": 2.127226463104326, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720551424, + "loss": 1.1849, + "grad_norm": 2.087986707687378, + "learning_rate": 7.016825206660788e-05 + }, + { + "step": 419, + "epoch": 2.132315521628499, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720666624, + "loss": 1.2312, + "grad_norm": 2.562169313430786, + "learning_rate": 6.941549377468367e-05 + }, + { + "step": 420, + "epoch": 2.1374045801526718, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720640512, + "loss": 1.1654, + "grad_norm": 2.3219666481018066, + "learning_rate": 6.866557757830575e-05 + }, + { + "step": 421, + "epoch": 2.1424936386768447, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720542208, + "loss": 1.0097, + "grad_norm": 2.26962947845459, + "learning_rate": 6.791852992590169e-05 + }, + { + "step": 422, + "epoch": 2.1475826972010177, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720577536, + "loss": 1.0652, + "grad_norm": 2.029843807220459, + "learning_rate": 6.717437716472997e-05 + }, + { + "step": 423, + "epoch": 2.1526717557251906, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720543744, + "loss": 1.1801, + "grad_norm": 2.537774085998535, + "learning_rate": 6.643314553995034e-05 + }, + { + "step": 424, + "epoch": 2.157760814249364, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720677376, + "loss": 1.126, + "grad_norm": 2.247429370880127, + "learning_rate": 6.569486119369863e-05 + }, + { + "step": 425, + "epoch": 2.162849872773537, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720651264, + "loss": 1.0666, + "grad_norm": 2.3856699466705322, + "learning_rate": 6.495955016416441e-05 + }, + { + "step": 426, + "epoch": 2.16793893129771, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720695808, + "loss": 0.9136, + "grad_norm": 2.3156299591064453, + "learning_rate": 6.422723838467286e-05 + }, + { + "step": 427, + "epoch": 2.173027989821883, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720586752, + "loss": 1.2307, + "grad_norm": 2.2851755619049072, + "learning_rate": 6.349795168276994e-05 + }, + { + "step": 428, + "epoch": 2.178117048346056, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720603648, + "loss": 1.0386, + "grad_norm": 2.2540788650512695, + "learning_rate": 6.277171577931187e-05 + }, + { + "step": 429, + "epoch": 2.183206106870229, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720678912, + "loss": 1.2057, + "grad_norm": 2.9888339042663574, + "learning_rate": 6.204855628755751e-05 + }, + { + "step": 430, + "epoch": 2.188295165394402, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720526848, + "loss": 1.3163, + "grad_norm": 2.999528169631958, + "learning_rate": 6.13284987122654e-05 + }, + { + "step": 431, + "epoch": 2.1933842239185752, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72064512, + "loss": 1.0542, + "grad_norm": 2.929593801498413, + "learning_rate": 6.061156844879417e-05 + }, + { + "step": 432, + "epoch": 2.198473282442748, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720634368, + "loss": 1.0382, + "grad_norm": 2.4614131450653076, + "learning_rate": 5.9897790782206636e-05 + }, + { + "step": 433, + "epoch": 2.203562340966921, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720600576, + "loss": 1.1743, + "grad_norm": 2.516267776489258, + "learning_rate": 5.9187190886378306e-05 + }, + { + "step": 434, + "epoch": 2.208651399491094, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720579072, + "loss": 1.3732, + "grad_norm": 3.4944002628326416, + "learning_rate": 5.8479793823109406e-05 + }, + { + "step": 435, + "epoch": 2.213740458015267, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720574464, + "loss": 1.039, + "grad_norm": 2.2334682941436768, + "learning_rate": 5.777562454124113e-05 + }, + { + "step": 436, + "epoch": 2.21882951653944, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720585216, + "loss": 1.1394, + "grad_norm": 2.056826591491699, + "learning_rate": 5.7074707875775496e-05 + }, + { + "step": 437, + "epoch": 2.223918575063613, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720638976, + "loss": 1.2034, + "grad_norm": 2.6849091053009033, + "learning_rate": 5.637706854699974e-05 + }, + { + "step": 438, + "epoch": 2.2290076335877864, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720556032, + "loss": 1.0717, + "grad_norm": 2.123591899871826, + "learning_rate": 5.568273115961414e-05 + }, + { + "step": 439, + "epoch": 2.2340966921119594, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72058368, + "loss": 1.0976, + "grad_norm": 2.0933353900909424, + "learning_rate": 5.499172020186447e-05 + }, + { + "step": 440, + "epoch": 2.2391857506361323, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720582144, + "loss": 1.0372, + "grad_norm": 1.989935278892517, + "learning_rate": 5.430406004467842e-05 + }, + { + "step": 441, + "epoch": 2.2442748091603053, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720625152, + "loss": 1.0978, + "grad_norm": 2.270538806915283, + "learning_rate": 5.361977494080572e-05 + }, + { + "step": 442, + "epoch": 2.2493638676844783, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720623616, + "loss": 1.1899, + "grad_norm": 2.117157220840454, + "learning_rate": 5.293888902396319e-05 + }, + { + "step": 443, + "epoch": 2.2544529262086512, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720674304, + "loss": 1.0497, + "grad_norm": 2.194533109664917, + "learning_rate": 5.2261426307983204e-05 + }, + { + "step": 444, + "epoch": 2.2595419847328246, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720680448, + "loss": 1.0948, + "grad_norm": 2.56341552734375, + "learning_rate": 5.158741068596714e-05 + }, + { + "step": 445, + "epoch": 2.2646310432569976, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720619008, + "loss": 1.1848, + "grad_norm": 2.4153096675872803, + "learning_rate": 5.0916865929442326e-05 + }, + { + "step": 446, + "epoch": 2.2697201017811706, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720608256, + "loss": 1.12, + "grad_norm": 2.6150221824645996, + "learning_rate": 5.024981568752386e-05 + }, + { + "step": 447, + "epoch": 2.2748091603053435, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720632832, + "loss": 1.1436, + "grad_norm": 2.3389010429382324, + "learning_rate": 4.958628348608065e-05 + }, + { + "step": 448, + "epoch": 2.2798982188295165, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720649728, + "loss": 1.0509, + "grad_norm": 2.5509917736053467, + "learning_rate": 4.892629272690536e-05 + }, + { + "step": 449, + "epoch": 2.2849872773536894, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720520704, + "loss": 1.1048, + "grad_norm": 2.68502140045166, + "learning_rate": 4.826986668688944e-05 + }, + { + "step": 450, + "epoch": 2.2900763358778624, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720695808, + "loss": 1.049, + "grad_norm": 2.708735704421997, + "learning_rate": 4.761702851720191e-05 + }, + { + "step": 451, + "epoch": 2.2951653944020354, + "cpu_mem": 1.907269632, + "gpu_mem": 4.7206528, + "loss": 1.03, + "grad_norm": 2.858384847640991, + "learning_rate": 4.6967801242472916e-05 + }, + { + "step": 452, + "epoch": 2.300254452926209, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720562176, + "loss": 1.028, + "grad_norm": 2.592632293701172, + "learning_rate": 4.632220775998172e-05 + }, + { + "step": 453, + "epoch": 2.3053435114503817, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720509952, + "loss": 0.9465, + "grad_norm": 2.679378032684326, + "learning_rate": 4.568027083884929e-05 + }, + { + "step": 454, + "epoch": 2.3104325699745547, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720525312, + "loss": 1.1384, + "grad_norm": 3.0502395629882812, + "learning_rate": 4.504201311923488e-05 + }, + { + "step": 455, + "epoch": 2.3155216284987277, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720626688, + "loss": 1.1556, + "grad_norm": 3.1190407276153564, + "learning_rate": 4.440745711153804e-05 + }, + { + "step": 456, + "epoch": 2.3206106870229006, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720635904, + "loss": 1.1488, + "grad_norm": 2.9984912872314453, + "learning_rate": 4.377662519560423e-05 + }, + { + "step": 457, + "epoch": 2.325699745547074, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720665088, + "loss": 1.2274, + "grad_norm": 3.7603578567504883, + "learning_rate": 4.3149539619935836e-05 + }, + { + "step": 458, + "epoch": 2.330788804071247, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720726528, + "loss": 1.1054, + "grad_norm": 3.4020636081695557, + "learning_rate": 4.252622250090746e-05 + }, + { + "step": 459, + "epoch": 2.33587786259542, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720580608, + "loss": 1.0454, + "grad_norm": 2.9773528575897217, + "learning_rate": 4.190669582198571e-05 + }, + { + "step": 460, + "epoch": 2.340966921119593, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72066816, + "loss": 1.1908, + "grad_norm": 4.301827907562256, + "learning_rate": 4.1290981432954185e-05 + }, + { + "step": 461, + "epoch": 2.346055979643766, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720571392, + "loss": 1.0115, + "grad_norm": 2.5554540157318115, + "learning_rate": 4.067910104914249e-05 + }, + { + "step": 462, + "epoch": 2.351145038167939, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720642048, + "loss": 1.164, + "grad_norm": 3.174329996109009, + "learning_rate": 4.007107625066079e-05 + }, + { + "step": 463, + "epoch": 2.356234096692112, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720677376, + "loss": 1.0187, + "grad_norm": 2.6852877140045166, + "learning_rate": 3.946692848163836e-05 + }, + { + "step": 464, + "epoch": 2.3613231552162848, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720615936, + "loss": 1.0174, + "grad_norm": 2.44645619392395, + "learning_rate": 3.886667904946739e-05 + }, + { + "step": 465, + "epoch": 2.366412213740458, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720572928, + "loss": 1.0777, + "grad_norm": 2.9309890270233154, + "learning_rate": 3.8270349124051694e-05 + }, + { + "step": 466, + "epoch": 2.371501272264631, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720574464, + "loss": 1.1412, + "grad_norm": 2.663360118865967, + "learning_rate": 3.767795973705975e-05 + }, + { + "step": 467, + "epoch": 2.376590330788804, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720611328, + "loss": 0.9798, + "grad_norm": 2.3876092433929443, + "learning_rate": 3.708953178118324e-05 + }, + { + "step": 468, + "epoch": 2.381679389312977, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72058368, + "loss": 1.1671, + "grad_norm": 2.889526605606079, + "learning_rate": 3.6505086009399944e-05 + }, + { + "step": 469, + "epoch": 2.38676844783715, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720640512, + "loss": 1.0021, + "grad_norm": 3.0505428314208984, + "learning_rate": 3.5924643034242136e-05 + }, + { + "step": 470, + "epoch": 2.391857506361323, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720669696, + "loss": 1.0583, + "grad_norm": 3.1210105419158936, + "learning_rate": 3.5348223327069105e-05 + }, + { + "step": 471, + "epoch": 2.3969465648854964, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720569856, + "loss": 1.1881, + "grad_norm": 3.1920559406280518, + "learning_rate": 3.4775847217345756e-05 + }, + { + "step": 472, + "epoch": 2.4020356234096694, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720592896, + "loss": 1.1453, + "grad_norm": 2.9236438274383545, + "learning_rate": 3.420753489192524e-05 + }, + { + "step": 473, + "epoch": 2.4071246819338423, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72056832, + "loss": 1.0346, + "grad_norm": 2.4266490936279297, + "learning_rate": 3.364330639433701e-05 + }, + { + "step": 474, + "epoch": 2.4122137404580153, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72066048, + "loss": 1.1699, + "grad_norm": 2.313926935195923, + "learning_rate": 3.308318162408013e-05 + }, + { + "step": 475, + "epoch": 2.4173027989821882, + "cpu_mem": 1.907269632, + "gpu_mem": 4.72052992, + "loss": 1.1096, + "grad_norm": 2.576312780380249, + "learning_rate": 3.2527180335921186e-05 + }, + { + "step": 476, + "epoch": 2.422391857506361, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720666624, + "loss": 1.0939, + "grad_norm": 2.4065840244293213, + "learning_rate": 3.197532213919774e-05 + }, + { + "step": 477, + "epoch": 2.427480916030534, + "cpu_mem": 1.907269632, + "gpu_mem": 4.720523776, + "loss": 0.9534, + "grad_norm": 2.316574811935425, + "learning_rate": 3.1427626497126654e-05 + }, + { + "step": 478, + "epoch": 2.432569974554707, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720546816, + "loss": 1.0989, + "grad_norm": 3.0087523460388184, + "learning_rate": 3.088411272611781e-05 + }, + { + "step": 479, + "epoch": 2.4376590330788805, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720594432, + "loss": 1.1539, + "grad_norm": 3.051593065261841, + "learning_rate": 3.0344799995092533e-05 + }, + { + "step": 480, + "epoch": 2.4427480916030535, + "cpu_mem": 1.90943232, + "gpu_mem": 4.72063744, + "loss": 1.1314, + "grad_norm": 2.7285192012786865, + "learning_rate": 2.9809707324807912e-05 + }, + { + "step": 481, + "epoch": 2.4478371501272265, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720565248, + "loss": 1.1022, + "grad_norm": 2.449920415878296, + "learning_rate": 2.9278853587185658e-05 + }, + { + "step": 482, + "epoch": 2.4529262086513994, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720663552, + "loss": 1.0397, + "grad_norm": 2.4486451148986816, + "learning_rate": 2.8752257504646616e-05 + }, + { + "step": 483, + "epoch": 2.4580152671755724, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720559104, + "loss": 1.1387, + "grad_norm": 3.2360188961029053, + "learning_rate": 2.8229937649450613e-05 + }, + { + "step": 484, + "epoch": 2.4631043256997454, + "cpu_mem": 1.90943232, + "gpu_mem": 4.72071424, + "loss": 0.9133, + "grad_norm": 2.3694214820861816, + "learning_rate": 2.7711912443041123e-05 + }, + { + "step": 485, + "epoch": 2.4681933842239188, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720546816, + "loss": 1.0851, + "grad_norm": 2.412914991378784, + "learning_rate": 2.719820015539596e-05 + }, + { + "step": 486, + "epoch": 2.4732824427480917, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720589824, + "loss": 1.0065, + "grad_norm": 2.6868178844451904, + "learning_rate": 2.6688818904382513e-05 + }, + { + "step": 487, + "epoch": 2.4783715012722647, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720611328, + "loss": 1.1289, + "grad_norm": 2.902972936630249, + "learning_rate": 2.6183786655119144e-05 + }, + { + "step": 488, + "epoch": 2.4834605597964376, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720534528, + "loss": 1.1227, + "grad_norm": 3.1373236179351807, + "learning_rate": 2.5683121219341217e-05 + }, + { + "step": 489, + "epoch": 2.4885496183206106, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720671232, + "loss": 1.0606, + "grad_norm": 3.2857189178466797, + "learning_rate": 2.518684025477319e-05 + }, + { + "step": 490, + "epoch": 2.4936386768447836, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720559104, + "loss": 0.9852, + "grad_norm": 2.3932127952575684, + "learning_rate": 2.469496126450578e-05 + }, + { + "step": 491, + "epoch": 2.4987277353689565, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720556032, + "loss": 1.1338, + "grad_norm": 3.039724588394165, + "learning_rate": 2.4207501596378508e-05 + }, + { + "step": 492, + "epoch": 2.5038167938931295, + "cpu_mem": 1.90943232, + "gpu_mem": 4.72052992, + "loss": 1.2475, + "grad_norm": 3.1189301013946533, + "learning_rate": 2.3724478442368133e-05 + }, + { + "step": 493, + "epoch": 2.508905852417303, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720565248, + "loss": 0.992, + "grad_norm": 3.2797043323516846, + "learning_rate": 2.324590883798204e-05 + }, + { + "step": 494, + "epoch": 2.513994910941476, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720580608, + "loss": 0.9503, + "grad_norm": 2.535738945007324, + "learning_rate": 2.2771809661657614e-05 + }, + { + "step": 495, + "epoch": 2.519083969465649, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720574464, + "loss": 0.9185, + "grad_norm": 2.940528154373169, + "learning_rate": 2.2302197634166835e-05 + }, + { + "step": 496, + "epoch": 2.524173027989822, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720586752, + "loss": 1.203, + "grad_norm": 3.1664814949035645, + "learning_rate": 2.1837089318026714e-05 + }, + { + "step": 497, + "epoch": 2.5292620865139948, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720619008, + "loss": 1.0307, + "grad_norm": 3.035275936126709, + "learning_rate": 2.1376501116915047e-05 + }, + { + "step": 498, + "epoch": 2.534351145038168, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720620544, + "loss": 1.1144, + "grad_norm": 2.6826674938201904, + "learning_rate": 2.0920449275091837e-05 + }, + { + "step": 499, + "epoch": 2.539440203562341, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720592896, + "loss": 1.1587, + "grad_norm": 2.865182876586914, + "learning_rate": 2.0468949876826573e-05 + }, + { + "step": 500, + "epoch": 2.544529262086514, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720642048, + "loss": 1.0236, + "grad_norm": 2.445111036300659, + "learning_rate": 2.002201884583065e-05 + }, + { + "step": 501, + "epoch": 2.549618320610687, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720571392, + "loss": 1.1079, + "grad_norm": 2.9539287090301514, + "learning_rate": 1.957967194469615e-05 + }, + { + "step": 502, + "epoch": 2.55470737913486, + "cpu_mem": 1.90943232, + "gpu_mem": 4.72062976, + "loss": 0.9718, + "grad_norm": 2.8627753257751465, + "learning_rate": 1.9141924774339566e-05 + }, + { + "step": 503, + "epoch": 2.559796437659033, + "cpu_mem": 1.90943232, + "gpu_mem": 4.720620544, + "loss": 0.9025, + "grad_norm": 2.4897494316101074, + "learning_rate": 1.8708792773451874e-05 + }, + { + "step": 504, + "epoch": 2.564885496183206, + "cpu_mem": 1.909628928, + "gpu_mem": 4.720554496, + "loss": 1.117, + "grad_norm": 3.6558778285980225, + "learning_rate": 1.828029121795375e-05 + }, + { + "step": 505, + "epoch": 2.569974554707379, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720576, + "loss": 1.054, + "grad_norm": 2.9850881099700928, + "learning_rate": 1.7856435220457092e-05 + }, + { + "step": 506, + "epoch": 2.5750636132315523, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720631296, + "loss": 1.067, + "grad_norm": 3.151782274246216, + "learning_rate": 1.7437239729731806e-05 + }, + { + "step": 507, + "epoch": 2.5801526717557253, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720548352, + "loss": 1.1331, + "grad_norm": 3.0441250801086426, + "learning_rate": 1.7022719530178624e-05 + }, + { + "step": 508, + "epoch": 2.5852417302798982, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720612864, + "loss": 1.1899, + "grad_norm": 3.106004238128662, + "learning_rate": 1.6612889241307836e-05 + }, + { + "step": 509, + "epoch": 2.590330788804071, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720571392, + "loss": 0.9646, + "grad_norm": 2.9565675258636475, + "learning_rate": 1.620776331722347e-05 + }, + { + "step": 510, + "epoch": 2.595419847328244, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720603648, + "loss": 1.0337, + "grad_norm": 2.6489667892456055, + "learning_rate": 1.580735604611368e-05 + }, + { + "step": 511, + "epoch": 2.6005089058524176, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720577536, + "loss": 0.9853, + "grad_norm": 2.705331325531006, + "learning_rate": 1.5411681549746678e-05 + }, + { + "step": 512, + "epoch": 2.6055979643765905, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720571392, + "loss": 1.1351, + "grad_norm": 3.1538424491882324, + "learning_rate": 1.502075378297285e-05 + }, + { + "step": 513, + "epoch": 2.6106870229007635, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720582144, + "loss": 1.0353, + "grad_norm": 2.906825304031372, + "learning_rate": 1.4634586533232428e-05 + }, + { + "step": 514, + "epoch": 2.6157760814249365, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720486912, + "loss": 1.1422, + "grad_norm": 2.968611240386963, + "learning_rate": 1.4253193420069292e-05 + }, + { + "step": 515, + "epoch": 2.6208651399491094, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720556032, + "loss": 1.0448, + "grad_norm": 2.7935004234313965, + "learning_rate": 1.3876587894650686e-05 + }, + { + "step": 516, + "epoch": 2.6259541984732824, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720603648, + "loss": 1.172, + "grad_norm": 3.2870078086853027, + "learning_rate": 1.350478323929271e-05 + }, + { + "step": 517, + "epoch": 2.6310432569974553, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720548352, + "loss": 1.1895, + "grad_norm": 3.179579496383667, + "learning_rate": 1.3137792566992001e-05 + }, + { + "step": 518, + "epoch": 2.6361323155216283, + "cpu_mem": 1.911791616, + "gpu_mem": 4.72060672, + "loss": 1.1247, + "grad_norm": 3.0445284843444824, + "learning_rate": 1.2775628820963091e-05 + }, + { + "step": 519, + "epoch": 2.6412213740458013, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720612864, + "loss": 1.0947, + "grad_norm": 3.279428243637085, + "learning_rate": 1.2418304774182075e-05 + }, + { + "step": 520, + "epoch": 2.6463104325699747, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720749568, + "loss": 0.8078, + "grad_norm": 2.362947940826416, + "learning_rate": 1.2065833028935968e-05 + }, + { + "step": 521, + "epoch": 2.6513994910941476, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720559104, + "loss": 1.0319, + "grad_norm": 2.822824478149414, + "learning_rate": 1.1718226016378507e-05 + }, + { + "step": 522, + "epoch": 2.6564885496183206, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720623616, + "loss": 1.0548, + "grad_norm": 2.7411043643951416, + "learning_rate": 1.137549599609136e-05 + }, + { + "step": 523, + "epoch": 2.6615776081424936, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720539136, + "loss": 0.9765, + "grad_norm": 2.4286351203918457, + "learning_rate": 1.103765505565205e-05 + }, + { + "step": 524, + "epoch": 2.6666666666666665, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720556032, + "loss": 1.1083, + "grad_norm": 3.19645357131958, + "learning_rate": 1.0704715110207579e-05 + }, + { + "step": 525, + "epoch": 2.67175572519084, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720556032, + "loss": 1.1299, + "grad_norm": 3.2862765789031982, + "learning_rate": 1.0376687902053981e-05 + }, + { + "step": 526, + "epoch": 2.676844783715013, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720681984, + "loss": 1.073, + "grad_norm": 2.671074390411377, + "learning_rate": 1.0053585000222524e-05 + }, + { + "step": 527, + "epoch": 2.681933842239186, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720562176, + "loss": 0.8474, + "grad_norm": 2.467865228652954, + "learning_rate": 9.735417800071433e-06 + }, + { + "step": 528, + "epoch": 2.687022900763359, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720720384, + "loss": 1.0546, + "grad_norm": 3.0531675815582275, + "learning_rate": 9.42219752288414e-06 + }, + { + "step": 529, + "epoch": 2.6921119592875318, + "cpu_mem": 1.911791616, + "gpu_mem": 4.72054528, + "loss": 1.0249, + "grad_norm": 2.7074482440948486, + "learning_rate": 9.113935215473428e-06 + }, + { + "step": 530, + "epoch": 2.6972010178117047, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720563712, + "loss": 1.0864, + "grad_norm": 3.223503589630127, + "learning_rate": 8.810641749791902e-06 + }, + { + "step": 531, + "epoch": 2.7022900763358777, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720740352, + "loss": 0.9918, + "grad_norm": 2.672673463821411, + "learning_rate": 8.512327822548481e-06 + }, + { + "step": 532, + "epoch": 2.7073791348600507, + "cpu_mem": 1.911791616, + "gpu_mem": 4.7206144, + "loss": 0.9259, + "grad_norm": 3.450578451156616, + "learning_rate": 8.219003954831199e-06 + }, + { + "step": 533, + "epoch": 2.712468193384224, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720611328, + "loss": 1.1264, + "grad_norm": 3.2777979373931885, + "learning_rate": 7.930680491736135e-06 + }, + { + "step": 534, + "epoch": 2.717557251908397, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720620544, + "loss": 1.0597, + "grad_norm": 2.9211320877075195, + "learning_rate": 7.647367602002491e-06 + }, + { + "step": 535, + "epoch": 2.72264631043257, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720496128, + "loss": 1.0947, + "grad_norm": 3.0480191707611084, + "learning_rate": 7.369075277654091e-06 + }, + { + "step": 536, + "epoch": 2.727735368956743, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720778752, + "loss": 1.115, + "grad_norm": 3.162631034851074, + "learning_rate": 7.095813333646832e-06 + }, + { + "step": 537, + "epoch": 2.732824427480916, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720542208, + "loss": 1.0038, + "grad_norm": 3.0641705989837646, + "learning_rate": 6.827591407522548e-06 + }, + { + "step": 538, + "epoch": 2.7379134860050893, + "cpu_mem": 1.911791616, + "gpu_mem": 4.72063744, + "loss": 1.1432, + "grad_norm": 2.8350038528442383, + "learning_rate": 6.564418959069273e-06 + }, + { + "step": 539, + "epoch": 2.7430025445292623, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720603648, + "loss": 1.1061, + "grad_norm": 3.2098021507263184, + "learning_rate": 6.3063052699873326e-06 + }, + { + "step": 540, + "epoch": 2.7480916030534353, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720571392, + "loss": 1.0847, + "grad_norm": 3.05271053314209, + "learning_rate": 6.053259443562286e-06 + }, + { + "step": 541, + "epoch": 2.753180661577608, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720589824, + "loss": 1.1392, + "grad_norm": 2.8073513507843018, + "learning_rate": 5.8052904043435985e-06 + }, + { + "step": 542, + "epoch": 2.758269720101781, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720640512, + "loss": 1.0731, + "grad_norm": 2.6490492820739746, + "learning_rate": 5.56240689783013e-06 + }, + { + "step": 543, + "epoch": 2.763358778625954, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720536064, + "loss": 1.2001, + "grad_norm": 3.3745298385620117, + "learning_rate": 5.324617490161409e-06 + }, + { + "step": 544, + "epoch": 2.768447837150127, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720565248, + "loss": 0.9935, + "grad_norm": 3.0233676433563232, + "learning_rate": 5.091930567815866e-06 + }, + { + "step": 545, + "epoch": 2.7735368956743, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720508416, + "loss": 1.0634, + "grad_norm": 2.8439714908599854, + "learning_rate": 4.86435433731473e-06 + }, + { + "step": 546, + "epoch": 2.778625954198473, + "cpu_mem": 1.911791616, + "gpu_mem": 4.72058368, + "loss": 0.8548, + "grad_norm": 2.584019184112549, + "learning_rate": 4.641896824932861e-06 + }, + { + "step": 547, + "epoch": 2.7837150127226464, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720576, + "loss": 0.9653, + "grad_norm": 2.916884183883667, + "learning_rate": 4.424565876415415e-06 + }, + { + "step": 548, + "epoch": 2.7888040712468194, + "cpu_mem": 1.911791616, + "gpu_mem": 4.72060672, + "loss": 1.0419, + "grad_norm": 3.438354015350342, + "learning_rate": 4.212369156701373e-06 + }, + { + "step": 549, + "epoch": 2.7938931297709924, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720536064, + "loss": 1.1702, + "grad_norm": 3.156700372695923, + "learning_rate": 4.005314149653133e-06 + }, + { + "step": 550, + "epoch": 2.7989821882951653, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720562176, + "loss": 0.9672, + "grad_norm": 3.563671350479126, + "learning_rate": 3.8034081577924147e-06 + }, + { + "step": 551, + "epoch": 2.8040712468193383, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720657408, + "loss": 0.9972, + "grad_norm": 2.6328306198120117, + "learning_rate": 3.6066583020429864e-06 + }, + { + "step": 552, + "epoch": 2.8091603053435117, + "cpu_mem": 1.911791616, + "gpu_mem": 4.72055296, + "loss": 0.9054, + "grad_norm": 2.7762458324432373, + "learning_rate": 3.415071521479246e-06 + }, + { + "step": 553, + "epoch": 2.8142493638676847, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720562176, + "loss": 0.9543, + "grad_norm": 2.6288981437683105, + "learning_rate": 3.2286545730817183e-06 + }, + { + "step": 554, + "epoch": 2.8193384223918576, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720589824, + "loss": 1.1611, + "grad_norm": 3.037616014480591, + "learning_rate": 3.0474140314985628e-06 + }, + { + "step": 555, + "epoch": 2.8244274809160306, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720603648, + "loss": 1.1343, + "grad_norm": 3.5050625801086426, + "learning_rate": 2.8713562888138754e-06 + }, + { + "step": 556, + "epoch": 2.8295165394402035, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720597504, + "loss": 0.9032, + "grad_norm": 2.549309492111206, + "learning_rate": 2.7004875543220506e-06 + }, + { + "step": 557, + "epoch": 2.8346055979643765, + "cpu_mem": 1.911791616, + "gpu_mem": 4.72056064, + "loss": 1.0268, + "grad_norm": 2.863330602645874, + "learning_rate": 2.5348138543089425e-06 + }, + { + "step": 558, + "epoch": 2.8396946564885495, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720619008, + "loss": 1.1196, + "grad_norm": 3.1663286685943604, + "learning_rate": 2.374341031839283e-06 + }, + { + "step": 559, + "epoch": 2.8447837150127224, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720528384, + "loss": 1.1185, + "grad_norm": 3.354562520980835, + "learning_rate": 2.2190747465505644e-06 + }, + { + "step": 560, + "epoch": 2.849872773536896, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720612864, + "loss": 1.1204, + "grad_norm": 3.093818426132202, + "learning_rate": 2.0690204744534976e-06 + }, + { + "step": 561, + "epoch": 2.854961832061069, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720651264, + "loss": 0.9876, + "grad_norm": 3.430046558380127, + "learning_rate": 1.924183507738819e-06 + }, + { + "step": 562, + "epoch": 2.8600508905852418, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720695808, + "loss": 1.148, + "grad_norm": 2.708543062210083, + "learning_rate": 1.7845689545906704e-06 + }, + { + "step": 563, + "epoch": 2.8651399491094147, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720657408, + "loss": 0.9714, + "grad_norm": 2.763545513153076, + "learning_rate": 1.6501817390064786e-06 + }, + { + "step": 564, + "epoch": 2.8702290076335877, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720594432, + "loss": 1.0749, + "grad_norm": 3.263587236404419, + "learning_rate": 1.521026600623243e-06 + }, + { + "step": 565, + "epoch": 2.875318066157761, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720648192, + "loss": 1.1635, + "grad_norm": 3.4809043407440186, + "learning_rate": 1.3971080945503866e-06 + }, + { + "step": 566, + "epoch": 2.880407124681934, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720586752, + "loss": 1.0429, + "grad_norm": 3.1237943172454834, + "learning_rate": 1.2784305912090842e-06 + }, + { + "step": 567, + "epoch": 2.885496183206107, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720572928, + "loss": 0.9442, + "grad_norm": 3.4066269397735596, + "learning_rate": 1.1649982761782195e-06 + }, + { + "step": 568, + "epoch": 2.89058524173028, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720784896, + "loss": 0.8601, + "grad_norm": 2.930818557739258, + "learning_rate": 1.0568151500465693e-06 + }, + { + "step": 569, + "epoch": 2.895674300254453, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720649728, + "loss": 1.0289, + "grad_norm": 2.8005332946777344, + "learning_rate": 9.538850282719833e-07 + }, + { + "step": 570, + "epoch": 2.900763358778626, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720623616, + "loss": 0.8598, + "grad_norm": 2.6058082580566406, + "learning_rate": 8.56211541046542e-07 + }, + { + "step": 571, + "epoch": 2.905852417302799, + "cpu_mem": 1.911791616, + "gpu_mem": 4.72062976, + "loss": 1.0542, + "grad_norm": 2.960752487182617, + "learning_rate": 7.637981331687582e-07 + }, + { + "step": 572, + "epoch": 2.910941475826972, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720663552, + "loss": 1.1329, + "grad_norm": 3.186211109161377, + "learning_rate": 6.766480639218752e-07 + }, + { + "step": 573, + "epoch": 2.916030534351145, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720571392, + "loss": 1.13, + "grad_norm": 3.221698522567749, + "learning_rate": 5.947644069591084e-07 + }, + { + "step": 574, + "epoch": 2.921119592875318, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720655872, + "loss": 0.9817, + "grad_norm": 3.2241458892822266, + "learning_rate": 5.181500501950986e-07 + }, + { + "step": 575, + "epoch": 2.926208651399491, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720543744, + "loss": 1.1201, + "grad_norm": 3.0688910484313965, + "learning_rate": 4.468076957041433e-07 + }, + { + "step": 576, + "epoch": 2.931297709923664, + "cpu_mem": 1.911791616, + "gpu_mem": 4.72060672, + "loss": 1.0682, + "grad_norm": 2.6368632316589355, + "learning_rate": 3.807398596248401e-07 + }, + { + "step": 577, + "epoch": 2.936386768447837, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720571392, + "loss": 0.8909, + "grad_norm": 2.5905256271362305, + "learning_rate": 3.199488720714072e-07 + }, + { + "step": 578, + "epoch": 2.94147582697201, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720571392, + "loss": 1.0576, + "grad_norm": 3.0792500972747803, + "learning_rate": 2.64436877051466e-07 + }, + { + "step": 579, + "epoch": 2.9465648854961835, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720655872, + "loss": 1.1574, + "grad_norm": 2.8745594024658203, + "learning_rate": 2.1420583239040167e-07 + }, + { + "step": 580, + "epoch": 2.9516539440203564, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720602112, + "loss": 1.158, + "grad_norm": 2.85233736038208, + "learning_rate": 1.6925750966238494e-07 + }, + { + "step": 581, + "epoch": 2.9567430025445294, + "cpu_mem": 1.911791616, + "gpu_mem": 4.72062208, + "loss": 1.2037, + "grad_norm": 3.6716747283935547, + "learning_rate": 1.295934941278387e-07 + }, + { + "step": 582, + "epoch": 2.9618320610687023, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720549888, + "loss": 1.0431, + "grad_norm": 2.8210411071777344, + "learning_rate": 9.52151846775162e-08 + }, + { + "step": 583, + "epoch": 2.9669211195928753, + "cpu_mem": 1.911791616, + "gpu_mem": 4.72063744, + "loss": 1.1194, + "grad_norm": 3.366770029067993, + "learning_rate": 6.612379378320709e-08 + }, + { + "step": 584, + "epoch": 2.9720101781170483, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720605184, + "loss": 1.0495, + "grad_norm": 2.788761615753174, + "learning_rate": 4.232034745495494e-08 + }, + { + "step": 585, + "epoch": 2.9770992366412212, + "cpu_mem": 1.911791616, + "gpu_mem": 4.72058368, + "loss": 0.8639, + "grad_norm": 2.6695220470428467, + "learning_rate": 2.3805685204869583e-08 + }, + { + "step": 586, + "epoch": 2.982188295165394, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720526848, + "loss": 1.078, + "grad_norm": 3.037633180618286, + "learning_rate": 1.0580460017517444e-08 + }, + { + "step": 587, + "epoch": 2.9872773536895676, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720592896, + "loss": 1.2392, + "grad_norm": 3.3663833141326904, + "learning_rate": 2.645138326906604e-09 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720634368, + "loss": 1.06, + "grad_norm": 2.6454248428344727, + "learning_rate": 0.0 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 1.911791616, + "gpu_mem": 4.720634368, + "train_runtime": 8634.0531, + "train_samples_per_second": 4.367, + "train_steps_per_second": 0.068, + "total_flos": 9.037250271924634e+16, + "train_loss": 1.3105507745426528 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r8-a2/README.md b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r8-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r8-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r8-a2/adapter_config.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..616e0cc3677d4646846654f1887fbef4d57d10ca --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r8-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r8-a2/eval_results.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..6ca302c196787c2096b76e81f8f725652d4c225f --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "logiqa", + "results": 0.3652137988019004 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r8-a2/training_configuration.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..30863782d4139b93fa4abfa0fabca9173d72e2b9 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "LOGIQA", + "dataset_id": "data/logiqa_train", + "preprocess_id": "logiqa_train_deepeval" + }, + "peft_config": { + "method": "lora", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 6307840 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 3, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-lora-logiqa-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r8-a2", + "seed": 42, + "timestamp": "2025-08-29T22:20:15.066832" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r8-a2/training_logs.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..4b35d28c30981306f1fb9ed38b1271faab17f009 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-logiqa-r8-a2/training_logs.json @@ -0,0 +1,5305 @@ +[ + { + "step": 1, + "epoch": 0.005089058524173028, + "cpu_mem": 1.754959872, + "gpu_mem": 4.443033088, + "loss": 3.8396, + "grad_norm": 29.9427547454834, + "learning_rate": 5.084745762711864e-06 + }, + { + "step": 2, + "epoch": 0.010178117048346057, + "cpu_mem": 1.759875072, + "gpu_mem": 4.493515264, + "loss": 3.9728, + "grad_norm": 29.75017738342285, + "learning_rate": 1.0169491525423728e-05 + }, + { + "step": 3, + "epoch": 0.015267175572519083, + "cpu_mem": 1.759875072, + "gpu_mem": 4.493592064, + "loss": 3.7768, + "grad_norm": 28.762060165405273, + "learning_rate": 1.5254237288135592e-05 + }, + { + "step": 4, + "epoch": 0.020356234096692113, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493490688, + "loss": 3.5857, + "grad_norm": 29.85860824584961, + "learning_rate": 2.0338983050847455e-05 + }, + { + "step": 5, + "epoch": 0.02544529262086514, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493506048, + "loss": 3.4873, + "grad_norm": 26.36086082458496, + "learning_rate": 2.542372881355932e-05 + }, + { + "step": 6, + "epoch": 0.030534351145038167, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493498368, + "loss": 3.1999, + "grad_norm": 22.19527244567871, + "learning_rate": 3.0508474576271185e-05 + }, + { + "step": 7, + "epoch": 0.035623409669211195, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493569024, + "loss": 2.8994, + "grad_norm": 19.02910041809082, + "learning_rate": 3.559322033898305e-05 + }, + { + "step": 8, + "epoch": 0.04071246819338423, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493553664, + "loss": 2.5203, + "grad_norm": 14.694186210632324, + "learning_rate": 4.067796610169491e-05 + }, + { + "step": 9, + "epoch": 0.04580152671755725, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493552128, + "loss": 2.2408, + "grad_norm": 11.600865364074707, + "learning_rate": 4.576271186440678e-05 + }, + { + "step": 10, + "epoch": 0.05089058524173028, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49356288, + "loss": 2.2051, + "grad_norm": 9.958389282226562, + "learning_rate": 5.084745762711864e-05 + }, + { + "step": 11, + "epoch": 0.05597964376590331, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493466112, + "loss": 1.8214, + "grad_norm": 6.800085067749023, + "learning_rate": 5.59322033898305e-05 + }, + { + "step": 12, + "epoch": 0.061068702290076333, + "cpu_mem": 1.76007168, + "gpu_mem": 4.4935168, + "loss": 1.6257, + "grad_norm": 3.988950729370117, + "learning_rate": 6.101694915254237e-05 + }, + { + "step": 13, + "epoch": 0.06615776081424936, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49360896, + "loss": 1.5324, + "grad_norm": 2.620539426803589, + "learning_rate": 6.610169491525423e-05 + }, + { + "step": 14, + "epoch": 0.07124681933842239, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493521408, + "loss": 1.4579, + "grad_norm": 1.4614462852478027, + "learning_rate": 7.11864406779661e-05 + }, + { + "step": 15, + "epoch": 0.07633587786259542, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493659648, + "loss": 1.4586, + "grad_norm": 2.255279302597046, + "learning_rate": 7.627118644067796e-05 + }, + { + "step": 16, + "epoch": 0.08142493638676845, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493519872, + "loss": 1.4784, + "grad_norm": 2.894843339920044, + "learning_rate": 8.135593220338982e-05 + }, + { + "step": 17, + "epoch": 0.08651399491094147, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493550592, + "loss": 1.4566, + "grad_norm": 2.7120351791381836, + "learning_rate": 8.64406779661017e-05 + }, + { + "step": 18, + "epoch": 0.0916030534351145, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493513728, + "loss": 1.371, + "grad_norm": 1.0821328163146973, + "learning_rate": 9.152542372881355e-05 + }, + { + "step": 19, + "epoch": 0.09669211195928754, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493421568, + "loss": 1.3897, + "grad_norm": 0.858384370803833, + "learning_rate": 9.661016949152541e-05 + }, + { + "step": 20, + "epoch": 0.10178117048346055, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493459968, + "loss": 1.4009, + "grad_norm": 1.6342703104019165, + "learning_rate": 0.00010169491525423727 + }, + { + "step": 21, + "epoch": 0.10687022900763359, + "cpu_mem": 1.76007168, + "gpu_mem": 4.4935936, + "loss": 1.4166, + "grad_norm": 2.145219326019287, + "learning_rate": 0.00010677966101694915 + }, + { + "step": 22, + "epoch": 0.11195928753180662, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493492224, + "loss": 1.4654, + "grad_norm": 2.617515802383423, + "learning_rate": 0.000111864406779661 + }, + { + "step": 23, + "epoch": 0.11704834605597965, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493519872, + "loss": 1.4312, + "grad_norm": 3.3458406925201416, + "learning_rate": 0.00011694915254237288 + }, + { + "step": 24, + "epoch": 0.12213740458015267, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493513728, + "loss": 1.3764, + "grad_norm": 1.3073737621307373, + "learning_rate": 0.00012203389830508474 + }, + { + "step": 25, + "epoch": 0.1272264631043257, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493519872, + "loss": 1.4659, + "grad_norm": 3.0960774421691895, + "learning_rate": 0.00012711864406779658 + }, + { + "step": 26, + "epoch": 0.13231552162849872, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493572096, + "loss": 1.4729, + "grad_norm": 4.517819881439209, + "learning_rate": 0.00013220338983050846 + }, + { + "step": 27, + "epoch": 0.13740458015267176, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493513728, + "loss": 1.4154, + "grad_norm": 1.7504642009735107, + "learning_rate": 0.00013728813559322033 + }, + { + "step": 28, + "epoch": 0.14249363867684478, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493459968, + "loss": 1.4156, + "grad_norm": 1.200673222541809, + "learning_rate": 0.0001423728813559322 + }, + { + "step": 29, + "epoch": 0.1475826972010178, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493552128, + "loss": 1.3988, + "grad_norm": 1.0041426420211792, + "learning_rate": 0.00014745762711864405 + }, + { + "step": 30, + "epoch": 0.15267175572519084, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49354752, + "loss": 1.3574, + "grad_norm": 1.9802424907684326, + "learning_rate": 0.00015254237288135592 + }, + { + "step": 31, + "epoch": 0.15776081424936386, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493526016, + "loss": 1.3981, + "grad_norm": 1.6931202411651611, + "learning_rate": 0.0001576271186440678 + }, + { + "step": 32, + "epoch": 0.1628498727735369, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493530624, + "loss": 1.4771, + "grad_norm": 2.234528064727783, + "learning_rate": 0.00016271186440677964 + }, + { + "step": 33, + "epoch": 0.16793893129770993, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493565952, + "loss": 1.3581, + "grad_norm": 1.1100895404815674, + "learning_rate": 0.0001677966101694915 + }, + { + "step": 34, + "epoch": 0.17302798982188294, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493490688, + "loss": 1.5149, + "grad_norm": 2.700509548187256, + "learning_rate": 0.0001728813559322034 + }, + { + "step": 35, + "epoch": 0.178117048346056, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493538304, + "loss": 1.4975, + "grad_norm": 2.721860885620117, + "learning_rate": 0.00017796610169491523 + }, + { + "step": 36, + "epoch": 0.183206106870229, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493553664, + "loss": 1.4425, + "grad_norm": 2.449096441268921, + "learning_rate": 0.0001830508474576271 + }, + { + "step": 37, + "epoch": 0.18829516539440203, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493619712, + "loss": 1.3685, + "grad_norm": 1.4969744682312012, + "learning_rate": 0.00018813559322033895 + }, + { + "step": 38, + "epoch": 0.19338422391857507, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493502976, + "loss": 1.4079, + "grad_norm": 1.031211495399475, + "learning_rate": 0.00019322033898305083 + }, + { + "step": 39, + "epoch": 0.1984732824427481, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493619712, + "loss": 1.4343, + "grad_norm": 0.9084482789039612, + "learning_rate": 0.0001983050847457627 + }, + { + "step": 40, + "epoch": 0.2035623409669211, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493541376, + "loss": 1.4122, + "grad_norm": 1.3888663053512573, + "learning_rate": 0.00020338983050847455 + }, + { + "step": 41, + "epoch": 0.20865139949109415, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49344, + "loss": 1.4265, + "grad_norm": 1.2090824842453003, + "learning_rate": 0.00020847457627118642 + }, + { + "step": 42, + "epoch": 0.21374045801526717, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493512192, + "loss": 1.4203, + "grad_norm": 0.9896847009658813, + "learning_rate": 0.0002135593220338983 + }, + { + "step": 43, + "epoch": 0.21882951653944022, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493475328, + "loss": 1.4138, + "grad_norm": 1.1853652000427246, + "learning_rate": 0.00021864406779661014 + }, + { + "step": 44, + "epoch": 0.22391857506361323, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493510656, + "loss": 1.3921, + "grad_norm": 0.49625444412231445, + "learning_rate": 0.000223728813559322 + }, + { + "step": 45, + "epoch": 0.22900763358778625, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493564416, + "loss": 1.3948, + "grad_norm": 0.3971928656101227, + "learning_rate": 0.00022881355932203386 + }, + { + "step": 46, + "epoch": 0.2340966921119593, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493612032, + "loss": 1.3785, + "grad_norm": 0.6543405652046204, + "learning_rate": 0.00023389830508474576 + }, + { + "step": 47, + "epoch": 0.23918575063613232, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49344, + "loss": 1.4003, + "grad_norm": 0.6776618957519531, + "learning_rate": 0.0002389830508474576 + }, + { + "step": 48, + "epoch": 0.24427480916030533, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49349376, + "loss": 1.4108, + "grad_norm": 0.9406586289405823, + "learning_rate": 0.00024406779661016948 + }, + { + "step": 49, + "epoch": 0.24936386768447838, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493483008, + "loss": 1.377, + "grad_norm": 0.4977302849292755, + "learning_rate": 0.00024915254237288135 + }, + { + "step": 50, + "epoch": 0.2544529262086514, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493492224, + "loss": 1.3761, + "grad_norm": 0.6218267679214478, + "learning_rate": 0.00025423728813559317 + }, + { + "step": 51, + "epoch": 0.2595419847328244, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493587456, + "loss": 1.3667, + "grad_norm": 0.5640150308609009, + "learning_rate": 0.0002593220338983051 + }, + { + "step": 52, + "epoch": 0.26463104325699743, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493529088, + "loss": 1.494, + "grad_norm": 2.55256986618042, + "learning_rate": 0.0002644067796610169 + }, + { + "step": 53, + "epoch": 0.2697201017811705, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493607424, + "loss": 1.4582, + "grad_norm": 1.8457847833633423, + "learning_rate": 0.0002694915254237288 + }, + { + "step": 54, + "epoch": 0.2748091603053435, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493507584, + "loss": 1.3958, + "grad_norm": 0.5945470929145813, + "learning_rate": 0.00027457627118644066 + }, + { + "step": 55, + "epoch": 0.27989821882951654, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493504512, + "loss": 1.4029, + "grad_norm": 0.9171520471572876, + "learning_rate": 0.0002796610169491525 + }, + { + "step": 56, + "epoch": 0.28498727735368956, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493549056, + "loss": 1.3941, + "grad_norm": 0.8504194021224976, + "learning_rate": 0.0002847457627118644 + }, + { + "step": 57, + "epoch": 0.2900763358778626, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493541376, + "loss": 1.4046, + "grad_norm": 0.8308277130126953, + "learning_rate": 0.00028983050847457623 + }, + { + "step": 58, + "epoch": 0.2951653944020356, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493564416, + "loss": 1.4197, + "grad_norm": 0.9561573266983032, + "learning_rate": 0.0002949152542372881 + }, + { + "step": 59, + "epoch": 0.30025445292620867, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49353216, + "loss": 1.3538, + "grad_norm": 0.9574133157730103, + "learning_rate": 0.0003 + }, + { + "step": 60, + "epoch": 0.3053435114503817, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493522944, + "loss": 1.4272, + "grad_norm": 0.8894922733306885, + "learning_rate": 0.00029999735486167307 + }, + { + "step": 61, + "epoch": 0.3104325699745547, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493559808, + "loss": 1.3742, + "grad_norm": 0.562622606754303, + "learning_rate": 0.00029998941953998247 + }, + { + "step": 62, + "epoch": 0.3155216284987277, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493599744, + "loss": 1.3836, + "grad_norm": 0.5316916704177856, + "learning_rate": 0.0002999761943147951 + }, + { + "step": 63, + "epoch": 0.32061068702290074, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493529088, + "loss": 1.3692, + "grad_norm": 0.3404761552810669, + "learning_rate": 0.000299957679652545 + }, + { + "step": 64, + "epoch": 0.3256997455470738, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493430784, + "loss": 1.3824, + "grad_norm": 0.9187424778938293, + "learning_rate": 0.0002999338762062168 + }, + { + "step": 65, + "epoch": 0.33078880407124683, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493512192, + "loss": 1.447, + "grad_norm": 1.415601134300232, + "learning_rate": 0.00029990478481532246 + }, + { + "step": 66, + "epoch": 0.33587786259541985, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49361664, + "loss": 1.4179, + "grad_norm": 0.7082955241203308, + "learning_rate": 0.00029987040650587214 + }, + { + "step": 67, + "epoch": 0.34096692111959287, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49349376, + "loss": 1.4568, + "grad_norm": 1.3742142915725708, + "learning_rate": 0.0002998307424903376 + }, + { + "step": 68, + "epoch": 0.3460559796437659, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493545984, + "loss": 1.4102, + "grad_norm": 0.7001263499259949, + "learning_rate": 0.00029978579416760955 + }, + { + "step": 69, + "epoch": 0.3511450381679389, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493538304, + "loss": 1.3856, + "grad_norm": 0.5632603168487549, + "learning_rate": 0.00029973556312294853 + }, + { + "step": 70, + "epoch": 0.356234096692112, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493453824, + "loss": 1.3876, + "grad_norm": 0.3205040991306305, + "learning_rate": 0.0002996800511279286 + }, + { + "step": 71, + "epoch": 0.361323155216285, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493476864, + "loss": 1.3924, + "grad_norm": 0.9779516458511353, + "learning_rate": 0.0002996192601403751 + }, + { + "step": 72, + "epoch": 0.366412213740458, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493526016, + "loss": 1.4156, + "grad_norm": 1.042282223701477, + "learning_rate": 0.00029955319230429584 + }, + { + "step": 73, + "epoch": 0.37150127226463103, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493479936, + "loss": 1.4131, + "grad_norm": 0.7601412534713745, + "learning_rate": 0.00029948184994980486 + }, + { + "step": 74, + "epoch": 0.37659033078880405, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493533696, + "loss": 1.4015, + "grad_norm": 0.9169175028800964, + "learning_rate": 0.0002994052355930409 + }, + { + "step": 75, + "epoch": 0.3816793893129771, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493576704, + "loss": 1.4024, + "grad_norm": 1.4626282453536987, + "learning_rate": 0.0002993233519360781 + }, + { + "step": 76, + "epoch": 0.38676844783715014, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49352448, + "loss": 1.3984, + "grad_norm": 0.995038628578186, + "learning_rate": 0.0002992362018668312 + }, + { + "step": 77, + "epoch": 0.39185750636132316, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49346304, + "loss": 1.4165, + "grad_norm": 0.8059393763542175, + "learning_rate": 0.00029914378845895343 + }, + { + "step": 78, + "epoch": 0.3969465648854962, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493564416, + "loss": 1.3803, + "grad_norm": 1.3926606178283691, + "learning_rate": 0.000299046114971728 + }, + { + "step": 79, + "epoch": 0.4020356234096692, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493581312, + "loss": 1.364, + "grad_norm": 1.5349339246749878, + "learning_rate": 0.0002989431848499534 + }, + { + "step": 80, + "epoch": 0.4071246819338422, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493421568, + "loss": 1.4521, + "grad_norm": 1.832576870918274, + "learning_rate": 0.0002988350017238218 + }, + { + "step": 81, + "epoch": 0.4122137404580153, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493527552, + "loss": 1.452, + "grad_norm": 2.0124080181121826, + "learning_rate": 0.0002987215694087909 + }, + { + "step": 82, + "epoch": 0.4173027989821883, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493499904, + "loss": 1.4318, + "grad_norm": 1.7343522310256958, + "learning_rate": 0.0002986028919054496 + }, + { + "step": 83, + "epoch": 0.4223918575063613, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493513728, + "loss": 1.4516, + "grad_norm": 2.1409637928009033, + "learning_rate": 0.00029847897339937675 + }, + { + "step": 84, + "epoch": 0.42748091603053434, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493519872, + "loss": 1.4127, + "grad_norm": 1.3911513090133667, + "learning_rate": 0.0002983498182609935 + }, + { + "step": 85, + "epoch": 0.43256997455470736, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493549056, + "loss": 1.3822, + "grad_norm": 0.4287321865558624, + "learning_rate": 0.0002982154310454093 + }, + { + "step": 86, + "epoch": 0.43765903307888043, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493490688, + "loss": 1.3918, + "grad_norm": 1.4672143459320068, + "learning_rate": 0.00029807581649226114 + }, + { + "step": 87, + "epoch": 0.44274809160305345, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493512192, + "loss": 1.4201, + "grad_norm": 1.4591654539108276, + "learning_rate": 0.00029793097952554646 + }, + { + "step": 88, + "epoch": 0.44783715012722647, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493538304, + "loss": 1.3694, + "grad_norm": 1.3218352794647217, + "learning_rate": 0.0002977809252534494 + }, + { + "step": 89, + "epoch": 0.4529262086513995, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49348608, + "loss": 1.4089, + "grad_norm": 1.3128831386566162, + "learning_rate": 0.00029762565896816073 + }, + { + "step": 90, + "epoch": 0.4580152671755725, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493518336, + "loss": 1.3992, + "grad_norm": 1.2594449520111084, + "learning_rate": 0.000297465186145691 + }, + { + "step": 91, + "epoch": 0.4631043256997455, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493510656, + "loss": 1.3593, + "grad_norm": 0.8974713087081909, + "learning_rate": 0.0002972995124456779 + }, + { + "step": 92, + "epoch": 0.4681933842239186, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493450752, + "loss": 1.4775, + "grad_norm": 1.9394911527633667, + "learning_rate": 0.0002971286437111861 + }, + { + "step": 93, + "epoch": 0.4732824427480916, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493651968, + "loss": 1.4501, + "grad_norm": 1.3481810092926025, + "learning_rate": 0.0002969525859685014 + }, + { + "step": 94, + "epoch": 0.47837150127226463, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493466112, + "loss": 1.4278, + "grad_norm": 1.5071247816085815, + "learning_rate": 0.0002967713454269183 + }, + { + "step": 95, + "epoch": 0.48346055979643765, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493633536, + "loss": 1.3856, + "grad_norm": 0.7479880452156067, + "learning_rate": 0.0002965849284785207 + }, + { + "step": 96, + "epoch": 0.48854961832061067, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493507584, + "loss": 1.3952, + "grad_norm": 0.9293127059936523, + "learning_rate": 0.000296393341697957 + }, + { + "step": 97, + "epoch": 0.49363867684478374, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493502976, + "loss": 1.4083, + "grad_norm": 0.9140604734420776, + "learning_rate": 0.00029619659184220755 + }, + { + "step": 98, + "epoch": 0.49872773536895676, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493550592, + "loss": 1.3833, + "grad_norm": 0.916469156742096, + "learning_rate": 0.00029599468585034684 + }, + { + "step": 99, + "epoch": 0.5038167938931297, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49353216, + "loss": 1.412, + "grad_norm": 1.7365604639053345, + "learning_rate": 0.0002957876308432986 + }, + { + "step": 100, + "epoch": 0.5089058524173028, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493496832, + "loss": 1.363, + "grad_norm": 0.5468966364860535, + "learning_rate": 0.0002955754341235846 + }, + { + "step": 101, + "epoch": 0.5139949109414759, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493479936, + "loss": 1.4183, + "grad_norm": 0.5792218446731567, + "learning_rate": 0.00029535810317506714 + }, + { + "step": 102, + "epoch": 0.5190839694656488, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493530624, + "loss": 1.4524, + "grad_norm": 0.9310188293457031, + "learning_rate": 0.00029513564566268524 + }, + { + "step": 103, + "epoch": 0.5241730279898219, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493490688, + "loss": 1.4435, + "grad_norm": 1.1525992155075073, + "learning_rate": 0.0002949080694321841 + }, + { + "step": 104, + "epoch": 0.5292620865139949, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493579776, + "loss": 1.4095, + "grad_norm": 0.5910022258758545, + "learning_rate": 0.0002946753825098386 + }, + { + "step": 105, + "epoch": 0.5343511450381679, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493449216, + "loss": 1.3752, + "grad_norm": 0.6407392621040344, + "learning_rate": 0.0002944375931021699 + }, + { + "step": 106, + "epoch": 0.539440203562341, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493512192, + "loss": 1.3993, + "grad_norm": 0.5102116465568542, + "learning_rate": 0.0002941947095956564 + }, + { + "step": 107, + "epoch": 0.544529262086514, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493507584, + "loss": 1.378, + "grad_norm": 0.3125024437904358, + "learning_rate": 0.0002939467405564377 + }, + { + "step": 108, + "epoch": 0.549618320610687, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493496832, + "loss": 1.3859, + "grad_norm": 0.6352884769439697, + "learning_rate": 0.00029369369473001265 + }, + { + "step": 109, + "epoch": 0.55470737913486, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493665792, + "loss": 1.3879, + "grad_norm": 0.2740617096424103, + "learning_rate": 0.0002934355810409307 + }, + { + "step": 110, + "epoch": 0.5597964376590331, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49346304, + "loss": 1.3693, + "grad_norm": 0.738030195236206, + "learning_rate": 0.0002931724085924774 + }, + { + "step": 111, + "epoch": 0.5648854961832062, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493464576, + "loss": 1.3936, + "grad_norm": 0.6761789917945862, + "learning_rate": 0.00029290418666635314 + }, + { + "step": 112, + "epoch": 0.5699745547073791, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493564416, + "loss": 1.3835, + "grad_norm": 0.47887349128723145, + "learning_rate": 0.0002926309247223459 + }, + { + "step": 113, + "epoch": 0.5750636132315522, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49366272, + "loss": 1.3898, + "grad_norm": 0.7697239518165588, + "learning_rate": 0.0002923526323979975 + }, + { + "step": 114, + "epoch": 0.5801526717557252, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493492224, + "loss": 1.4089, + "grad_norm": 1.0528883934020996, + "learning_rate": 0.00029206931950826387 + }, + { + "step": 115, + "epoch": 0.5852417302798982, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49350144, + "loss": 1.4141, + "grad_norm": 0.8010849952697754, + "learning_rate": 0.00029178099604516876 + }, + { + "step": 116, + "epoch": 0.5903307888040712, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49356288, + "loss": 1.3829, + "grad_norm": 0.6602729558944702, + "learning_rate": 0.0002914876721774515 + }, + { + "step": 117, + "epoch": 0.5954198473282443, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49345536, + "loss": 1.3641, + "grad_norm": 0.3255758285522461, + "learning_rate": 0.00029118935825020806 + }, + { + "step": 118, + "epoch": 0.6005089058524173, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493550592, + "loss": 1.3858, + "grad_norm": 0.3694641590118408, + "learning_rate": 0.00029088606478452656 + }, + { + "step": 119, + "epoch": 0.6055979643765903, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493684224, + "loss": 1.3655, + "grad_norm": 0.4891485869884491, + "learning_rate": 0.0002905778024771158 + }, + { + "step": 120, + "epoch": 0.6106870229007634, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493587456, + "loss": 1.4752, + "grad_norm": 2.020766496658325, + "learning_rate": 0.00029026458219992855 + }, + { + "step": 121, + "epoch": 0.6157760814249363, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493633536, + "loss": 1.4054, + "grad_norm": 1.045362114906311, + "learning_rate": 0.00028994641499977745 + }, + { + "step": 122, + "epoch": 0.6208651399491094, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49357824, + "loss": 1.3856, + "grad_norm": 0.5713481903076172, + "learning_rate": 0.00028962331209794604 + }, + { + "step": 123, + "epoch": 0.6259541984732825, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49361664, + "loss": 1.39, + "grad_norm": 1.0584194660186768, + "learning_rate": 0.00028929528488979244 + }, + { + "step": 124, + "epoch": 0.6310432569974554, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49353984, + "loss": 1.3825, + "grad_norm": 0.7097488641738892, + "learning_rate": 0.0002889623449443479 + }, + { + "step": 125, + "epoch": 0.6361323155216285, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493575168, + "loss": 1.4022, + "grad_norm": 0.7776065468788147, + "learning_rate": 0.0002886245040039086 + }, + { + "step": 126, + "epoch": 0.6412213740458015, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493481472, + "loss": 1.3509, + "grad_norm": 0.3058049976825714, + "learning_rate": 0.0002882817739836215 + }, + { + "step": 127, + "epoch": 0.6463104325699746, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493506048, + "loss": 1.4268, + "grad_norm": 1.1322132349014282, + "learning_rate": 0.000287934166971064 + }, + { + "step": 128, + "epoch": 0.6513994910941476, + "cpu_mem": 1.76007168, + "gpu_mem": 4.4934784, + "loss": 1.3748, + "grad_norm": 0.5986920595169067, + "learning_rate": 0.0002875816952258179 + }, + { + "step": 129, + "epoch": 0.6564885496183206, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493484544, + "loss": 1.3877, + "grad_norm": 0.6120620369911194, + "learning_rate": 0.00028722437117903693 + }, + { + "step": 130, + "epoch": 0.6615776081424937, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493475328, + "loss": 1.4469, + "grad_norm": 1.167204737663269, + "learning_rate": 0.000286862207433008 + }, + { + "step": 131, + "epoch": 0.6666666666666666, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49340928, + "loss": 1.3982, + "grad_norm": 0.6184087991714478, + "learning_rate": 0.00028649521676070726 + }, + { + "step": 132, + "epoch": 0.6717557251908397, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493572096, + "loss": 1.401, + "grad_norm": 0.5463101267814636, + "learning_rate": 0.0002861234121053493 + }, + { + "step": 133, + "epoch": 0.6768447837150128, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493484544, + "loss": 1.4145, + "grad_norm": 0.9058083295822144, + "learning_rate": 0.0002857468065799307 + }, + { + "step": 134, + "epoch": 0.6819338422391857, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493487616, + "loss": 1.396, + "grad_norm": 0.3178761601448059, + "learning_rate": 0.0002853654134667676 + }, + { + "step": 135, + "epoch": 0.6870229007633588, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493542912, + "loss": 1.3972, + "grad_norm": 0.6816291213035583, + "learning_rate": 0.0002849792462170271 + }, + { + "step": 136, + "epoch": 0.6921119592875318, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493446144, + "loss": 1.4046, + "grad_norm": 0.53705233335495, + "learning_rate": 0.0002845883184502533 + }, + { + "step": 137, + "epoch": 0.6972010178117048, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493613568, + "loss": 1.3826, + "grad_norm": 0.4543381333351135, + "learning_rate": 0.00028419264395388626 + }, + { + "step": 138, + "epoch": 0.7022900763358778, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493479936, + "loss": 1.3744, + "grad_norm": 0.8149245977401733, + "learning_rate": 0.0002837922366827765 + }, + { + "step": 139, + "epoch": 0.7073791348600509, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493469184, + "loss": 1.3969, + "grad_norm": 1.2226688861846924, + "learning_rate": 0.00028338711075869216 + }, + { + "step": 140, + "epoch": 0.712468193384224, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493521408, + "loss": 1.3909, + "grad_norm": 0.952864944934845, + "learning_rate": 0.00028297728046982137 + }, + { + "step": 141, + "epoch": 0.7175572519083969, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493446144, + "loss": 1.4044, + "grad_norm": 0.8851274251937866, + "learning_rate": 0.00028256276027026816 + }, + { + "step": 142, + "epoch": 0.72264631043257, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493507584, + "loss": 1.4318, + "grad_norm": 1.0183138847351074, + "learning_rate": 0.0002821435647795429 + }, + { + "step": 143, + "epoch": 0.727735368956743, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493506048, + "loss": 1.4029, + "grad_norm": 0.8148736953735352, + "learning_rate": 0.00028171970878204623 + }, + { + "step": 144, + "epoch": 0.732824427480916, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493450752, + "loss": 1.3604, + "grad_norm": 0.3051430284976959, + "learning_rate": 0.0002812912072265481 + }, + { + "step": 145, + "epoch": 0.7379134860050891, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49344768, + "loss": 1.4384, + "grad_norm": 1.1082253456115723, + "learning_rate": 0.00028085807522566043 + }, + { + "step": 146, + "epoch": 0.7430025445292621, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493549056, + "loss": 1.3739, + "grad_norm": 0.5351439714431763, + "learning_rate": 0.00028042032805530387 + }, + { + "step": 147, + "epoch": 0.7480916030534351, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493553664, + "loss": 1.3853, + "grad_norm": 0.6897121071815491, + "learning_rate": 0.00027997798115416935 + }, + { + "step": 148, + "epoch": 0.7531806615776081, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493651968, + "loss": 1.3873, + "grad_norm": 0.5331100225448608, + "learning_rate": 0.0002795310501231734 + }, + { + "step": 149, + "epoch": 0.7582697201017812, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493495296, + "loss": 1.3806, + "grad_norm": 0.5181450843811035, + "learning_rate": 0.0002790795507249081 + }, + { + "step": 150, + "epoch": 0.7633587786259542, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493487616, + "loss": 1.4027, + "grad_norm": 0.626332700252533, + "learning_rate": 0.00027862349888308494 + }, + { + "step": 151, + "epoch": 0.7684478371501272, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493430784, + "loss": 1.3803, + "grad_norm": 0.336386114358902, + "learning_rate": 0.0002781629106819733 + }, + { + "step": 152, + "epoch": 0.7735368956743003, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49346304, + "loss": 1.3756, + "grad_norm": 0.33393341302871704, + "learning_rate": 0.00027769780236583315 + }, + { + "step": 153, + "epoch": 0.7786259541984732, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493490688, + "loss": 1.3937, + "grad_norm": 0.6458150148391724, + "learning_rate": 0.0002772281903383424 + }, + { + "step": 154, + "epoch": 0.7837150127226463, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493541376, + "loss": 1.3665, + "grad_norm": 0.4425663948059082, + "learning_rate": 0.00027675409116201797 + }, + { + "step": 155, + "epoch": 0.7888040712468194, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493453824, + "loss": 1.4036, + "grad_norm": 0.3146791160106659, + "learning_rate": 0.00027627552155763186 + }, + { + "step": 156, + "epoch": 0.7938931297709924, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493473792, + "loss": 1.3961, + "grad_norm": 0.770833432674408, + "learning_rate": 0.00027579249840362145 + }, + { + "step": 157, + "epoch": 0.7989821882951654, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493550592, + "loss": 1.4051, + "grad_norm": 0.7802287340164185, + "learning_rate": 0.0002753050387354942 + }, + { + "step": 158, + "epoch": 0.8040712468193384, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493452288, + "loss": 1.3951, + "grad_norm": 0.38587960600852966, + "learning_rate": 0.0002748131597452268 + }, + { + "step": 159, + "epoch": 0.8091603053435115, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493553664, + "loss": 1.3953, + "grad_norm": 0.48054128885269165, + "learning_rate": 0.00027431687878065874 + }, + { + "step": 160, + "epoch": 0.8142493638676844, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493510656, + "loss": 1.388, + "grad_norm": 0.6814455986022949, + "learning_rate": 0.00027381621334488085 + }, + { + "step": 161, + "epoch": 0.8193384223918575, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49350912, + "loss": 1.3669, + "grad_norm": 0.39116182923316956, + "learning_rate": 0.00027331118109561744 + }, + { + "step": 162, + "epoch": 0.8244274809160306, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493492224, + "loss": 1.3799, + "grad_norm": 0.3059747517108917, + "learning_rate": 0.000272801799844604 + }, + { + "step": 163, + "epoch": 0.8295165394402035, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49360896, + "loss": 1.4293, + "grad_norm": 0.7665922045707703, + "learning_rate": 0.00027228808755695884 + }, + { + "step": 164, + "epoch": 0.8346055979643766, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493504512, + "loss": 1.385, + "grad_norm": 0.6291453242301941, + "learning_rate": 0.00027177006235054943 + }, + { + "step": 165, + "epoch": 0.8396946564885496, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493584384, + "loss": 1.3869, + "grad_norm": 0.46691203117370605, + "learning_rate": 0.0002712477424953534 + }, + { + "step": 166, + "epoch": 0.8447837150127226, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49349376, + "loss": 1.3712, + "grad_norm": 0.602537214756012, + "learning_rate": 0.00027072114641281435 + }, + { + "step": 167, + "epoch": 0.8498727735368957, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49345536, + "loss": 1.3731, + "grad_norm": 0.6017073392868042, + "learning_rate": 0.0002701902926751921 + }, + { + "step": 168, + "epoch": 0.8549618320610687, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493458432, + "loss": 1.3705, + "grad_norm": 0.5499193668365479, + "learning_rate": 0.00026965520000490743 + }, + { + "step": 169, + "epoch": 0.8600508905852418, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493526016, + "loss": 1.3681, + "grad_norm": 0.4619956314563751, + "learning_rate": 0.0002691158872738822 + }, + { + "step": 170, + "epoch": 0.8651399491094147, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493533696, + "loss": 1.4073, + "grad_norm": 0.3380984365940094, + "learning_rate": 0.00026857237350287334 + }, + { + "step": 171, + "epoch": 0.8702290076335878, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493507584, + "loss": 1.4004, + "grad_norm": 0.42955282330513, + "learning_rate": 0.0002680246778608023 + }, + { + "step": 172, + "epoch": 0.8753180661577609, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493487616, + "loss": 1.3864, + "grad_norm": 0.26765188574790955, + "learning_rate": 0.0002674728196640788 + }, + { + "step": 173, + "epoch": 0.8804071246819338, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49352448, + "loss": 1.3907, + "grad_norm": 0.5379332900047302, + "learning_rate": 0.00026691681837591984 + }, + { + "step": 174, + "epoch": 0.8854961832061069, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493467648, + "loss": 1.3929, + "grad_norm": 0.3578237295150757, + "learning_rate": 0.00026635669360566296 + }, + { + "step": 175, + "epoch": 0.8905852417302799, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493538304, + "loss": 1.3717, + "grad_norm": 0.4717216491699219, + "learning_rate": 0.00026579246510807477 + }, + { + "step": 176, + "epoch": 0.8956743002544529, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493435392, + "loss": 1.3632, + "grad_norm": 0.3030129671096802, + "learning_rate": 0.00026522415278265425 + }, + { + "step": 177, + "epoch": 0.9007633587786259, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493444608, + "loss": 1.4021, + "grad_norm": 0.7375836372375488, + "learning_rate": 0.0002646517766729309 + }, + { + "step": 178, + "epoch": 0.905852417302799, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493521408, + "loss": 1.3878, + "grad_norm": 0.5706522464752197, + "learning_rate": 0.0002640753569657579 + }, + { + "step": 179, + "epoch": 0.910941475826972, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493529088, + "loss": 1.3832, + "grad_norm": 0.5511612296104431, + "learning_rate": 0.0002634949139906 + }, + { + "step": 180, + "epoch": 0.916030534351145, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493535232, + "loss": 1.3881, + "grad_norm": 0.4406997561454773, + "learning_rate": 0.00026291046821881673 + }, + { + "step": 181, + "epoch": 0.9211195928753181, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493453824, + "loss": 1.3841, + "grad_norm": 0.4044110178947449, + "learning_rate": 0.0002623220402629402 + }, + { + "step": 182, + "epoch": 0.926208651399491, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493564416, + "loss": 1.4024, + "grad_norm": 0.5116137862205505, + "learning_rate": 0.0002617296508759483 + }, + { + "step": 183, + "epoch": 0.9312977099236641, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493552128, + "loss": 1.3703, + "grad_norm": 0.2789294123649597, + "learning_rate": 0.00026113332095053257 + }, + { + "step": 184, + "epoch": 0.9363867684478372, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493535232, + "loss": 1.3691, + "grad_norm": 0.690117359161377, + "learning_rate": 0.0002605330715183616 + }, + { + "step": 185, + "epoch": 0.9414758269720102, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493473792, + "loss": 1.3835, + "grad_norm": 0.8589558005332947, + "learning_rate": 0.0002599289237493392 + }, + { + "step": 186, + "epoch": 0.9465648854961832, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493506048, + "loss": 1.3533, + "grad_norm": 0.23568114638328552, + "learning_rate": 0.0002593208989508575 + }, + { + "step": 187, + "epoch": 0.9516539440203562, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493602816, + "loss": 1.3687, + "grad_norm": 0.23876729607582092, + "learning_rate": 0.00025870901856704583 + }, + { + "step": 188, + "epoch": 0.9567430025445293, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493575168, + "loss": 1.3559, + "grad_norm": 0.3397904336452484, + "learning_rate": 0.00025809330417801425 + }, + { + "step": 189, + "epoch": 0.9618320610687023, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493479936, + "loss": 1.4622, + "grad_norm": 0.9744318127632141, + "learning_rate": 0.00025747377749909254 + }, + { + "step": 190, + "epoch": 0.9669211195928753, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493512192, + "loss": 1.3764, + "grad_norm": 0.4829428195953369, + "learning_rate": 0.00025685046038006413 + }, + { + "step": 191, + "epoch": 0.9720101781170484, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49350144, + "loss": 1.4136, + "grad_norm": 0.5078803300857544, + "learning_rate": 0.0002562233748043958 + }, + { + "step": 192, + "epoch": 0.9770992366412213, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493438464, + "loss": 1.3992, + "grad_norm": 0.6458097100257874, + "learning_rate": 0.00025559254288846196 + }, + { + "step": 193, + "epoch": 0.9821882951653944, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493559808, + "loss": 1.3969, + "grad_norm": 0.3370194435119629, + "learning_rate": 0.0002549579868807651 + }, + { + "step": 194, + "epoch": 0.9872773536895675, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493481472, + "loss": 1.3701, + "grad_norm": 0.4040037989616394, + "learning_rate": 0.0002543197291611507 + }, + { + "step": 195, + "epoch": 0.9923664122137404, + "cpu_mem": 1.76007168, + "gpu_mem": 4.4935936, + "loss": 1.362, + "grad_norm": 0.2791256010532379, + "learning_rate": 0.0002536777922400183 + }, + { + "step": 196, + "epoch": 0.9974554707379135, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493588992, + "loss": 1.385, + "grad_norm": 0.5491617918014526, + "learning_rate": 0.0002530321987575271 + }, + { + "step": 197, + "epoch": 1.0025445292620865, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518754304, + "loss": 2.0405, + "grad_norm": 0.5287402272224426, + "learning_rate": 0.0002523829714827981 + }, + { + "step": 198, + "epoch": 1.0076335877862594, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518820352, + "loss": 1.3705, + "grad_norm": 0.38353100419044495, + "learning_rate": 0.00025173013331311053 + }, + { + "step": 199, + "epoch": 1.0127226463104326, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51881728, + "loss": 1.3925, + "grad_norm": 0.8535876870155334, + "learning_rate": 0.0002510737072730946 + }, + { + "step": 200, + "epoch": 1.0178117048346056, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51870208, + "loss": 1.344, + "grad_norm": 0.5518146753311157, + "learning_rate": 0.0002504137165139193 + }, + { + "step": 201, + "epoch": 1.0229007633587786, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518752768, + "loss": 1.3932, + "grad_norm": 0.8776749968528748, + "learning_rate": 0.0002497501843124761 + }, + { + "step": 202, + "epoch": 1.0279898218829517, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518746624, + "loss": 1.4546, + "grad_norm": 1.1782629489898682, + "learning_rate": 0.00024908313407055765 + }, + { + "step": 203, + "epoch": 1.0330788804071247, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51879424, + "loss": 1.3617, + "grad_norm": 0.44135817885398865, + "learning_rate": 0.00024841258931403284 + }, + { + "step": 204, + "epoch": 1.0381679389312977, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518726656, + "loss": 1.3288, + "grad_norm": 0.4756423830986023, + "learning_rate": 0.00024773857369201675 + }, + { + "step": 205, + "epoch": 1.0432569974554706, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518772736, + "loss": 1.3737, + "grad_norm": 0.4496293365955353, + "learning_rate": 0.00024706111097603676 + }, + { + "step": 206, + "epoch": 1.0483460559796438, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51878656, + "loss": 1.3361, + "grad_norm": 0.5266224145889282, + "learning_rate": 0.00024638022505919425 + }, + { + "step": 207, + "epoch": 1.0534351145038168, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518742016, + "loss": 1.3725, + "grad_norm": 1.0613235235214233, + "learning_rate": 0.00024569593995532157 + }, + { + "step": 208, + "epoch": 1.0585241730279897, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518734336, + "loss": 1.359, + "grad_norm": 0.8276228904724121, + "learning_rate": 0.00024500827979813546 + }, + { + "step": 209, + "epoch": 1.063613231552163, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51885568, + "loss": 1.3587, + "grad_norm": 0.9629799127578735, + "learning_rate": 0.0002443172688403859 + }, + { + "step": 210, + "epoch": 1.0687022900763359, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518752768, + "loss": 1.3859, + "grad_norm": 1.4227005243301392, + "learning_rate": 0.00024362293145300027 + }, + { + "step": 211, + "epoch": 1.0737913486005088, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518754304, + "loss": 1.3756, + "grad_norm": 1.0312976837158203, + "learning_rate": 0.00024292529212422445 + }, + { + "step": 212, + "epoch": 1.078880407124682, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518757376, + "loss": 1.3596, + "grad_norm": 0.6213247179985046, + "learning_rate": 0.00024222437545875887 + }, + { + "step": 213, + "epoch": 1.083969465648855, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518683648, + "loss": 1.4449, + "grad_norm": 1.6149866580963135, + "learning_rate": 0.0002415202061768906 + }, + { + "step": 214, + "epoch": 1.089058524173028, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51867904, + "loss": 1.3888, + "grad_norm": 1.4206966161727905, + "learning_rate": 0.0002408128091136217 + }, + { + "step": 215, + "epoch": 1.094147582697201, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518729728, + "loss": 1.3978, + "grad_norm": 1.7047111988067627, + "learning_rate": 0.00024010220921779336 + }, + { + "step": 216, + "epoch": 1.099236641221374, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518858752, + "loss": 1.3616, + "grad_norm": 0.5671579837799072, + "learning_rate": 0.00023938843155120581 + }, + { + "step": 217, + "epoch": 1.104325699745547, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518720512, + "loss": 1.3919, + "grad_norm": 0.9414582848548889, + "learning_rate": 0.00023867150128773453 + }, + { + "step": 218, + "epoch": 1.10941475826972, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518746624, + "loss": 1.3555, + "grad_norm": 0.5971041321754456, + "learning_rate": 0.0002379514437124425 + }, + { + "step": 219, + "epoch": 1.1145038167938932, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518758912, + "loss": 1.3772, + "grad_norm": 0.510827898979187, + "learning_rate": 0.00023722828422068814 + }, + { + "step": 220, + "epoch": 1.1195928753180662, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518731264, + "loss": 1.474, + "grad_norm": 1.7662144899368286, + "learning_rate": 0.00023650204831723008 + }, + { + "step": 221, + "epoch": 1.1246819338422391, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518705152, + "loss": 1.4178, + "grad_norm": 0.8067710995674133, + "learning_rate": 0.00023577276161532718 + }, + { + "step": 222, + "epoch": 1.1297709923664123, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51871744, + "loss": 1.3928, + "grad_norm": 0.9047196507453918, + "learning_rate": 0.0002350404498358356 + }, + { + "step": 223, + "epoch": 1.1348600508905853, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518745088, + "loss": 1.3857, + "grad_norm": 1.1072514057159424, + "learning_rate": 0.00023430513880630133 + }, + { + "step": 224, + "epoch": 1.1399491094147582, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51876352, + "loss": 1.3896, + "grad_norm": 0.6583268642425537, + "learning_rate": 0.00023356685446004966 + }, + { + "step": 225, + "epoch": 1.1450381679389312, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518814208, + "loss": 1.3756, + "grad_norm": 0.7056071162223816, + "learning_rate": 0.00023282562283527005 + }, + { + "step": 226, + "epoch": 1.1501272264631044, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518765056, + "loss": 1.4179, + "grad_norm": 1.2489075660705566, + "learning_rate": 0.00023208147007409827 + }, + { + "step": 227, + "epoch": 1.1552162849872774, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518715904, + "loss": 1.4224, + "grad_norm": 1.8807587623596191, + "learning_rate": 0.00023133442242169425 + }, + { + "step": 228, + "epoch": 1.1603053435114503, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518760448, + "loss": 1.4305, + "grad_norm": 1.5133349895477295, + "learning_rate": 0.00023058450622531632 + }, + { + "step": 229, + "epoch": 1.1653944020356235, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518659072, + "loss": 1.4181, + "grad_norm": 1.1571718454360962, + "learning_rate": 0.00022983174793339206 + }, + { + "step": 230, + "epoch": 1.1704834605597965, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518726656, + "loss": 1.3953, + "grad_norm": 0.7520480155944824, + "learning_rate": 0.0002290761740945857 + }, + { + "step": 231, + "epoch": 1.1755725190839694, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51871744, + "loss": 1.3881, + "grad_norm": 1.0168302059173584, + "learning_rate": 0.00022831781135686135 + }, + { + "step": 232, + "epoch": 1.1806615776081424, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518722048, + "loss": 1.3975, + "grad_norm": 1.7122282981872559, + "learning_rate": 0.00022755668646654375 + }, + { + "step": 233, + "epoch": 1.1857506361323156, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518837248, + "loss": 1.3852, + "grad_norm": 0.7429341077804565, + "learning_rate": 0.00022679282626737442 + }, + { + "step": 234, + "epoch": 1.1908396946564885, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518797312, + "loss": 1.416, + "grad_norm": 1.028501033782959, + "learning_rate": 0.00022602625769956519 + }, + { + "step": 235, + "epoch": 1.1959287531806615, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51875584, + "loss": 1.3962, + "grad_norm": 1.11207914352417, + "learning_rate": 0.00022525700779884802 + }, + { + "step": 236, + "epoch": 1.2010178117048347, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518754304, + "loss": 1.3943, + "grad_norm": 1.1618434190750122, + "learning_rate": 0.00022448510369552164 + }, + { + "step": 237, + "epoch": 1.2061068702290076, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51877888, + "loss": 1.3842, + "grad_norm": 0.5817784667015076, + "learning_rate": 0.0002237105726134943 + }, + { + "step": 238, + "epoch": 1.2111959287531806, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518798848, + "loss": 1.3767, + "grad_norm": 0.6992388963699341, + "learning_rate": 0.00022293344186932406 + }, + { + "step": 239, + "epoch": 1.2162849872773536, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518715904, + "loss": 1.4242, + "grad_norm": 1.7361243963241577, + "learning_rate": 0.00022215373887125514 + }, + { + "step": 240, + "epoch": 1.2213740458015268, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518851072, + "loss": 1.392, + "grad_norm": 1.1824274063110352, + "learning_rate": 0.00022137149111825128 + }, + { + "step": 241, + "epoch": 1.2264631043256997, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518804992, + "loss": 1.3886, + "grad_norm": 0.9333128333091736, + "learning_rate": 0.00022058672619902606 + }, + { + "step": 242, + "epoch": 1.2315521628498727, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518826496, + "loss": 1.3768, + "grad_norm": 0.6037389636039734, + "learning_rate": 0.00021979947179106966 + }, + { + "step": 243, + "epoch": 1.2366412213740459, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518712832, + "loss": 1.3407, + "grad_norm": 1.472715973854065, + "learning_rate": 0.0002190097556596728 + }, + { + "step": 244, + "epoch": 1.2417302798982188, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518700544, + "loss": 1.4997, + "grad_norm": 1.9701818227767944, + "learning_rate": 0.0002182176056569476 + }, + { + "step": 245, + "epoch": 1.2468193384223918, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518705152, + "loss": 1.4596, + "grad_norm": 1.9132858514785767, + "learning_rate": 0.00021742304972084518 + }, + { + "step": 246, + "epoch": 1.2519083969465647, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518742016, + "loss": 1.4448, + "grad_norm": 1.5314313173294067, + "learning_rate": 0.00021662611587417035 + }, + { + "step": 247, + "epoch": 1.256997455470738, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518703616, + "loss": 1.3451, + "grad_norm": 0.7357609272003174, + "learning_rate": 0.00021582683222359317 + }, + { + "step": 248, + "epoch": 1.262086513994911, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518749696, + "loss": 1.4143, + "grad_norm": 0.802513837814331, + "learning_rate": 0.00021502522695865796 + }, + { + "step": 249, + "epoch": 1.267175572519084, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518758912, + "loss": 1.3924, + "grad_norm": 0.4126146137714386, + "learning_rate": 0.00021422132835078884 + }, + { + "step": 250, + "epoch": 1.272264631043257, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518797312, + "loss": 1.3687, + "grad_norm": 0.5610044598579407, + "learning_rate": 0.0002134151647522927 + }, + { + "step": 251, + "epoch": 1.27735368956743, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518715904, + "loss": 1.3934, + "grad_norm": 0.6500912308692932, + "learning_rate": 0.00021260676459535933 + }, + { + "step": 252, + "epoch": 1.282442748091603, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51872512, + "loss": 1.3828, + "grad_norm": 0.8447783589363098, + "learning_rate": 0.00021179615639105857 + }, + { + "step": 253, + "epoch": 1.2875318066157762, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518791168, + "loss": 1.3857, + "grad_norm": 0.5341652631759644, + "learning_rate": 0.00021098336872833482 + }, + { + "step": 254, + "epoch": 1.2926208651399491, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51871744, + "loss": 1.3565, + "grad_norm": 0.2953730523586273, + "learning_rate": 0.0002101684302729987 + }, + { + "step": 255, + "epoch": 1.297709923664122, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518711296, + "loss": 1.3811, + "grad_norm": 0.6444727778434753, + "learning_rate": 0.00020935136976671617 + }, + { + "step": 256, + "epoch": 1.3027989821882953, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518777344, + "loss": 1.3938, + "grad_norm": 0.8146637082099915, + "learning_rate": 0.00020853221602599458 + }, + { + "step": 257, + "epoch": 1.3078880407124682, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518735872, + "loss": 1.3658, + "grad_norm": 0.24652402102947235, + "learning_rate": 0.00020771099794116672 + }, + { + "step": 258, + "epoch": 1.3129770992366412, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518849536, + "loss": 1.3779, + "grad_norm": 0.6610425710678101, + "learning_rate": 0.0002068877444753717 + }, + { + "step": 259, + "epoch": 1.3180661577608141, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518726656, + "loss": 1.3849, + "grad_norm": 0.5602418184280396, + "learning_rate": 0.0002060624846635335 + }, + { + "step": 260, + "epoch": 1.3231552162849873, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518752768, + "loss": 1.399, + "grad_norm": 0.42608270049095154, + "learning_rate": 0.00020523524761133677 + }, + { + "step": 261, + "epoch": 1.3282442748091603, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518758912, + "loss": 1.3752, + "grad_norm": 0.8030539155006409, + "learning_rate": 0.00020440606249420073 + }, + { + "step": 262, + "epoch": 1.3333333333333333, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518826496, + "loss": 1.3739, + "grad_norm": 0.3628354072570801, + "learning_rate": 0.00020357495855624974 + }, + { + "step": 263, + "epoch": 1.3384223918575064, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518682112, + "loss": 1.3787, + "grad_norm": 0.7377053499221802, + "learning_rate": 0.0002027419651092822 + }, + { + "step": 264, + "epoch": 1.3435114503816794, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518815744, + "loss": 1.3764, + "grad_norm": 0.4202861189842224, + "learning_rate": 0.00020190711153173676 + }, + { + "step": 265, + "epoch": 1.3486005089058524, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518844928, + "loss": 1.3634, + "grad_norm": 0.6214026212692261, + "learning_rate": 0.00020107042726765588 + }, + { + "step": 266, + "epoch": 1.3536895674300253, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518714368, + "loss": 1.391, + "grad_norm": 0.9048495292663574, + "learning_rate": 0.0002002319418256479 + }, + { + "step": 267, + "epoch": 1.3587786259541985, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518715904, + "loss": 1.415, + "grad_norm": 0.7518694400787354, + "learning_rate": 0.00019939168477784583 + }, + { + "step": 268, + "epoch": 1.3638676844783715, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51871744, + "loss": 1.3672, + "grad_norm": 0.413564532995224, + "learning_rate": 0.00019854968575886458 + }, + { + "step": 269, + "epoch": 1.3689567430025447, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518791168, + "loss": 1.4094, + "grad_norm": 1.0058109760284424, + "learning_rate": 0.00019770597446475588 + }, + { + "step": 270, + "epoch": 1.3740458015267176, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518772736, + "loss": 1.3332, + "grad_norm": 0.28967103362083435, + "learning_rate": 0.0001968605806519608 + }, + { + "step": 271, + "epoch": 1.3791348600508906, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518715904, + "loss": 1.4231, + "grad_norm": 0.8552537560462952, + "learning_rate": 0.00019601353413626032 + }, + { + "step": 272, + "epoch": 1.3842239185750635, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518754304, + "loss": 1.3588, + "grad_norm": 0.6700282692909241, + "learning_rate": 0.00019516486479172386 + }, + { + "step": 273, + "epoch": 1.3893129770992365, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518821888, + "loss": 1.3593, + "grad_norm": 0.4073947072029114, + "learning_rate": 0.0001943146025496555 + }, + { + "step": 274, + "epoch": 1.3944020356234097, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518774272, + "loss": 1.3836, + "grad_norm": 0.5341019034385681, + "learning_rate": 0.00019346277739753855 + }, + { + "step": 275, + "epoch": 1.3994910941475827, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51890176, + "loss": 1.3933, + "grad_norm": 0.7651644945144653, + "learning_rate": 0.00019260941937797776 + }, + { + "step": 276, + "epoch": 1.4045801526717558, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51878656, + "loss": 1.359, + "grad_norm": 0.5330560207366943, + "learning_rate": 0.00019175455858763988 + }, + { + "step": 277, + "epoch": 1.4096692111959288, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518735872, + "loss": 1.3444, + "grad_norm": 0.40881603956222534, + "learning_rate": 0.0001908982251761921 + }, + { + "step": 278, + "epoch": 1.4147582697201018, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518723584, + "loss": 1.33, + "grad_norm": 0.6552839875221252, + "learning_rate": 0.00019004044934523871 + }, + { + "step": 279, + "epoch": 1.4198473282442747, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518715904, + "loss": 1.3925, + "grad_norm": 0.6213047504425049, + "learning_rate": 0.00018918126134725616 + }, + { + "step": 280, + "epoch": 1.424936386768448, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518838784, + "loss": 1.3714, + "grad_norm": 0.829371988773346, + "learning_rate": 0.00018832069148452582 + }, + { + "step": 281, + "epoch": 1.4300254452926209, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51877888, + "loss": 1.4414, + "grad_norm": 1.2989680767059326, + "learning_rate": 0.00018745877010806534 + }, + { + "step": 282, + "epoch": 1.4351145038167938, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51872512, + "loss": 1.4523, + "grad_norm": 1.3012094497680664, + "learning_rate": 0.00018659552761655828 + }, + { + "step": 283, + "epoch": 1.440203562340967, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518742016, + "loss": 1.4199, + "grad_norm": 0.9041076302528381, + "learning_rate": 0.00018573099445528204 + }, + { + "step": 284, + "epoch": 1.44529262086514, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518881792, + "loss": 1.3623, + "grad_norm": 0.3771124482154846, + "learning_rate": 0.00018486520111503387 + }, + { + "step": 285, + "epoch": 1.450381679389313, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518752768, + "loss": 1.3948, + "grad_norm": 0.6029961705207825, + "learning_rate": 0.0001839981781310558 + }, + { + "step": 286, + "epoch": 1.455470737913486, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51874816, + "loss": 1.3976, + "grad_norm": 0.6035098433494568, + "learning_rate": 0.00018312995608195747 + }, + { + "step": 287, + "epoch": 1.460559796437659, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518651392, + "loss": 1.405, + "grad_norm": 0.6196001768112183, + "learning_rate": 0.00018226056558863778 + }, + { + "step": 288, + "epoch": 1.465648854961832, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518726656, + "loss": 1.3973, + "grad_norm": 0.6879257559776306, + "learning_rate": 0.00018139003731320496 + }, + { + "step": 289, + "epoch": 1.470737913486005, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518746624, + "loss": 1.3956, + "grad_norm": 0.5538918972015381, + "learning_rate": 0.00018051840195789506 + }, + { + "step": 290, + "epoch": 1.4758269720101782, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518712832, + "loss": 1.3795, + "grad_norm": 0.37190234661102295, + "learning_rate": 0.00017964569026398926 + }, + { + "step": 291, + "epoch": 1.4809160305343512, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518674432, + "loss": 1.3735, + "grad_norm": 0.3085636794567108, + "learning_rate": 0.00017877193301072945 + }, + { + "step": 292, + "epoch": 1.4860050890585241, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518800384, + "loss": 1.377, + "grad_norm": 0.39224591851234436, + "learning_rate": 0.0001778971610142331 + }, + { + "step": 293, + "epoch": 1.491094147582697, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518677504, + "loss": 1.4054, + "grad_norm": 0.6247186660766602, + "learning_rate": 0.00017702140512640594 + }, + { + "step": 294, + "epoch": 1.4961832061068703, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518720512, + "loss": 1.4117, + "grad_norm": 0.6757923364639282, + "learning_rate": 0.00017614469623385414 + }, + { + "step": 295, + "epoch": 1.5012722646310432, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518703616, + "loss": 1.3377, + "grad_norm": 0.6002600193023682, + "learning_rate": 0.00017526706525679498 + }, + { + "step": 296, + "epoch": 1.5063613231552164, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518745088, + "loss": 1.3699, + "grad_norm": 0.48464247584342957, + "learning_rate": 0.00017438854314796623 + }, + { + "step": 297, + "epoch": 1.5114503816793894, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518685184, + "loss": 1.3536, + "grad_norm": 0.32482197880744934, + "learning_rate": 0.00017350916089153455 + }, + { + "step": 298, + "epoch": 1.5165394402035624, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518708224, + "loss": 1.3538, + "grad_norm": 0.6164999604225159, + "learning_rate": 0.00017262894950200277 + }, + { + "step": 299, + "epoch": 1.5216284987277353, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518697472, + "loss": 1.3719, + "grad_norm": 0.6119054555892944, + "learning_rate": 0.000171747940023116 + }, + { + "step": 300, + "epoch": 1.5267175572519083, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51870976, + "loss": 1.3856, + "grad_norm": 0.481408953666687, + "learning_rate": 0.0001708661635267667 + }, + { + "step": 301, + "epoch": 1.5318066157760815, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518712832, + "loss": 1.3416, + "grad_norm": 0.9286465048789978, + "learning_rate": 0.00016998365111189906 + }, + { + "step": 302, + "epoch": 1.5368956743002544, + "cpu_mem": 1.76007168, + "gpu_mem": 4.5186944, + "loss": 1.3781, + "grad_norm": 0.3996107578277588, + "learning_rate": 0.00016910043390341183 + }, + { + "step": 303, + "epoch": 1.5419847328244276, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518700544, + "loss": 1.3709, + "grad_norm": 0.7268552184104919, + "learning_rate": 0.0001682165430510609 + }, + { + "step": 304, + "epoch": 1.5470737913486006, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51864832, + "loss": 1.4448, + "grad_norm": 1.3985364437103271, + "learning_rate": 0.00016733200972836055 + }, + { + "step": 305, + "epoch": 1.5521628498727735, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518720512, + "loss": 1.3523, + "grad_norm": 0.4944891333580017, + "learning_rate": 0.00016644686513148397 + }, + { + "step": 306, + "epoch": 1.5572519083969465, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518761984, + "loss": 1.3528, + "grad_norm": 0.5576210021972656, + "learning_rate": 0.00016556114047816317 + }, + { + "step": 307, + "epoch": 1.5623409669211195, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51870976, + "loss": 1.3599, + "grad_norm": 0.4506267011165619, + "learning_rate": 0.00016467486700658785 + }, + { + "step": 308, + "epoch": 1.5674300254452926, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518720512, + "loss": 1.3779, + "grad_norm": 0.4728412628173828, + "learning_rate": 0.0001637880759743037 + }, + { + "step": 309, + "epoch": 1.5725190839694656, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518703616, + "loss": 1.417, + "grad_norm": 0.9896643757820129, + "learning_rate": 0.00016290079865711004 + }, + { + "step": 310, + "epoch": 1.5776081424936388, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518758912, + "loss": 1.3641, + "grad_norm": 0.3613266050815582, + "learning_rate": 0.00016201306634795675 + }, + { + "step": 311, + "epoch": 1.5826972010178118, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51872512, + "loss": 1.3592, + "grad_norm": 0.3578721582889557, + "learning_rate": 0.00016112491035584047 + }, + { + "step": 312, + "epoch": 1.5877862595419847, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518734336, + "loss": 1.3791, + "grad_norm": 0.4575977325439453, + "learning_rate": 0.00016023636200470065 + }, + { + "step": 313, + "epoch": 1.5928753180661577, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518735872, + "loss": 1.3432, + "grad_norm": 0.898478627204895, + "learning_rate": 0.00015934745263231464 + }, + { + "step": 314, + "epoch": 1.5979643765903306, + "cpu_mem": 1.76007168, + "gpu_mem": 4.5189632, + "loss": 1.391, + "grad_norm": 0.8583600521087646, + "learning_rate": 0.00015845821358919236 + }, + { + "step": 315, + "epoch": 1.6030534351145038, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51875584, + "loss": 1.3552, + "grad_norm": 0.6637605428695679, + "learning_rate": 0.00015756867623747088 + }, + { + "step": 316, + "epoch": 1.608142493638677, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518728192, + "loss": 1.3762, + "grad_norm": 0.9916446208953857, + "learning_rate": 0.00015667887194980806 + }, + { + "step": 317, + "epoch": 1.61323155216285, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518692864, + "loss": 1.3372, + "grad_norm": 0.9617828130722046, + "learning_rate": 0.00015578883210827626 + }, + { + "step": 318, + "epoch": 1.618320610687023, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518783488, + "loss": 1.3378, + "grad_norm": 1.0396329164505005, + "learning_rate": 0.0001548985881032554 + }, + { + "step": 319, + "epoch": 1.623409669211196, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518708224, + "loss": 1.3326, + "grad_norm": 0.9846084117889404, + "learning_rate": 0.00015400817133232606 + }, + { + "step": 320, + "epoch": 1.6284987277353689, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51875584, + "loss": 1.3582, + "grad_norm": 0.6871299743652344, + "learning_rate": 0.00015311761319916184 + }, + { + "step": 321, + "epoch": 1.6335877862595418, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518697472, + "loss": 1.3432, + "grad_norm": 1.0301274061203003, + "learning_rate": 0.00015222694511242215 + }, + { + "step": 322, + "epoch": 1.638676844783715, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518918656, + "loss": 1.4022, + "grad_norm": 1.3160960674285889, + "learning_rate": 0.00015133619848464424 + }, + { + "step": 323, + "epoch": 1.6437659033078882, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518800384, + "loss": 1.3705, + "grad_norm": 1.0527888536453247, + "learning_rate": 0.0001504454047311353 + }, + { + "step": 324, + "epoch": 1.6488549618320612, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518692864, + "loss": 1.3443, + "grad_norm": 1.408197045326233, + "learning_rate": 0.00014955459526886468 + }, + { + "step": 325, + "epoch": 1.6539440203562341, + "cpu_mem": 1.76007168, + "gpu_mem": 4.5187328, + "loss": 1.4091, + "grad_norm": 1.0603963136672974, + "learning_rate": 0.00014866380151535574 + }, + { + "step": 326, + "epoch": 1.659033078880407, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518780416, + "loss": 1.3116, + "grad_norm": 0.8242224454879761, + "learning_rate": 0.0001477730548875778 + }, + { + "step": 327, + "epoch": 1.66412213740458, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518746624, + "loss": 1.3745, + "grad_norm": 1.3695831298828125, + "learning_rate": 0.0001468823868008382 + }, + { + "step": 328, + "epoch": 1.6692111959287532, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51880192, + "loss": 1.3683, + "grad_norm": 1.0984662771224976, + "learning_rate": 0.000145991828667674 + }, + { + "step": 329, + "epoch": 1.6743002544529262, + "cpu_mem": 1.76007168, + "gpu_mem": 4.5186944, + "loss": 1.3983, + "grad_norm": 0.9569587111473083, + "learning_rate": 0.0001451014118967446 + }, + { + "step": 330, + "epoch": 1.6793893129770994, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51877888, + "loss": 1.3511, + "grad_norm": 1.2292358875274658, + "learning_rate": 0.00014421116789172374 + }, + { + "step": 331, + "epoch": 1.6844783715012723, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518768128, + "loss": 1.2777, + "grad_norm": 0.7975842952728271, + "learning_rate": 0.00014332112805019194 + }, + { + "step": 332, + "epoch": 1.6895674300254453, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518723584, + "loss": 1.3622, + "grad_norm": 0.743346095085144, + "learning_rate": 0.00014243132376252912 + }, + { + "step": 333, + "epoch": 1.6946564885496183, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518812672, + "loss": 1.4099, + "grad_norm": 0.8635744452476501, + "learning_rate": 0.00014154178641080767 + }, + { + "step": 334, + "epoch": 1.6997455470737912, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51874048, + "loss": 1.351, + "grad_norm": 0.7639305591583252, + "learning_rate": 0.0001406525473676854 + }, + { + "step": 335, + "epoch": 1.7048346055979644, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518749696, + "loss": 1.3728, + "grad_norm": 0.675274670124054, + "learning_rate": 0.00013976363799529936 + }, + { + "step": 336, + "epoch": 1.7099236641221374, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51880192, + "loss": 1.3349, + "grad_norm": 0.7300083041191101, + "learning_rate": 0.00013887508964415956 + }, + { + "step": 337, + "epoch": 1.7150127226463106, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51880192, + "loss": 1.3263, + "grad_norm": 1.7762560844421387, + "learning_rate": 0.00013798693365204325 + }, + { + "step": 338, + "epoch": 1.7201017811704835, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518838784, + "loss": 1.3874, + "grad_norm": 1.517770528793335, + "learning_rate": 0.00013709920134288993 + }, + { + "step": 339, + "epoch": 1.7251908396946565, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51875584, + "loss": 1.3618, + "grad_norm": 0.7803937196731567, + "learning_rate": 0.00013621192402569628 + }, + { + "step": 340, + "epoch": 1.7302798982188294, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51882496, + "loss": 1.3564, + "grad_norm": 0.6279773712158203, + "learning_rate": 0.00013532513299341215 + }, + { + "step": 341, + "epoch": 1.7353689567430024, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518766592, + "loss": 1.3549, + "grad_norm": 0.6063970923423767, + "learning_rate": 0.00013443885952183683 + }, + { + "step": 342, + "epoch": 1.7404580152671756, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518772736, + "loss": 1.3432, + "grad_norm": 1.3413046598434448, + "learning_rate": 0.00013355313486851603 + }, + { + "step": 343, + "epoch": 1.7455470737913485, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518683648, + "loss": 1.3882, + "grad_norm": 1.0628844499588013, + "learning_rate": 0.00013266799027163942 + }, + { + "step": 344, + "epoch": 1.7506361323155217, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51880192, + "loss": 1.3386, + "grad_norm": 0.6608291864395142, + "learning_rate": 0.00013178345694893906 + }, + { + "step": 345, + "epoch": 1.7557251908396947, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518754304, + "loss": 1.3737, + "grad_norm": 0.8615196347236633, + "learning_rate": 0.0001308995660965881 + }, + { + "step": 346, + "epoch": 1.7608142493638677, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518700544, + "loss": 1.3731, + "grad_norm": 1.3976774215698242, + "learning_rate": 0.00013001634888810094 + }, + { + "step": 347, + "epoch": 1.7659033078880406, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518749696, + "loss": 1.3276, + "grad_norm": 0.7021074295043945, + "learning_rate": 0.0001291338364732333 + }, + { + "step": 348, + "epoch": 1.7709923664122136, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518875648, + "loss": 1.3987, + "grad_norm": 0.9128946661949158, + "learning_rate": 0.00012825205997688403 + }, + { + "step": 349, + "epoch": 1.7760814249363868, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518674432, + "loss": 1.2945, + "grad_norm": 0.773384690284729, + "learning_rate": 0.00012737105049799723 + }, + { + "step": 350, + "epoch": 1.78117048346056, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518738944, + "loss": 1.394, + "grad_norm": 0.6943948864936829, + "learning_rate": 0.00012649083910846543 + }, + { + "step": 351, + "epoch": 1.786259541984733, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518754304, + "loss": 1.3262, + "grad_norm": 0.6596630811691284, + "learning_rate": 0.00012561145685203374 + }, + { + "step": 352, + "epoch": 1.7913486005089059, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518745088, + "loss": 1.3543, + "grad_norm": 0.8942705392837524, + "learning_rate": 0.00012473293474320505 + }, + { + "step": 353, + "epoch": 1.7964376590330788, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518742016, + "loss": 1.2748, + "grad_norm": 0.8936471343040466, + "learning_rate": 0.00012385530376614586 + }, + { + "step": 354, + "epoch": 1.8015267175572518, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51871744, + "loss": 1.3943, + "grad_norm": 1.3813450336456299, + "learning_rate": 0.00012297859487359408 + }, + { + "step": 355, + "epoch": 1.806615776081425, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518700544, + "loss": 1.3472, + "grad_norm": 0.8805654644966125, + "learning_rate": 0.0001221028389857669 + }, + { + "step": 356, + "epoch": 1.811704834605598, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51870208, + "loss": 1.3243, + "grad_norm": 0.7401148676872253, + "learning_rate": 0.00012122806698927051 + }, + { + "step": 357, + "epoch": 1.8167938931297711, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518754304, + "loss": 1.3277, + "grad_norm": 1.1819344758987427, + "learning_rate": 0.00012035430973601075 + }, + { + "step": 358, + "epoch": 1.821882951653944, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518820352, + "loss": 1.3373, + "grad_norm": 1.6522026062011719, + "learning_rate": 0.00011948159804210495 + }, + { + "step": 359, + "epoch": 1.826972010178117, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518774272, + "loss": 1.2893, + "grad_norm": 1.1229602098464966, + "learning_rate": 0.00011860996268679504 + }, + { + "step": 360, + "epoch": 1.83206106870229, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51882496, + "loss": 1.3727, + "grad_norm": 0.8429384827613831, + "learning_rate": 0.00011773943441136221 + }, + { + "step": 361, + "epoch": 1.837150127226463, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518795776, + "loss": 1.3397, + "grad_norm": 1.0147525072097778, + "learning_rate": 0.00011687004391804251 + }, + { + "step": 362, + "epoch": 1.8422391857506362, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518729728, + "loss": 1.3883, + "grad_norm": 1.5138620138168335, + "learning_rate": 0.00011600182186894417 + }, + { + "step": 363, + "epoch": 1.8473282442748091, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51868672, + "loss": 1.2893, + "grad_norm": 1.2151203155517578, + "learning_rate": 0.00011513479888496609 + }, + { + "step": 364, + "epoch": 1.8524173027989823, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518714368, + "loss": 1.2093, + "grad_norm": 1.103375792503357, + "learning_rate": 0.00011426900554471795 + }, + { + "step": 365, + "epoch": 1.8575063613231553, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518697472, + "loss": 1.3679, + "grad_norm": 1.2096805572509766, + "learning_rate": 0.0001134044723834417 + }, + { + "step": 366, + "epoch": 1.8625954198473282, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51874048, + "loss": 1.2726, + "grad_norm": 1.3399866819381714, + "learning_rate": 0.00011254122989193465 + }, + { + "step": 367, + "epoch": 1.8676844783715012, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51879424, + "loss": 1.359, + "grad_norm": 1.8377406597137451, + "learning_rate": 0.00011167930851547418 + }, + { + "step": 368, + "epoch": 1.8727735368956742, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518691328, + "loss": 1.3072, + "grad_norm": 1.6318564414978027, + "learning_rate": 0.0001108187386527438 + }, + { + "step": 369, + "epoch": 1.8778625954198473, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518751232, + "loss": 1.3126, + "grad_norm": 1.5719292163848877, + "learning_rate": 0.00010995955065476126 + }, + { + "step": 370, + "epoch": 1.8829516539440203, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518674432, + "loss": 1.3292, + "grad_norm": 1.4618122577667236, + "learning_rate": 0.00010910177482380795 + }, + { + "step": 371, + "epoch": 1.8880407124681935, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518834176, + "loss": 1.2796, + "grad_norm": 1.6883022785186768, + "learning_rate": 0.00010824544141236015 + }, + { + "step": 372, + "epoch": 1.8931297709923665, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518688256, + "loss": 1.3202, + "grad_norm": 1.1437427997589111, + "learning_rate": 0.00010739058062202224 + }, + { + "step": 373, + "epoch": 1.8982188295165394, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518711296, + "loss": 1.2808, + "grad_norm": 1.1591439247131348, + "learning_rate": 0.00010653722260246145 + }, + { + "step": 374, + "epoch": 1.9033078880407124, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518734336, + "loss": 1.3407, + "grad_norm": 1.7944399118423462, + "learning_rate": 0.00010568539745034447 + }, + { + "step": 375, + "epoch": 1.9083969465648853, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518958592, + "loss": 1.4001, + "grad_norm": 1.2465510368347168, + "learning_rate": 0.00010483513520827614 + }, + { + "step": 376, + "epoch": 1.9134860050890585, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518715904, + "loss": 1.4065, + "grad_norm": 1.2306934595108032, + "learning_rate": 0.00010398646586373969 + }, + { + "step": 377, + "epoch": 1.9185750636132317, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518838784, + "loss": 1.3456, + "grad_norm": 1.2849770784378052, + "learning_rate": 0.00010313941934803922 + }, + { + "step": 378, + "epoch": 1.9236641221374047, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518754304, + "loss": 1.3614, + "grad_norm": 1.1050775051116943, + "learning_rate": 0.00010229402553524413 + }, + { + "step": 379, + "epoch": 1.9287531806615776, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518843392, + "loss": 1.3838, + "grad_norm": 1.4563099145889282, + "learning_rate": 0.00010145031424113542 + }, + { + "step": 380, + "epoch": 1.9338422391857506, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518742016, + "loss": 1.337, + "grad_norm": 1.139607310295105, + "learning_rate": 0.00010060831522215416 + }, + { + "step": 381, + "epoch": 1.9389312977099236, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518749696, + "loss": 1.3986, + "grad_norm": 0.9873425364494324, + "learning_rate": 9.976805817435207e-05 + }, + { + "step": 382, + "epoch": 1.9440203562340967, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518672896, + "loss": 1.3231, + "grad_norm": 0.886710524559021, + "learning_rate": 9.89295727323441e-05 + }, + { + "step": 383, + "epoch": 1.9491094147582697, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518752768, + "loss": 1.3617, + "grad_norm": 1.024148941040039, + "learning_rate": 9.809288846826327e-05 + }, + { + "step": 384, + "epoch": 1.954198473282443, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518723584, + "loss": 1.374, + "grad_norm": 1.0300142765045166, + "learning_rate": 9.725803489071779e-05 + }, + { + "step": 385, + "epoch": 1.9592875318066159, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518711296, + "loss": 1.2991, + "grad_norm": 0.8752567768096924, + "learning_rate": 9.642504144375026e-05 + }, + { + "step": 386, + "epoch": 1.9643765903307888, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518854144, + "loss": 1.2597, + "grad_norm": 0.8105007410049438, + "learning_rate": 9.559393750579926e-05 + }, + { + "step": 387, + "epoch": 1.9694656488549618, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518737408, + "loss": 1.3214, + "grad_norm": 0.7056103944778442, + "learning_rate": 9.476475238866318e-05 + }, + { + "step": 388, + "epoch": 1.9745547073791347, + "cpu_mem": 1.76007168, + "gpu_mem": 4.51874816, + "loss": 1.337, + "grad_norm": 0.8330013751983643, + "learning_rate": 9.393751533646649e-05 + }, + { + "step": 389, + "epoch": 1.979643765903308, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518903296, + "loss": 1.3355, + "grad_norm": 0.8509153127670288, + "learning_rate": 9.31122555246283e-05 + }, + { + "step": 390, + "epoch": 1.984732824427481, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518712832, + "loss": 1.3541, + "grad_norm": 0.9288692474365234, + "learning_rate": 9.228900205883324e-05 + }, + { + "step": 391, + "epoch": 1.989821882951654, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518734336, + "loss": 1.2926, + "grad_norm": 0.7765429019927979, + "learning_rate": 9.146778397400543e-05 + }, + { + "step": 392, + "epoch": 1.994910941475827, + "cpu_mem": 1.76007168, + "gpu_mem": 4.5187712, + "loss": 1.3453, + "grad_norm": 0.8282029032707214, + "learning_rate": 9.064863023328384e-05 + }, + { + "step": 393, + "epoch": 2.0, + "cpu_mem": 1.76007168, + "gpu_mem": 4.518358016, + "loss": 1.9735, + "grad_norm": 1.4702869653701782, + "learning_rate": 8.983156972700125e-05 + }, + { + "step": 394, + "epoch": 2.005089058524173, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493461504, + "loss": 1.2455, + "grad_norm": 1.2181600332260132, + "learning_rate": 8.901663127166513e-05 + }, + { + "step": 395, + "epoch": 2.010178117048346, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49344, + "loss": 1.32, + "grad_norm": 0.938593864440918, + "learning_rate": 8.820384360894143e-05 + }, + { + "step": 396, + "epoch": 2.015267175572519, + "cpu_mem": 1.76007168, + "gpu_mem": 4.49345536, + "loss": 1.3373, + "grad_norm": 1.069080114364624, + "learning_rate": 8.739323540464063e-05 + }, + { + "step": 397, + "epoch": 2.0203562340966923, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493602816, + "loss": 1.2484, + "grad_norm": 1.2214435338974, + "learning_rate": 8.658483524770728e-05 + }, + { + "step": 398, + "epoch": 2.0254452926208653, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493549056, + "loss": 1.2921, + "grad_norm": 1.0366792678833008, + "learning_rate": 8.577867164921113e-05 + }, + { + "step": 399, + "epoch": 2.030534351145038, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493565952, + "loss": 1.2733, + "grad_norm": 1.3003768920898438, + "learning_rate": 8.497477304134203e-05 + }, + { + "step": 400, + "epoch": 2.035623409669211, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493518336, + "loss": 1.2991, + "grad_norm": 1.0983554124832153, + "learning_rate": 8.41731677764068e-05 + }, + { + "step": 401, + "epoch": 2.040712468193384, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493542912, + "loss": 1.2731, + "grad_norm": 1.3412500619888306, + "learning_rate": 8.337388412582972e-05 + }, + { + "step": 402, + "epoch": 2.045801526717557, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493590528, + "loss": 1.3154, + "grad_norm": 1.7726919651031494, + "learning_rate": 8.257695027915481e-05 + }, + { + "step": 403, + "epoch": 2.05089058524173, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493499904, + "loss": 1.2522, + "grad_norm": 1.531832218170166, + "learning_rate": 8.178239434305235e-05 + }, + { + "step": 404, + "epoch": 2.0559796437659035, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493472256, + "loss": 1.1643, + "grad_norm": 1.4750925302505493, + "learning_rate": 8.099024434032717e-05 + }, + { + "step": 405, + "epoch": 2.0610687022900764, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493561344, + "loss": 1.2244, + "grad_norm": 1.508220911026001, + "learning_rate": 8.02005282089303e-05 + }, + { + "step": 406, + "epoch": 2.0661577608142494, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493452288, + "loss": 1.1584, + "grad_norm": 1.428913950920105, + "learning_rate": 7.941327380097388e-05 + }, + { + "step": 407, + "epoch": 2.0712468193384224, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493504512, + "loss": 1.2553, + "grad_norm": 1.546115756034851, + "learning_rate": 7.862850888174869e-05 + }, + { + "step": 408, + "epoch": 2.0763358778625953, + "cpu_mem": 1.76007168, + "gpu_mem": 4.493458432, + "loss": 1.2865, + "grad_norm": 2.0599005222320557, + "learning_rate": 7.784626112874487e-05 + }, + { + "step": 409, + "epoch": 2.0814249363867683, + "cpu_mem": 1.760268288, + "gpu_mem": 4.493661184, + "loss": 1.2675, + "grad_norm": 1.7912949323654175, + "learning_rate": 7.706655813067594e-05 + }, + { + "step": 410, + "epoch": 2.0865139949109412, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493659648, + "loss": 1.366, + "grad_norm": 2.166999340057373, + "learning_rate": 7.628942738650573e-05 + }, + { + "step": 411, + "epoch": 2.0916030534351147, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493582848, + "loss": 1.1324, + "grad_norm": 1.7166810035705566, + "learning_rate": 7.551489630447835e-05 + }, + { + "step": 412, + "epoch": 2.0966921119592876, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49353984, + "loss": 1.3025, + "grad_norm": 1.8892287015914917, + "learning_rate": 7.474299220115195e-05 + }, + { + "step": 413, + "epoch": 2.1017811704834606, + "cpu_mem": 1.76203776, + "gpu_mem": 4.4935168, + "loss": 1.118, + "grad_norm": 1.7802501916885376, + "learning_rate": 7.397374230043484e-05 + }, + { + "step": 414, + "epoch": 2.1068702290076335, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49347072, + "loss": 1.278, + "grad_norm": 1.50777268409729, + "learning_rate": 7.320717373262557e-05 + }, + { + "step": 415, + "epoch": 2.1119592875318065, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493587456, + "loss": 1.3338, + "grad_norm": 2.271089553833008, + "learning_rate": 7.244331353345625e-05 + }, + { + "step": 416, + "epoch": 2.1170483460559795, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493475328, + "loss": 1.3109, + "grad_norm": 1.4759130477905273, + "learning_rate": 7.16821886431386e-05 + }, + { + "step": 417, + "epoch": 2.122137404580153, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493464576, + "loss": 1.2779, + "grad_norm": 1.4350740909576416, + "learning_rate": 7.092382590541432e-05 + }, + { + "step": 418, + "epoch": 2.127226463104326, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493469184, + "loss": 1.2522, + "grad_norm": 1.4841984510421753, + "learning_rate": 7.016825206660788e-05 + }, + { + "step": 419, + "epoch": 2.132315521628499, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493584384, + "loss": 1.3288, + "grad_norm": 1.7721344232559204, + "learning_rate": 6.941549377468367e-05 + }, + { + "step": 420, + "epoch": 2.1374045801526718, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493558272, + "loss": 1.2814, + "grad_norm": 1.574934482574463, + "learning_rate": 6.866557757830575e-05 + }, + { + "step": 421, + "epoch": 2.1424936386768447, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493459968, + "loss": 1.236, + "grad_norm": 1.732743501663208, + "learning_rate": 6.791852992590169e-05 + }, + { + "step": 422, + "epoch": 2.1475826972010177, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493495296, + "loss": 1.199, + "grad_norm": 1.4396436214447021, + "learning_rate": 6.717437716472997e-05 + }, + { + "step": 423, + "epoch": 2.1526717557251906, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493461504, + "loss": 1.2584, + "grad_norm": 1.3801759481430054, + "learning_rate": 6.643314553995034e-05 + }, + { + "step": 424, + "epoch": 2.157760814249364, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493595136, + "loss": 1.2436, + "grad_norm": 1.3777662515640259, + "learning_rate": 6.569486119369863e-05 + }, + { + "step": 425, + "epoch": 2.162849872773537, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493569024, + "loss": 1.2335, + "grad_norm": 1.6032681465148926, + "learning_rate": 6.495955016416441e-05 + }, + { + "step": 426, + "epoch": 2.16793893129771, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493613568, + "loss": 1.0828, + "grad_norm": 1.456201434135437, + "learning_rate": 6.422723838467286e-05 + }, + { + "step": 427, + "epoch": 2.173027989821883, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493504512, + "loss": 1.2449, + "grad_norm": 1.418583631515503, + "learning_rate": 6.349795168276994e-05 + }, + { + "step": 428, + "epoch": 2.178117048346056, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493521408, + "loss": 1.1226, + "grad_norm": 1.6003406047821045, + "learning_rate": 6.277171577931187e-05 + }, + { + "step": 429, + "epoch": 2.183206106870229, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493596672, + "loss": 1.3672, + "grad_norm": 2.182643175125122, + "learning_rate": 6.204855628755751e-05 + }, + { + "step": 430, + "epoch": 2.188295165394402, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493444608, + "loss": 1.3742, + "grad_norm": 2.0892884731292725, + "learning_rate": 6.13284987122654e-05 + }, + { + "step": 431, + "epoch": 2.1933842239185752, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49356288, + "loss": 1.2645, + "grad_norm": 2.3697471618652344, + "learning_rate": 6.061156844879417e-05 + }, + { + "step": 432, + "epoch": 2.198473282442748, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493552128, + "loss": 1.1674, + "grad_norm": 1.9826672077178955, + "learning_rate": 5.9897790782206636e-05 + }, + { + "step": 433, + "epoch": 2.203562340966921, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493518336, + "loss": 1.2372, + "grad_norm": 1.4054876565933228, + "learning_rate": 5.9187190886378306e-05 + }, + { + "step": 434, + "epoch": 2.208651399491094, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493496832, + "loss": 1.3434, + "grad_norm": 1.941765546798706, + "learning_rate": 5.8479793823109406e-05 + }, + { + "step": 435, + "epoch": 2.213740458015267, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493492224, + "loss": 1.218, + "grad_norm": 1.5344960689544678, + "learning_rate": 5.777562454124113e-05 + }, + { + "step": 436, + "epoch": 2.21882951653944, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493502976, + "loss": 1.2894, + "grad_norm": 1.672250747680664, + "learning_rate": 5.7074707875775496e-05 + }, + { + "step": 437, + "epoch": 2.223918575063613, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493556736, + "loss": 1.2893, + "grad_norm": 1.8003240823745728, + "learning_rate": 5.637706854699974e-05 + }, + { + "step": 438, + "epoch": 2.2290076335877864, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493473792, + "loss": 1.2601, + "grad_norm": 1.4216355085372925, + "learning_rate": 5.568273115961414e-05 + }, + { + "step": 439, + "epoch": 2.2340966921119594, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49350144, + "loss": 1.2185, + "grad_norm": 1.3393009901046753, + "learning_rate": 5.499172020186447e-05 + }, + { + "step": 440, + "epoch": 2.2391857506361323, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493499904, + "loss": 1.2366, + "grad_norm": 1.55874764919281, + "learning_rate": 5.430406004467842e-05 + }, + { + "step": 441, + "epoch": 2.2442748091603053, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493542912, + "loss": 1.2342, + "grad_norm": 1.7756643295288086, + "learning_rate": 5.361977494080572e-05 + }, + { + "step": 442, + "epoch": 2.2493638676844783, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493541376, + "loss": 1.2803, + "grad_norm": 1.615216851234436, + "learning_rate": 5.293888902396319e-05 + }, + { + "step": 443, + "epoch": 2.2544529262086512, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493592064, + "loss": 1.2534, + "grad_norm": 1.8755874633789062, + "learning_rate": 5.2261426307983204e-05 + }, + { + "step": 444, + "epoch": 2.2595419847328246, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493598208, + "loss": 1.2466, + "grad_norm": 1.7218396663665771, + "learning_rate": 5.158741068596714e-05 + }, + { + "step": 445, + "epoch": 2.2646310432569976, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493536768, + "loss": 1.3196, + "grad_norm": 1.6826705932617188, + "learning_rate": 5.0916865929442326e-05 + }, + { + "step": 446, + "epoch": 2.2697201017811706, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493526016, + "loss": 1.26, + "grad_norm": 2.08514404296875, + "learning_rate": 5.024981568752386e-05 + }, + { + "step": 447, + "epoch": 2.2748091603053435, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493550592, + "loss": 1.2695, + "grad_norm": 1.7068073749542236, + "learning_rate": 4.958628348608065e-05 + }, + { + "step": 448, + "epoch": 2.2798982188295165, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493567488, + "loss": 1.1762, + "grad_norm": 2.009733200073242, + "learning_rate": 4.892629272690536e-05 + }, + { + "step": 449, + "epoch": 2.2849872773536894, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493438464, + "loss": 1.2178, + "grad_norm": 1.7013733386993408, + "learning_rate": 4.826986668688944e-05 + }, + { + "step": 450, + "epoch": 2.2900763358778624, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493613568, + "loss": 1.1563, + "grad_norm": 2.0011892318725586, + "learning_rate": 4.761702851720191e-05 + }, + { + "step": 451, + "epoch": 2.2951653944020354, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49357056, + "loss": 1.2202, + "grad_norm": 1.993901252746582, + "learning_rate": 4.6967801242472916e-05 + }, + { + "step": 452, + "epoch": 2.300254452926209, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493479936, + "loss": 1.2149, + "grad_norm": 1.7070136070251465, + "learning_rate": 4.632220775998172e-05 + }, + { + "step": 453, + "epoch": 2.3053435114503817, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493427712, + "loss": 1.1617, + "grad_norm": 1.6267485618591309, + "learning_rate": 4.568027083884929e-05 + }, + { + "step": 454, + "epoch": 2.3104325699745547, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493443072, + "loss": 1.3853, + "grad_norm": 2.1260170936584473, + "learning_rate": 4.504201311923488e-05 + }, + { + "step": 455, + "epoch": 2.3155216284987277, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493544448, + "loss": 1.1937, + "grad_norm": 1.8467614650726318, + "learning_rate": 4.440745711153804e-05 + }, + { + "step": 456, + "epoch": 2.3206106870229006, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493553664, + "loss": 1.2177, + "grad_norm": 2.0098745822906494, + "learning_rate": 4.377662519560423e-05 + }, + { + "step": 457, + "epoch": 2.325699745547074, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493582848, + "loss": 1.2688, + "grad_norm": 2.171715259552002, + "learning_rate": 4.3149539619935836e-05 + }, + { + "step": 458, + "epoch": 2.330788804071247, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493644288, + "loss": 1.3044, + "grad_norm": 2.2359049320220947, + "learning_rate": 4.252622250090746e-05 + }, + { + "step": 459, + "epoch": 2.33587786259542, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493498368, + "loss": 1.1309, + "grad_norm": 1.8600462675094604, + "learning_rate": 4.190669582198571e-05 + }, + { + "step": 460, + "epoch": 2.340966921119593, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49358592, + "loss": 1.3233, + "grad_norm": 2.745011806488037, + "learning_rate": 4.1290981432954185e-05 + }, + { + "step": 461, + "epoch": 2.346055979643766, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493489152, + "loss": 1.2552, + "grad_norm": 1.9253625869750977, + "learning_rate": 4.067910104914249e-05 + }, + { + "step": 462, + "epoch": 2.351145038167939, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493559808, + "loss": 1.274, + "grad_norm": 1.899074673652649, + "learning_rate": 4.007107625066079e-05 + }, + { + "step": 463, + "epoch": 2.356234096692112, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493595136, + "loss": 1.1843, + "grad_norm": 1.9185845851898193, + "learning_rate": 3.946692848163836e-05 + }, + { + "step": 464, + "epoch": 2.3613231552162848, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493533696, + "loss": 1.1591, + "grad_norm": 1.7083492279052734, + "learning_rate": 3.886667904946739e-05 + }, + { + "step": 465, + "epoch": 2.366412213740458, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493490688, + "loss": 1.2231, + "grad_norm": 1.7697157859802246, + "learning_rate": 3.8270349124051694e-05 + }, + { + "step": 466, + "epoch": 2.371501272264631, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493492224, + "loss": 1.2695, + "grad_norm": 1.7851979732513428, + "learning_rate": 3.767795973705975e-05 + }, + { + "step": 467, + "epoch": 2.376590330788804, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493529088, + "loss": 1.1649, + "grad_norm": 1.5936659574508667, + "learning_rate": 3.708953178118324e-05 + }, + { + "step": 468, + "epoch": 2.381679389312977, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49350144, + "loss": 1.1926, + "grad_norm": 1.71742844581604, + "learning_rate": 3.6505086009399944e-05 + }, + { + "step": 469, + "epoch": 2.38676844783715, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493558272, + "loss": 1.2499, + "grad_norm": 1.9231393337249756, + "learning_rate": 3.5924643034242136e-05 + }, + { + "step": 470, + "epoch": 2.391857506361323, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493587456, + "loss": 1.1951, + "grad_norm": 2.21256160736084, + "learning_rate": 3.5348223327069105e-05 + }, + { + "step": 471, + "epoch": 2.3969465648854964, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493487616, + "loss": 1.2108, + "grad_norm": 2.1370201110839844, + "learning_rate": 3.4775847217345756e-05 + }, + { + "step": 472, + "epoch": 2.4020356234096694, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493510656, + "loss": 1.1645, + "grad_norm": 1.916337251663208, + "learning_rate": 3.420753489192524e-05 + }, + { + "step": 473, + "epoch": 2.4071246819338423, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49348608, + "loss": 1.2155, + "grad_norm": 1.9203264713287354, + "learning_rate": 3.364330639433701e-05 + }, + { + "step": 474, + "epoch": 2.4122137404580153, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49357824, + "loss": 1.2519, + "grad_norm": 1.898951530456543, + "learning_rate": 3.308318162408013e-05 + }, + { + "step": 475, + "epoch": 2.4173027989821882, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49344768, + "loss": 1.1444, + "grad_norm": 1.9629889726638794, + "learning_rate": 3.2527180335921186e-05 + }, + { + "step": 476, + "epoch": 2.422391857506361, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493584384, + "loss": 1.3055, + "grad_norm": 1.9164798259735107, + "learning_rate": 3.197532213919774e-05 + }, + { + "step": 477, + "epoch": 2.427480916030534, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493441536, + "loss": 1.1895, + "grad_norm": 1.8079711198806763, + "learning_rate": 3.1427626497126654e-05 + }, + { + "step": 478, + "epoch": 2.432569974554707, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493464576, + "loss": 1.2511, + "grad_norm": 2.447209596633911, + "learning_rate": 3.088411272611781e-05 + }, + { + "step": 479, + "epoch": 2.4376590330788805, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493512192, + "loss": 1.3152, + "grad_norm": 2.4001779556274414, + "learning_rate": 3.0344799995092533e-05 + }, + { + "step": 480, + "epoch": 2.4427480916030535, + "cpu_mem": 1.76203776, + "gpu_mem": 4.4935552, + "loss": 1.2615, + "grad_norm": 1.8391376733779907, + "learning_rate": 2.9809707324807912e-05 + }, + { + "step": 481, + "epoch": 2.4478371501272265, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493483008, + "loss": 1.2342, + "grad_norm": 1.9063161611557007, + "learning_rate": 2.9278853587185658e-05 + }, + { + "step": 482, + "epoch": 2.4529262086513994, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493581312, + "loss": 1.2455, + "grad_norm": 1.8921847343444824, + "learning_rate": 2.8752257504646616e-05 + }, + { + "step": 483, + "epoch": 2.4580152671755724, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493476864, + "loss": 1.2156, + "grad_norm": 1.795665979385376, + "learning_rate": 2.8229937649450613e-05 + }, + { + "step": 484, + "epoch": 2.4631043256997454, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493632, + "loss": 1.1714, + "grad_norm": 1.7185704708099365, + "learning_rate": 2.7711912443041123e-05 + }, + { + "step": 485, + "epoch": 2.4681933842239188, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493464576, + "loss": 1.2996, + "grad_norm": 1.7589646577835083, + "learning_rate": 2.719820015539596e-05 + }, + { + "step": 486, + "epoch": 2.4732824427480917, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493507584, + "loss": 1.2397, + "grad_norm": 1.9653416872024536, + "learning_rate": 2.6688818904382513e-05 + }, + { + "step": 487, + "epoch": 2.4783715012722647, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493529088, + "loss": 1.182, + "grad_norm": 1.738325595855713, + "learning_rate": 2.6183786655119144e-05 + }, + { + "step": 488, + "epoch": 2.4834605597964376, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493452288, + "loss": 1.2829, + "grad_norm": 2.025881767272949, + "learning_rate": 2.5683121219341217e-05 + }, + { + "step": 489, + "epoch": 2.4885496183206106, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493588992, + "loss": 1.1752, + "grad_norm": 1.9629648923873901, + "learning_rate": 2.518684025477319e-05 + }, + { + "step": 490, + "epoch": 2.4936386768447836, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493476864, + "loss": 1.1264, + "grad_norm": 1.7972261905670166, + "learning_rate": 2.469496126450578e-05 + }, + { + "step": 491, + "epoch": 2.4987277353689565, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493473792, + "loss": 1.3247, + "grad_norm": 1.8664318323135376, + "learning_rate": 2.4207501596378508e-05 + }, + { + "step": 492, + "epoch": 2.5038167938931295, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49344768, + "loss": 1.312, + "grad_norm": 2.0916459560394287, + "learning_rate": 2.3724478442368133e-05 + }, + { + "step": 493, + "epoch": 2.508905852417303, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493483008, + "loss": 1.1534, + "grad_norm": 1.9871327877044678, + "learning_rate": 2.324590883798204e-05 + }, + { + "step": 494, + "epoch": 2.513994910941476, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493498368, + "loss": 1.1431, + "grad_norm": 1.7869534492492676, + "learning_rate": 2.2771809661657614e-05 + }, + { + "step": 495, + "epoch": 2.519083969465649, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493492224, + "loss": 1.1754, + "grad_norm": 1.9198211431503296, + "learning_rate": 2.2302197634166835e-05 + }, + { + "step": 496, + "epoch": 2.524173027989822, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493504512, + "loss": 1.2884, + "grad_norm": 2.1243364810943604, + "learning_rate": 2.1837089318026714e-05 + }, + { + "step": 497, + "epoch": 2.5292620865139948, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493536768, + "loss": 1.1575, + "grad_norm": 1.9047483205795288, + "learning_rate": 2.1376501116915047e-05 + }, + { + "step": 498, + "epoch": 2.534351145038168, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493538304, + "loss": 1.2597, + "grad_norm": 1.6771018505096436, + "learning_rate": 2.0920449275091837e-05 + }, + { + "step": 499, + "epoch": 2.539440203562341, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493510656, + "loss": 1.2561, + "grad_norm": 1.8822518587112427, + "learning_rate": 2.0468949876826573e-05 + }, + { + "step": 500, + "epoch": 2.544529262086514, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493559808, + "loss": 1.1794, + "grad_norm": 1.9333287477493286, + "learning_rate": 2.002201884583065e-05 + }, + { + "step": 501, + "epoch": 2.549618320610687, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493489152, + "loss": 1.2478, + "grad_norm": 2.0478763580322266, + "learning_rate": 1.957967194469615e-05 + }, + { + "step": 502, + "epoch": 2.55470737913486, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49354752, + "loss": 1.1102, + "grad_norm": 1.986507534980774, + "learning_rate": 1.9141924774339566e-05 + }, + { + "step": 503, + "epoch": 2.559796437659033, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493538304, + "loss": 1.0509, + "grad_norm": 1.59932279586792, + "learning_rate": 1.8708792773451874e-05 + }, + { + "step": 504, + "epoch": 2.564885496183206, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493472256, + "loss": 1.2167, + "grad_norm": 2.034536838531494, + "learning_rate": 1.828029121795375e-05 + }, + { + "step": 505, + "epoch": 2.569974554707379, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49349376, + "loss": 1.1723, + "grad_norm": 1.7690469026565552, + "learning_rate": 1.7856435220457092e-05 + }, + { + "step": 506, + "epoch": 2.5750636132315523, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493549056, + "loss": 1.2906, + "grad_norm": 2.5948591232299805, + "learning_rate": 1.7437239729731806e-05 + }, + { + "step": 507, + "epoch": 2.5801526717557253, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493466112, + "loss": 1.2184, + "grad_norm": 1.8858314752578735, + "learning_rate": 1.7022719530178624e-05 + }, + { + "step": 508, + "epoch": 2.5852417302798982, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493530624, + "loss": 1.2638, + "grad_norm": 2.2522408962249756, + "learning_rate": 1.6612889241307836e-05 + }, + { + "step": 509, + "epoch": 2.590330788804071, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493489152, + "loss": 1.1362, + "grad_norm": 1.9693809747695923, + "learning_rate": 1.620776331722347e-05 + }, + { + "step": 510, + "epoch": 2.595419847328244, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493521408, + "loss": 1.2661, + "grad_norm": 1.9689873456954956, + "learning_rate": 1.580735604611368e-05 + }, + { + "step": 511, + "epoch": 2.6005089058524176, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493495296, + "loss": 1.1627, + "grad_norm": 1.8979052305221558, + "learning_rate": 1.5411681549746678e-05 + }, + { + "step": 512, + "epoch": 2.6055979643765905, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493489152, + "loss": 1.2499, + "grad_norm": 2.0185647010803223, + "learning_rate": 1.502075378297285e-05 + }, + { + "step": 513, + "epoch": 2.6106870229007635, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493499904, + "loss": 1.3141, + "grad_norm": 2.452040195465088, + "learning_rate": 1.4634586533232428e-05 + }, + { + "step": 514, + "epoch": 2.6157760814249365, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493404672, + "loss": 1.264, + "grad_norm": 1.9577916860580444, + "learning_rate": 1.4253193420069292e-05 + }, + { + "step": 515, + "epoch": 2.6208651399491094, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493473792, + "loss": 1.178, + "grad_norm": 2.272634267807007, + "learning_rate": 1.3876587894650686e-05 + }, + { + "step": 516, + "epoch": 2.6259541984732824, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493521408, + "loss": 1.2506, + "grad_norm": 2.312358856201172, + "learning_rate": 1.350478323929271e-05 + }, + { + "step": 517, + "epoch": 2.6310432569974553, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493466112, + "loss": 1.3101, + "grad_norm": 2.876584768295288, + "learning_rate": 1.3137792566992001e-05 + }, + { + "step": 518, + "epoch": 2.6361323155216283, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49352448, + "loss": 1.3522, + "grad_norm": 2.5750203132629395, + "learning_rate": 1.2775628820963091e-05 + }, + { + "step": 519, + "epoch": 2.6412213740458013, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493530624, + "loss": 1.2111, + "grad_norm": 2.4082119464874268, + "learning_rate": 1.2418304774182075e-05 + }, + { + "step": 520, + "epoch": 2.6463104325699747, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493667328, + "loss": 1.1283, + "grad_norm": 1.8970468044281006, + "learning_rate": 1.2065833028935968e-05 + }, + { + "step": 521, + "epoch": 2.6513994910941476, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493476864, + "loss": 1.1709, + "grad_norm": 1.7273095846176147, + "learning_rate": 1.1718226016378507e-05 + }, + { + "step": 522, + "epoch": 2.6564885496183206, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493541376, + "loss": 1.2047, + "grad_norm": 2.3556737899780273, + "learning_rate": 1.137549599609136e-05 + }, + { + "step": 523, + "epoch": 2.6615776081424936, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493456896, + "loss": 1.1963, + "grad_norm": 2.080383539199829, + "learning_rate": 1.103765505565205e-05 + }, + { + "step": 524, + "epoch": 2.6666666666666665, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493473792, + "loss": 1.1709, + "grad_norm": 2.141674280166626, + "learning_rate": 1.0704715110207579e-05 + }, + { + "step": 525, + "epoch": 2.67175572519084, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493473792, + "loss": 1.1986, + "grad_norm": 1.9965574741363525, + "learning_rate": 1.0376687902053981e-05 + }, + { + "step": 526, + "epoch": 2.676844783715013, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493599744, + "loss": 1.292, + "grad_norm": 2.0534558296203613, + "learning_rate": 1.0053585000222524e-05 + }, + { + "step": 527, + "epoch": 2.681933842239186, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493479936, + "loss": 1.0813, + "grad_norm": 2.1535611152648926, + "learning_rate": 9.735417800071433e-06 + }, + { + "step": 528, + "epoch": 2.687022900763359, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493638144, + "loss": 1.1553, + "grad_norm": 1.9631518125534058, + "learning_rate": 9.42219752288414e-06 + }, + { + "step": 529, + "epoch": 2.6921119592875318, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49346304, + "loss": 1.229, + "grad_norm": 1.8764824867248535, + "learning_rate": 9.113935215473428e-06 + }, + { + "step": 530, + "epoch": 2.6972010178117047, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493481472, + "loss": 1.2978, + "grad_norm": 2.308229923248291, + "learning_rate": 8.810641749791902e-06 + }, + { + "step": 531, + "epoch": 2.7022900763358777, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493658112, + "loss": 1.1347, + "grad_norm": 1.8085517883300781, + "learning_rate": 8.512327822548481e-06 + }, + { + "step": 532, + "epoch": 2.7073791348600507, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49353216, + "loss": 1.1449, + "grad_norm": 1.8577075004577637, + "learning_rate": 8.219003954831199e-06 + }, + { + "step": 533, + "epoch": 2.712468193384224, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493529088, + "loss": 1.2416, + "grad_norm": 2.0989925861358643, + "learning_rate": 7.930680491736135e-06 + }, + { + "step": 534, + "epoch": 2.717557251908397, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493538304, + "loss": 1.199, + "grad_norm": 1.9798862934112549, + "learning_rate": 7.647367602002491e-06 + }, + { + "step": 535, + "epoch": 2.72264631043257, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493413888, + "loss": 1.2285, + "grad_norm": 2.0462238788604736, + "learning_rate": 7.369075277654091e-06 + }, + { + "step": 536, + "epoch": 2.727735368956743, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493696512, + "loss": 1.2764, + "grad_norm": 2.374398946762085, + "learning_rate": 7.095813333646832e-06 + }, + { + "step": 537, + "epoch": 2.732824427480916, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493459968, + "loss": 1.1332, + "grad_norm": 1.9502490758895874, + "learning_rate": 6.827591407522548e-06 + }, + { + "step": 538, + "epoch": 2.7379134860050893, + "cpu_mem": 1.76203776, + "gpu_mem": 4.4935552, + "loss": 1.273, + "grad_norm": 1.7606923580169678, + "learning_rate": 6.564418959069273e-06 + }, + { + "step": 539, + "epoch": 2.7430025445292623, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493521408, + "loss": 1.2657, + "grad_norm": 2.5668246746063232, + "learning_rate": 6.3063052699873326e-06 + }, + { + "step": 540, + "epoch": 2.7480916030534353, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493489152, + "loss": 1.309, + "grad_norm": 2.449667453765869, + "learning_rate": 6.053259443562286e-06 + }, + { + "step": 541, + "epoch": 2.753180661577608, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493507584, + "loss": 1.3355, + "grad_norm": 2.0415592193603516, + "learning_rate": 5.8052904043435985e-06 + }, + { + "step": 542, + "epoch": 2.758269720101781, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493558272, + "loss": 1.4031, + "grad_norm": 2.582411766052246, + "learning_rate": 5.56240689783013e-06 + }, + { + "step": 543, + "epoch": 2.763358778625954, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493453824, + "loss": 1.2835, + "grad_norm": 2.2976901531219482, + "learning_rate": 5.324617490161409e-06 + }, + { + "step": 544, + "epoch": 2.768447837150127, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493483008, + "loss": 1.1795, + "grad_norm": 2.0415689945220947, + "learning_rate": 5.091930567815866e-06 + }, + { + "step": 545, + "epoch": 2.7735368956743, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493426176, + "loss": 1.2864, + "grad_norm": 2.0273592472076416, + "learning_rate": 4.86435433731473e-06 + }, + { + "step": 546, + "epoch": 2.778625954198473, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49350144, + "loss": 1.1257, + "grad_norm": 1.879783034324646, + "learning_rate": 4.641896824932861e-06 + }, + { + "step": 547, + "epoch": 2.7837150127226464, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49349376, + "loss": 1.1603, + "grad_norm": 1.8836020231246948, + "learning_rate": 4.424565876415415e-06 + }, + { + "step": 548, + "epoch": 2.7888040712468194, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49352448, + "loss": 1.2114, + "grad_norm": 1.913467526435852, + "learning_rate": 4.212369156701373e-06 + }, + { + "step": 549, + "epoch": 2.7938931297709924, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493453824, + "loss": 1.2338, + "grad_norm": 1.7854373455047607, + "learning_rate": 4.005314149653133e-06 + }, + { + "step": 550, + "epoch": 2.7989821882951653, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493479936, + "loss": 1.2323, + "grad_norm": 2.151526927947998, + "learning_rate": 3.8034081577924147e-06 + }, + { + "step": 551, + "epoch": 2.8040712468193383, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493575168, + "loss": 1.2371, + "grad_norm": 2.312635660171509, + "learning_rate": 3.6066583020429864e-06 + }, + { + "step": 552, + "epoch": 2.8091603053435117, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49347072, + "loss": 1.1066, + "grad_norm": 1.846198558807373, + "learning_rate": 3.415071521479246e-06 + }, + { + "step": 553, + "epoch": 2.8142493638676847, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493479936, + "loss": 1.2311, + "grad_norm": 1.9019705057144165, + "learning_rate": 3.2286545730817183e-06 + }, + { + "step": 554, + "epoch": 2.8193384223918576, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493507584, + "loss": 1.3848, + "grad_norm": 2.4176759719848633, + "learning_rate": 3.0474140314985628e-06 + }, + { + "step": 555, + "epoch": 2.8244274809160306, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493521408, + "loss": 1.3049, + "grad_norm": 2.3215224742889404, + "learning_rate": 2.8713562888138754e-06 + }, + { + "step": 556, + "epoch": 2.8295165394402035, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493515264, + "loss": 1.1218, + "grad_norm": 1.8951525688171387, + "learning_rate": 2.7004875543220506e-06 + }, + { + "step": 557, + "epoch": 2.8346055979643765, + "cpu_mem": 1.76203776, + "gpu_mem": 4.4934784, + "loss": 1.1375, + "grad_norm": 1.8825587034225464, + "learning_rate": 2.5348138543089425e-06 + }, + { + "step": 558, + "epoch": 2.8396946564885495, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493536768, + "loss": 1.2921, + "grad_norm": 2.819791555404663, + "learning_rate": 2.374341031839283e-06 + }, + { + "step": 559, + "epoch": 2.8447837150127224, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493446144, + "loss": 1.2541, + "grad_norm": 2.208458185195923, + "learning_rate": 2.2190747465505644e-06 + }, + { + "step": 560, + "epoch": 2.849872773536896, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493530624, + "loss": 1.2111, + "grad_norm": 2.3804702758789062, + "learning_rate": 2.0690204744534976e-06 + }, + { + "step": 561, + "epoch": 2.854961832061069, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493569024, + "loss": 1.0987, + "grad_norm": 2.3462255001068115, + "learning_rate": 1.924183507738819e-06 + }, + { + "step": 562, + "epoch": 2.8600508905852418, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493613568, + "loss": 1.3643, + "grad_norm": 2.1674201488494873, + "learning_rate": 1.7845689545906704e-06 + }, + { + "step": 563, + "epoch": 2.8651399491094147, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493575168, + "loss": 1.1861, + "grad_norm": 1.8091249465942383, + "learning_rate": 1.6501817390064786e-06 + }, + { + "step": 564, + "epoch": 2.8702290076335877, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493512192, + "loss": 1.2593, + "grad_norm": 2.0843887329101562, + "learning_rate": 1.521026600623243e-06 + }, + { + "step": 565, + "epoch": 2.875318066157761, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493565952, + "loss": 1.3696, + "grad_norm": 2.9274420738220215, + "learning_rate": 1.3971080945503866e-06 + }, + { + "step": 566, + "epoch": 2.880407124681934, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493504512, + "loss": 1.242, + "grad_norm": 2.147740364074707, + "learning_rate": 1.2784305912090842e-06 + }, + { + "step": 567, + "epoch": 2.885496183206107, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493490688, + "loss": 1.1021, + "grad_norm": 1.770539402961731, + "learning_rate": 1.1649982761782195e-06 + }, + { + "step": 568, + "epoch": 2.89058524173028, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493702656, + "loss": 1.0944, + "grad_norm": 1.9329615831375122, + "learning_rate": 1.0568151500465693e-06 + }, + { + "step": 569, + "epoch": 2.895674300254453, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493567488, + "loss": 1.1877, + "grad_norm": 2.5771710872650146, + "learning_rate": 9.538850282719833e-07 + }, + { + "step": 570, + "epoch": 2.900763358778626, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493541376, + "loss": 1.1705, + "grad_norm": 1.9423385858535767, + "learning_rate": 8.56211541046542e-07 + }, + { + "step": 571, + "epoch": 2.905852417302799, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49354752, + "loss": 1.2555, + "grad_norm": 2.0403058528900146, + "learning_rate": 7.637981331687582e-07 + }, + { + "step": 572, + "epoch": 2.910941475826972, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493581312, + "loss": 1.3293, + "grad_norm": 2.9506332874298096, + "learning_rate": 6.766480639218752e-07 + }, + { + "step": 573, + "epoch": 2.916030534351145, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493489152, + "loss": 1.2842, + "grad_norm": 2.158268451690674, + "learning_rate": 5.947644069591084e-07 + }, + { + "step": 574, + "epoch": 2.921119592875318, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493573632, + "loss": 1.1999, + "grad_norm": 2.2902932167053223, + "learning_rate": 5.181500501950986e-07 + }, + { + "step": 575, + "epoch": 2.926208651399491, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493461504, + "loss": 1.1833, + "grad_norm": 1.8559632301330566, + "learning_rate": 4.468076957041433e-07 + }, + { + "step": 576, + "epoch": 2.931297709923664, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49352448, + "loss": 1.2502, + "grad_norm": 2.130039930343628, + "learning_rate": 3.807398596248401e-07 + }, + { + "step": 577, + "epoch": 2.936386768447837, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493489152, + "loss": 1.1142, + "grad_norm": 1.8459100723266602, + "learning_rate": 3.199488720714072e-07 + }, + { + "step": 578, + "epoch": 2.94147582697201, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493489152, + "loss": 1.242, + "grad_norm": 2.0775270462036133, + "learning_rate": 2.64436877051466e-07 + }, + { + "step": 579, + "epoch": 2.9465648854961835, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493573632, + "loss": 1.2746, + "grad_norm": 1.9629416465759277, + "learning_rate": 2.1420583239040167e-07 + }, + { + "step": 580, + "epoch": 2.9516539440203564, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493519872, + "loss": 1.2671, + "grad_norm": 1.9143123626708984, + "learning_rate": 1.6925750966238494e-07 + }, + { + "step": 581, + "epoch": 2.9567430025445294, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49353984, + "loss": 1.2928, + "grad_norm": 2.4918973445892334, + "learning_rate": 1.295934941278387e-07 + }, + { + "step": 582, + "epoch": 2.9618320610687023, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493467648, + "loss": 1.2651, + "grad_norm": 2.517381191253662, + "learning_rate": 9.52151846775162e-08 + }, + { + "step": 583, + "epoch": 2.9669211195928753, + "cpu_mem": 1.76203776, + "gpu_mem": 4.4935552, + "loss": 1.3094, + "grad_norm": 2.2002789974212646, + "learning_rate": 6.612379378320709e-08 + }, + { + "step": 584, + "epoch": 2.9720101781170483, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493522944, + "loss": 1.1956, + "grad_norm": 2.083509683609009, + "learning_rate": 4.232034745495494e-08 + }, + { + "step": 585, + "epoch": 2.9770992366412212, + "cpu_mem": 1.76203776, + "gpu_mem": 4.49350144, + "loss": 1.0768, + "grad_norm": 1.761749029159546, + "learning_rate": 2.3805685204869583e-08 + }, + { + "step": 586, + "epoch": 2.982188295165394, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493444608, + "loss": 1.1917, + "grad_norm": 2.3449466228485107, + "learning_rate": 1.0580460017517444e-08 + }, + { + "step": 587, + "epoch": 2.9872773536895676, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493510656, + "loss": 1.3222, + "grad_norm": 2.236262321472168, + "learning_rate": 2.645138326906604e-09 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493552128, + "loss": 1.2081, + "grad_norm": 1.8682501316070557, + "learning_rate": 0.0 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 1.76203776, + "gpu_mem": 4.493552128, + "train_runtime": 8574.3605, + "train_samples_per_second": 4.397, + "train_steps_per_second": 0.069, + "total_flos": 8.875874857128346e+16, + "train_loss": 1.3685422028814043 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r2-a2/README.md b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r2-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r2-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r2-a2/adapter_config.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..26ebe9ef584396639cb6b281f2c8108d7f3fd14a --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r2-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 4, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 2, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r2-a2/eval_results.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..6265bebc04611478a1ac53bc47128662e069da75 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "winogrande", + "results": 0.5098658247829518 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r2-a2/training_configuration.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..e66a7decd56ed39235578547803507de700b2e75 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "WINOGRANDE", + "dataset_id": "allenai/winogrande", + "preprocess_id": "winogrande_train_deepeval" + }, + "peft_config": { + "method": "lora", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 1576960 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-lora-winogrande-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r2-a2", + "seed": 42, + "timestamp": "2025-08-29T17:52:16.681914" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r2-a2/training_logs.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..242917a28bdf0e73c6a8f477b7b18457bc1d1b0f --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r2-a2/training_logs.json @@ -0,0 +1,5773 @@ +[ + { + "step": 1, + "epoch": 0.00625, + "cpu_mem": 1.683136512, + "gpu_mem": 4.423642624, + "loss": 3.3802, + "grad_norm": 13.83266544342041, + "learning_rate": 4.6875e-06 + }, + { + "step": 2, + "epoch": 0.0125, + "cpu_mem": 1.689034752, + "gpu_mem": 4.436256256, + "loss": 3.3361, + "grad_norm": 13.677742004394531, + "learning_rate": 9.375e-06 + }, + { + "step": 3, + "epoch": 0.01875, + "cpu_mem": 1.689427968, + "gpu_mem": 4.436260864, + "loss": 3.2225, + "grad_norm": 12.8506498336792, + "learning_rate": 1.40625e-05 + }, + { + "step": 4, + "epoch": 0.025, + "cpu_mem": 1.689821184, + "gpu_mem": 4.436259328, + "loss": 3.1714, + "grad_norm": 13.24236011505127, + "learning_rate": 1.875e-05 + }, + { + "step": 5, + "epoch": 0.03125, + "cpu_mem": 1.6902144, + "gpu_mem": 4.436259328, + "loss": 3.1628, + "grad_norm": 13.221402168273926, + "learning_rate": 2.3437499999999997e-05 + }, + { + "step": 6, + "epoch": 0.0375, + "cpu_mem": 1.690411008, + "gpu_mem": 4.436265472, + "loss": 3.1174, + "grad_norm": 13.739409446716309, + "learning_rate": 2.8125e-05 + }, + { + "step": 7, + "epoch": 0.04375, + "cpu_mem": 1.690804224, + "gpu_mem": 4.436271616, + "loss": 2.9677, + "grad_norm": 13.865226745605469, + "learning_rate": 3.28125e-05 + }, + { + "step": 8, + "epoch": 0.05, + "cpu_mem": 1.691000832, + "gpu_mem": 4.43625472, + "loss": 2.849, + "grad_norm": 13.190237045288086, + "learning_rate": 3.75e-05 + }, + { + "step": 9, + "epoch": 0.05625, + "cpu_mem": 1.69119744, + "gpu_mem": 4.436260864, + "loss": 2.9193, + "grad_norm": 14.35507869720459, + "learning_rate": 4.2187499999999995e-05 + }, + { + "step": 10, + "epoch": 0.0625, + "cpu_mem": 1.691394048, + "gpu_mem": 4.436263936, + "loss": 2.5966, + "grad_norm": 14.376572608947754, + "learning_rate": 4.6874999999999994e-05 + }, + { + "step": 11, + "epoch": 0.06875, + "cpu_mem": 1.691590656, + "gpu_mem": 4.436253184, + "loss": 2.3806, + "grad_norm": 14.181256294250488, + "learning_rate": 5.156249999999999e-05 + }, + { + "step": 12, + "epoch": 0.075, + "cpu_mem": 1.691787264, + "gpu_mem": 4.436257792, + "loss": 2.3082, + "grad_norm": 13.987662315368652, + "learning_rate": 5.625e-05 + }, + { + "step": 13, + "epoch": 0.08125, + "cpu_mem": 1.691787264, + "gpu_mem": 4.436265472, + "loss": 1.9145, + "grad_norm": 12.040809631347656, + "learning_rate": 6.09375e-05 + }, + { + "step": 14, + "epoch": 0.0875, + "cpu_mem": 1.691787264, + "gpu_mem": 4.436260864, + "loss": 1.6822, + "grad_norm": 10.132392883300781, + "learning_rate": 6.5625e-05 + }, + { + "step": 15, + "epoch": 0.09375, + "cpu_mem": 1.691983872, + "gpu_mem": 4.436260864, + "loss": 1.39, + "grad_norm": 7.734875202178955, + "learning_rate": 7.03125e-05 + }, + { + "step": 16, + "epoch": 0.1, + "cpu_mem": 1.69218048, + "gpu_mem": 4.436257792, + "loss": 1.2327, + "grad_norm": 7.8858513832092285, + "learning_rate": 7.5e-05 + }, + { + "step": 17, + "epoch": 0.10625, + "cpu_mem": 1.69218048, + "gpu_mem": 4.436257792, + "loss": 0.9848, + "grad_norm": 5.989914417266846, + "learning_rate": 7.968749999999999e-05 + }, + { + "step": 18, + "epoch": 0.1125, + "cpu_mem": 1.692377088, + "gpu_mem": 4.436260864, + "loss": 0.9464, + "grad_norm": 4.825669765472412, + "learning_rate": 8.437499999999999e-05 + }, + { + "step": 19, + "epoch": 0.11875, + "cpu_mem": 1.692377088, + "gpu_mem": 4.436257792, + "loss": 0.8834, + "grad_norm": 3.6761608123779297, + "learning_rate": 8.906249999999999e-05 + }, + { + "step": 20, + "epoch": 0.125, + "cpu_mem": 1.692377088, + "gpu_mem": 4.436265472, + "loss": 0.8944, + "grad_norm": 5.350832939147949, + "learning_rate": 9.374999999999999e-05 + }, + { + "step": 21, + "epoch": 0.13125, + "cpu_mem": 1.692573696, + "gpu_mem": 4.436257792, + "loss": 0.7698, + "grad_norm": 2.1533663272857666, + "learning_rate": 9.843749999999999e-05 + }, + { + "step": 22, + "epoch": 0.1375, + "cpu_mem": 1.692573696, + "gpu_mem": 4.436257792, + "loss": 0.7243, + "grad_norm": 1.919243335723877, + "learning_rate": 0.00010312499999999999 + }, + { + "step": 23, + "epoch": 0.14375, + "cpu_mem": 1.692770304, + "gpu_mem": 4.436253184, + "loss": 0.696, + "grad_norm": 1.9062637090682983, + "learning_rate": 0.00010781249999999998 + }, + { + "step": 24, + "epoch": 0.15, + "cpu_mem": 1.692770304, + "gpu_mem": 4.436256256, + "loss": 0.7639, + "grad_norm": 4.81977653503418, + "learning_rate": 0.0001125 + }, + { + "step": 25, + "epoch": 0.15625, + "cpu_mem": 1.692770304, + "gpu_mem": 4.436259328, + "loss": 0.7096, + "grad_norm": 4.102127552032471, + "learning_rate": 0.0001171875 + }, + { + "step": 26, + "epoch": 0.1625, + "cpu_mem": 1.692770304, + "gpu_mem": 4.43625472, + "loss": 0.7525, + "grad_norm": 6.262319564819336, + "learning_rate": 0.000121875 + }, + { + "step": 27, + "epoch": 0.16875, + "cpu_mem": 1.692770304, + "gpu_mem": 4.436253184, + "loss": 0.7015, + "grad_norm": 3.219036817550659, + "learning_rate": 0.0001265625 + }, + { + "step": 28, + "epoch": 0.175, + "cpu_mem": 1.692770304, + "gpu_mem": 4.436259328, + "loss": 0.7124, + "grad_norm": 1.083777666091919, + "learning_rate": 0.00013125 + }, + { + "step": 29, + "epoch": 0.18125, + "cpu_mem": 1.692966912, + "gpu_mem": 4.436257792, + "loss": 0.7097, + "grad_norm": 2.689441680908203, + "learning_rate": 0.0001359375 + }, + { + "step": 30, + "epoch": 0.1875, + "cpu_mem": 1.692966912, + "gpu_mem": 4.436257792, + "loss": 0.7561, + "grad_norm": 3.7544941902160645, + "learning_rate": 0.000140625 + }, + { + "step": 31, + "epoch": 0.19375, + "cpu_mem": 1.692966912, + "gpu_mem": 4.436257792, + "loss": 0.6836, + "grad_norm": 1.2491235733032227, + "learning_rate": 0.0001453125 + }, + { + "step": 32, + "epoch": 0.2, + "cpu_mem": 1.692966912, + "gpu_mem": 4.43625472, + "loss": 0.6529, + "grad_norm": 1.57710599899292, + "learning_rate": 0.00015 + }, + { + "step": 33, + "epoch": 0.20625, + "cpu_mem": 1.69316352, + "gpu_mem": 4.43625472, + "loss": 0.8592, + "grad_norm": 8.028727531433105, + "learning_rate": 0.00015468749999999999 + }, + { + "step": 34, + "epoch": 0.2125, + "cpu_mem": 1.69316352, + "gpu_mem": 4.43625472, + "loss": 0.8269, + "grad_norm": 6.71557092666626, + "learning_rate": 0.00015937499999999998 + }, + { + "step": 35, + "epoch": 0.21875, + "cpu_mem": 1.69316352, + "gpu_mem": 4.436260864, + "loss": 0.7235, + "grad_norm": 3.196037769317627, + "learning_rate": 0.00016406249999999998 + }, + { + "step": 36, + "epoch": 0.225, + "cpu_mem": 1.69316352, + "gpu_mem": 4.436256256, + "loss": 0.7128, + "grad_norm": 1.524878978729248, + "learning_rate": 0.00016874999999999998 + }, + { + "step": 37, + "epoch": 0.23125, + "cpu_mem": 1.69316352, + "gpu_mem": 4.43625472, + "loss": 0.6914, + "grad_norm": 0.8804712295532227, + "learning_rate": 0.00017343749999999998 + }, + { + "step": 38, + "epoch": 0.2375, + "cpu_mem": 1.69316352, + "gpu_mem": 4.436259328, + "loss": 0.7069, + "grad_norm": 2.3322417736053467, + "learning_rate": 0.00017812499999999998 + }, + { + "step": 39, + "epoch": 0.24375, + "cpu_mem": 1.69316352, + "gpu_mem": 4.436265472, + "loss": 0.7455, + "grad_norm": 3.4839231967926025, + "learning_rate": 0.00018281249999999998 + }, + { + "step": 40, + "epoch": 0.25, + "cpu_mem": 1.69316352, + "gpu_mem": 4.4362624, + "loss": 0.711, + "grad_norm": 0.6303919553756714, + "learning_rate": 0.00018749999999999998 + }, + { + "step": 41, + "epoch": 0.25625, + "cpu_mem": 1.69316352, + "gpu_mem": 4.4362624, + "loss": 0.7101, + "grad_norm": 0.8052319288253784, + "learning_rate": 0.00019218749999999998 + }, + { + "step": 42, + "epoch": 0.2625, + "cpu_mem": 1.69316352, + "gpu_mem": 4.436259328, + "loss": 0.7158, + "grad_norm": 2.7432708740234375, + "learning_rate": 0.00019687499999999997 + }, + { + "step": 43, + "epoch": 0.26875, + "cpu_mem": 1.69316352, + "gpu_mem": 4.436259328, + "loss": 0.681, + "grad_norm": 0.3840685784816742, + "learning_rate": 0.00020156249999999997 + }, + { + "step": 44, + "epoch": 0.275, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7261, + "grad_norm": 2.041940212249756, + "learning_rate": 0.00020624999999999997 + }, + { + "step": 45, + "epoch": 0.28125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436267008, + "loss": 0.6987, + "grad_norm": 1.1497113704681396, + "learning_rate": 0.00021093749999999997 + }, + { + "step": 46, + "epoch": 0.2875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7156, + "grad_norm": 1.0560545921325684, + "learning_rate": 0.00021562499999999997 + }, + { + "step": 47, + "epoch": 0.29375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.7432, + "grad_norm": 2.2676539421081543, + "learning_rate": 0.00022031249999999997 + }, + { + "step": 48, + "epoch": 0.3, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.7501, + "grad_norm": 3.221907138824463, + "learning_rate": 0.000225 + }, + { + "step": 49, + "epoch": 0.30625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.6659, + "grad_norm": 1.400289535522461, + "learning_rate": 0.0002296875 + }, + { + "step": 50, + "epoch": 0.3125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.7218, + "grad_norm": 1.126739263534546, + "learning_rate": 0.000234375 + }, + { + "step": 51, + "epoch": 0.31875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6977, + "grad_norm": 1.2122461795806885, + "learning_rate": 0.0002390625 + }, + { + "step": 52, + "epoch": 0.325, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6936, + "grad_norm": 0.5441356897354126, + "learning_rate": 0.00024375 + }, + { + "step": 53, + "epoch": 0.33125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.692, + "grad_norm": 1.0445122718811035, + "learning_rate": 0.00024843749999999996 + }, + { + "step": 54, + "epoch": 0.3375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436248576, + "loss": 0.7277, + "grad_norm": 2.425431489944458, + "learning_rate": 0.000253125 + }, + { + "step": 55, + "epoch": 0.34375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6981, + "grad_norm": 1.3362460136413574, + "learning_rate": 0.00025781249999999996 + }, + { + "step": 56, + "epoch": 0.35, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6951, + "grad_norm": 0.685305655002594, + "learning_rate": 0.0002625 + }, + { + "step": 57, + "epoch": 0.35625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.6971, + "grad_norm": 0.9880672693252563, + "learning_rate": 0.00026718749999999996 + }, + { + "step": 58, + "epoch": 0.3625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.7045, + "grad_norm": 1.0030702352523804, + "learning_rate": 0.000271875 + }, + { + "step": 59, + "epoch": 0.36875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436267008, + "loss": 0.7091, + "grad_norm": 1.6192041635513306, + "learning_rate": 0.00027656249999999995 + }, + { + "step": 60, + "epoch": 0.375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6997, + "grad_norm": 0.5898262858390808, + "learning_rate": 0.00028125 + }, + { + "step": 61, + "epoch": 0.38125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6999, + "grad_norm": 0.6567052602767944, + "learning_rate": 0.00028593749999999995 + }, + { + "step": 62, + "epoch": 0.3875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6992, + "grad_norm": 0.31528347730636597, + "learning_rate": 0.000290625 + }, + { + "step": 63, + "epoch": 0.39375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6855, + "grad_norm": 0.5848697423934937, + "learning_rate": 0.00029531249999999995 + }, + { + "step": 64, + "epoch": 0.4, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.7401, + "grad_norm": 2.95882248878479, + "learning_rate": 0.0003 + }, + { + "step": 65, + "epoch": 0.40625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6874, + "grad_norm": 0.9511045217514038, + "learning_rate": 0.00029999776892091325 + }, + { + "step": 66, + "epoch": 0.4125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.747, + "grad_norm": 3.0018863677978516, + "learning_rate": 0.00029999107575002246 + }, + { + "step": 67, + "epoch": 0.41875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6813, + "grad_norm": 0.21937806904315948, + "learning_rate": 0.0002999799206864343 + }, + { + "step": 68, + "epoch": 0.425, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7029, + "grad_norm": 0.6779627799987793, + "learning_rate": 0.0002999643040619863 + }, + { + "step": 69, + "epoch": 0.43125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6929, + "grad_norm": 0.8617700338363647, + "learning_rate": 0.0002999442263412377 + }, + { + "step": 70, + "epoch": 0.4375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.6875, + "grad_norm": 0.20560003817081451, + "learning_rate": 0.00029991968812145484 + }, + { + "step": 71, + "epoch": 0.44375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.702, + "grad_norm": 1.0989729166030884, + "learning_rate": 0.00029989069013259374 + }, + { + "step": 72, + "epoch": 0.45, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6967, + "grad_norm": 0.5222887992858887, + "learning_rate": 0.00029985723323727866 + }, + { + "step": 73, + "epoch": 0.45625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7015, + "grad_norm": 1.6201422214508057, + "learning_rate": 0.00029981931843077583 + }, + { + "step": 74, + "epoch": 0.4625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6998, + "grad_norm": 0.5089905261993408, + "learning_rate": 0.00029977694684096444 + }, + { + "step": 75, + "epoch": 0.46875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.682, + "grad_norm": 0.8053684830665588, + "learning_rate": 0.0002997301197283027 + }, + { + "step": 76, + "epoch": 0.475, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.7489, + "grad_norm": 3.032691240310669, + "learning_rate": 0.0002996788384857905 + }, + { + "step": 77, + "epoch": 0.48125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.7145, + "grad_norm": 1.4360599517822266, + "learning_rate": 0.00029962310463892795 + }, + { + "step": 78, + "epoch": 0.4875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6958, + "grad_norm": 1.1082075834274292, + "learning_rate": 0.00029956291984566997 + }, + { + "step": 79, + "epoch": 0.49375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7027, + "grad_norm": 0.5354030132293701, + "learning_rate": 0.00029949828589637703 + }, + { + "step": 80, + "epoch": 0.5, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.7262, + "grad_norm": 1.5854023694992065, + "learning_rate": 0.0002994292047137618 + }, + { + "step": 81, + "epoch": 0.50625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.7027, + "grad_norm": 1.3265489339828491, + "learning_rate": 0.00029935567835283203 + }, + { + "step": 82, + "epoch": 0.5125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6816, + "grad_norm": 0.24732555449008942, + "learning_rate": 0.00029927770900082954 + }, + { + "step": 83, + "epoch": 0.51875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6854, + "grad_norm": 0.9271851181983948, + "learning_rate": 0.0002991952989771647 + }, + { + "step": 84, + "epoch": 0.525, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.7179, + "grad_norm": 1.7710685729980469, + "learning_rate": 0.0002991084507333479 + }, + { + "step": 85, + "epoch": 0.53125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7294, + "grad_norm": 1.8168729543685913, + "learning_rate": 0.00029901716685291663 + }, + { + "step": 86, + "epoch": 0.5375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.712, + "grad_norm": 0.9750337600708008, + "learning_rate": 0.0002989214500513582 + }, + { + "step": 87, + "epoch": 0.54375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6903, + "grad_norm": 0.336660236120224, + "learning_rate": 0.0002988213031760294 + }, + { + "step": 88, + "epoch": 0.55, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.697, + "grad_norm": 0.7233413457870483, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 89, + "epoch": 0.55625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.7095, + "grad_norm": 0.9539067149162292, + "learning_rate": 0.0002986077312523219 + }, + { + "step": 90, + "epoch": 0.5625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.69, + "grad_norm": 0.2083374261856079, + "learning_rate": 0.00029849431255722116 + }, + { + "step": 91, + "epoch": 0.56875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6665, + "grad_norm": 0.7599075436592102, + "learning_rate": 0.00029837647649471715 + }, + { + "step": 92, + "epoch": 0.575, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.7407, + "grad_norm": 1.652909517288208, + "learning_rate": 0.0002982542265701641 + }, + { + "step": 93, + "epoch": 0.58125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.7184, + "grad_norm": 1.0809563398361206, + "learning_rate": 0.0002981275664202187 + }, + { + "step": 94, + "epoch": 0.5875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7078, + "grad_norm": 0.5733321309089661, + "learning_rate": 0.00029799649981273186 + }, + { + "step": 95, + "epoch": 0.59375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6879, + "grad_norm": 0.39110442996025085, + "learning_rate": 0.00029786103064663634 + }, + { + "step": 96, + "epoch": 0.6, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6977, + "grad_norm": 0.9494445323944092, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 97, + "epoch": 0.60625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7025, + "grad_norm": 0.5269314646720886, + "learning_rate": 0.00029757690088906156 + }, + { + "step": 98, + "epoch": 0.6125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.7053, + "grad_norm": 0.594878613948822, + "learning_rate": 0.00029742824874979515 + }, + { + "step": 99, + "epoch": 0.61875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6788, + "grad_norm": 0.7555477023124695, + "learning_rate": 0.0002972752109560943 + }, + { + "step": 100, + "epoch": 0.625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7049, + "grad_norm": 0.8187215328216553, + "learning_rate": 0.00029711779206048454 + }, + { + "step": 101, + "epoch": 0.63125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6938, + "grad_norm": 0.5182422995567322, + "learning_rate": 0.0002969559967458194 + }, + { + "step": 102, + "epoch": 0.6375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.7128, + "grad_norm": 1.0362550020217896, + "learning_rate": 0.0002967898298251407 + }, + { + "step": 103, + "epoch": 0.64375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6893, + "grad_norm": 0.27027931809425354, + "learning_rate": 0.0002966192962415358 + }, + { + "step": 104, + "epoch": 0.65, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6942, + "grad_norm": 0.36158162355422974, + "learning_rate": 0.00029644440106799 + }, + { + "step": 105, + "epoch": 0.65625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6934, + "grad_norm": 0.3102886378765106, + "learning_rate": 0.00029626514950723627 + }, + { + "step": 106, + "epoch": 0.6625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7082, + "grad_norm": 0.8535572290420532, + "learning_rate": 0.0002960815468916 + }, + { + "step": 107, + "epoch": 0.66875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6995, + "grad_norm": 0.5979053974151611, + "learning_rate": 0.0002958935986828407 + }, + { + "step": 108, + "epoch": 0.675, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6898, + "grad_norm": 0.16138623654842377, + "learning_rate": 0.00029570131047198915 + }, + { + "step": 109, + "epoch": 0.68125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.7123, + "grad_norm": 0.42164984345436096, + "learning_rate": 0.0002955046879791816 + }, + { + "step": 110, + "epoch": 0.6875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6914, + "grad_norm": 0.5988208651542664, + "learning_rate": 0.00029530373705348895 + }, + { + "step": 111, + "epoch": 0.69375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.7045, + "grad_norm": 0.5512078404426575, + "learning_rate": 0.00029509846367274336 + }, + { + "step": 112, + "epoch": 0.7, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.703, + "grad_norm": 0.5977473855018616, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 113, + "epoch": 0.70625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436250112, + "loss": 0.6776, + "grad_norm": 0.30862340331077576, + "learning_rate": 0.00029467497410015625 + }, + { + "step": 114, + "epoch": 0.7125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.694, + "grad_norm": 0.30562731623649597, + "learning_rate": 0.00029445677050616437 + }, + { + "step": 115, + "epoch": 0.71875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.7016, + "grad_norm": 0.640714168548584, + "learning_rate": 0.0002942342696524443 + }, + { + "step": 116, + "epoch": 0.725, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.7229, + "grad_norm": 1.99752938747406, + "learning_rate": 0.0002940074781578893 + }, + { + "step": 117, + "epoch": 0.73125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.693, + "grad_norm": 0.42917120456695557, + "learning_rate": 0.00029377640276902954 + }, + { + "step": 118, + "epoch": 0.7375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.691, + "grad_norm": 0.36744749546051025, + "learning_rate": 0.0002935410503598313 + }, + { + "step": 119, + "epoch": 0.74375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7383, + "grad_norm": 1.8811557292938232, + "learning_rate": 0.00029330142793149237 + }, + { + "step": 120, + "epoch": 0.75, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7039, + "grad_norm": 0.6467918753623962, + "learning_rate": 0.000293057542612234 + }, + { + "step": 121, + "epoch": 0.75625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6976, + "grad_norm": 0.21314767003059387, + "learning_rate": 0.0002928094016570886 + }, + { + "step": 122, + "epoch": 0.7625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6975, + "grad_norm": 1.2783654928207397, + "learning_rate": 0.00029255701244768414 + }, + { + "step": 123, + "epoch": 0.76875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.698, + "grad_norm": 0.3042077124118805, + "learning_rate": 0.0002923003824920244 + }, + { + "step": 124, + "epoch": 0.775, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6954, + "grad_norm": 0.20429036021232605, + "learning_rate": 0.0002920395194242658 + }, + { + "step": 125, + "epoch": 0.78125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436250112, + "loss": 0.6957, + "grad_norm": 0.13123148679733276, + "learning_rate": 0.00029177443100449014 + }, + { + "step": 126, + "epoch": 0.7875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6941, + "grad_norm": 0.3629641532897949, + "learning_rate": 0.00029150512511847375 + }, + { + "step": 127, + "epoch": 0.79375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6946, + "grad_norm": 0.23763252794742584, + "learning_rate": 0.00029123160977745306 + }, + { + "step": 128, + "epoch": 0.8, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6966, + "grad_norm": 0.4855000078678131, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 129, + "epoch": 0.80625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6973, + "grad_norm": 0.1841968148946762, + "learning_rate": 0.00029067198340121094 + }, + { + "step": 130, + "epoch": 0.8125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.7029, + "grad_norm": 0.13550734519958496, + "learning_rate": 0.00029038588901359884 + }, + { + "step": 131, + "epoch": 0.81875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6998, + "grad_norm": 0.44782084226608276, + "learning_rate": 0.00029009561846570604 + }, + { + "step": 132, + "epoch": 0.825, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6983, + "grad_norm": 0.676345705986023, + "learning_rate": 0.00028980118039241976 + }, + { + "step": 133, + "epoch": 0.83125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6985, + "grad_norm": 1.3368337154388428, + "learning_rate": 0.00028950258355260177 + }, + { + "step": 134, + "epoch": 0.8375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6979, + "grad_norm": 0.27397415041923523, + "learning_rate": 0.00028919983682882766 + }, + { + "step": 135, + "epoch": 0.84375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6792, + "grad_norm": 0.49170124530792236, + "learning_rate": 0.0002888929492271224 + }, + { + "step": 136, + "epoch": 0.85, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6776, + "grad_norm": 0.17699812352657318, + "learning_rate": 0.000288581929876693 + }, + { + "step": 137, + "epoch": 0.85625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6957, + "grad_norm": 0.3484092950820923, + "learning_rate": 0.00028826678802965614 + }, + { + "step": 138, + "epoch": 0.8625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7159, + "grad_norm": 0.7555202841758728, + "learning_rate": 0.0002879475330607638 + }, + { + "step": 139, + "epoch": 0.86875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.7279, + "grad_norm": 0.9190186858177185, + "learning_rate": 0.00028762417446712363 + }, + { + "step": 140, + "epoch": 0.875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6883, + "grad_norm": 0.20900030434131622, + "learning_rate": 0.00028729672186791704 + }, + { + "step": 141, + "epoch": 0.88125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6821, + "grad_norm": 0.13714168965816498, + "learning_rate": 0.00028696518500411254 + }, + { + "step": 142, + "epoch": 0.8875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6883, + "grad_norm": 0.19246403872966766, + "learning_rate": 0.0002866295737381763 + }, + { + "step": 143, + "epoch": 0.89375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6982, + "grad_norm": 0.1476067155599594, + "learning_rate": 0.0002862898980537788 + }, + { + "step": 144, + "epoch": 0.9, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6947, + "grad_norm": 0.1320081502199173, + "learning_rate": 0.0002859461680554975 + }, + { + "step": 145, + "epoch": 0.90625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.6973, + "grad_norm": 0.6157453060150146, + "learning_rate": 0.0002855983939685165 + }, + { + "step": 146, + "epoch": 0.9125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6942, + "grad_norm": 0.5585399866104126, + "learning_rate": 0.0002852465861383224 + }, + { + "step": 147, + "epoch": 0.91875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7005, + "grad_norm": 0.5287559628486633, + "learning_rate": 0.00028489075503039643 + }, + { + "step": 148, + "epoch": 0.925, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6926, + "grad_norm": 0.15172460675239563, + "learning_rate": 0.00028453091122990323 + }, + { + "step": 149, + "epoch": 0.93125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6828, + "grad_norm": 0.3334523141384125, + "learning_rate": 0.0002841670654413757 + }, + { + "step": 150, + "epoch": 0.9375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6852, + "grad_norm": 0.251968652009964, + "learning_rate": 0.0002837992284883971 + }, + { + "step": 151, + "epoch": 0.94375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.7116, + "grad_norm": 0.6656016111373901, + "learning_rate": 0.0002834274113132784 + }, + { + "step": 152, + "epoch": 0.95, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6987, + "grad_norm": 0.28842583298683167, + "learning_rate": 0.0002830516249767332 + }, + { + "step": 153, + "epoch": 0.95625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6891, + "grad_norm": 0.1148039847612381, + "learning_rate": 0.0002826718806575488 + }, + { + "step": 154, + "epoch": 0.9625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6968, + "grad_norm": 0.342706561088562, + "learning_rate": 0.0002822881896522532 + }, + { + "step": 155, + "epoch": 0.96875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7, + "grad_norm": 0.6660858392715454, + "learning_rate": 0.0002819005633747795 + }, + { + "step": 156, + "epoch": 0.975, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.7002, + "grad_norm": 0.38698098063468933, + "learning_rate": 0.00028150901335612615 + }, + { + "step": 157, + "epoch": 0.98125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6588, + "grad_norm": 1.5113012790679932, + "learning_rate": 0.0002811135512440138 + }, + { + "step": 158, + "epoch": 0.9875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436268544, + "loss": 0.7055, + "grad_norm": 0.7014216780662537, + "learning_rate": 0.0002807141888025392 + }, + { + "step": 159, + "epoch": 0.99375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6904, + "grad_norm": 0.3366650342941284, + "learning_rate": 0.00028031093791182484 + }, + { + "step": 160, + "epoch": 1.0, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.7179, + "grad_norm": 1.0065648555755615, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 161, + "epoch": 1.00625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7169, + "grad_norm": 0.8127135634422302, + "learning_rate": 0.0002794928188811727 + }, + { + "step": 162, + "epoch": 1.0125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.7042, + "grad_norm": 0.6570464968681335, + "learning_rate": 0.0002790779750784118 + }, + { + "step": 163, + "epoch": 1.01875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6979, + "grad_norm": 0.8232495784759521, + "learning_rate": 0.0002786592915000408 + }, + { + "step": 164, + "epoch": 1.025, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6791, + "grad_norm": 0.3254877030849457, + "learning_rate": 0.00027823678060094197 + }, + { + "step": 165, + "epoch": 1.03125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6834, + "grad_norm": 0.23941670358181, + "learning_rate": 0.0002778104549498518 + }, + { + "step": 166, + "epoch": 1.0375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.75, + "grad_norm": 1.4399815797805786, + "learning_rate": 0.00027738032722898683 + }, + { + "step": 167, + "epoch": 1.04375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7547, + "grad_norm": 1.2823718786239624, + "learning_rate": 0.00027694641023366656 + }, + { + "step": 168, + "epoch": 1.05, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.726, + "grad_norm": 0.6834604144096375, + "learning_rate": 0.0002765087168719328 + }, + { + "step": 169, + "epoch": 1.05625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.7128, + "grad_norm": 0.45957425236701965, + "learning_rate": 0.00027606726016416567 + }, + { + "step": 170, + "epoch": 1.0625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.7173, + "grad_norm": 0.5689762234687805, + "learning_rate": 0.00027562205324269617 + }, + { + "step": 171, + "epoch": 1.06875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6916, + "grad_norm": 0.49433183670043945, + "learning_rate": 0.00027517310935141565 + }, + { + "step": 172, + "epoch": 1.075, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7018, + "grad_norm": 0.3783508539199829, + "learning_rate": 0.0002747204418453818 + }, + { + "step": 173, + "epoch": 1.08125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436250112, + "loss": 0.7092, + "grad_norm": 0.2643006443977356, + "learning_rate": 0.00027426406419042135 + }, + { + "step": 174, + "epoch": 1.0875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.702, + "grad_norm": 0.3481295704841614, + "learning_rate": 0.00027380398996272956 + }, + { + "step": 175, + "epoch": 1.09375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7187, + "grad_norm": 1.0913372039794922, + "learning_rate": 0.0002733402328484662 + }, + { + "step": 176, + "epoch": 1.1, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6922, + "grad_norm": 0.39305877685546875, + "learning_rate": 0.00027287280664334875 + }, + { + "step": 177, + "epoch": 1.10625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7198, + "grad_norm": 0.8478072881698608, + "learning_rate": 0.0002724017252522415 + }, + { + "step": 178, + "epoch": 1.1125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6884, + "grad_norm": 0.30154484510421753, + "learning_rate": 0.0002719270026887423 + }, + { + "step": 179, + "epoch": 1.11875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.7072, + "grad_norm": 0.4667106866836548, + "learning_rate": 0.0002714486530747656 + }, + { + "step": 180, + "epoch": 1.125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6918, + "grad_norm": 0.3425340950489044, + "learning_rate": 0.0002709666906401224 + }, + { + "step": 181, + "epoch": 1.13125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436273152, + "loss": 0.6782, + "grad_norm": 1.217700719833374, + "learning_rate": 0.0002704811297220967 + }, + { + "step": 182, + "epoch": 1.1375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.7368, + "grad_norm": 3.345475196838379, + "learning_rate": 0.00026999198476501945 + }, + { + "step": 183, + "epoch": 1.14375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 1.0401, + "grad_norm": 46.40269470214844, + "learning_rate": 0.0002694992703198383 + }, + { + "step": 184, + "epoch": 1.15, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7999, + "grad_norm": 11.832414627075195, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 185, + "epoch": 1.15625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.691, + "grad_norm": 0.8041672110557556, + "learning_rate": 0.0002685031916994403 + }, + { + "step": 186, + "epoch": 1.1625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6935, + "grad_norm": 1.0832329988479614, + "learning_rate": 0.0002679998571552925 + }, + { + "step": 187, + "epoch": 1.16875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6886, + "grad_norm": 2.6858136653900146, + "learning_rate": 0.0002674930123842975 + }, + { + "step": 188, + "epoch": 1.175, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7652, + "grad_norm": 4.65596342086792, + "learning_rate": 0.0002669826724639322 + }, + { + "step": 189, + "epoch": 1.18125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.8064, + "grad_norm": 4.585738182067871, + "learning_rate": 0.0002664688525756463 + }, + { + "step": 190, + "epoch": 1.1875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6876, + "grad_norm": 0.5261756181716919, + "learning_rate": 0.0002659515680044105 + }, + { + "step": 191, + "epoch": 1.19375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6814, + "grad_norm": 0.41349083185195923, + "learning_rate": 0.00026543083413826203 + }, + { + "step": 192, + "epoch": 1.2, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6897, + "grad_norm": 1.898442029953003, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 193, + "epoch": 1.20625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6997, + "grad_norm": 0.4439702332019806, + "learning_rate": 0.0002643790805859582 + }, + { + "step": 194, + "epoch": 1.2125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.6846, + "grad_norm": 0.3118208646774292, + "learning_rate": 0.00026384809218707423 + }, + { + "step": 195, + "epoch": 1.21875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.729, + "grad_norm": 1.5808017253875732, + "learning_rate": 0.0002633137170668897 + }, + { + "step": 196, + "epoch": 1.225, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6857, + "grad_norm": 0.329436719417572, + "learning_rate": 0.0002627759711218466 + }, + { + "step": 197, + "epoch": 1.23125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.7053, + "grad_norm": 0.3449731469154358, + "learning_rate": 0.00026223487034866133 + }, + { + "step": 198, + "epoch": 1.2375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7007, + "grad_norm": 0.5285054445266724, + "learning_rate": 0.00026169043084384896 + }, + { + "step": 199, + "epoch": 1.24375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6825, + "grad_norm": 0.2227252721786499, + "learning_rate": 0.00026114266880324387 + }, + { + "step": 200, + "epoch": 1.25, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7208, + "grad_norm": 0.9508650898933411, + "learning_rate": 0.0002605916005215186 + }, + { + "step": 201, + "epoch": 1.25625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.7013, + "grad_norm": 0.4106709063053131, + "learning_rate": 0.00026003724239169874 + }, + { + "step": 202, + "epoch": 1.2625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7092, + "grad_norm": 0.7245640754699707, + "learning_rate": 0.00025947961090467533 + }, + { + "step": 203, + "epoch": 1.26875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6747, + "grad_norm": 0.7311031818389893, + "learning_rate": 0.0002589187226487144 + }, + { + "step": 204, + "epoch": 1.275, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6953, + "grad_norm": 0.24077805876731873, + "learning_rate": 0.0002583545943089633 + }, + { + "step": 205, + "epoch": 1.28125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6951, + "grad_norm": 0.3526243567466736, + "learning_rate": 0.00025778724266695466 + }, + { + "step": 206, + "epoch": 1.2875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7119, + "grad_norm": 0.4926137924194336, + "learning_rate": 0.00025721668460010696 + }, + { + "step": 207, + "epoch": 1.29375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6975, + "grad_norm": 0.19842113554477692, + "learning_rate": 0.0002566429370812223 + }, + { + "step": 208, + "epoch": 1.3, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6841, + "grad_norm": 0.2998376786708832, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 209, + "epoch": 1.30625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7091, + "grad_norm": 0.5752790570259094, + "learning_rate": 0.0002554859420524386 + }, + { + "step": 210, + "epoch": 1.3125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7, + "grad_norm": 0.3764725625514984, + "learning_rate": 0.00025490272896050507 + }, + { + "step": 211, + "epoch": 1.31875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7151, + "grad_norm": 0.9053231477737427, + "learning_rate": 0.00025431639525144175 + }, + { + "step": 212, + "epoch": 1.325, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6982, + "grad_norm": 0.2878480553627014, + "learning_rate": 0.0002537269583673404 + }, + { + "step": 213, + "epoch": 1.33125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6932, + "grad_norm": 0.1362980306148529, + "learning_rate": 0.0002531344358426051 + }, + { + "step": 214, + "epoch": 1.3375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436267008, + "loss": 0.6955, + "grad_norm": 0.32281240820884705, + "learning_rate": 0.0002525388453034307 + }, + { + "step": 215, + "epoch": 1.34375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6883, + "grad_norm": 0.4131995141506195, + "learning_rate": 0.0002519402044672784 + }, + { + "step": 216, + "epoch": 1.35, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6949, + "grad_norm": 0.6259315013885498, + "learning_rate": 0.00025133853114234905 + }, + { + "step": 217, + "epoch": 1.35625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.699, + "grad_norm": 0.28524893522262573, + "learning_rate": 0.00025073384322705274 + }, + { + "step": 218, + "epoch": 1.3625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.701, + "grad_norm": 0.7453063726425171, + "learning_rate": 0.0002501261587094771 + }, + { + "step": 219, + "epoch": 1.36875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6958, + "grad_norm": 0.158852681517601, + "learning_rate": 0.00024951549566685165 + }, + { + "step": 220, + "epoch": 1.375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6927, + "grad_norm": 0.0811055600643158, + "learning_rate": 0.0002489018722650103 + }, + { + "step": 221, + "epoch": 1.38125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6921, + "grad_norm": 0.7510029077529907, + "learning_rate": 0.00024828530675785094 + }, + { + "step": 222, + "epoch": 1.3875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6939, + "grad_norm": 0.17717188596725464, + "learning_rate": 0.00024766581748679234 + }, + { + "step": 223, + "epoch": 1.39375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.6908, + "grad_norm": 0.32211142778396606, + "learning_rate": 0.0002470434228802286 + }, + { + "step": 224, + "epoch": 1.4, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6917, + "grad_norm": 0.14353682100772858, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 225, + "epoch": 1.40625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7058, + "grad_norm": 0.6179647445678711, + "learning_rate": 0.0002457899918057468 + }, + { + "step": 226, + "epoch": 1.4125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7063, + "grad_norm": 0.48252543807029724, + "learning_rate": 0.0002451589926245468 + }, + { + "step": 227, + "epoch": 1.41875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6991, + "grad_norm": 0.14725713431835175, + "learning_rate": 0.00024452516268016865 + }, + { + "step": 228, + "epoch": 1.425, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6946, + "grad_norm": 0.11413919925689697, + "learning_rate": 0.00024388852082760884 + }, + { + "step": 229, + "epoch": 1.43125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.696, + "grad_norm": 0.18780174851417542, + "learning_rate": 0.00024324908600551162 + }, + { + "step": 230, + "epoch": 1.4375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6944, + "grad_norm": 0.32025763392448425, + "learning_rate": 0.00024260687723560574 + }, + { + "step": 231, + "epoch": 1.44375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6876, + "grad_norm": 0.396868497133255, + "learning_rate": 0.00024196191362213862 + }, + { + "step": 232, + "epoch": 1.45, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6909, + "grad_norm": 0.11610713601112366, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 233, + "epoch": 1.45625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6976, + "grad_norm": 0.2371203601360321, + "learning_rate": 0.0002406637986906913 + }, + { + "step": 234, + "epoch": 1.4625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436268544, + "loss": 0.6909, + "grad_norm": 0.18773791193962097, + "learning_rate": 0.00024001068598867212 + }, + { + "step": 235, + "epoch": 1.46875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.7019, + "grad_norm": 0.516610860824585, + "learning_rate": 0.000239354895673865 + }, + { + "step": 236, + "epoch": 1.475, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6916, + "grad_norm": 0.5660571455955505, + "learning_rate": 0.00023869644725453735 + }, + { + "step": 237, + "epoch": 1.48125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436267008, + "loss": 0.7029, + "grad_norm": 0.5866164565086365, + "learning_rate": 0.00023803536031802918 + }, + { + "step": 238, + "epoch": 1.4875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.6957, + "grad_norm": 0.9667791724205017, + "learning_rate": 0.00023737165453017033 + }, + { + "step": 239, + "epoch": 1.49375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.7041, + "grad_norm": 0.5165679454803467, + "learning_rate": 0.0002367053496346955 + }, + { + "step": 240, + "epoch": 1.5, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7028, + "grad_norm": 0.34334683418273926, + "learning_rate": 0.00023603646545265687 + }, + { + "step": 241, + "epoch": 1.50625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6692, + "grad_norm": 0.2925838530063629, + "learning_rate": 0.00023536502188183472 + }, + { + "step": 242, + "epoch": 1.5125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6794, + "grad_norm": 0.06948590278625488, + "learning_rate": 0.00023469103889614505 + }, + { + "step": 243, + "epoch": 1.51875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436267008, + "loss": 0.746, + "grad_norm": 1.190832495689392, + "learning_rate": 0.0002340145365450458 + }, + { + "step": 244, + "epoch": 1.525, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7212, + "grad_norm": 0.7033959031105042, + "learning_rate": 0.0002333355349529403 + }, + { + "step": 245, + "epoch": 1.53125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.7084, + "grad_norm": 0.5916748642921448, + "learning_rate": 0.0002326540543185786 + }, + { + "step": 246, + "epoch": 1.5375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6912, + "grad_norm": 0.1240658387541771, + "learning_rate": 0.0002319701149144565 + }, + { + "step": 247, + "epoch": 1.54375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6936, + "grad_norm": 0.07295471429824829, + "learning_rate": 0.00023128373708621275 + }, + { + "step": 248, + "epoch": 1.55, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.708, + "grad_norm": 0.6033907532691956, + "learning_rate": 0.00023059494125202357 + }, + { + "step": 249, + "epoch": 1.55625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6853, + "grad_norm": 0.1042880117893219, + "learning_rate": 0.00022990374790199532 + }, + { + "step": 250, + "epoch": 1.5625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.721, + "grad_norm": 0.6854376792907715, + "learning_rate": 0.0002292101775975552 + }, + { + "step": 251, + "epoch": 1.56875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.691, + "grad_norm": 0.07989989221096039, + "learning_rate": 0.00022851425097083906 + }, + { + "step": 252, + "epoch": 1.575, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6856, + "grad_norm": 0.19201266765594482, + "learning_rate": 0.00022781598872407822 + }, + { + "step": 253, + "epoch": 1.58125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6984, + "grad_norm": 0.12678585946559906, + "learning_rate": 0.00022711541162898321 + }, + { + "step": 254, + "epoch": 1.5875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6979, + "grad_norm": 0.14823156595230103, + "learning_rate": 0.00022641254052612627 + }, + { + "step": 255, + "epoch": 1.59375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.6904, + "grad_norm": 0.14893005788326263, + "learning_rate": 0.00022570739632432079 + }, + { + "step": 256, + "epoch": 1.6, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7003, + "grad_norm": 0.2503890097141266, + "learning_rate": 0.000225 + }, + { + "step": 257, + "epoch": 1.60625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6958, + "grad_norm": 0.08558503538370132, + "learning_rate": 0.0002242903725965924 + }, + { + "step": 258, + "epoch": 1.6125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6983, + "grad_norm": 0.32956600189208984, + "learning_rate": 0.00022357853522389615 + }, + { + "step": 259, + "epoch": 1.61875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6989, + "grad_norm": 0.4328124225139618, + "learning_rate": 0.000222864509057451 + }, + { + "step": 260, + "epoch": 1.625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6917, + "grad_norm": 0.24160081148147583, + "learning_rate": 0.00022214831533790813 + }, + { + "step": 261, + "epoch": 1.63125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6992, + "grad_norm": 0.6626600623130798, + "learning_rate": 0.0002214299753703987 + }, + { + "step": 262, + "epoch": 1.6375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6922, + "grad_norm": 0.12787054479122162, + "learning_rate": 0.00022070951052389966 + }, + { + "step": 263, + "epoch": 1.64375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436268544, + "loss": 0.6883, + "grad_norm": 0.09814996272325516, + "learning_rate": 0.00021998694223059837 + }, + { + "step": 264, + "epoch": 1.65, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6912, + "grad_norm": 0.08509663492441177, + "learning_rate": 0.0002192622919852551 + }, + { + "step": 265, + "epoch": 1.65625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7028, + "grad_norm": 0.2746524214744568, + "learning_rate": 0.00021853558134456307 + }, + { + "step": 266, + "epoch": 1.6625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6883, + "grad_norm": 0.1659913957118988, + "learning_rate": 0.00021780683192650796 + }, + { + "step": 267, + "epoch": 1.66875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6874, + "grad_norm": 0.14900967478752136, + "learning_rate": 0.00021707606540972413 + }, + { + "step": 268, + "epoch": 1.675, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7044, + "grad_norm": 0.5474597811698914, + "learning_rate": 0.00021634330353285017 + }, + { + "step": 269, + "epoch": 1.68125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6908, + "grad_norm": 0.29957595467567444, + "learning_rate": 0.00021560856809388213 + }, + { + "step": 270, + "epoch": 1.6875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6999, + "grad_norm": 0.4173649251461029, + "learning_rate": 0.00021487188094952489 + }, + { + "step": 271, + "epoch": 1.69375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6949, + "grad_norm": 0.09360115230083466, + "learning_rate": 0.0002141332640145423 + }, + { + "step": 272, + "epoch": 1.7, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6982, + "grad_norm": 0.6088557243347168, + "learning_rate": 0.0002133927392611049 + }, + { + "step": 273, + "epoch": 1.70625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6949, + "grad_norm": 0.25441962480545044, + "learning_rate": 0.00021265032871813658 + }, + { + "step": 274, + "epoch": 1.7125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.69, + "grad_norm": 0.4583888351917267, + "learning_rate": 0.00021190605447065917 + }, + { + "step": 275, + "epoch": 1.71875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6987, + "grad_norm": 0.3496013283729553, + "learning_rate": 0.0002111599386591355 + }, + { + "step": 276, + "epoch": 1.725, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.7, + "grad_norm": 0.3031418025493622, + "learning_rate": 0.00021041200347881057 + }, + { + "step": 277, + "epoch": 1.73125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.7006, + "grad_norm": 0.45816949009895325, + "learning_rate": 0.00020966227117905163 + }, + { + "step": 278, + "epoch": 1.7375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6914, + "grad_norm": 0.18061217665672302, + "learning_rate": 0.00020891076406268612 + }, + { + "step": 279, + "epoch": 1.74375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7024, + "grad_norm": 0.4965406060218811, + "learning_rate": 0.00020815750448533805 + }, + { + "step": 280, + "epoch": 1.75, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.6951, + "grad_norm": 0.1667080819606781, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 281, + "epoch": 1.75625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.6928, + "grad_norm": 0.35813096165657043, + "learning_rate": 0.00020664581763018324 + }, + { + "step": 282, + "epoch": 1.7625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.7049, + "grad_norm": 0.36866915225982666, + "learning_rate": 0.00020588743532161543 + }, + { + "step": 283, + "epoch": 1.76875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.7197, + "grad_norm": 1.0709607601165771, + "learning_rate": 0.00020512739048920552 + }, + { + "step": 284, + "epoch": 1.775, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7019, + "grad_norm": 0.3815457820892334, + "learning_rate": 0.00020436570574255522 + }, + { + "step": 285, + "epoch": 1.78125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6957, + "grad_norm": 0.1568594127893448, + "learning_rate": 0.00020360240374005 + }, + { + "step": 286, + "epoch": 1.7875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.7024, + "grad_norm": 0.7786321043968201, + "learning_rate": 0.00020283750718818501 + }, + { + "step": 287, + "epoch": 1.79375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.6978, + "grad_norm": 0.24292002618312836, + "learning_rate": 0.00020207103884088955 + }, + { + "step": 288, + "epoch": 1.8, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6965, + "grad_norm": 0.16959260404109955, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 289, + "epoch": 1.80625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7021, + "grad_norm": 0.3279533088207245, + "learning_rate": 0.00020053347800883298 + }, + { + "step": 290, + "epoch": 1.8125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6931, + "grad_norm": 0.2376861423254013, + "learning_rate": 0.00019976243126300282 + }, + { + "step": 291, + "epoch": 1.81875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6922, + "grad_norm": 0.06240609660744667, + "learning_rate": 0.00019898990419824333 + }, + { + "step": 292, + "epoch": 1.825, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436248576, + "loss": 0.6957, + "grad_norm": 0.09472914785146713, + "learning_rate": 0.00019821591979547423 + }, + { + "step": 293, + "epoch": 1.83125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.7097, + "grad_norm": 0.5675989389419556, + "learning_rate": 0.00019744050107896774 + }, + { + "step": 294, + "epoch": 1.8375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436250112, + "loss": 0.7137, + "grad_norm": 0.7588828802108765, + "learning_rate": 0.0001966636711156636 + }, + { + "step": 295, + "epoch": 1.84375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6931, + "grad_norm": 0.11409120261669159, + "learning_rate": 0.00019588545301448302 + }, + { + "step": 296, + "epoch": 1.85, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6921, + "grad_norm": 0.3078445494174957, + "learning_rate": 0.00019510586992564093 + }, + { + "step": 297, + "epoch": 1.85625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6943, + "grad_norm": 0.08923359960317612, + "learning_rate": 0.0001943249450399578 + }, + { + "step": 298, + "epoch": 1.8625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6972, + "grad_norm": 0.12796567380428314, + "learning_rate": 0.0001935427015881693 + }, + { + "step": 299, + "epoch": 1.86875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6924, + "grad_norm": 0.19764314591884613, + "learning_rate": 0.00019275916284023563 + }, + { + "step": 300, + "epoch": 1.875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7021, + "grad_norm": 0.5562687516212463, + "learning_rate": 0.00019197435210464882 + }, + { + "step": 301, + "epoch": 1.88125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6936, + "grad_norm": 0.11589452624320984, + "learning_rate": 0.00019118829272773985 + }, + { + "step": 302, + "epoch": 1.8875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6915, + "grad_norm": 0.1378728449344635, + "learning_rate": 0.00019040100809298392 + }, + { + "step": 303, + "epoch": 1.89375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436268544, + "loss": 0.6951, + "grad_norm": 0.1886414885520935, + "learning_rate": 0.00018961252162030476 + }, + { + "step": 304, + "epoch": 1.9, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6933, + "grad_norm": 0.2282310575246811, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 305, + "epoch": 1.90625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6904, + "grad_norm": 0.2508070468902588, + "learning_rate": 0.00018803203701893393 + }, + { + "step": 306, + "epoch": 1.9125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.695, + "grad_norm": 0.11800406873226166, + "learning_rate": 0.00018724008590605742 + }, + { + "step": 307, + "epoch": 1.91875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7056, + "grad_norm": 0.46052825450897217, + "learning_rate": 0.0001864470269854896 + }, + { + "step": 308, + "epoch": 1.925, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6876, + "grad_norm": 0.11014346778392792, + "learning_rate": 0.00018565288384892595 + }, + { + "step": 309, + "epoch": 1.93125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7102, + "grad_norm": 0.6226047277450562, + "learning_rate": 0.00018485768012031518 + }, + { + "step": 310, + "epoch": 1.9375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.684, + "grad_norm": 0.23348534107208252, + "learning_rate": 0.00018406143945515598 + }, + { + "step": 311, + "epoch": 1.94375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.6894, + "grad_norm": 0.14578792452812195, + "learning_rate": 0.00018326418553979367 + }, + { + "step": 312, + "epoch": 1.95, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.6978, + "grad_norm": 0.2315908521413803, + "learning_rate": 0.0001824659420907154 + }, + { + "step": 313, + "epoch": 1.95625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6895, + "grad_norm": 0.13921763002872467, + "learning_rate": 0.00018166673285384475 + }, + { + "step": 314, + "epoch": 1.9625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6907, + "grad_norm": 0.08176339417695999, + "learning_rate": 0.00018086658160383523 + }, + { + "step": 315, + "epoch": 1.96875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6911, + "grad_norm": 0.17442116141319275, + "learning_rate": 0.00018006551214336304 + }, + { + "step": 316, + "epoch": 1.975, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6965, + "grad_norm": 0.42122164368629456, + "learning_rate": 0.00017926354830241924 + }, + { + "step": 317, + "epoch": 1.98125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6946, + "grad_norm": 0.32773977518081665, + "learning_rate": 0.00017846071393760044 + }, + { + "step": 318, + "epoch": 1.9875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6917, + "grad_norm": 0.18610990047454834, + "learning_rate": 0.00017765703293139948 + }, + { + "step": 319, + "epoch": 1.99375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6847, + "grad_norm": 0.2868407070636749, + "learning_rate": 0.00017685252919149493 + }, + { + "step": 320, + "epoch": 2.0, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.687, + "grad_norm": 0.3258173167705536, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 321, + "epoch": 2.00625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6605, + "grad_norm": 0.6032968759536743, + "learning_rate": 0.00017524114926294887 + }, + { + "step": 322, + "epoch": 2.0125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.697, + "grad_norm": 0.318297803401947, + "learning_rate": 0.0001744343210091883 + }, + { + "step": 323, + "epoch": 2.01875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.7099, + "grad_norm": 0.51422518491745, + "learning_rate": 0.00017362676589005967 + }, + { + "step": 324, + "epoch": 2.025, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436268544, + "loss": 0.7436, + "grad_norm": 0.9612325429916382, + "learning_rate": 0.0001728185079284875 + }, + { + "step": 325, + "epoch": 2.03125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.7181, + "grad_norm": 0.5938482284545898, + "learning_rate": 0.00017200957116830423 + }, + { + "step": 326, + "epoch": 2.0375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436267008, + "loss": 0.7105, + "grad_norm": 0.459426611661911, + "learning_rate": 0.00017119997967353514 + }, + { + "step": 327, + "epoch": 2.04375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6888, + "grad_norm": 0.07044217735528946, + "learning_rate": 0.00017038975752768211 + }, + { + "step": 328, + "epoch": 2.05, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.7018, + "grad_norm": 0.2940356135368347, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 329, + "epoch": 2.05625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6996, + "grad_norm": 0.4257708787918091, + "learning_rate": 0.0001687675177098179 + }, + { + "step": 330, + "epoch": 2.0625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6919, + "grad_norm": 0.07252331078052521, + "learning_rate": 0.00016795554829574435 + }, + { + "step": 331, + "epoch": 2.06875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6961, + "grad_norm": 0.1330239176750183, + "learning_rate": 0.00016714304474502696 + }, + { + "step": 332, + "epoch": 2.075, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6841, + "grad_norm": 0.2875259816646576, + "learning_rate": 0.00016633003122779467 + }, + { + "step": 333, + "epoch": 2.08125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6854, + "grad_norm": 0.14372658729553223, + "learning_rate": 0.00016551653192934694 + }, + { + "step": 334, + "epoch": 2.0875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.701, + "grad_norm": 0.2542774975299835, + "learning_rate": 0.0001647025710494341 + }, + { + "step": 335, + "epoch": 2.09375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.7029, + "grad_norm": 0.29772529006004333, + "learning_rate": 0.00016388817280153735 + }, + { + "step": 336, + "epoch": 2.1, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6757, + "grad_norm": 0.11097745597362518, + "learning_rate": 0.00016307336141214873 + }, + { + "step": 337, + "epoch": 2.10625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.7261, + "grad_norm": 0.6528292298316956, + "learning_rate": 0.00016225816112005022 + }, + { + "step": 338, + "epoch": 2.1125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.7231, + "grad_norm": 0.6860221028327942, + "learning_rate": 0.00016144259617559286 + }, + { + "step": 339, + "epoch": 2.11875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.7002, + "grad_norm": 0.4148077964782715, + "learning_rate": 0.00016062669083997513 + }, + { + "step": 340, + "epoch": 2.125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6984, + "grad_norm": 0.2157445102930069, + "learning_rate": 0.00015981046938452146 + }, + { + "step": 341, + "epoch": 2.13125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6951, + "grad_norm": 0.26303157210350037, + "learning_rate": 0.00015899395608996015 + }, + { + "step": 342, + "epoch": 2.1375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.692, + "grad_norm": 0.3659428656101227, + "learning_rate": 0.00015817717524570094 + }, + { + "step": 343, + "epoch": 2.14375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6889, + "grad_norm": 0.14791420102119446, + "learning_rate": 0.0001573601511491127 + }, + { + "step": 344, + "epoch": 2.15, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7031, + "grad_norm": 0.5589924454689026, + "learning_rate": 0.00015654290810480042 + }, + { + "step": 345, + "epoch": 2.15625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.7005, + "grad_norm": 0.4566138982772827, + "learning_rate": 0.00015572547042388223 + }, + { + "step": 346, + "epoch": 2.1625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.697, + "grad_norm": 0.1042766273021698, + "learning_rate": 0.00015490786242326643 + }, + { + "step": 347, + "epoch": 2.16875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6937, + "grad_norm": 0.25352856516838074, + "learning_rate": 0.00015409010842492777 + }, + { + "step": 348, + "epoch": 2.175, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6937, + "grad_norm": 0.2577526867389679, + "learning_rate": 0.00015327223275518416 + }, + { + "step": 349, + "epoch": 2.18125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43624704, + "loss": 0.6976, + "grad_norm": 0.14251722395420074, + "learning_rate": 0.000152454259743973 + }, + { + "step": 350, + "epoch": 2.1875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6907, + "grad_norm": 0.07933853566646576, + "learning_rate": 0.00015163621372412734 + }, + { + "step": 351, + "epoch": 2.19375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6916, + "grad_norm": 0.28553393483161926, + "learning_rate": 0.00015081811903065205 + }, + { + "step": 352, + "epoch": 2.2, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.692, + "grad_norm": 0.09082792699337006, + "learning_rate": 0.00015 + }, + { + "step": 353, + "epoch": 2.20625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6984, + "grad_norm": 0.18411611020565033, + "learning_rate": 0.0001491818809693479 + }, + { + "step": 354, + "epoch": 2.2125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6993, + "grad_norm": 1.1169296503067017, + "learning_rate": 0.00014836378627587266 + }, + { + "step": 355, + "epoch": 2.21875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6938, + "grad_norm": 0.27125120162963867, + "learning_rate": 0.00014754574025602698 + }, + { + "step": 356, + "epoch": 2.225, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.7014, + "grad_norm": 0.32632121443748474, + "learning_rate": 0.00014672776724481584 + }, + { + "step": 357, + "epoch": 2.23125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.7009, + "grad_norm": 0.430666983127594, + "learning_rate": 0.00014590989157507224 + }, + { + "step": 358, + "epoch": 2.2375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6949, + "grad_norm": 0.3346516191959381, + "learning_rate": 0.00014509213757673357 + }, + { + "step": 359, + "epoch": 2.24375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6986, + "grad_norm": 0.17481577396392822, + "learning_rate": 0.00014427452957611775 + }, + { + "step": 360, + "epoch": 2.25, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6967, + "grad_norm": 0.32030394673347473, + "learning_rate": 0.0001434570918951996 + }, + { + "step": 361, + "epoch": 2.25625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6959, + "grad_norm": 0.0826176255941391, + "learning_rate": 0.0001426398488508873 + }, + { + "step": 362, + "epoch": 2.2625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6965, + "grad_norm": 0.47450363636016846, + "learning_rate": 0.00014182282475429903 + }, + { + "step": 363, + "epoch": 2.26875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.69, + "grad_norm": 0.10716969519853592, + "learning_rate": 0.00014100604391003985 + }, + { + "step": 364, + "epoch": 2.275, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6986, + "grad_norm": 0.29357340931892395, + "learning_rate": 0.0001401895306154785 + }, + { + "step": 365, + "epoch": 2.28125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.707, + "grad_norm": 0.413950651884079, + "learning_rate": 0.00013937330916002487 + }, + { + "step": 366, + "epoch": 2.2875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6958, + "grad_norm": 0.1902361810207367, + "learning_rate": 0.00013855740382440714 + }, + { + "step": 367, + "epoch": 2.29375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6806, + "grad_norm": 0.2311578392982483, + "learning_rate": 0.0001377418388799498 + }, + { + "step": 368, + "epoch": 2.3, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6823, + "grad_norm": 0.17461833357810974, + "learning_rate": 0.00013692663858785124 + }, + { + "step": 369, + "epoch": 2.30625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6842, + "grad_norm": 0.1586795449256897, + "learning_rate": 0.00013611182719846268 + }, + { + "step": 370, + "epoch": 2.3125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6984, + "grad_norm": 0.15812192857265472, + "learning_rate": 0.0001352974289505659 + }, + { + "step": 371, + "epoch": 2.31875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6977, + "grad_norm": 0.21657542884349823, + "learning_rate": 0.000134483468070653 + }, + { + "step": 372, + "epoch": 2.325, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.7059, + "grad_norm": 0.2652151584625244, + "learning_rate": 0.00013366996877220533 + }, + { + "step": 373, + "epoch": 2.33125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6942, + "grad_norm": 0.1452542245388031, + "learning_rate": 0.000132856955254973 + }, + { + "step": 374, + "epoch": 2.3375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43627008, + "loss": 0.6811, + "grad_norm": 0.2203839272260666, + "learning_rate": 0.00013204445170425565 + }, + { + "step": 375, + "epoch": 2.34375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7, + "grad_norm": 0.3006785213947296, + "learning_rate": 0.00013123248229018214 + }, + { + "step": 376, + "epoch": 2.35, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6988, + "grad_norm": 0.3415546417236328, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 377, + "epoch": 2.35625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43627008, + "loss": 0.6993, + "grad_norm": 0.24600288271903992, + "learning_rate": 0.0001296102424723179 + }, + { + "step": 378, + "epoch": 2.3625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6945, + "grad_norm": 0.27653932571411133, + "learning_rate": 0.0001288000203264649 + }, + { + "step": 379, + "epoch": 2.36875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.697, + "grad_norm": 0.22414368391036987, + "learning_rate": 0.00012799042883169574 + }, + { + "step": 380, + "epoch": 2.375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6916, + "grad_norm": 0.3384087085723877, + "learning_rate": 0.00012718149207151247 + }, + { + "step": 381, + "epoch": 2.38125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6981, + "grad_norm": 0.19719506800174713, + "learning_rate": 0.00012637323410994033 + }, + { + "step": 382, + "epoch": 2.3875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6843, + "grad_norm": 0.18984340131282806, + "learning_rate": 0.0001255656789908117 + }, + { + "step": 383, + "epoch": 2.39375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.691, + "grad_norm": 0.11130431294441223, + "learning_rate": 0.0001247588507370511 + }, + { + "step": 384, + "epoch": 2.4, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7162, + "grad_norm": 0.46890363097190857, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 385, + "epoch": 2.40625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.7203, + "grad_norm": 0.6568487882614136, + "learning_rate": 0.0001231474708085051 + }, + { + "step": 386, + "epoch": 2.4125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.7057, + "grad_norm": 0.322986364364624, + "learning_rate": 0.0001223429670686005 + }, + { + "step": 387, + "epoch": 2.41875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6856, + "grad_norm": 0.10663679242134094, + "learning_rate": 0.00012153928606239957 + }, + { + "step": 388, + "epoch": 2.425, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7029, + "grad_norm": 0.3104194402694702, + "learning_rate": 0.00012073645169758076 + }, + { + "step": 389, + "epoch": 2.43125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6927, + "grad_norm": 0.10474424064159393, + "learning_rate": 0.00011993448785663692 + }, + { + "step": 390, + "epoch": 2.4375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7001, + "grad_norm": 0.3399132788181305, + "learning_rate": 0.00011913341839616476 + }, + { + "step": 391, + "epoch": 2.44375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6948, + "grad_norm": 0.12807130813598633, + "learning_rate": 0.00011833326714615522 + }, + { + "step": 392, + "epoch": 2.45, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436267008, + "loss": 0.6924, + "grad_norm": 0.07062850892543793, + "learning_rate": 0.00011753405790928456 + }, + { + "step": 393, + "epoch": 2.45625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6936, + "grad_norm": 0.14604097604751587, + "learning_rate": 0.0001167358144602063 + }, + { + "step": 394, + "epoch": 2.4625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6955, + "grad_norm": 0.14697588980197906, + "learning_rate": 0.00011593856054484402 + }, + { + "step": 395, + "epoch": 2.46875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.675, + "grad_norm": 0.4674135744571686, + "learning_rate": 0.00011514231987968482 + }, + { + "step": 396, + "epoch": 2.475, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6945, + "grad_norm": 0.07653773576021194, + "learning_rate": 0.00011434711615107404 + }, + { + "step": 397, + "epoch": 2.48125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.6926, + "grad_norm": 0.0753614529967308, + "learning_rate": 0.00011355297301451042 + }, + { + "step": 398, + "epoch": 2.4875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436267008, + "loss": 0.6985, + "grad_norm": 0.12248300760984421, + "learning_rate": 0.00011275991409394253 + }, + { + "step": 399, + "epoch": 2.49375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7133, + "grad_norm": 0.3862832486629486, + "learning_rate": 0.00011196796298106608 + }, + { + "step": 400, + "epoch": 2.5, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7127, + "grad_norm": 0.3646034002304077, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 401, + "epoch": 2.50625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6801, + "grad_norm": 0.1853150725364685, + "learning_rate": 0.00011038747837969526 + }, + { + "step": 402, + "epoch": 2.5125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6983, + "grad_norm": 0.16224218904972076, + "learning_rate": 0.00010959899190701608 + }, + { + "step": 403, + "epoch": 2.51875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6923, + "grad_norm": 0.10327368229627609, + "learning_rate": 0.00010881170727226018 + }, + { + "step": 404, + "epoch": 2.525, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7079, + "grad_norm": 0.3454728424549103, + "learning_rate": 0.00010802564789535119 + }, + { + "step": 405, + "epoch": 2.53125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6887, + "grad_norm": 0.07829678058624268, + "learning_rate": 0.00010724083715976441 + }, + { + "step": 406, + "epoch": 2.5375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.69, + "grad_norm": 0.08624996989965439, + "learning_rate": 0.00010645729841183066 + }, + { + "step": 407, + "epoch": 2.54375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6914, + "grad_norm": 0.07039596885442734, + "learning_rate": 0.00010567505496004213 + }, + { + "step": 408, + "epoch": 2.55, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.6901, + "grad_norm": 0.06734960526227951, + "learning_rate": 0.00010489413007435904 + }, + { + "step": 409, + "epoch": 2.55625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6908, + "grad_norm": 0.05782429501414299, + "learning_rate": 0.00010411454698551695 + }, + { + "step": 410, + "epoch": 2.5625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6875, + "grad_norm": 0.17067015171051025, + "learning_rate": 0.00010333632888433638 + }, + { + "step": 411, + "epoch": 2.56875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.703, + "grad_norm": 0.3225139379501343, + "learning_rate": 0.00010255949892103225 + }, + { + "step": 412, + "epoch": 2.575, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6918, + "grad_norm": 0.09326571971178055, + "learning_rate": 0.00010178408020452579 + }, + { + "step": 413, + "epoch": 2.58125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436267008, + "loss": 0.6892, + "grad_norm": 0.1001659408211708, + "learning_rate": 0.00010101009580175669 + }, + { + "step": 414, + "epoch": 2.5875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6954, + "grad_norm": 0.08983571827411652, + "learning_rate": 0.00010023756873699722 + }, + { + "step": 415, + "epoch": 2.59375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.694, + "grad_norm": 0.14595626294612885, + "learning_rate": 9.946652199116699e-05 + }, + { + "step": 416, + "epoch": 2.6, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6992, + "grad_norm": 0.307039350271225, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 417, + "epoch": 2.60625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6984, + "grad_norm": 0.27847781777381897, + "learning_rate": 9.792896115911045e-05 + }, + { + "step": 418, + "epoch": 2.6125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6918, + "grad_norm": 0.18990810215473175, + "learning_rate": 9.716249281181497e-05 + }, + { + "step": 419, + "epoch": 2.61875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6947, + "grad_norm": 0.3568650186061859, + "learning_rate": 9.639759625994998e-05 + }, + { + "step": 420, + "epoch": 2.625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6897, + "grad_norm": 0.06021081283688545, + "learning_rate": 9.563429425744476e-05 + }, + { + "step": 421, + "epoch": 2.63125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7005, + "grad_norm": 0.23057764768600464, + "learning_rate": 9.487260951079448e-05 + }, + { + "step": 422, + "epoch": 2.6375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6969, + "grad_norm": 0.37124714255332947, + "learning_rate": 9.411256467838455e-05 + }, + { + "step": 423, + "epoch": 2.64375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.689, + "grad_norm": 0.41757890582084656, + "learning_rate": 9.335418236981677e-05 + }, + { + "step": 424, + "epoch": 2.65, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6959, + "grad_norm": 0.14373281598091125, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 425, + "epoch": 2.65625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6946, + "grad_norm": 0.15632052719593048, + "learning_rate": 9.184249551466189e-05 + }, + { + "step": 426, + "epoch": 2.6625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6889, + "grad_norm": 0.2295098900794983, + "learning_rate": 9.10892359373139e-05 + }, + { + "step": 427, + "epoch": 2.66875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436267008, + "loss": 0.695, + "grad_norm": 0.07158998399972916, + "learning_rate": 9.033772882094833e-05 + }, + { + "step": 428, + "epoch": 2.675, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.7051, + "grad_norm": 0.4923437833786011, + "learning_rate": 8.958799652118943e-05 + }, + { + "step": 429, + "epoch": 2.68125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6944, + "grad_norm": 0.116938017308712, + "learning_rate": 8.884006134086449e-05 + }, + { + "step": 430, + "epoch": 2.6875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6915, + "grad_norm": 0.06653156876564026, + "learning_rate": 8.809394552934079e-05 + }, + { + "step": 431, + "epoch": 2.69375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6855, + "grad_norm": 0.2958771288394928, + "learning_rate": 8.734967128186338e-05 + }, + { + "step": 432, + "epoch": 2.7, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6957, + "grad_norm": 0.10773763805627823, + "learning_rate": 8.660726073889511e-05 + }, + { + "step": 433, + "epoch": 2.70625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6956, + "grad_norm": 0.16481594741344452, + "learning_rate": 8.586673598545771e-05 + }, + { + "step": 434, + "epoch": 2.7125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6952, + "grad_norm": 0.16090619564056396, + "learning_rate": 8.512811905047505e-05 + }, + { + "step": 435, + "epoch": 2.71875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6909, + "grad_norm": 0.11110244691371918, + "learning_rate": 8.439143190611787e-05 + }, + { + "step": 436, + "epoch": 2.725, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6939, + "grad_norm": 0.06961734592914581, + "learning_rate": 8.365669646714983e-05 + }, + { + "step": 437, + "epoch": 2.73125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.6996, + "grad_norm": 0.13617220520973206, + "learning_rate": 8.29239345902759e-05 + }, + { + "step": 438, + "epoch": 2.7375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6973, + "grad_norm": 0.07641923427581787, + "learning_rate": 8.219316807349204e-05 + }, + { + "step": 439, + "epoch": 2.74375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6922, + "grad_norm": 0.07990799844264984, + "learning_rate": 8.146441865543689e-05 + }, + { + "step": 440, + "epoch": 2.75, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6882, + "grad_norm": 0.5702701210975647, + "learning_rate": 8.073770801474495e-05 + }, + { + "step": 441, + "epoch": 2.75625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6946, + "grad_norm": 0.056035786867141724, + "learning_rate": 8.001305776940163e-05 + }, + { + "step": 442, + "epoch": 2.7625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6922, + "grad_norm": 0.1349029541015625, + "learning_rate": 7.929048947610034e-05 + }, + { + "step": 443, + "epoch": 2.76875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6938, + "grad_norm": 0.13852249085903168, + "learning_rate": 7.857002462960132e-05 + }, + { + "step": 444, + "epoch": 2.775, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6909, + "grad_norm": 0.19660893082618713, + "learning_rate": 7.785168466209187e-05 + }, + { + "step": 445, + "epoch": 2.78125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6896, + "grad_norm": 0.12231782078742981, + "learning_rate": 7.713549094254897e-05 + }, + { + "step": 446, + "epoch": 2.7875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6873, + "grad_norm": 0.23543545603752136, + "learning_rate": 7.64214647761038e-05 + }, + { + "step": 447, + "epoch": 2.79375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6879, + "grad_norm": 0.1616705060005188, + "learning_rate": 7.570962740340759e-05 + }, + { + "step": 448, + "epoch": 2.8, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6878, + "grad_norm": 0.13497383892536163, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 449, + "epoch": 2.80625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6874, + "grad_norm": 0.13132548332214355, + "learning_rate": 7.429260367567916e-05 + }, + { + "step": 450, + "epoch": 2.8125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.703, + "grad_norm": 0.30539363622665405, + "learning_rate": 7.358745947387373e-05 + }, + { + "step": 451, + "epoch": 2.81875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.696, + "grad_norm": 0.12872374057769775, + "learning_rate": 7.288458837101675e-05 + }, + { + "step": 452, + "epoch": 2.825, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6712, + "grad_norm": 0.40255430340766907, + "learning_rate": 7.218401127592175e-05 + }, + { + "step": 453, + "epoch": 2.83125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.676, + "grad_norm": 0.2633000314235687, + "learning_rate": 7.14857490291609e-05 + }, + { + "step": 454, + "epoch": 2.8375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6934, + "grad_norm": 0.10677231103181839, + "learning_rate": 7.07898224024448e-05 + }, + { + "step": 455, + "epoch": 2.84375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7111, + "grad_norm": 0.46375924348831177, + "learning_rate": 7.009625209800465e-05 + }, + { + "step": 456, + "epoch": 2.85, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7068, + "grad_norm": 0.3955704867839813, + "learning_rate": 6.940505874797639e-05 + }, + { + "step": 457, + "epoch": 2.85625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.7021, + "grad_norm": 0.32601556181907654, + "learning_rate": 6.871626291378728e-05 + }, + { + "step": 458, + "epoch": 2.8625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7169, + "grad_norm": 0.5729773640632629, + "learning_rate": 6.80298850855435e-05 + }, + { + "step": 459, + "epoch": 2.86875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7003, + "grad_norm": 0.1996842324733734, + "learning_rate": 6.734594568142142e-05 + }, + { + "step": 460, + "epoch": 2.875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.7039, + "grad_norm": 0.3732545077800751, + "learning_rate": 6.66644650470597e-05 + }, + { + "step": 461, + "epoch": 2.88125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6762, + "grad_norm": 0.5003936290740967, + "learning_rate": 6.598546345495417e-05 + }, + { + "step": 462, + "epoch": 2.8875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.7074, + "grad_norm": 0.5564477443695068, + "learning_rate": 6.530896110385494e-05 + }, + { + "step": 463, + "epoch": 2.89375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6967, + "grad_norm": 0.1711839884519577, + "learning_rate": 6.463497811816523e-05 + }, + { + "step": 464, + "epoch": 2.9, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6907, + "grad_norm": 0.08440211415290833, + "learning_rate": 6.396353454734311e-05 + }, + { + "step": 465, + "epoch": 2.90625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6955, + "grad_norm": 0.3993905186653137, + "learning_rate": 6.32946503653045e-05 + }, + { + "step": 466, + "epoch": 2.9125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6966, + "grad_norm": 0.06576453149318695, + "learning_rate": 6.262834546982969e-05 + }, + { + "step": 467, + "epoch": 2.91875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436267008, + "loss": 0.692, + "grad_norm": 0.10545915365219116, + "learning_rate": 6.196463968197084e-05 + }, + { + "step": 468, + "epoch": 2.925, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.693, + "grad_norm": 0.06235138326883316, + "learning_rate": 6.130355274546267e-05 + }, + { + "step": 469, + "epoch": 2.93125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6964, + "grad_norm": 0.32676106691360474, + "learning_rate": 6.064510432613499e-05 + }, + { + "step": 470, + "epoch": 2.9375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6953, + "grad_norm": 0.1759742945432663, + "learning_rate": 5.998931401132786e-05 + }, + { + "step": 471, + "epoch": 2.94375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6881, + "grad_norm": 0.3296176791191101, + "learning_rate": 5.933620130930867e-05 + }, + { + "step": 472, + "epoch": 2.95, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6979, + "grad_norm": 0.23389941453933716, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 473, + "epoch": 2.95625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6917, + "grad_norm": 0.10424681752920151, + "learning_rate": 5.803808637786135e-05 + }, + { + "step": 474, + "epoch": 2.9625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.6981, + "grad_norm": 0.15585274994373322, + "learning_rate": 5.739312276439427e-05 + }, + { + "step": 475, + "epoch": 2.96875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6949, + "grad_norm": 0.10060406476259232, + "learning_rate": 5.6750913994488415e-05 + }, + { + "step": 476, + "epoch": 2.975, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6965, + "grad_norm": 0.1002727746963501, + "learning_rate": 5.6111479172391136e-05 + }, + { + "step": 477, + "epoch": 2.98125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6907, + "grad_norm": 0.09972648322582245, + "learning_rate": 5.5474837319831314e-05 + }, + { + "step": 478, + "epoch": 2.9875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6911, + "grad_norm": 0.0980520248413086, + "learning_rate": 5.4841007375453186e-05 + }, + { + "step": 479, + "epoch": 2.99375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6891, + "grad_norm": 0.11619674414396286, + "learning_rate": 5.4210008194253196e-05 + }, + { + "step": 480, + "epoch": 3.0, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6878, + "grad_norm": 0.12326416373252869, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 481, + "epoch": 3.00625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.6882, + "grad_norm": 0.11231421679258347, + "learning_rate": 5.2956577119771405e-05 + }, + { + "step": 482, + "epoch": 3.0125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.699, + "grad_norm": 0.2565767168998718, + "learning_rate": 5.233418251320765e-05 + }, + { + "step": 483, + "epoch": 3.01875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.7049, + "grad_norm": 0.3161673843860626, + "learning_rate": 5.171469324214901e-05 + }, + { + "step": 484, + "epoch": 3.025, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.695, + "grad_norm": 0.06501755863428116, + "learning_rate": 5.109812773498967e-05 + }, + { + "step": 485, + "epoch": 3.03125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7012, + "grad_norm": 0.3089890480041504, + "learning_rate": 5.048450433314835e-05 + }, + { + "step": 486, + "epoch": 3.0375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6871, + "grad_norm": 0.158905491232872, + "learning_rate": 4.987384129052291e-05 + }, + { + "step": 487, + "epoch": 3.04375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6999, + "grad_norm": 0.19906027615070343, + "learning_rate": 4.926615677294723e-05 + }, + { + "step": 488, + "epoch": 3.05, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6994, + "grad_norm": 0.39247483015060425, + "learning_rate": 4.866146885765096e-05 + }, + { + "step": 489, + "epoch": 3.05625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.69, + "grad_norm": 0.2841380834579468, + "learning_rate": 4.8059795532721575e-05 + }, + { + "step": 490, + "epoch": 3.0625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6972, + "grad_norm": 0.21467481553554535, + "learning_rate": 4.7461154696569294e-05 + }, + { + "step": 491, + "epoch": 3.06875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6885, + "grad_norm": 0.2494855672121048, + "learning_rate": 4.686556415739488e-05 + }, + { + "step": 492, + "epoch": 3.075, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6909, + "grad_norm": 0.10403452813625336, + "learning_rate": 4.62730416326596e-05 + }, + { + "step": 493, + "epoch": 3.08125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6912, + "grad_norm": 0.3712635338306427, + "learning_rate": 4.568360474855826e-05 + }, + { + "step": 494, + "epoch": 3.0875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6919, + "grad_norm": 0.1404065638780594, + "learning_rate": 4.509727103949492e-05 + }, + { + "step": 495, + "epoch": 3.09375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6896, + "grad_norm": 0.41758716106414795, + "learning_rate": 4.451405794756138e-05 + }, + { + "step": 496, + "epoch": 3.1, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436267008, + "loss": 0.6939, + "grad_norm": 0.14124171435832977, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 497, + "epoch": 3.10625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436267008, + "loss": 0.6923, + "grad_norm": 0.30861759185791016, + "learning_rate": 4.33570629187776e-05 + }, + { + "step": 498, + "epoch": 3.1125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6962, + "grad_norm": 0.42809122800827026, + "learning_rate": 4.278331539989307e-05 + }, + { + "step": 499, + "epoch": 3.11875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.6953, + "grad_norm": 0.11524618417024612, + "learning_rate": 4.2212757333045283e-05 + }, + { + "step": 500, + "epoch": 3.125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6947, + "grad_norm": 0.16743776202201843, + "learning_rate": 4.164540569103667e-05 + }, + { + "step": 501, + "epoch": 3.13125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6934, + "grad_norm": 0.05939275771379471, + "learning_rate": 4.108127735128561e-05 + }, + { + "step": 502, + "epoch": 3.1375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6918, + "grad_norm": 0.1982499063014984, + "learning_rate": 4.052038909532469e-05 + }, + { + "step": 503, + "epoch": 3.14375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6923, + "grad_norm": 0.2604564130306244, + "learning_rate": 3.996275760830125e-05 + }, + { + "step": 504, + "epoch": 3.15, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6935, + "grad_norm": 0.06297598034143448, + "learning_rate": 3.94083994784814e-05 + }, + { + "step": 505, + "epoch": 3.15625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6961, + "grad_norm": 0.30662938952445984, + "learning_rate": 3.885733119675616e-05 + }, + { + "step": 506, + "epoch": 3.1625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6916, + "grad_norm": 0.06957300752401352, + "learning_rate": 3.830956915615106e-05 + }, + { + "step": 507, + "epoch": 3.16875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.6941, + "grad_norm": 0.07412543147802353, + "learning_rate": 3.776512965133863e-05 + }, + { + "step": 508, + "epoch": 3.175, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.6908, + "grad_norm": 0.26146188378334045, + "learning_rate": 3.72240288781534e-05 + }, + { + "step": 509, + "epoch": 3.18125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6934, + "grad_norm": 0.2540963590145111, + "learning_rate": 3.66862829331103e-05 + }, + { + "step": 510, + "epoch": 3.1875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6954, + "grad_norm": 0.11708111315965652, + "learning_rate": 3.6151907812925717e-05 + }, + { + "step": 511, + "epoch": 3.19375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6951, + "grad_norm": 0.07813259959220886, + "learning_rate": 3.562091941404179e-05 + }, + { + "step": 512, + "epoch": 3.2, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6954, + "grad_norm": 0.058054786175489426, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 513, + "epoch": 3.20625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.692, + "grad_norm": 0.1441773623228073, + "learning_rate": 3.456916586173797e-05 + }, + { + "step": 514, + "epoch": 3.2125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6931, + "grad_norm": 0.27087849378585815, + "learning_rate": 3.404843199558945e-05 + }, + { + "step": 515, + "epoch": 3.21875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6939, + "grad_norm": 0.24680492281913757, + "learning_rate": 3.3531147424353664e-05 + }, + { + "step": 516, + "epoch": 3.225, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6925, + "grad_norm": 0.23893745243549347, + "learning_rate": 3.301732753606776e-05 + }, + { + "step": 517, + "epoch": 3.23125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6918, + "grad_norm": 0.508774995803833, + "learning_rate": 3.250698761570244e-05 + }, + { + "step": 518, + "epoch": 3.2375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.689, + "grad_norm": 0.6244984269142151, + "learning_rate": 3.200014284470745e-05 + }, + { + "step": 519, + "epoch": 3.24375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6946, + "grad_norm": 0.06572522222995758, + "learning_rate": 3.149680830055967e-05 + }, + { + "step": 520, + "epoch": 3.25, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6906, + "grad_norm": 0.26117104291915894, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 521, + "epoch": 3.25625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6938, + "grad_norm": 0.060611240565776825, + "learning_rate": 3.0500729680161663e-05 + }, + { + "step": 522, + "epoch": 3.2625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6907, + "grad_norm": 0.2042621672153473, + "learning_rate": 3.0008015234980552e-05 + }, + { + "step": 523, + "epoch": 3.26875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6937, + "grad_norm": 0.06519153714179993, + "learning_rate": 2.9518870277903274e-05 + }, + { + "step": 524, + "epoch": 3.275, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.689, + "grad_norm": 0.0794588029384613, + "learning_rate": 2.9033309359877597e-05 + }, + { + "step": 525, + "epoch": 3.28125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6997, + "grad_norm": 0.35396695137023926, + "learning_rate": 2.855134692523438e-05 + }, + { + "step": 526, + "epoch": 3.2875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6923, + "grad_norm": 0.09092722833156586, + "learning_rate": 2.807299731125773e-05 + }, + { + "step": 527, + "epoch": 3.29375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6943, + "grad_norm": 0.11738506704568863, + "learning_rate": 2.759827474775852e-05 + }, + { + "step": 528, + "epoch": 3.3, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6902, + "grad_norm": 0.12261214107275009, + "learning_rate": 2.7127193356651213e-05 + }, + { + "step": 529, + "epoch": 3.30625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6966, + "grad_norm": 0.3482706546783447, + "learning_rate": 2.665976715153377e-05 + }, + { + "step": 530, + "epoch": 3.3125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6874, + "grad_norm": 0.2830795645713806, + "learning_rate": 2.619601003727043e-05 + }, + { + "step": 531, + "epoch": 3.31875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436271616, + "loss": 0.7015, + "grad_norm": 0.41531050205230713, + "learning_rate": 2.5735935809578656e-05 + }, + { + "step": 532, + "epoch": 3.325, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7022, + "grad_norm": 0.41271474957466125, + "learning_rate": 2.5279558154618197e-05 + }, + { + "step": 533, + "epoch": 3.33125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6945, + "grad_norm": 0.1735668033361435, + "learning_rate": 2.4826890648584353e-05 + }, + { + "step": 534, + "epoch": 3.3375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6956, + "grad_norm": 0.1271335780620575, + "learning_rate": 2.4377946757303828e-05 + }, + { + "step": 535, + "epoch": 3.34375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.699, + "grad_norm": 0.13822206854820251, + "learning_rate": 2.393273983583427e-05 + }, + { + "step": 536, + "epoch": 3.35, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6933, + "grad_norm": 0.1643162965774536, + "learning_rate": 2.3491283128067174e-05 + }, + { + "step": 537, + "epoch": 3.35625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6914, + "grad_norm": 0.15025730431079865, + "learning_rate": 2.3053589766333414e-05 + }, + { + "step": 538, + "epoch": 3.3625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6928, + "grad_norm": 0.2011362463235855, + "learning_rate": 2.261967277101318e-05 + }, + { + "step": 539, + "epoch": 3.36875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6932, + "grad_norm": 0.09689207375049591, + "learning_rate": 2.218954505014821e-05 + }, + { + "step": 540, + "epoch": 3.375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6911, + "grad_norm": 0.3833630681037903, + "learning_rate": 2.1763219399058042e-05 + }, + { + "step": 541, + "epoch": 3.38125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6912, + "grad_norm": 0.07189416885375977, + "learning_rate": 2.1340708499959197e-05 + }, + { + "step": 542, + "epoch": 3.3875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6942, + "grad_norm": 0.2018006294965744, + "learning_rate": 2.0922024921588167e-05 + }, + { + "step": 543, + "epoch": 3.39375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6932, + "grad_norm": 0.18802930414676666, + "learning_rate": 2.0507181118827254e-05 + }, + { + "step": 544, + "epoch": 3.4, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.689, + "grad_norm": 0.07558046281337738, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 545, + "epoch": 3.40625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.6967, + "grad_norm": 0.07807182520627975, + "learning_rate": 1.9689062088175154e-05 + }, + { + "step": 546, + "epoch": 3.4125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6919, + "grad_norm": 0.04885090887546539, + "learning_rate": 1.928581119746081e-05 + }, + { + "step": 547, + "epoch": 3.41875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6942, + "grad_norm": 0.23497429490089417, + "learning_rate": 1.8886448755986193e-05 + }, + { + "step": 548, + "epoch": 3.425, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6931, + "grad_norm": 0.12534525990486145, + "learning_rate": 1.8490986643873845e-05 + }, + { + "step": 549, + "epoch": 3.43125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6936, + "grad_norm": 0.12179633975028992, + "learning_rate": 1.8099436625220443e-05 + }, + { + "step": 550, + "epoch": 3.4375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436268544, + "loss": 0.6904, + "grad_norm": 0.16724975407123566, + "learning_rate": 1.7711810347746757e-05 + }, + { + "step": 551, + "epoch": 3.44375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6938, + "grad_norm": 0.19791297614574432, + "learning_rate": 1.7328119342451165e-05 + }, + { + "step": 552, + "epoch": 3.45, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6942, + "grad_norm": 0.11529538035392761, + "learning_rate": 1.694837502326674e-05 + }, + { + "step": 553, + "epoch": 3.45625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6947, + "grad_norm": 0.16074901819229126, + "learning_rate": 1.6572588686721606e-05 + }, + { + "step": 554, + "epoch": 3.4625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6898, + "grad_norm": 0.3405320942401886, + "learning_rate": 1.6200771511602882e-05 + }, + { + "step": 555, + "epoch": 3.46875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.6897, + "grad_norm": 0.6681931614875793, + "learning_rate": 1.583293455862422e-05 + }, + { + "step": 556, + "epoch": 3.475, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6919, + "grad_norm": 0.09342384338378906, + "learning_rate": 1.546908877009676e-05 + }, + { + "step": 557, + "epoch": 3.48125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6967, + "grad_norm": 0.28320300579071045, + "learning_rate": 1.5109244969603546e-05 + }, + { + "step": 558, + "epoch": 3.4875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6978, + "grad_norm": 0.22785477340221405, + "learning_rate": 1.4753413861677604e-05 + }, + { + "step": 559, + "epoch": 3.49375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6943, + "grad_norm": 0.1962384432554245, + "learning_rate": 1.4401606031483497e-05 + }, + { + "step": 560, + "epoch": 3.5, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6939, + "grad_norm": 0.18186016380786896, + "learning_rate": 1.4053831944502508e-05 + }, + { + "step": 561, + "epoch": 3.50625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6948, + "grad_norm": 0.09391657263040543, + "learning_rate": 1.371010194622117e-05 + }, + { + "step": 562, + "epoch": 3.5125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.7015, + "grad_norm": 0.42099735140800476, + "learning_rate": 1.3370426261823613e-05 + }, + { + "step": 563, + "epoch": 3.51875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6938, + "grad_norm": 0.12365562468767166, + "learning_rate": 1.3034814995887433e-05 + }, + { + "step": 564, + "epoch": 3.525, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.691, + "grad_norm": 0.14646485447883606, + "learning_rate": 1.2703278132082934e-05 + }, + { + "step": 565, + "epoch": 3.53125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6918, + "grad_norm": 0.06423018127679825, + "learning_rate": 1.237582553287631e-05 + }, + { + "step": 566, + "epoch": 3.5375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.7001, + "grad_norm": 0.4691506028175354, + "learning_rate": 1.205246693923616e-05 + }, + { + "step": 567, + "epoch": 3.54375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6981, + "grad_norm": 0.29412633180618286, + "learning_rate": 1.173321197034382e-05 + }, + { + "step": 568, + "epoch": 3.55, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6935, + "grad_norm": 0.2020447999238968, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 569, + "epoch": 3.55625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6977, + "grad_norm": 0.3363268971443176, + "learning_rate": 1.1107050772877507e-05 + }, + { + "step": 570, + "epoch": 3.5625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6933, + "grad_norm": 0.09102755784988403, + "learning_rate": 1.0800163171172332e-05 + }, + { + "step": 571, + "epoch": 3.56875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6942, + "grad_norm": 0.11175628751516342, + "learning_rate": 1.0497416447398187e-05 + }, + { + "step": 572, + "epoch": 3.575, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.688, + "grad_norm": 0.31207120418548584, + "learning_rate": 1.0198819607580233e-05 + }, + { + "step": 573, + "epoch": 3.58125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6942, + "grad_norm": 0.17199304699897766, + "learning_rate": 9.904381534293993e-06 + }, + { + "step": 574, + "epoch": 3.5875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6912, + "grad_norm": 0.2078588306903839, + "learning_rate": 9.614110986401169e-06 + }, + { + "step": 575, + "epoch": 3.59375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6925, + "grad_norm": 0.058434322476387024, + "learning_rate": 9.32801659878905e-06 + }, + { + "step": 576, + "epoch": 3.6, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6979, + "grad_norm": 0.23535022139549255, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 577, + "epoch": 3.60625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.692, + "grad_norm": 0.07194728404283524, + "learning_rate": 8.768390222546895e-06 + }, + { + "step": 578, + "epoch": 3.6125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436267008, + "loss": 0.6942, + "grad_norm": 0.10786271095275879, + "learning_rate": 8.494874881526215e-06 + }, + { + "step": 579, + "epoch": 3.61875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6951, + "grad_norm": 0.06550299376249313, + "learning_rate": 8.225568995509834e-06 + }, + { + "step": 580, + "epoch": 3.625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6872, + "grad_norm": 0.3658002018928528, + "learning_rate": 7.960480575734162e-06 + }, + { + "step": 581, + "epoch": 3.63125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6923, + "grad_norm": 0.1438230723142624, + "learning_rate": 7.699617507975563e-06 + }, + { + "step": 582, + "epoch": 3.6375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.692, + "grad_norm": 0.05862034857273102, + "learning_rate": 7.442987552315833e-06 + }, + { + "step": 583, + "epoch": 3.64375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6918, + "grad_norm": 0.07259807735681534, + "learning_rate": 7.190598342911358e-06 + }, + { + "step": 584, + "epoch": 3.65, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6895, + "grad_norm": 0.07411931455135345, + "learning_rate": 6.942457387765976e-06 + }, + { + "step": 585, + "epoch": 3.65625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436267008, + "loss": 0.6917, + "grad_norm": 0.15078137814998627, + "learning_rate": 6.698572068507596e-06 + }, + { + "step": 586, + "epoch": 3.6625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6991, + "grad_norm": 0.38842707872390747, + "learning_rate": 6.458949640168675e-06 + }, + { + "step": 587, + "epoch": 3.66875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436267008, + "loss": 0.6887, + "grad_norm": 0.11452246457338333, + "learning_rate": 6.223597230970428e-06 + }, + { + "step": 588, + "epoch": 3.675, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6884, + "grad_norm": 0.3136930465698242, + "learning_rate": 5.992521842110709e-06 + }, + { + "step": 589, + "epoch": 3.68125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6925, + "grad_norm": 0.2384672909975052, + "learning_rate": 5.7657303475556974e-06 + }, + { + "step": 590, + "epoch": 3.6875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.698, + "grad_norm": 0.3323866128921509, + "learning_rate": 5.543229493835594e-06 + }, + { + "step": 591, + "epoch": 3.69375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6964, + "grad_norm": 0.5113027691841125, + "learning_rate": 5.325025899843732e-06 + }, + { + "step": 592, + "epoch": 3.7, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.6962, + "grad_norm": 0.2790364921092987, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 593, + "epoch": 3.70625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6928, + "grad_norm": 0.22070778906345367, + "learning_rate": 4.901536327256589e-06 + }, + { + "step": 594, + "epoch": 3.7125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6995, + "grad_norm": 0.5077831745147705, + "learning_rate": 4.6962629465110365e-06 + }, + { + "step": 595, + "epoch": 3.71875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436265472, + "loss": 0.6923, + "grad_norm": 0.06759735941886902, + "learning_rate": 4.495312020818403e-06 + }, + { + "step": 596, + "epoch": 3.725, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6909, + "grad_norm": 0.27039316296577454, + "learning_rate": 4.298689528010785e-06 + }, + { + "step": 597, + "epoch": 3.73125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6918, + "grad_norm": 0.11788258701562881, + "learning_rate": 4.106401317159275e-06 + }, + { + "step": 598, + "epoch": 3.7375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6955, + "grad_norm": 0.2823037803173065, + "learning_rate": 3.918453108399955e-06 + }, + { + "step": 599, + "epoch": 3.74375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6924, + "grad_norm": 0.16907955706119537, + "learning_rate": 3.7348504927637302e-06 + }, + { + "step": 600, + "epoch": 3.75, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6932, + "grad_norm": 0.1581099033355713, + "learning_rate": 3.5555989320099952e-06 + }, + { + "step": 601, + "epoch": 3.75625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6927, + "grad_norm": 0.10501580685377121, + "learning_rate": 3.3807037584642316e-06 + }, + { + "step": 602, + "epoch": 3.7625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6932, + "grad_norm": 0.15892255306243896, + "learning_rate": 3.21017017485925e-06 + }, + { + "step": 603, + "epoch": 3.76875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436263936, + "loss": 0.6943, + "grad_norm": 0.2661198675632477, + "learning_rate": 3.0440032541805825e-06 + }, + { + "step": 604, + "epoch": 3.775, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6949, + "grad_norm": 0.1124497577548027, + "learning_rate": 2.882207939515435e-06 + }, + { + "step": 605, + "epoch": 3.78125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436253184, + "loss": 0.6939, + "grad_norm": 0.43267822265625, + "learning_rate": 2.7247890439057064e-06 + }, + { + "step": 606, + "epoch": 3.7875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6929, + "grad_norm": 0.20972499251365662, + "learning_rate": 2.5717512502048342e-06 + }, + { + "step": 607, + "epoch": 3.79375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6914, + "grad_norm": 0.16194036602973938, + "learning_rate": 2.423099110938376e-06 + }, + { + "step": 608, + "epoch": 3.8, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6936, + "grad_norm": 0.22002218663692474, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 609, + "epoch": 3.80625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6945, + "grad_norm": 0.3706638813018799, + "learning_rate": 2.1389693533636455e-06 + }, + { + "step": 610, + "epoch": 3.8125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6978, + "grad_norm": 0.10653430223464966, + "learning_rate": 2.003500187268153e-06 + }, + { + "step": 611, + "epoch": 3.81875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6896, + "grad_norm": 0.2148282378911972, + "learning_rate": 1.8724335797812685e-06 + }, + { + "step": 612, + "epoch": 3.825, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.6939, + "grad_norm": 0.17692965269088745, + "learning_rate": 1.7457734298359005e-06 + }, + { + "step": 613, + "epoch": 3.83125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6936, + "grad_norm": 0.07955671101808548, + "learning_rate": 1.6235235052828476e-06 + }, + { + "step": 614, + "epoch": 3.8375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6906, + "grad_norm": 0.21184535324573517, + "learning_rate": 1.505687442778819e-06 + }, + { + "step": 615, + "epoch": 3.84375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6919, + "grad_norm": 0.22281238436698914, + "learning_rate": 1.3922687476781047e-06 + }, + { + "step": 616, + "epoch": 3.85, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6913, + "grad_norm": 0.06758581101894379, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 617, + "epoch": 3.85625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6946, + "grad_norm": 0.20585639774799347, + "learning_rate": 1.1786968239705486e-06 + }, + { + "step": 618, + "epoch": 3.8625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6906, + "grad_norm": 0.06721778959035873, + "learning_rate": 1.0785499486417438e-06 + }, + { + "step": 619, + "epoch": 3.86875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6949, + "grad_norm": 0.061478689312934875, + "learning_rate": 9.82833147083345e-07 + }, + { + "step": 620, + "epoch": 3.875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6906, + "grad_norm": 0.26499196887016296, + "learning_rate": 8.91549266652053e-07 + }, + { + "step": 621, + "epoch": 3.88125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6974, + "grad_norm": 0.3785613179206848, + "learning_rate": 8.04701022835319e-07 + }, + { + "step": 622, + "epoch": 3.8875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6967, + "grad_norm": 0.14968296885490417, + "learning_rate": 7.222909991704773e-07 + }, + { + "step": 623, + "epoch": 3.89375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.691, + "grad_norm": 0.2802983820438385, + "learning_rate": 6.443216471679058e-07 + }, + { + "step": 624, + "epoch": 3.9, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6936, + "grad_norm": 0.14907334744930267, + "learning_rate": 5.707952862381681e-07 + }, + { + "step": 625, + "epoch": 3.90625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6916, + "grad_norm": 0.11081935465335846, + "learning_rate": 5.017141036229522e-07 + }, + { + "step": 626, + "epoch": 3.9125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6933, + "grad_norm": 0.054633744060993195, + "learning_rate": 4.370801543300051e-07 + }, + { + "step": 627, + "epoch": 3.91875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6921, + "grad_norm": 0.14787234365940094, + "learning_rate": 3.768953610720327e-07 + }, + { + "step": 628, + "epoch": 3.925, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6918, + "grad_norm": 0.15585161745548248, + "learning_rate": 3.211615142094781e-07 + }, + { + "step": 629, + "epoch": 3.93125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436256256, + "loss": 0.6908, + "grad_norm": 0.2217024713754654, + "learning_rate": 2.6988027169728145e-07 + }, + { + "step": 630, + "epoch": 3.9375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.43625472, + "loss": 0.6924, + "grad_norm": 0.26207882165908813, + "learning_rate": 2.2305315903553555e-07 + }, + { + "step": 631, + "epoch": 3.94375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436268544, + "loss": 0.6916, + "grad_norm": 0.2184615582227707, + "learning_rate": 1.8068156922413924e-07 + }, + { + "step": 632, + "epoch": 3.95, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6891, + "grad_norm": 0.1060728132724762, + "learning_rate": 1.4276676272133025e-07 + }, + { + "step": 633, + "epoch": 3.95625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.69, + "grad_norm": 0.06253160536289215, + "learning_rate": 1.0930986740621539e-07 + }, + { + "step": 634, + "epoch": 3.9625, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436259328, + "loss": 0.6945, + "grad_norm": 0.25265344977378845, + "learning_rate": 8.031187854514731e-08 + }, + { + "step": 635, + "epoch": 3.96875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6926, + "grad_norm": 0.0657254010438919, + "learning_rate": 5.577365876224815e-08 + }, + { + "step": 636, + "epoch": 3.975, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436251648, + "loss": 0.6923, + "grad_norm": 0.0851597711443901, + "learning_rate": 3.5695938013630134e-08 + }, + { + "step": 637, + "epoch": 3.98125, + "cpu_mem": 1.693360128, + "gpu_mem": 4.4362624, + "loss": 0.6968, + "grad_norm": 0.4301515221595764, + "learning_rate": 2.007931356572956e-08 + }, + { + "step": 638, + "epoch": 3.9875, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6948, + "grad_norm": 0.22010056674480438, + "learning_rate": 8.924249977537712e-09 + }, + { + "step": 639, + "epoch": 3.99375, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436260864, + "loss": 0.6939, + "grad_norm": 0.21923400461673737, + "learning_rate": 2.2310790867619e-09 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "loss": 0.6909, + "grad_norm": 0.18590153753757477, + "learning_rate": 0.0 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.693360128, + "gpu_mem": 4.436257792, + "train_runtime": 1426.3997, + "train_samples_per_second": 28.699, + "train_steps_per_second": 0.449, + "total_flos": 1.4579617164754944e+16, + "train_loss": 0.747900562826544 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r32-a2/README.md b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r32-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r32-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r32-a2/adapter_config.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..03eabaea80bc9f8c1936ead28264f565a8ac69c0 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r32-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r32-a2/eval_results.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..2b3da8d39588a2788ee3452b7106acf4d14c35a9 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "winogrande", + "results": 0.5256511444356748 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r32-a2/training_configuration.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..f4c046b00f63cc74ecb6e1e081a26b9f769f3de1 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "WINOGRANDE", + "dataset_id": "allenai/winogrande", + "preprocess_id": "winogrande_train_deepeval" + }, + "peft_config": { + "method": "lora", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 25231360 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-lora-winogrande-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r32-a2", + "seed": 42, + "timestamp": "2025-08-30T08:11:43.395924" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r32-a2/training_logs.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..5ec7bff2eaad6a2c22f0d6b43561b0f35d1b5281 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r32-a2/training_logs.json @@ -0,0 +1,5773 @@ +[ + { + "step": 1, + "epoch": 0.00625, + "cpu_mem": 1.72017664, + "gpu_mem": 4.518260224, + "loss": 3.3802, + "grad_norm": 57.27546310424805, + "learning_rate": 4.6875e-06 + }, + { + "step": 2, + "epoch": 0.0125, + "cpu_mem": 1.726271488, + "gpu_mem": 4.720109056, + "loss": 3.3361, + "grad_norm": 56.232215881347656, + "learning_rate": 9.375e-06 + }, + { + "step": 3, + "epoch": 0.01875, + "cpu_mem": 1.726664704, + "gpu_mem": 4.720113664, + "loss": 2.9777, + "grad_norm": 52.48434829711914, + "learning_rate": 1.40625e-05 + }, + { + "step": 4, + "epoch": 0.025, + "cpu_mem": 1.72705792, + "gpu_mem": 4.720112128, + "loss": 2.4558, + "grad_norm": 50.19524383544922, + "learning_rate": 1.875e-05 + }, + { + "step": 5, + "epoch": 0.03125, + "cpu_mem": 1.727451136, + "gpu_mem": 4.720112128, + "loss": 1.8711, + "grad_norm": 37.914283752441406, + "learning_rate": 2.3437499999999997e-05 + }, + { + "step": 6, + "epoch": 0.0375, + "cpu_mem": 1.727647744, + "gpu_mem": 4.720118272, + "loss": 1.3542, + "grad_norm": 21.581628799438477, + "learning_rate": 2.8125e-05 + }, + { + "step": 7, + "epoch": 0.04375, + "cpu_mem": 1.72804096, + "gpu_mem": 4.720124416, + "loss": 1.0431, + "grad_norm": 18.137744903564453, + "learning_rate": 3.28125e-05 + }, + { + "step": 8, + "epoch": 0.05, + "cpu_mem": 1.728237568, + "gpu_mem": 4.72010752, + "loss": 0.8649, + "grad_norm": 8.423918724060059, + "learning_rate": 3.75e-05 + }, + { + "step": 9, + "epoch": 0.05625, + "cpu_mem": 1.728237568, + "gpu_mem": 4.720113664, + "loss": 0.8502, + "grad_norm": 11.23043441772461, + "learning_rate": 4.2187499999999995e-05 + }, + { + "step": 10, + "epoch": 0.0625, + "cpu_mem": 1.728434176, + "gpu_mem": 4.720116736, + "loss": 0.7983, + "grad_norm": 12.746944427490234, + "learning_rate": 4.6874999999999994e-05 + }, + { + "step": 11, + "epoch": 0.06875, + "cpu_mem": 1.728630784, + "gpu_mem": 4.720105984, + "loss": 0.7883, + "grad_norm": 13.818852424621582, + "learning_rate": 5.156249999999999e-05 + }, + { + "step": 12, + "epoch": 0.075, + "cpu_mem": 1.728827392, + "gpu_mem": 4.720110592, + "loss": 0.6788, + "grad_norm": 4.551372051239014, + "learning_rate": 5.625e-05 + }, + { + "step": 13, + "epoch": 0.08125, + "cpu_mem": 1.728827392, + "gpu_mem": 4.720118272, + "loss": 0.7462, + "grad_norm": 5.239316940307617, + "learning_rate": 6.09375e-05 + }, + { + "step": 14, + "epoch": 0.0875, + "cpu_mem": 1.729024, + "gpu_mem": 4.720113664, + "loss": 0.7202, + "grad_norm": 4.053483486175537, + "learning_rate": 6.5625e-05 + }, + { + "step": 15, + "epoch": 0.09375, + "cpu_mem": 1.729220608, + "gpu_mem": 4.720113664, + "loss": 0.7049, + "grad_norm": 3.333466053009033, + "learning_rate": 7.03125e-05 + }, + { + "step": 16, + "epoch": 0.1, + "cpu_mem": 1.729220608, + "gpu_mem": 4.720110592, + "loss": 0.6881, + "grad_norm": 1.3810962438583374, + "learning_rate": 7.5e-05 + }, + { + "step": 17, + "epoch": 0.10625, + "cpu_mem": 1.729417216, + "gpu_mem": 4.720110592, + "loss": 0.7072, + "grad_norm": 3.288944959640503, + "learning_rate": 7.968749999999999e-05 + }, + { + "step": 18, + "epoch": 0.1125, + "cpu_mem": 1.729613824, + "gpu_mem": 4.720113664, + "loss": 0.7037, + "grad_norm": 2.7249066829681396, + "learning_rate": 8.437499999999999e-05 + }, + { + "step": 19, + "epoch": 0.11875, + "cpu_mem": 1.729613824, + "gpu_mem": 4.720110592, + "loss": 0.7028, + "grad_norm": 2.030271053314209, + "learning_rate": 8.906249999999999e-05 + }, + { + "step": 20, + "epoch": 0.125, + "cpu_mem": 1.729810432, + "gpu_mem": 4.720118272, + "loss": 0.7136, + "grad_norm": 2.0888376235961914, + "learning_rate": 9.374999999999999e-05 + }, + { + "step": 21, + "epoch": 0.13125, + "cpu_mem": 1.729810432, + "gpu_mem": 4.720110592, + "loss": 0.7478, + "grad_norm": 5.780369281768799, + "learning_rate": 9.843749999999999e-05 + }, + { + "step": 22, + "epoch": 0.1375, + "cpu_mem": 1.73000704, + "gpu_mem": 4.720110592, + "loss": 0.7349, + "grad_norm": 3.2512173652648926, + "learning_rate": 0.00010312499999999999 + }, + { + "step": 23, + "epoch": 0.14375, + "cpu_mem": 1.73000704, + "gpu_mem": 4.720105984, + "loss": 0.7232, + "grad_norm": 3.6738674640655518, + "learning_rate": 0.00010781249999999998 + }, + { + "step": 24, + "epoch": 0.15, + "cpu_mem": 1.73000704, + "gpu_mem": 4.720109056, + "loss": 0.7112, + "grad_norm": 1.9177542924880981, + "learning_rate": 0.0001125 + }, + { + "step": 25, + "epoch": 0.15625, + "cpu_mem": 1.73000704, + "gpu_mem": 4.720112128, + "loss": 0.7092, + "grad_norm": 1.225071668624878, + "learning_rate": 0.0001171875 + }, + { + "step": 26, + "epoch": 0.1625, + "cpu_mem": 1.73000704, + "gpu_mem": 4.72010752, + "loss": 0.7664, + "grad_norm": 4.354071140289307, + "learning_rate": 0.000121875 + }, + { + "step": 27, + "epoch": 0.16875, + "cpu_mem": 1.73000704, + "gpu_mem": 4.720105984, + "loss": 0.68, + "grad_norm": 0.8261433839797974, + "learning_rate": 0.0001265625 + }, + { + "step": 28, + "epoch": 0.175, + "cpu_mem": 1.73000704, + "gpu_mem": 4.720112128, + "loss": 0.7339, + "grad_norm": 3.09401273727417, + "learning_rate": 0.00013125 + }, + { + "step": 29, + "epoch": 0.18125, + "cpu_mem": 1.73000704, + "gpu_mem": 4.720110592, + "loss": 0.7368, + "grad_norm": 3.0542638301849365, + "learning_rate": 0.0001359375 + }, + { + "step": 30, + "epoch": 0.1875, + "cpu_mem": 1.73000704, + "gpu_mem": 4.720110592, + "loss": 0.8114, + "grad_norm": 5.319136619567871, + "learning_rate": 0.000140625 + }, + { + "step": 31, + "epoch": 0.19375, + "cpu_mem": 1.73000704, + "gpu_mem": 4.720110592, + "loss": 0.6984, + "grad_norm": 1.7788218259811401, + "learning_rate": 0.0001453125 + }, + { + "step": 32, + "epoch": 0.2, + "cpu_mem": 1.730203648, + "gpu_mem": 4.72010752, + "loss": 0.6682, + "grad_norm": 0.5583691000938416, + "learning_rate": 0.00015 + }, + { + "step": 33, + "epoch": 0.20625, + "cpu_mem": 1.730400256, + "gpu_mem": 4.72010752, + "loss": 0.8266, + "grad_norm": 5.361758708953857, + "learning_rate": 0.00015468749999999999 + }, + { + "step": 34, + "epoch": 0.2125, + "cpu_mem": 1.730400256, + "gpu_mem": 4.72010752, + "loss": 0.8039, + "grad_norm": 4.954075813293457, + "learning_rate": 0.00015937499999999998 + }, + { + "step": 35, + "epoch": 0.21875, + "cpu_mem": 1.730400256, + "gpu_mem": 4.720113664, + "loss": 0.7127, + "grad_norm": 2.033935070037842, + "learning_rate": 0.00016406249999999998 + }, + { + "step": 36, + "epoch": 0.225, + "cpu_mem": 1.730400256, + "gpu_mem": 4.720109056, + "loss": 0.7879, + "grad_norm": 6.548220157623291, + "learning_rate": 0.00016874999999999998 + }, + { + "step": 37, + "epoch": 0.23125, + "cpu_mem": 1.730400256, + "gpu_mem": 4.72010752, + "loss": 0.7084, + "grad_norm": 2.4359781742095947, + "learning_rate": 0.00017343749999999998 + }, + { + "step": 38, + "epoch": 0.2375, + "cpu_mem": 1.730400256, + "gpu_mem": 4.720112128, + "loss": 0.704, + "grad_norm": 0.7094013094902039, + "learning_rate": 0.00017812499999999998 + }, + { + "step": 39, + "epoch": 0.24375, + "cpu_mem": 1.730400256, + "gpu_mem": 4.720118272, + "loss": 0.6798, + "grad_norm": 0.568288266658783, + "learning_rate": 0.00018281249999999998 + }, + { + "step": 40, + "epoch": 0.25, + "cpu_mem": 1.730400256, + "gpu_mem": 4.7201152, + "loss": 0.7547, + "grad_norm": 2.3537309169769287, + "learning_rate": 0.00018749999999999998 + }, + { + "step": 41, + "epoch": 0.25625, + "cpu_mem": 1.730400256, + "gpu_mem": 4.7201152, + "loss": 0.7445, + "grad_norm": 2.3325884342193604, + "learning_rate": 0.00019218749999999998 + }, + { + "step": 42, + "epoch": 0.2625, + "cpu_mem": 1.730400256, + "gpu_mem": 4.720112128, + "loss": 0.7148, + "grad_norm": 4.679717540740967, + "learning_rate": 0.00019687499999999997 + }, + { + "step": 43, + "epoch": 0.26875, + "cpu_mem": 1.730400256, + "gpu_mem": 4.720112128, + "loss": 0.7455, + "grad_norm": 2.5314502716064453, + "learning_rate": 0.00020156249999999997 + }, + { + "step": 44, + "epoch": 0.275, + "cpu_mem": 1.730400256, + "gpu_mem": 4.720112128, + "loss": 5.3558, + "grad_norm": 723.1975708007812, + "learning_rate": 0.00020624999999999997 + }, + { + "step": 45, + "epoch": 0.28125, + "cpu_mem": 1.730400256, + "gpu_mem": 4.720119808, + "loss": 0.7094, + "grad_norm": 1.70792555809021, + "learning_rate": 0.00021093749999999997 + }, + { + "step": 46, + "epoch": 0.2875, + "cpu_mem": 1.730400256, + "gpu_mem": 4.720112128, + "loss": 0.706, + "grad_norm": 0.50592440366745, + "learning_rate": 0.00021562499999999997 + }, + { + "step": 47, + "epoch": 0.29375, + "cpu_mem": 1.730400256, + "gpu_mem": 4.720113664, + "loss": 0.7056, + "grad_norm": 0.8811830282211304, + "learning_rate": 0.00022031249999999997 + }, + { + "step": 48, + "epoch": 0.3, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6814, + "grad_norm": 0.3716464936733246, + "learning_rate": 0.000225 + }, + { + "step": 49, + "epoch": 0.30625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.8726, + "grad_norm": 5.321829319000244, + "learning_rate": 0.0002296875 + }, + { + "step": 50, + "epoch": 0.3125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.7375, + "grad_norm": 2.159797191619873, + "learning_rate": 0.000234375 + }, + { + "step": 51, + "epoch": 0.31875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6961, + "grad_norm": 1.1379742622375488, + "learning_rate": 0.0002390625 + }, + { + "step": 52, + "epoch": 0.325, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.7835, + "grad_norm": 2.6022896766662598, + "learning_rate": 0.00024375 + }, + { + "step": 53, + "epoch": 0.33125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.8243, + "grad_norm": 3.0817880630493164, + "learning_rate": 0.00024843749999999996 + }, + { + "step": 54, + "epoch": 0.3375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720101376, + "loss": 0.6948, + "grad_norm": 0.4320283830165863, + "learning_rate": 0.000253125 + }, + { + "step": 55, + "epoch": 0.34375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.7087, + "grad_norm": 0.7984228730201721, + "learning_rate": 0.00025781249999999996 + }, + { + "step": 56, + "epoch": 0.35, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.702, + "grad_norm": 0.9343792200088501, + "learning_rate": 0.0002625 + }, + { + "step": 57, + "epoch": 0.35625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.711, + "grad_norm": 1.9531621932983398, + "learning_rate": 0.00026718749999999996 + }, + { + "step": 58, + "epoch": 0.3625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.7088, + "grad_norm": 1.276352882385254, + "learning_rate": 0.000271875 + }, + { + "step": 59, + "epoch": 0.36875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720119808, + "loss": 0.6986, + "grad_norm": 0.5703633427619934, + "learning_rate": 0.00027656249999999995 + }, + { + "step": 60, + "epoch": 0.375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7089, + "grad_norm": 1.3623318672180176, + "learning_rate": 0.00028125 + }, + { + "step": 61, + "epoch": 0.38125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7416, + "grad_norm": 2.7381982803344727, + "learning_rate": 0.00028593749999999995 + }, + { + "step": 62, + "epoch": 0.3875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7167, + "grad_norm": 1.7562028169631958, + "learning_rate": 0.000290625 + }, + { + "step": 63, + "epoch": 0.39375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6866, + "grad_norm": 0.9393060803413391, + "learning_rate": 0.00029531249999999995 + }, + { + "step": 64, + "epoch": 0.4, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.7744, + "grad_norm": 3.5247128009796143, + "learning_rate": 0.0003 + }, + { + "step": 65, + "epoch": 0.40625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6815, + "grad_norm": 0.3044804334640503, + "learning_rate": 0.00029999776892091325 + }, + { + "step": 66, + "epoch": 0.4125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6992, + "grad_norm": 0.7948664426803589, + "learning_rate": 0.00029999107575002246 + }, + { + "step": 67, + "epoch": 0.41875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.7576, + "grad_norm": 2.5693650245666504, + "learning_rate": 0.0002999799206864343 + }, + { + "step": 68, + "epoch": 0.425, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7123, + "grad_norm": 1.2326534986495972, + "learning_rate": 0.0002999643040619863 + }, + { + "step": 69, + "epoch": 0.43125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.691, + "grad_norm": 0.14709803462028503, + "learning_rate": 0.0002999442263412377 + }, + { + "step": 70, + "epoch": 0.4375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.739, + "grad_norm": 2.0310168266296387, + "learning_rate": 0.00029991968812145484 + }, + { + "step": 71, + "epoch": 0.44375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.7111, + "grad_norm": 1.0119962692260742, + "learning_rate": 0.00029989069013259374 + }, + { + "step": 72, + "epoch": 0.45, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6936, + "grad_norm": 0.13146303594112396, + "learning_rate": 0.00029985723323727866 + }, + { + "step": 73, + "epoch": 0.45625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.749, + "grad_norm": 1.6950381994247437, + "learning_rate": 0.00029981931843077583 + }, + { + "step": 74, + "epoch": 0.4625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.7009, + "grad_norm": 0.6165192127227783, + "learning_rate": 0.00029977694684096444 + }, + { + "step": 75, + "epoch": 0.46875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6832, + "grad_norm": 0.6725459098815918, + "learning_rate": 0.0002997301197283027 + }, + { + "step": 76, + "epoch": 0.475, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.9004, + "grad_norm": 3.668682098388672, + "learning_rate": 0.0002996788384857905 + }, + { + "step": 77, + "epoch": 0.48125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.792, + "grad_norm": 2.383697986602783, + "learning_rate": 0.00029962310463892795 + }, + { + "step": 78, + "epoch": 0.4875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.7127, + "grad_norm": 0.9964679479598999, + "learning_rate": 0.00029956291984566997 + }, + { + "step": 79, + "epoch": 0.49375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6979, + "grad_norm": 0.3265517055988312, + "learning_rate": 0.00029949828589637703 + }, + { + "step": 80, + "epoch": 0.5, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.7144, + "grad_norm": 0.8354005813598633, + "learning_rate": 0.0002994292047137618 + }, + { + "step": 81, + "epoch": 0.50625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.7025, + "grad_norm": 0.24958232045173645, + "learning_rate": 0.00029935567835283203 + }, + { + "step": 82, + "epoch": 0.5125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.7292, + "grad_norm": 1.5616759061813354, + "learning_rate": 0.00029927770900082954 + }, + { + "step": 83, + "epoch": 0.51875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6886, + "grad_norm": 0.6402855515480042, + "learning_rate": 0.0002991952989771647 + }, + { + "step": 84, + "epoch": 0.525, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.7011, + "grad_norm": 1.1026426553726196, + "learning_rate": 0.0002991084507333479 + }, + { + "step": 85, + "epoch": 0.53125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7033, + "grad_norm": 0.7877585887908936, + "learning_rate": 0.00029901716685291663 + }, + { + "step": 86, + "epoch": 0.5375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.7219, + "grad_norm": 1.470060110092163, + "learning_rate": 0.0002989214500513582 + }, + { + "step": 87, + "epoch": 0.54375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6884, + "grad_norm": 0.12938757240772247, + "learning_rate": 0.0002988213031760294 + }, + { + "step": 88, + "epoch": 0.55, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7629, + "grad_norm": 2.829951047897339, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 89, + "epoch": 0.55625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.7022, + "grad_norm": 1.095219612121582, + "learning_rate": 0.0002986077312523219 + }, + { + "step": 90, + "epoch": 0.5625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6912, + "grad_norm": 0.5342074036598206, + "learning_rate": 0.00029849431255722116 + }, + { + "step": 91, + "epoch": 0.56875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6651, + "grad_norm": 0.6004640460014343, + "learning_rate": 0.00029837647649471715 + }, + { + "step": 92, + "epoch": 0.575, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.8596, + "grad_norm": 3.5393176078796387, + "learning_rate": 0.0002982542265701641 + }, + { + "step": 93, + "epoch": 0.58125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.7405, + "grad_norm": 1.4825395345687866, + "learning_rate": 0.0002981275664202187 + }, + { + "step": 94, + "epoch": 0.5875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.7086, + "grad_norm": 0.9479426145553589, + "learning_rate": 0.00029799649981273186 + }, + { + "step": 95, + "epoch": 0.59375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7013, + "grad_norm": 0.958656370639801, + "learning_rate": 0.00029786103064663634 + }, + { + "step": 96, + "epoch": 0.6, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.7835, + "grad_norm": 2.093811273574829, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 97, + "epoch": 0.60625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7316, + "grad_norm": 1.2628545761108398, + "learning_rate": 0.00029757690088906156 + }, + { + "step": 98, + "epoch": 0.6125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.7, + "grad_norm": 0.4505934715270996, + "learning_rate": 0.00029742824874979515 + }, + { + "step": 99, + "epoch": 0.61875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6728, + "grad_norm": 0.4340468645095825, + "learning_rate": 0.0002972752109560943 + }, + { + "step": 100, + "epoch": 0.625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.775, + "grad_norm": 2.0685911178588867, + "learning_rate": 0.00029711779206048454 + }, + { + "step": 101, + "epoch": 0.63125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7578, + "grad_norm": 1.8091659545898438, + "learning_rate": 0.0002969559967458194 + }, + { + "step": 102, + "epoch": 0.6375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.7557, + "grad_norm": 1.56316077709198, + "learning_rate": 0.0002967898298251407 + }, + { + "step": 103, + "epoch": 0.64375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.7086, + "grad_norm": 0.5910893678665161, + "learning_rate": 0.0002966192962415358 + }, + { + "step": 104, + "epoch": 0.65, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6992, + "grad_norm": 0.31733426451683044, + "learning_rate": 0.00029644440106799 + }, + { + "step": 105, + "epoch": 0.65625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.7044, + "grad_norm": 0.8655760288238525, + "learning_rate": 0.00029626514950723627 + }, + { + "step": 106, + "epoch": 0.6625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7021, + "grad_norm": 0.8115829825401306, + "learning_rate": 0.0002960815468916 + }, + { + "step": 107, + "epoch": 0.66875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.695, + "grad_norm": 0.4802318811416626, + "learning_rate": 0.0002958935986828407 + }, + { + "step": 108, + "epoch": 0.675, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.7258, + "grad_norm": 1.6139017343521118, + "learning_rate": 0.00029570131047198915 + }, + { + "step": 109, + "epoch": 0.68125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6974, + "grad_norm": 0.42842987179756165, + "learning_rate": 0.0002955046879791816 + }, + { + "step": 110, + "epoch": 0.6875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.7244, + "grad_norm": 1.5731083154678345, + "learning_rate": 0.00029530373705348895 + }, + { + "step": 111, + "epoch": 0.69375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6936, + "grad_norm": 0.13742977380752563, + "learning_rate": 0.00029509846367274336 + }, + { + "step": 112, + "epoch": 0.7, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6943, + "grad_norm": 0.25442665815353394, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 113, + "epoch": 0.70625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720102912, + "loss": 0.6809, + "grad_norm": 0.498094379901886, + "learning_rate": 0.00029467497410015625 + }, + { + "step": 114, + "epoch": 0.7125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6937, + "grad_norm": 0.651147723197937, + "learning_rate": 0.00029445677050616437 + }, + { + "step": 115, + "epoch": 0.71875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.7041, + "grad_norm": 0.8938285708427429, + "learning_rate": 0.0002942342696524443 + }, + { + "step": 116, + "epoch": 0.725, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6977, + "grad_norm": 1.5705320835113525, + "learning_rate": 0.0002940074781578893 + }, + { + "step": 117, + "epoch": 0.73125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.7697, + "grad_norm": 1.9871735572814941, + "learning_rate": 0.00029377640276902954 + }, + { + "step": 118, + "epoch": 0.7375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7542, + "grad_norm": 1.6701043844223022, + "learning_rate": 0.0002935410503598313 + }, + { + "step": 119, + "epoch": 0.74375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7644, + "grad_norm": 1.8497964143753052, + "learning_rate": 0.00029330142793149237 + }, + { + "step": 120, + "epoch": 0.75, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6945, + "grad_norm": 0.12367319315671921, + "learning_rate": 0.000293057542612234 + }, + { + "step": 121, + "epoch": 0.75625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.7128, + "grad_norm": 0.8832190632820129, + "learning_rate": 0.0002928094016570886 + }, + { + "step": 122, + "epoch": 0.7625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.719, + "grad_norm": 1.3923529386520386, + "learning_rate": 0.00029255701244768414 + }, + { + "step": 123, + "epoch": 0.76875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.7056, + "grad_norm": 0.6322979927062988, + "learning_rate": 0.0002923003824920244 + }, + { + "step": 124, + "epoch": 0.775, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7275, + "grad_norm": 1.448596477508545, + "learning_rate": 0.0002920395194242658 + }, + { + "step": 125, + "epoch": 0.78125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720102912, + "loss": 0.6947, + "grad_norm": 0.4014534056186676, + "learning_rate": 0.00029177443100449014 + }, + { + "step": 126, + "epoch": 0.7875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6928, + "grad_norm": 0.28655949234962463, + "learning_rate": 0.00029150512511847375 + }, + { + "step": 127, + "epoch": 0.79375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7143, + "grad_norm": 1.4493529796600342, + "learning_rate": 0.00029123160977745306 + }, + { + "step": 128, + "epoch": 0.8, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6914, + "grad_norm": 0.4185909628868103, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 129, + "epoch": 0.80625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.732, + "grad_norm": 2.080479145050049, + "learning_rate": 0.00029067198340121094 + }, + { + "step": 130, + "epoch": 0.8125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.697, + "grad_norm": 0.17926323413848877, + "learning_rate": 0.00029038588901359884 + }, + { + "step": 131, + "epoch": 0.81875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6918, + "grad_norm": 0.14979717135429382, + "learning_rate": 0.00029009561846570604 + }, + { + "step": 132, + "epoch": 0.825, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.7177, + "grad_norm": 1.2299444675445557, + "learning_rate": 0.00028980118039241976 + }, + { + "step": 133, + "epoch": 0.83125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6958, + "grad_norm": 1.3782416582107544, + "learning_rate": 0.00028950258355260177 + }, + { + "step": 134, + "epoch": 0.8375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.723, + "grad_norm": 1.3422859907150269, + "learning_rate": 0.00028919983682882766 + }, + { + "step": 135, + "epoch": 0.84375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6716, + "grad_norm": 0.29098889231681824, + "learning_rate": 0.0002888929492271224 + }, + { + "step": 136, + "epoch": 0.85, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6794, + "grad_norm": 0.3769038915634155, + "learning_rate": 0.000288581929876693 + }, + { + "step": 137, + "epoch": 0.85625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6899, + "grad_norm": 0.413480281829834, + "learning_rate": 0.00028826678802965614 + }, + { + "step": 138, + "epoch": 0.8625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6956, + "grad_norm": 0.38079705834388733, + "learning_rate": 0.0002879475330607638 + }, + { + "step": 139, + "epoch": 0.86875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6928, + "grad_norm": 0.13262267410755157, + "learning_rate": 0.00028762417446712363 + }, + { + "step": 140, + "epoch": 0.875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7155, + "grad_norm": 1.1844984292984009, + "learning_rate": 0.00028729672186791704 + }, + { + "step": 141, + "epoch": 0.88125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7115, + "grad_norm": 1.1153477430343628, + "learning_rate": 0.00028696518500411254 + }, + { + "step": 142, + "epoch": 0.8875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6914, + "grad_norm": 0.3798380196094513, + "learning_rate": 0.0002866295737381763 + }, + { + "step": 143, + "epoch": 0.89375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6992, + "grad_norm": 0.5220383405685425, + "learning_rate": 0.0002862898980537788 + }, + { + "step": 144, + "epoch": 0.9, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7154, + "grad_norm": 0.8076120615005493, + "learning_rate": 0.0002859461680554975 + }, + { + "step": 145, + "epoch": 0.90625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.6825, + "grad_norm": 0.0844593495130539, + "learning_rate": 0.0002855983939685165 + }, + { + "step": 146, + "epoch": 0.9125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.684, + "grad_norm": 0.25338801741600037, + "learning_rate": 0.0002852465861383224 + }, + { + "step": 147, + "epoch": 0.91875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.7041, + "grad_norm": 0.6661930084228516, + "learning_rate": 0.00028489075503039643 + }, + { + "step": 148, + "epoch": 0.925, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6941, + "grad_norm": 0.502446711063385, + "learning_rate": 0.00028453091122990323 + }, + { + "step": 149, + "epoch": 0.93125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.7072, + "grad_norm": 1.1523233652114868, + "learning_rate": 0.0002841670654413757 + }, + { + "step": 150, + "epoch": 0.9375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6913, + "grad_norm": 0.49297115206718445, + "learning_rate": 0.0002837992284883971 + }, + { + "step": 151, + "epoch": 0.94375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.7351, + "grad_norm": 1.1330760717391968, + "learning_rate": 0.0002834274113132784 + }, + { + "step": 152, + "epoch": 0.95, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.7101, + "grad_norm": 0.6332705020904541, + "learning_rate": 0.0002830516249767332 + }, + { + "step": 153, + "epoch": 0.95625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6901, + "grad_norm": 0.07133732736110687, + "learning_rate": 0.0002826718806575488 + }, + { + "step": 154, + "epoch": 0.9625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6938, + "grad_norm": 0.23413392901420593, + "learning_rate": 0.0002822881896522532 + }, + { + "step": 155, + "epoch": 0.96875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6905, + "grad_norm": 0.44009873270988464, + "learning_rate": 0.0002819005633747795 + }, + { + "step": 156, + "epoch": 0.975, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.712, + "grad_norm": 0.8654837608337402, + "learning_rate": 0.00028150901335612615 + }, + { + "step": 157, + "epoch": 0.98125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6458, + "grad_norm": 1.4672516584396362, + "learning_rate": 0.0002811135512440138 + }, + { + "step": 158, + "epoch": 0.9875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720121344, + "loss": 0.7196, + "grad_norm": 1.0956414937973022, + "learning_rate": 0.0002807141888025392 + }, + { + "step": 159, + "epoch": 0.99375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6913, + "grad_norm": 0.3045135736465454, + "learning_rate": 0.00028031093791182484 + }, + { + "step": 160, + "epoch": 1.0, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6976, + "grad_norm": 0.5009533762931824, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 161, + "epoch": 1.00625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.7024, + "grad_norm": 0.3072945773601532, + "learning_rate": 0.0002794928188811727 + }, + { + "step": 162, + "epoch": 1.0125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.7014, + "grad_norm": 0.4081372916698456, + "learning_rate": 0.0002790779750784118 + }, + { + "step": 163, + "epoch": 1.01875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6791, + "grad_norm": 0.510256290435791, + "learning_rate": 0.0002786592915000408 + }, + { + "step": 164, + "epoch": 1.025, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6808, + "grad_norm": 0.2816726267337799, + "learning_rate": 0.00027823678060094197 + }, + { + "step": 165, + "epoch": 1.03125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6853, + "grad_norm": 0.2034560739994049, + "learning_rate": 0.0002778104549498518 + }, + { + "step": 166, + "epoch": 1.0375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7217, + "grad_norm": 1.1940079927444458, + "learning_rate": 0.00027738032722898683 + }, + { + "step": 167, + "epoch": 1.04375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6975, + "grad_norm": 0.2976217269897461, + "learning_rate": 0.00027694641023366656 + }, + { + "step": 168, + "epoch": 1.05, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.7043, + "grad_norm": 0.6216174960136414, + "learning_rate": 0.0002765087168719328 + }, + { + "step": 169, + "epoch": 1.05625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7002, + "grad_norm": 0.4807371199131012, + "learning_rate": 0.00027606726016416567 + }, + { + "step": 170, + "epoch": 1.0625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.6973, + "grad_norm": 0.5672400593757629, + "learning_rate": 0.00027562205324269617 + }, + { + "step": 171, + "epoch": 1.06875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.693, + "grad_norm": 0.7708451151847839, + "learning_rate": 0.00027517310935141565 + }, + { + "step": 172, + "epoch": 1.075, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7143, + "grad_norm": 1.0128346681594849, + "learning_rate": 0.0002747204418453818 + }, + { + "step": 173, + "epoch": 1.08125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720102912, + "loss": 0.7012, + "grad_norm": 0.5843798518180847, + "learning_rate": 0.00027426406419042135 + }, + { + "step": 174, + "epoch": 1.0875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6954, + "grad_norm": 0.304953932762146, + "learning_rate": 0.00027380398996272956 + }, + { + "step": 175, + "epoch": 1.09375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.7215, + "grad_norm": 1.030972957611084, + "learning_rate": 0.0002733402328484662 + }, + { + "step": 176, + "epoch": 1.1, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6928, + "grad_norm": 0.16266894340515137, + "learning_rate": 0.00027287280664334875 + }, + { + "step": 177, + "epoch": 1.10625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6949, + "grad_norm": 0.24048681557178497, + "learning_rate": 0.0002724017252522415 + }, + { + "step": 178, + "epoch": 1.1125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6909, + "grad_norm": 0.07768089324235916, + "learning_rate": 0.0002719270026887423 + }, + { + "step": 179, + "epoch": 1.11875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6923, + "grad_norm": 0.0869787260890007, + "learning_rate": 0.0002714486530747656 + }, + { + "step": 180, + "epoch": 1.125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6951, + "grad_norm": 0.2728469669818878, + "learning_rate": 0.0002709666906401224 + }, + { + "step": 181, + "epoch": 1.13125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720125952, + "loss": 0.6729, + "grad_norm": 0.5322023034095764, + "learning_rate": 0.0002704811297220967 + }, + { + "step": 182, + "epoch": 1.1375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7324, + "grad_norm": 1.2435344457626343, + "learning_rate": 0.00026999198476501945 + }, + { + "step": 183, + "epoch": 1.14375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7105, + "grad_norm": 0.878863513469696, + "learning_rate": 0.0002694992703198383 + }, + { + "step": 184, + "epoch": 1.15, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6849, + "grad_norm": 0.36960190534591675, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 185, + "epoch": 1.15625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6966, + "grad_norm": 0.5376189947128296, + "learning_rate": 0.0002685031916994403 + }, + { + "step": 186, + "epoch": 1.1625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.7642, + "grad_norm": 1.6297880411148071, + "learning_rate": 0.0002679998571552925 + }, + { + "step": 187, + "epoch": 1.16875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6815, + "grad_norm": 0.3741550147533417, + "learning_rate": 0.0002674930123842975 + }, + { + "step": 188, + "epoch": 1.175, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6999, + "grad_norm": 0.5745854377746582, + "learning_rate": 0.0002669826724639322 + }, + { + "step": 189, + "epoch": 1.18125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7, + "grad_norm": 0.6442886590957642, + "learning_rate": 0.0002664688525756463 + }, + { + "step": 190, + "epoch": 1.1875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.711, + "grad_norm": 1.3806525468826294, + "learning_rate": 0.0002659515680044105 + }, + { + "step": 191, + "epoch": 1.19375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.7192, + "grad_norm": 1.5578025579452515, + "learning_rate": 0.00026543083413826203 + }, + { + "step": 192, + "epoch": 1.2, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6784, + "grad_norm": 0.7056897878646851, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 193, + "epoch": 1.20625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.693, + "grad_norm": 0.20332473516464233, + "learning_rate": 0.0002643790805859582 + }, + { + "step": 194, + "epoch": 1.2125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.6898, + "grad_norm": 0.04674564301967621, + "learning_rate": 0.00026384809218707423 + }, + { + "step": 195, + "epoch": 1.21875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.716, + "grad_norm": 1.7689440250396729, + "learning_rate": 0.0002633137170668897 + }, + { + "step": 196, + "epoch": 1.225, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6923, + "grad_norm": 0.2124248743057251, + "learning_rate": 0.0002627759711218466 + }, + { + "step": 197, + "epoch": 1.23125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.6991, + "grad_norm": 0.538480281829834, + "learning_rate": 0.00026223487034866133 + }, + { + "step": 198, + "epoch": 1.2375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6981, + "grad_norm": 0.6222289204597473, + "learning_rate": 0.00026169043084384896 + }, + { + "step": 199, + "epoch": 1.24375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6813, + "grad_norm": 0.03448052331805229, + "learning_rate": 0.00026114266880324387 + }, + { + "step": 200, + "epoch": 1.25, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.7111, + "grad_norm": 0.9422595500946045, + "learning_rate": 0.0002605916005215186 + }, + { + "step": 201, + "epoch": 1.25625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.6962, + "grad_norm": 0.23377637565135956, + "learning_rate": 0.00026003724239169874 + }, + { + "step": 202, + "epoch": 1.2625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.7148, + "grad_norm": 1.0486937761306763, + "learning_rate": 0.00025947961090467533 + }, + { + "step": 203, + "epoch": 1.26875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6744, + "grad_norm": 0.6723323464393616, + "learning_rate": 0.0002589187226487144 + }, + { + "step": 204, + "epoch": 1.275, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6976, + "grad_norm": 0.3094039261341095, + "learning_rate": 0.0002583545943089633 + }, + { + "step": 205, + "epoch": 1.28125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6996, + "grad_norm": 0.3465484082698822, + "learning_rate": 0.00025778724266695466 + }, + { + "step": 206, + "epoch": 1.2875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.7089, + "grad_norm": 0.48172709345817566, + "learning_rate": 0.00025721668460010696 + }, + { + "step": 207, + "epoch": 1.29375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6933, + "grad_norm": 0.08893127739429474, + "learning_rate": 0.0002566429370812223 + }, + { + "step": 208, + "epoch": 1.3, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6872, + "grad_norm": 0.36892908811569214, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 209, + "epoch": 1.30625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6985, + "grad_norm": 0.40097805857658386, + "learning_rate": 0.0002554859420524386 + }, + { + "step": 210, + "epoch": 1.3125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6936, + "grad_norm": 0.22263135015964508, + "learning_rate": 0.00025490272896050507 + }, + { + "step": 211, + "epoch": 1.31875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7127, + "grad_norm": 0.7903273701667786, + "learning_rate": 0.00025431639525144175 + }, + { + "step": 212, + "epoch": 1.325, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6975, + "grad_norm": 0.2247646152973175, + "learning_rate": 0.0002537269583673404 + }, + { + "step": 213, + "epoch": 1.33125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6926, + "grad_norm": 0.11149632930755615, + "learning_rate": 0.0002531344358426051 + }, + { + "step": 214, + "epoch": 1.3375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720119808, + "loss": 0.6941, + "grad_norm": 0.15491022169589996, + "learning_rate": 0.0002525388453034307 + }, + { + "step": 215, + "epoch": 1.34375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.697, + "grad_norm": 0.389967143535614, + "learning_rate": 0.0002519402044672784 + }, + { + "step": 216, + "epoch": 1.35, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6995, + "grad_norm": 0.5424106121063232, + "learning_rate": 0.00025133853114234905 + }, + { + "step": 217, + "epoch": 1.35625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6959, + "grad_norm": 0.2042379230260849, + "learning_rate": 0.00025073384322705274 + }, + { + "step": 218, + "epoch": 1.3625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.7017, + "grad_norm": 0.5822784900665283, + "learning_rate": 0.0002501261587094771 + }, + { + "step": 219, + "epoch": 1.36875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6926, + "grad_norm": 0.10752076655626297, + "learning_rate": 0.00024951549566685165 + }, + { + "step": 220, + "epoch": 1.375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6945, + "grad_norm": 0.03495921939611435, + "learning_rate": 0.0002489018722650103 + }, + { + "step": 221, + "epoch": 1.38125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6948, + "grad_norm": 0.5388035774230957, + "learning_rate": 0.00024828530675785094 + }, + { + "step": 222, + "epoch": 1.3875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6952, + "grad_norm": 0.13899804651737213, + "learning_rate": 0.00024766581748679234 + }, + { + "step": 223, + "epoch": 1.39375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.6903, + "grad_norm": 0.1994374841451645, + "learning_rate": 0.0002470434228802286 + }, + { + "step": 224, + "epoch": 1.4, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.694, + "grad_norm": 0.10392753034830093, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 225, + "epoch": 1.40625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.7076, + "grad_norm": 0.44438982009887695, + "learning_rate": 0.0002457899918057468 + }, + { + "step": 226, + "epoch": 1.4125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.701, + "grad_norm": 0.31383299827575684, + "learning_rate": 0.0002451589926245468 + }, + { + "step": 227, + "epoch": 1.41875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6941, + "grad_norm": 0.07643990963697433, + "learning_rate": 0.00024452516268016865 + }, + { + "step": 228, + "epoch": 1.425, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6939, + "grad_norm": 0.09563430398702621, + "learning_rate": 0.00024388852082760884 + }, + { + "step": 229, + "epoch": 1.43125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6961, + "grad_norm": 0.15871290862560272, + "learning_rate": 0.00024324908600551162 + }, + { + "step": 230, + "epoch": 1.4375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6984, + "grad_norm": 0.25720492005348206, + "learning_rate": 0.00024260687723560574 + }, + { + "step": 231, + "epoch": 1.44375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6874, + "grad_norm": 0.23664171993732452, + "learning_rate": 0.00024196191362213862 + }, + { + "step": 232, + "epoch": 1.45, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6922, + "grad_norm": 0.0383404865860939, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 233, + "epoch": 1.45625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6958, + "grad_norm": 0.15751682221889496, + "learning_rate": 0.0002406637986906913 + }, + { + "step": 234, + "epoch": 1.4625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720121344, + "loss": 0.6938, + "grad_norm": 0.09246419370174408, + "learning_rate": 0.00024001068598867212 + }, + { + "step": 235, + "epoch": 1.46875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7008, + "grad_norm": 0.32723090052604675, + "learning_rate": 0.000239354895673865 + }, + { + "step": 236, + "epoch": 1.475, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6883, + "grad_norm": 0.39663559198379517, + "learning_rate": 0.00023869644725453735 + }, + { + "step": 237, + "epoch": 1.48125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720119808, + "loss": 0.6989, + "grad_norm": 0.38401105999946594, + "learning_rate": 0.00023803536031802918 + }, + { + "step": 238, + "epoch": 1.4875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.694, + "grad_norm": 0.6511464715003967, + "learning_rate": 0.00023737165453017033 + }, + { + "step": 239, + "epoch": 1.49375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.7037, + "grad_norm": 0.3825187087059021, + "learning_rate": 0.0002367053496346955 + }, + { + "step": 240, + "epoch": 1.5, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.7008, + "grad_norm": 0.2704549729824066, + "learning_rate": 0.00023603646545265687 + }, + { + "step": 241, + "epoch": 1.50625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6732, + "grad_norm": 0.20272208750247955, + "learning_rate": 0.00023536502188183472 + }, + { + "step": 242, + "epoch": 1.5125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6819, + "grad_norm": 0.03015664592385292, + "learning_rate": 0.00023469103889614505 + }, + { + "step": 243, + "epoch": 1.51875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720119808, + "loss": 0.7456, + "grad_norm": 0.8899064660072327, + "learning_rate": 0.0002340145365450458 + }, + { + "step": 244, + "epoch": 1.525, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.712, + "grad_norm": 0.4248214066028595, + "learning_rate": 0.0002333355349529403 + }, + { + "step": 245, + "epoch": 1.53125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.701, + "grad_norm": 0.3430611491203308, + "learning_rate": 0.0002326540543185786 + }, + { + "step": 246, + "epoch": 1.5375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6952, + "grad_norm": 0.23715868592262268, + "learning_rate": 0.0002319701149144565 + }, + { + "step": 247, + "epoch": 1.54375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6919, + "grad_norm": 0.051284439861774445, + "learning_rate": 0.00023128373708621275 + }, + { + "step": 248, + "epoch": 1.55, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.714, + "grad_norm": 0.9449547529220581, + "learning_rate": 0.00023059494125202357 + }, + { + "step": 249, + "epoch": 1.55625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6864, + "grad_norm": 0.12578842043876648, + "learning_rate": 0.00022990374790199532 + }, + { + "step": 250, + "epoch": 1.5625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7101, + "grad_norm": 1.1539607048034668, + "learning_rate": 0.0002292101775975552 + }, + { + "step": 251, + "epoch": 1.56875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.694, + "grad_norm": 0.5180936455726624, + "learning_rate": 0.00022851425097083906 + }, + { + "step": 252, + "epoch": 1.575, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.7024, + "grad_norm": 1.2567081451416016, + "learning_rate": 0.00022781598872407822 + }, + { + "step": 253, + "epoch": 1.58125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6966, + "grad_norm": 0.3418976366519928, + "learning_rate": 0.00022711541162898321 + }, + { + "step": 254, + "epoch": 1.5875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6943, + "grad_norm": 0.10212824493646622, + "learning_rate": 0.00022641254052612627 + }, + { + "step": 255, + "epoch": 1.59375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.6901, + "grad_norm": 0.25213363766670227, + "learning_rate": 0.00022570739632432079 + }, + { + "step": 256, + "epoch": 1.6, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.7036, + "grad_norm": 0.7465999722480774, + "learning_rate": 0.000225 + }, + { + "step": 257, + "epoch": 1.60625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.695, + "grad_norm": 0.25468191504478455, + "learning_rate": 0.0002242903725965924 + }, + { + "step": 258, + "epoch": 1.6125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7034, + "grad_norm": 0.9659193158149719, + "learning_rate": 0.00022357853522389615 + }, + { + "step": 259, + "epoch": 1.61875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7001, + "grad_norm": 1.3087700605392456, + "learning_rate": 0.000222864509057451 + }, + { + "step": 260, + "epoch": 1.625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6975, + "grad_norm": 0.8052722215652466, + "learning_rate": 0.00022214831533790813 + }, + { + "step": 261, + "epoch": 1.63125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6957, + "grad_norm": 1.4646708965301514, + "learning_rate": 0.0002214299753703987 + }, + { + "step": 262, + "epoch": 1.6375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6943, + "grad_norm": 0.12530392408370972, + "learning_rate": 0.00022070951052389966 + }, + { + "step": 263, + "epoch": 1.64375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720121344, + "loss": 0.6938, + "grad_norm": 0.21724450588226318, + "learning_rate": 0.00021998694223059837 + }, + { + "step": 264, + "epoch": 1.65, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6922, + "grad_norm": 0.05238856375217438, + "learning_rate": 0.0002192622919852551 + }, + { + "step": 265, + "epoch": 1.65625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6969, + "grad_norm": 0.5901528000831604, + "learning_rate": 0.00021853558134456307 + }, + { + "step": 266, + "epoch": 1.6625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6939, + "grad_norm": 0.5630112290382385, + "learning_rate": 0.00021780683192650796 + }, + { + "step": 267, + "epoch": 1.66875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6938, + "grad_norm": 0.5498056411743164, + "learning_rate": 0.00021707606540972413 + }, + { + "step": 268, + "epoch": 1.675, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6994, + "grad_norm": 1.1121935844421387, + "learning_rate": 0.00021634330353285017 + }, + { + "step": 269, + "epoch": 1.68125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6934, + "grad_norm": 0.7021274566650391, + "learning_rate": 0.00021560856809388213 + }, + { + "step": 270, + "epoch": 1.6875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6961, + "grad_norm": 0.8253675103187561, + "learning_rate": 0.00021487188094952489 + }, + { + "step": 271, + "epoch": 1.69375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.694, + "grad_norm": 0.09485553205013275, + "learning_rate": 0.0002141332640145423 + }, + { + "step": 272, + "epoch": 1.7, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.7023, + "grad_norm": 1.165671706199646, + "learning_rate": 0.0002133927392611049 + }, + { + "step": 273, + "epoch": 1.70625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6935, + "grad_norm": 0.4954543709754944, + "learning_rate": 0.00021265032871813658 + }, + { + "step": 274, + "epoch": 1.7125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.692, + "grad_norm": 0.8397842049598694, + "learning_rate": 0.00021190605447065917 + }, + { + "step": 275, + "epoch": 1.71875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7019, + "grad_norm": 1.10673987865448, + "learning_rate": 0.0002111599386591355 + }, + { + "step": 276, + "epoch": 1.725, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.701, + "grad_norm": 0.840828001499176, + "learning_rate": 0.00021041200347881057 + }, + { + "step": 277, + "epoch": 1.73125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.6948, + "grad_norm": 0.8557524085044861, + "learning_rate": 0.00020966227117905163 + }, + { + "step": 278, + "epoch": 1.7375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6935, + "grad_norm": 0.14333386719226837, + "learning_rate": 0.00020891076406268612 + }, + { + "step": 279, + "epoch": 1.74375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6929, + "grad_norm": 0.7223246097564697, + "learning_rate": 0.00020815750448533805 + }, + { + "step": 280, + "epoch": 1.75, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.6922, + "grad_norm": 0.0306211207062006, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 281, + "epoch": 1.75625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.6842, + "grad_norm": 0.23945514857769012, + "learning_rate": 0.00020664581763018324 + }, + { + "step": 282, + "epoch": 1.7625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.7089, + "grad_norm": 0.6352826356887817, + "learning_rate": 0.00020588743532161543 + }, + { + "step": 283, + "epoch": 1.76875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7408, + "grad_norm": 1.369584560394287, + "learning_rate": 0.00020512739048920552 + }, + { + "step": 284, + "epoch": 1.775, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.704, + "grad_norm": 0.5727683901786804, + "learning_rate": 0.00020436570574255522 + }, + { + "step": 285, + "epoch": 1.78125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6938, + "grad_norm": 0.1859133392572403, + "learning_rate": 0.00020360240374005 + }, + { + "step": 286, + "epoch": 1.7875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.7157, + "grad_norm": 1.696636438369751, + "learning_rate": 0.00020283750718818501 + }, + { + "step": 287, + "epoch": 1.79375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.7043, + "grad_norm": 0.7674811482429504, + "learning_rate": 0.00020207103884088955 + }, + { + "step": 288, + "epoch": 1.8, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.699, + "grad_norm": 0.5101733803749084, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 289, + "epoch": 1.80625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6996, + "grad_norm": 0.7179736495018005, + "learning_rate": 0.00020053347800883298 + }, + { + "step": 290, + "epoch": 1.8125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6917, + "grad_norm": 0.284076452255249, + "learning_rate": 0.00019976243126300282 + }, + { + "step": 291, + "epoch": 1.81875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6937, + "grad_norm": 0.21216025948524475, + "learning_rate": 0.00019898990419824333 + }, + { + "step": 292, + "epoch": 1.825, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720101376, + "loss": 0.6978, + "grad_norm": 0.3499354422092438, + "learning_rate": 0.00019821591979547423 + }, + { + "step": 293, + "epoch": 1.83125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.7265, + "grad_norm": 1.1123017072677612, + "learning_rate": 0.00019744050107896774 + }, + { + "step": 294, + "epoch": 1.8375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720102912, + "loss": 0.7289, + "grad_norm": 1.2034318447113037, + "learning_rate": 0.0001966636711156636 + }, + { + "step": 295, + "epoch": 1.84375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6952, + "grad_norm": 0.13924765586853027, + "learning_rate": 0.00019588545301448302 + }, + { + "step": 296, + "epoch": 1.85, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6971, + "grad_norm": 0.5038891434669495, + "learning_rate": 0.00019510586992564093 + }, + { + "step": 297, + "epoch": 1.85625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6957, + "grad_norm": 0.14182189106941223, + "learning_rate": 0.0001943249450399578 + }, + { + "step": 298, + "epoch": 1.8625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6971, + "grad_norm": 0.294199138879776, + "learning_rate": 0.0001935427015881693 + }, + { + "step": 299, + "epoch": 1.86875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6913, + "grad_norm": 0.1657639592885971, + "learning_rate": 0.00019275916284023563 + }, + { + "step": 300, + "epoch": 1.875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.7059, + "grad_norm": 1.0723072290420532, + "learning_rate": 0.00019197435210464882 + }, + { + "step": 301, + "epoch": 1.88125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6953, + "grad_norm": 0.34668996930122375, + "learning_rate": 0.00019118829272773985 + }, + { + "step": 302, + "epoch": 1.8875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6988, + "grad_norm": 0.5788814425468445, + "learning_rate": 0.00019040100809298392 + }, + { + "step": 303, + "epoch": 1.89375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720121344, + "loss": 0.6923, + "grad_norm": 0.02944917231798172, + "learning_rate": 0.00018961252162030476 + }, + { + "step": 304, + "epoch": 1.9, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6891, + "grad_norm": 0.14991898834705353, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 305, + "epoch": 1.90625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.687, + "grad_norm": 0.23866210877895355, + "learning_rate": 0.00018803203701893393 + }, + { + "step": 306, + "epoch": 1.9125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6925, + "grad_norm": 0.25284481048583984, + "learning_rate": 0.00018724008590605742 + }, + { + "step": 307, + "epoch": 1.91875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.7154, + "grad_norm": 1.436574935913086, + "learning_rate": 0.0001864470269854896 + }, + { + "step": 308, + "epoch": 1.925, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6897, + "grad_norm": 0.16865338385105133, + "learning_rate": 0.00018565288384892595 + }, + { + "step": 309, + "epoch": 1.93125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6997, + "grad_norm": 1.1011378765106201, + "learning_rate": 0.00018485768012031518 + }, + { + "step": 310, + "epoch": 1.9375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.7033, + "grad_norm": 0.9454079270362854, + "learning_rate": 0.00018406143945515598 + }, + { + "step": 311, + "epoch": 1.94375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.7024, + "grad_norm": 0.6924813389778137, + "learning_rate": 0.00018326418553979367 + }, + { + "step": 312, + "epoch": 1.95, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.6911, + "grad_norm": 0.041777729988098145, + "learning_rate": 0.0001824659420907154 + }, + { + "step": 313, + "epoch": 1.95625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6982, + "grad_norm": 0.5393715500831604, + "learning_rate": 0.00018166673285384475 + }, + { + "step": 314, + "epoch": 1.9625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6936, + "grad_norm": 0.02448694035410881, + "learning_rate": 0.00018086658160383523 + }, + { + "step": 315, + "epoch": 1.96875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6905, + "grad_norm": 0.25952762365341187, + "learning_rate": 0.00018006551214336304 + }, + { + "step": 316, + "epoch": 1.975, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.7087, + "grad_norm": 1.0640366077423096, + "learning_rate": 0.00017926354830241924 + }, + { + "step": 317, + "epoch": 1.98125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.7042, + "grad_norm": 0.8085795044898987, + "learning_rate": 0.00017846071393760044 + }, + { + "step": 318, + "epoch": 1.9875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6959, + "grad_norm": 0.40610137581825256, + "learning_rate": 0.00017765703293139948 + }, + { + "step": 319, + "epoch": 1.99375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6882, + "grad_norm": 0.33856216073036194, + "learning_rate": 0.00017685252919149493 + }, + { + "step": 320, + "epoch": 2.0, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6786, + "grad_norm": 0.2530158758163452, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 321, + "epoch": 2.00625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6535, + "grad_norm": 0.3908483386039734, + "learning_rate": 0.00017524114926294887 + }, + { + "step": 322, + "epoch": 2.0125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.7031, + "grad_norm": 0.3450213372707367, + "learning_rate": 0.0001744343210091883 + }, + { + "step": 323, + "epoch": 2.01875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7166, + "grad_norm": 0.5918006300926208, + "learning_rate": 0.00017362676589005967 + }, + { + "step": 324, + "epoch": 2.025, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720121344, + "loss": 0.7441, + "grad_norm": 1.1248430013656616, + "learning_rate": 0.0001728185079284875 + }, + { + "step": 325, + "epoch": 2.03125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.7071, + "grad_norm": 0.7133017182350159, + "learning_rate": 0.00017200957116830423 + }, + { + "step": 326, + "epoch": 2.0375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720119808, + "loss": 0.695, + "grad_norm": 0.24797262251377106, + "learning_rate": 0.00017119997967353514 + }, + { + "step": 327, + "epoch": 2.04375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.7113, + "grad_norm": 1.6026979684829712, + "learning_rate": 0.00017038975752768211 + }, + { + "step": 328, + "epoch": 2.05, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.7045, + "grad_norm": 1.144885778427124, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 329, + "epoch": 2.05625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.688, + "grad_norm": 0.47744861245155334, + "learning_rate": 0.0001687675177098179 + }, + { + "step": 330, + "epoch": 2.0625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7076, + "grad_norm": 1.2831629514694214, + "learning_rate": 0.00016795554829574435 + }, + { + "step": 331, + "epoch": 2.06875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.7008, + "grad_norm": 0.8616870641708374, + "learning_rate": 0.00016714304474502696 + }, + { + "step": 332, + "epoch": 2.075, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6976, + "grad_norm": 1.3015812635421753, + "learning_rate": 0.00016633003122779467 + }, + { + "step": 333, + "epoch": 2.08125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.7036, + "grad_norm": 1.2680689096450806, + "learning_rate": 0.00016551653192934694 + }, + { + "step": 334, + "epoch": 2.0875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6957, + "grad_norm": 0.45022308826446533, + "learning_rate": 0.0001647025710494341 + }, + { + "step": 335, + "epoch": 2.09375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6947, + "grad_norm": 0.30140987038612366, + "learning_rate": 0.00016388817280153735 + }, + { + "step": 336, + "epoch": 2.1, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6906, + "grad_norm": 1.1204452514648438, + "learning_rate": 0.00016307336141214873 + }, + { + "step": 337, + "epoch": 2.10625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.7065, + "grad_norm": 1.3225176334381104, + "learning_rate": 0.00016225816112005022 + }, + { + "step": 338, + "epoch": 2.1125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7141, + "grad_norm": 1.7681318521499634, + "learning_rate": 0.00016144259617559286 + }, + { + "step": 339, + "epoch": 2.11875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.7011, + "grad_norm": 0.9267274737358093, + "learning_rate": 0.00016062669083997513 + }, + { + "step": 340, + "epoch": 2.125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.695, + "grad_norm": 0.3274507522583008, + "learning_rate": 0.00015981046938452146 + }, + { + "step": 341, + "epoch": 2.13125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6938, + "grad_norm": 0.7475917339324951, + "learning_rate": 0.00015899395608996015 + }, + { + "step": 342, + "epoch": 2.1375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.696, + "grad_norm": 0.8096014261245728, + "learning_rate": 0.00015817717524570094 + }, + { + "step": 343, + "epoch": 2.14375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6935, + "grad_norm": 0.051297158002853394, + "learning_rate": 0.0001573601511491127 + }, + { + "step": 344, + "epoch": 2.15, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.693, + "grad_norm": 0.8674398064613342, + "learning_rate": 0.00015654290810480042 + }, + { + "step": 345, + "epoch": 2.15625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6895, + "grad_norm": 0.5000708699226379, + "learning_rate": 0.00015572547042388223 + }, + { + "step": 346, + "epoch": 2.1625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6981, + "grad_norm": 0.5877357721328735, + "learning_rate": 0.00015490786242326643 + }, + { + "step": 347, + "epoch": 2.16875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.7122, + "grad_norm": 1.4082072973251343, + "learning_rate": 0.00015409010842492777 + }, + { + "step": 348, + "epoch": 2.175, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.7113, + "grad_norm": 1.3163734674453735, + "learning_rate": 0.00015327223275518416 + }, + { + "step": 349, + "epoch": 2.18125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72009984, + "loss": 0.6913, + "grad_norm": 0.09009212255477905, + "learning_rate": 0.000152454259743973 + }, + { + "step": 350, + "epoch": 2.1875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6928, + "grad_norm": 0.07127156108617783, + "learning_rate": 0.00015163621372412734 + }, + { + "step": 351, + "epoch": 2.19375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6943, + "grad_norm": 0.5261074304580688, + "learning_rate": 0.00015081811903065205 + }, + { + "step": 352, + "epoch": 2.2, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6949, + "grad_norm": 0.25749513506889343, + "learning_rate": 0.00015 + }, + { + "step": 353, + "epoch": 2.20625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6913, + "grad_norm": 0.04777367413043976, + "learning_rate": 0.0001491818809693479 + }, + { + "step": 354, + "epoch": 2.2125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.659, + "grad_norm": 1.535619854927063, + "learning_rate": 0.00014836378627587266 + }, + { + "step": 355, + "epoch": 2.21875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.7124, + "grad_norm": 0.6650431156158447, + "learning_rate": 0.00014754574025602698 + }, + { + "step": 356, + "epoch": 2.225, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.7119, + "grad_norm": 0.5613598823547363, + "learning_rate": 0.00014672776724481584 + }, + { + "step": 357, + "epoch": 2.23125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.7155, + "grad_norm": 0.6001882553100586, + "learning_rate": 0.00014590989157507224 + }, + { + "step": 358, + "epoch": 2.2375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.7049, + "grad_norm": 0.4394175112247467, + "learning_rate": 0.00014509213757673357 + }, + { + "step": 359, + "epoch": 2.24375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6962, + "grad_norm": 0.17465947568416595, + "learning_rate": 0.00014427452957611775 + }, + { + "step": 360, + "epoch": 2.25, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6957, + "grad_norm": 0.33749648928642273, + "learning_rate": 0.0001434570918951996 + }, + { + "step": 361, + "epoch": 2.25625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6946, + "grad_norm": 0.17299968004226685, + "learning_rate": 0.0001426398488508873 + }, + { + "step": 362, + "epoch": 2.2625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.681, + "grad_norm": 0.5107457041740417, + "learning_rate": 0.00014182282475429903 + }, + { + "step": 363, + "epoch": 2.26875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6921, + "grad_norm": 0.178608238697052, + "learning_rate": 0.00014100604391003985 + }, + { + "step": 364, + "epoch": 2.275, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7194, + "grad_norm": 0.9469413161277771, + "learning_rate": 0.0001401895306154785 + }, + { + "step": 365, + "epoch": 2.28125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.731, + "grad_norm": 1.1998013257980347, + "learning_rate": 0.00013937330916002487 + }, + { + "step": 366, + "epoch": 2.2875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.7064, + "grad_norm": 0.5949049592018127, + "learning_rate": 0.00013855740382440714 + }, + { + "step": 367, + "epoch": 2.29375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6797, + "grad_norm": 0.3075293004512787, + "learning_rate": 0.0001377418388799498 + }, + { + "step": 368, + "epoch": 2.3, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6851, + "grad_norm": 0.2683301568031311, + "learning_rate": 0.00013692663858785124 + }, + { + "step": 369, + "epoch": 2.30625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6852, + "grad_norm": 0.2900184690952301, + "learning_rate": 0.00013611182719846268 + }, + { + "step": 370, + "epoch": 2.3125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.695, + "grad_norm": 0.17847023904323578, + "learning_rate": 0.0001352974289505659 + }, + { + "step": 371, + "epoch": 2.31875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6972, + "grad_norm": 0.24754248559474945, + "learning_rate": 0.000134483468070653 + }, + { + "step": 372, + "epoch": 2.325, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6968, + "grad_norm": 0.3157871961593628, + "learning_rate": 0.00013366996877220533 + }, + { + "step": 373, + "epoch": 2.33125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6942, + "grad_norm": 0.07802005112171173, + "learning_rate": 0.000132856955254973 + }, + { + "step": 374, + "epoch": 2.3375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72012288, + "loss": 0.6928, + "grad_norm": 0.5802419185638428, + "learning_rate": 0.00013204445170425565 + }, + { + "step": 375, + "epoch": 2.34375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6942, + "grad_norm": 0.30882540345191956, + "learning_rate": 0.00013123248229018214 + }, + { + "step": 376, + "epoch": 2.35, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6939, + "grad_norm": 0.3742713928222656, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 377, + "epoch": 2.35625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72012288, + "loss": 0.691, + "grad_norm": 0.23500396311283112, + "learning_rate": 0.0001296102424723179 + }, + { + "step": 378, + "epoch": 2.3625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6891, + "grad_norm": 0.26744574308395386, + "learning_rate": 0.0001288000203264649 + }, + { + "step": 379, + "epoch": 2.36875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.7064, + "grad_norm": 0.6954576969146729, + "learning_rate": 0.00012799042883169574 + }, + { + "step": 380, + "epoch": 2.375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.682, + "grad_norm": 0.38613051176071167, + "learning_rate": 0.00012718149207151247 + }, + { + "step": 381, + "epoch": 2.38125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7039, + "grad_norm": 0.581422746181488, + "learning_rate": 0.00012637323410994033 + }, + { + "step": 382, + "epoch": 2.3875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6828, + "grad_norm": 0.19507355988025665, + "learning_rate": 0.0001255656789908117 + }, + { + "step": 383, + "epoch": 2.39375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6863, + "grad_norm": 0.07699579000473022, + "learning_rate": 0.0001247588507370511 + }, + { + "step": 384, + "epoch": 2.4, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7086, + "grad_norm": 0.7247633337974548, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 385, + "epoch": 2.40625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7149, + "grad_norm": 0.968413233757019, + "learning_rate": 0.0001231474708085051 + }, + { + "step": 386, + "epoch": 2.4125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.697, + "grad_norm": 0.26986491680145264, + "learning_rate": 0.0001223429670686005 + }, + { + "step": 387, + "epoch": 2.41875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6933, + "grad_norm": 0.3396812975406647, + "learning_rate": 0.00012153928606239957 + }, + { + "step": 388, + "epoch": 2.425, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6924, + "grad_norm": 0.107146255671978, + "learning_rate": 0.00012073645169758076 + }, + { + "step": 389, + "epoch": 2.43125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6954, + "grad_norm": 0.19240212440490723, + "learning_rate": 0.00011993448785663692 + }, + { + "step": 390, + "epoch": 2.4375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.689, + "grad_norm": 0.16087864339351654, + "learning_rate": 0.00011913341839616476 + }, + { + "step": 391, + "epoch": 2.44375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6938, + "grad_norm": 0.11122910678386688, + "learning_rate": 0.00011833326714615522 + }, + { + "step": 392, + "epoch": 2.45, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720119808, + "loss": 0.6964, + "grad_norm": 0.21420706808567047, + "learning_rate": 0.00011753405790928456 + }, + { + "step": 393, + "epoch": 2.45625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6901, + "grad_norm": 0.04138645902276039, + "learning_rate": 0.0001167358144602063 + }, + { + "step": 394, + "epoch": 2.4625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6991, + "grad_norm": 0.3014482855796814, + "learning_rate": 0.00011593856054484402 + }, + { + "step": 395, + "epoch": 2.46875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6734, + "grad_norm": 0.642922043800354, + "learning_rate": 0.00011514231987968482 + }, + { + "step": 396, + "epoch": 2.475, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6924, + "grad_norm": 0.06563147902488708, + "learning_rate": 0.00011434711615107404 + }, + { + "step": 397, + "epoch": 2.48125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.6916, + "grad_norm": 0.0839574858546257, + "learning_rate": 0.00011355297301451042 + }, + { + "step": 398, + "epoch": 2.4875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720119808, + "loss": 0.6928, + "grad_norm": 0.1047968789935112, + "learning_rate": 0.00011275991409394253 + }, + { + "step": 399, + "epoch": 2.49375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.7073, + "grad_norm": 0.5934455990791321, + "learning_rate": 0.00011196796298106608 + }, + { + "step": 400, + "epoch": 2.5, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.7026, + "grad_norm": 0.5413299202919006, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 401, + "epoch": 2.50625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6858, + "grad_norm": 0.5939713716506958, + "learning_rate": 0.00011038747837969526 + }, + { + "step": 402, + "epoch": 2.5125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.694, + "grad_norm": 0.034741271287202835, + "learning_rate": 0.00010959899190701608 + }, + { + "step": 403, + "epoch": 2.51875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6931, + "grad_norm": 0.1690404862165451, + "learning_rate": 0.00010881170727226018 + }, + { + "step": 404, + "epoch": 2.525, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6935, + "grad_norm": 0.3437475562095642, + "learning_rate": 0.00010802564789535119 + }, + { + "step": 405, + "epoch": 2.53125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6939, + "grad_norm": 0.3497893214225769, + "learning_rate": 0.00010724083715976441 + }, + { + "step": 406, + "epoch": 2.5375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6948, + "grad_norm": 0.36352765560150146, + "learning_rate": 0.00010645729841183066 + }, + { + "step": 407, + "epoch": 2.54375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6932, + "grad_norm": 0.2033793330192566, + "learning_rate": 0.00010567505496004213 + }, + { + "step": 408, + "epoch": 2.55, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.6922, + "grad_norm": 0.1402321606874466, + "learning_rate": 0.00010489413007435904 + }, + { + "step": 409, + "epoch": 2.55625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6914, + "grad_norm": 0.07998552173376083, + "learning_rate": 0.00010411454698551695 + }, + { + "step": 410, + "epoch": 2.5625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6867, + "grad_norm": 0.289734423160553, + "learning_rate": 0.00010333632888433638 + }, + { + "step": 411, + "epoch": 2.56875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.7066, + "grad_norm": 0.5669627785682678, + "learning_rate": 0.00010255949892103225 + }, + { + "step": 412, + "epoch": 2.575, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6885, + "grad_norm": 0.03683910891413689, + "learning_rate": 0.00010178408020452579 + }, + { + "step": 413, + "epoch": 2.58125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720119808, + "loss": 0.6884, + "grad_norm": 0.030282272025942802, + "learning_rate": 0.00010101009580175669 + }, + { + "step": 414, + "epoch": 2.5875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6981, + "grad_norm": 0.23343881964683533, + "learning_rate": 0.00010023756873699722 + }, + { + "step": 415, + "epoch": 2.59375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7001, + "grad_norm": 0.3000665307044983, + "learning_rate": 9.946652199116699e-05 + }, + { + "step": 416, + "epoch": 2.6, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.7052, + "grad_norm": 0.5063682794570923, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 417, + "epoch": 2.60625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.7009, + "grad_norm": 0.4417142868041992, + "learning_rate": 9.792896115911045e-05 + }, + { + "step": 418, + "epoch": 2.6125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6922, + "grad_norm": 0.21695689857006073, + "learning_rate": 9.716249281181497e-05 + }, + { + "step": 419, + "epoch": 2.61875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.691, + "grad_norm": 0.49243229627609253, + "learning_rate": 9.639759625994998e-05 + }, + { + "step": 420, + "epoch": 2.625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6951, + "grad_norm": 0.1695171296596527, + "learning_rate": 9.563429425744476e-05 + }, + { + "step": 421, + "epoch": 2.63125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.7053, + "grad_norm": 0.5976517200469971, + "learning_rate": 9.487260951079448e-05 + }, + { + "step": 422, + "epoch": 2.6375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.7127, + "grad_norm": 0.8328030109405518, + "learning_rate": 9.411256467838455e-05 + }, + { + "step": 423, + "epoch": 2.64375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6769, + "grad_norm": 0.5237409472465515, + "learning_rate": 9.335418236981677e-05 + }, + { + "step": 424, + "epoch": 2.65, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6993, + "grad_norm": 0.33801451325416565, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 425, + "epoch": 2.65625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6976, + "grad_norm": 0.2944739758968353, + "learning_rate": 9.184249551466189e-05 + }, + { + "step": 426, + "epoch": 2.6625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6859, + "grad_norm": 0.2675777077674866, + "learning_rate": 9.10892359373139e-05 + }, + { + "step": 427, + "epoch": 2.66875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720119808, + "loss": 0.6934, + "grad_norm": 0.05418539419770241, + "learning_rate": 9.033772882094833e-05 + }, + { + "step": 428, + "epoch": 2.675, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.7033, + "grad_norm": 0.5692530274391174, + "learning_rate": 8.958799652118943e-05 + }, + { + "step": 429, + "epoch": 2.68125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6932, + "grad_norm": 0.05446188896894455, + "learning_rate": 8.884006134086449e-05 + }, + { + "step": 430, + "epoch": 2.6875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6924, + "grad_norm": 0.12962254881858826, + "learning_rate": 8.809394552934079e-05 + }, + { + "step": 431, + "epoch": 2.69375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6949, + "grad_norm": 0.4106745719909668, + "learning_rate": 8.734967128186338e-05 + }, + { + "step": 432, + "epoch": 2.7, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.694, + "grad_norm": 0.03089405782520771, + "learning_rate": 8.660726073889511e-05 + }, + { + "step": 433, + "epoch": 2.70625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6929, + "grad_norm": 0.05140059441328049, + "learning_rate": 8.586673598545771e-05 + }, + { + "step": 434, + "epoch": 2.7125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.694, + "grad_norm": 0.052435457706451416, + "learning_rate": 8.512811905047505e-05 + }, + { + "step": 435, + "epoch": 2.71875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6944, + "grad_norm": 0.20689545571804047, + "learning_rate": 8.439143190611787e-05 + }, + { + "step": 436, + "epoch": 2.725, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6931, + "grad_norm": 0.0819716528058052, + "learning_rate": 8.365669646714983e-05 + }, + { + "step": 437, + "epoch": 2.73125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.6946, + "grad_norm": 0.06515394896268845, + "learning_rate": 8.29239345902759e-05 + }, + { + "step": 438, + "epoch": 2.7375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6929, + "grad_norm": 0.023208066821098328, + "learning_rate": 8.219316807349204e-05 + }, + { + "step": 439, + "epoch": 2.74375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6938, + "grad_norm": 0.028950193896889687, + "learning_rate": 8.146441865543689e-05 + }, + { + "step": 440, + "epoch": 2.75, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6914, + "grad_norm": 0.5677440762519836, + "learning_rate": 8.073770801474495e-05 + }, + { + "step": 441, + "epoch": 2.75625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6934, + "grad_norm": 0.028887998312711716, + "learning_rate": 8.001305776940163e-05 + }, + { + "step": 442, + "epoch": 2.7625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6902, + "grad_norm": 0.1042974665760994, + "learning_rate": 7.929048947610034e-05 + }, + { + "step": 443, + "epoch": 2.76875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6961, + "grad_norm": 0.16584116220474243, + "learning_rate": 7.857002462960132e-05 + }, + { + "step": 444, + "epoch": 2.775, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6879, + "grad_norm": 0.11453115195035934, + "learning_rate": 7.785168466209187e-05 + }, + { + "step": 445, + "epoch": 2.78125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6897, + "grad_norm": 0.04005393758416176, + "learning_rate": 7.713549094254897e-05 + }, + { + "step": 446, + "epoch": 2.7875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6835, + "grad_norm": 0.12769098579883575, + "learning_rate": 7.64214647761038e-05 + }, + { + "step": 447, + "epoch": 2.79375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6858, + "grad_norm": 0.045596592128276825, + "learning_rate": 7.570962740340759e-05 + }, + { + "step": 448, + "epoch": 2.8, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6854, + "grad_norm": 0.026519285514950752, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 449, + "epoch": 2.80625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6854, + "grad_norm": 0.023741962388157845, + "learning_rate": 7.429260367567916e-05 + }, + { + "step": 450, + "epoch": 2.8125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.7118, + "grad_norm": 0.41572466492652893, + "learning_rate": 7.358745947387373e-05 + }, + { + "step": 451, + "epoch": 2.81875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6994, + "grad_norm": 0.2246115654706955, + "learning_rate": 7.288458837101675e-05 + }, + { + "step": 452, + "epoch": 2.825, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6645, + "grad_norm": 0.2929467260837555, + "learning_rate": 7.218401127592175e-05 + }, + { + "step": 453, + "epoch": 2.83125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6733, + "grad_norm": 0.1665802150964737, + "learning_rate": 7.14857490291609e-05 + }, + { + "step": 454, + "epoch": 2.8375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6945, + "grad_norm": 0.15671882033348083, + "learning_rate": 7.07898224024448e-05 + }, + { + "step": 455, + "epoch": 2.84375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.7167, + "grad_norm": 0.4677496552467346, + "learning_rate": 7.009625209800465e-05 + }, + { + "step": 456, + "epoch": 2.85, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.7104, + "grad_norm": 0.37179628014564514, + "learning_rate": 6.940505874797639e-05 + }, + { + "step": 457, + "epoch": 2.85625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7034, + "grad_norm": 0.26108232140541077, + "learning_rate": 6.871626291378728e-05 + }, + { + "step": 458, + "epoch": 2.8625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7123, + "grad_norm": 0.42205747961997986, + "learning_rate": 6.80298850855435e-05 + }, + { + "step": 459, + "epoch": 2.86875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6958, + "grad_norm": 0.10318117588758469, + "learning_rate": 6.734594568142142e-05 + }, + { + "step": 460, + "epoch": 2.875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6974, + "grad_norm": 0.19337885081768036, + "learning_rate": 6.66644650470597e-05 + }, + { + "step": 461, + "epoch": 2.88125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6911, + "grad_norm": 0.43432649970054626, + "learning_rate": 6.598546345495417e-05 + }, + { + "step": 462, + "epoch": 2.8875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6921, + "grad_norm": 0.30418860912323, + "learning_rate": 6.530896110385494e-05 + }, + { + "step": 463, + "epoch": 2.89375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.694, + "grad_norm": 0.03701454773545265, + "learning_rate": 6.463497811816523e-05 + }, + { + "step": 464, + "epoch": 2.9, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.697, + "grad_norm": 0.14719635248184204, + "learning_rate": 6.396353454734311e-05 + }, + { + "step": 465, + "epoch": 2.90625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6859, + "grad_norm": 0.20314116775989532, + "learning_rate": 6.32946503653045e-05 + }, + { + "step": 466, + "epoch": 2.9125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6958, + "grad_norm": 0.09987439960241318, + "learning_rate": 6.262834546982969e-05 + }, + { + "step": 467, + "epoch": 2.91875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720119808, + "loss": 0.6933, + "grad_norm": 0.06888532638549805, + "learning_rate": 6.196463968197084e-05 + }, + { + "step": 468, + "epoch": 2.925, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6966, + "grad_norm": 0.13110306859016418, + "learning_rate": 6.130355274546267e-05 + }, + { + "step": 469, + "epoch": 2.93125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.7097, + "grad_norm": 0.395150363445282, + "learning_rate": 6.064510432613499e-05 + }, + { + "step": 470, + "epoch": 2.9375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7017, + "grad_norm": 0.24518249928951263, + "learning_rate": 5.998931401132786e-05 + }, + { + "step": 471, + "epoch": 2.94375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6808, + "grad_norm": 0.25806140899658203, + "learning_rate": 5.933620130930867e-05 + }, + { + "step": 472, + "epoch": 2.95, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.7017, + "grad_norm": 0.2903796434402466, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 473, + "epoch": 2.95625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6954, + "grad_norm": 0.1185152679681778, + "learning_rate": 5.803808637786135e-05 + }, + { + "step": 474, + "epoch": 2.9625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.696, + "grad_norm": 0.16914165019989014, + "learning_rate": 5.739312276439427e-05 + }, + { + "step": 475, + "epoch": 2.96875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6945, + "grad_norm": 0.08419326692819595, + "learning_rate": 5.6750913994488415e-05 + }, + { + "step": 476, + "epoch": 2.975, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.694, + "grad_norm": 0.06854535639286041, + "learning_rate": 5.6111479172391136e-05 + }, + { + "step": 477, + "epoch": 2.98125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6923, + "grad_norm": 0.18548718094825745, + "learning_rate": 5.5474837319831314e-05 + }, + { + "step": 478, + "epoch": 2.9875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6928, + "grad_norm": 0.20728768408298492, + "learning_rate": 5.4841007375453186e-05 + }, + { + "step": 479, + "epoch": 2.99375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.693, + "grad_norm": 0.22493883967399597, + "learning_rate": 5.4210008194253196e-05 + }, + { + "step": 480, + "epoch": 3.0, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.693, + "grad_norm": 0.24496106803417206, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 481, + "epoch": 3.00625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.6924, + "grad_norm": 0.2143675535917282, + "learning_rate": 5.2956577119771405e-05 + }, + { + "step": 482, + "epoch": 3.0125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6952, + "grad_norm": 0.25425463914871216, + "learning_rate": 5.233418251320765e-05 + }, + { + "step": 483, + "epoch": 3.01875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6962, + "grad_norm": 0.33462396264076233, + "learning_rate": 5.171469324214901e-05 + }, + { + "step": 484, + "epoch": 3.025, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6936, + "grad_norm": 0.04659596085548401, + "learning_rate": 5.109812773498967e-05 + }, + { + "step": 485, + "epoch": 3.03125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6958, + "grad_norm": 0.3250470757484436, + "learning_rate": 5.048450433314835e-05 + }, + { + "step": 486, + "epoch": 3.0375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6926, + "grad_norm": 0.2885156571865082, + "learning_rate": 4.987384129052291e-05 + }, + { + "step": 487, + "epoch": 3.04375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6945, + "grad_norm": 0.15931758284568787, + "learning_rate": 4.926615677294723e-05 + }, + { + "step": 488, + "epoch": 3.05, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6933, + "grad_norm": 0.45983657240867615, + "learning_rate": 4.866146885765096e-05 + }, + { + "step": 489, + "epoch": 3.05625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6955, + "grad_norm": 0.5084583759307861, + "learning_rate": 4.8059795532721575e-05 + }, + { + "step": 490, + "epoch": 3.0625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6925, + "grad_norm": 0.21837398409843445, + "learning_rate": 4.7461154696569294e-05 + }, + { + "step": 491, + "epoch": 3.06875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6956, + "grad_norm": 0.45115146040916443, + "learning_rate": 4.686556415739488e-05 + }, + { + "step": 492, + "epoch": 3.075, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6938, + "grad_norm": 0.20288866758346558, + "learning_rate": 4.62730416326596e-05 + }, + { + "step": 493, + "epoch": 3.08125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6955, + "grad_norm": 0.5776761770248413, + "learning_rate": 4.568360474855826e-05 + }, + { + "step": 494, + "epoch": 3.0875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6929, + "grad_norm": 0.15132448077201843, + "learning_rate": 4.509727103949492e-05 + }, + { + "step": 495, + "epoch": 3.09375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6905, + "grad_norm": 0.5593264102935791, + "learning_rate": 4.451405794756138e-05 + }, + { + "step": 496, + "epoch": 3.1, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720119808, + "loss": 0.692, + "grad_norm": 0.16078925132751465, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 497, + "epoch": 3.10625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720119808, + "loss": 0.6994, + "grad_norm": 0.39185914397239685, + "learning_rate": 4.33570629187776e-05 + }, + { + "step": 498, + "epoch": 3.1125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7036, + "grad_norm": 0.5111779570579529, + "learning_rate": 4.278331539989307e-05 + }, + { + "step": 499, + "epoch": 3.11875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.695, + "grad_norm": 0.14450673758983612, + "learning_rate": 4.2212757333045283e-05 + }, + { + "step": 500, + "epoch": 3.125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6971, + "grad_norm": 0.20095449686050415, + "learning_rate": 4.164540569103667e-05 + }, + { + "step": 501, + "epoch": 3.13125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6933, + "grad_norm": 0.0265535656362772, + "learning_rate": 4.108127735128561e-05 + }, + { + "step": 502, + "epoch": 3.1375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6899, + "grad_norm": 0.18054957687854767, + "learning_rate": 4.052038909532469e-05 + }, + { + "step": 503, + "epoch": 3.14375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6888, + "grad_norm": 0.24400466680526733, + "learning_rate": 3.996275760830125e-05 + }, + { + "step": 504, + "epoch": 3.15, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6932, + "grad_norm": 0.023109152913093567, + "learning_rate": 3.94083994784814e-05 + }, + { + "step": 505, + "epoch": 3.15625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.699, + "grad_norm": 0.35746636986732483, + "learning_rate": 3.885733119675616e-05 + }, + { + "step": 506, + "epoch": 3.1625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6938, + "grad_norm": 0.06470338255167007, + "learning_rate": 3.830956915615106e-05 + }, + { + "step": 507, + "epoch": 3.16875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.6926, + "grad_norm": 0.0350029431283474, + "learning_rate": 3.776512965133863e-05 + }, + { + "step": 508, + "epoch": 3.175, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.689, + "grad_norm": 0.252542108297348, + "learning_rate": 3.72240288781534e-05 + }, + { + "step": 509, + "epoch": 3.18125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.696, + "grad_norm": 0.28093603253364563, + "learning_rate": 3.66862829331103e-05 + }, + { + "step": 510, + "epoch": 3.1875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6953, + "grad_norm": 0.10730616003274918, + "learning_rate": 3.6151907812925717e-05 + }, + { + "step": 511, + "epoch": 3.19375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.693, + "grad_norm": 0.03683946654200554, + "learning_rate": 3.562091941404179e-05 + }, + { + "step": 512, + "epoch": 3.2, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6932, + "grad_norm": 0.043334271758794785, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 513, + "epoch": 3.20625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6938, + "grad_norm": 0.14542052149772644, + "learning_rate": 3.456916586173797e-05 + }, + { + "step": 514, + "epoch": 3.2125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6923, + "grad_norm": 0.2924746870994568, + "learning_rate": 3.404843199558945e-05 + }, + { + "step": 515, + "epoch": 3.21875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6942, + "grad_norm": 0.2601511478424072, + "learning_rate": 3.3531147424353664e-05 + }, + { + "step": 516, + "epoch": 3.225, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6934, + "grad_norm": 0.25027918815612793, + "learning_rate": 3.301732753606776e-05 + }, + { + "step": 517, + "epoch": 3.23125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.693, + "grad_norm": 0.5645226836204529, + "learning_rate": 3.250698761570244e-05 + }, + { + "step": 518, + "epoch": 3.2375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6926, + "grad_norm": 0.6825657486915588, + "learning_rate": 3.200014284470745e-05 + }, + { + "step": 519, + "epoch": 3.24375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.693, + "grad_norm": 0.02775096520781517, + "learning_rate": 3.149680830055967e-05 + }, + { + "step": 520, + "epoch": 3.25, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6919, + "grad_norm": 0.2696119546890259, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 521, + "epoch": 3.25625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6931, + "grad_norm": 0.044292110949754715, + "learning_rate": 3.0500729680161663e-05 + }, + { + "step": 522, + "epoch": 3.2625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6897, + "grad_norm": 0.175717294216156, + "learning_rate": 3.0008015234980552e-05 + }, + { + "step": 523, + "epoch": 3.26875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6945, + "grad_norm": 0.0643916204571724, + "learning_rate": 2.9518870277903274e-05 + }, + { + "step": 524, + "epoch": 3.275, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6933, + "grad_norm": 0.0709623321890831, + "learning_rate": 2.9033309359877597e-05 + }, + { + "step": 525, + "epoch": 3.28125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.7023, + "grad_norm": 0.33690887689590454, + "learning_rate": 2.855134692523438e-05 + }, + { + "step": 526, + "epoch": 3.2875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6936, + "grad_norm": 0.08203183859586716, + "learning_rate": 2.807299731125773e-05 + }, + { + "step": 527, + "epoch": 3.29375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6903, + "grad_norm": 0.07813210785388947, + "learning_rate": 2.759827474775852e-05 + }, + { + "step": 528, + "epoch": 3.3, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6894, + "grad_norm": 0.08311379700899124, + "learning_rate": 2.7127193356651213e-05 + }, + { + "step": 529, + "epoch": 3.30625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.7018, + "grad_norm": 0.3284844756126404, + "learning_rate": 2.665976715153377e-05 + }, + { + "step": 530, + "epoch": 3.3125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6851, + "grad_norm": 0.22443024814128876, + "learning_rate": 2.619601003727043e-05 + }, + { + "step": 531, + "epoch": 3.31875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720124416, + "loss": 0.7032, + "grad_norm": 0.37376078963279724, + "learning_rate": 2.5735935809578656e-05 + }, + { + "step": 532, + "epoch": 3.325, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7027, + "grad_norm": 0.37163224816322327, + "learning_rate": 2.5279558154618197e-05 + }, + { + "step": 533, + "epoch": 3.33125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6961, + "grad_norm": 0.16151966154575348, + "learning_rate": 2.4826890648584353e-05 + }, + { + "step": 534, + "epoch": 3.3375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6949, + "grad_norm": 0.10799293220043182, + "learning_rate": 2.4377946757303828e-05 + }, + { + "step": 535, + "epoch": 3.34375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6954, + "grad_norm": 0.09846331179141998, + "learning_rate": 2.393273983583427e-05 + }, + { + "step": 536, + "epoch": 3.35, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6947, + "grad_norm": 0.1384023278951645, + "learning_rate": 2.3491283128067174e-05 + }, + { + "step": 537, + "epoch": 3.35625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6919, + "grad_norm": 0.14585651457309723, + "learning_rate": 2.3053589766333414e-05 + }, + { + "step": 538, + "epoch": 3.3625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6938, + "grad_norm": 0.17164185643196106, + "learning_rate": 2.261967277101318e-05 + }, + { + "step": 539, + "epoch": 3.36875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6941, + "grad_norm": 0.06316741555929184, + "learning_rate": 2.218954505014821e-05 + }, + { + "step": 540, + "epoch": 3.375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6944, + "grad_norm": 0.3969831168651581, + "learning_rate": 2.1763219399058042e-05 + }, + { + "step": 541, + "epoch": 3.38125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6927, + "grad_norm": 0.029931509867310524, + "learning_rate": 2.1340708499959197e-05 + }, + { + "step": 542, + "epoch": 3.3875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6927, + "grad_norm": 0.15130233764648438, + "learning_rate": 2.0922024921588167e-05 + }, + { + "step": 543, + "epoch": 3.39375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.692, + "grad_norm": 0.14708702266216278, + "learning_rate": 2.0507181118827254e-05 + }, + { + "step": 544, + "epoch": 3.4, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6918, + "grad_norm": 0.0439065657556057, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 545, + "epoch": 3.40625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.6946, + "grad_norm": 0.09425830096006393, + "learning_rate": 1.9689062088175154e-05 + }, + { + "step": 546, + "epoch": 3.4125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6932, + "grad_norm": 0.04089101031422615, + "learning_rate": 1.928581119746081e-05 + }, + { + "step": 547, + "epoch": 3.41875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.691, + "grad_norm": 0.1870659589767456, + "learning_rate": 1.8886448755986193e-05 + }, + { + "step": 548, + "epoch": 3.425, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6919, + "grad_norm": 0.07580937445163727, + "learning_rate": 1.8490986643873845e-05 + }, + { + "step": 549, + "epoch": 3.43125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6924, + "grad_norm": 0.07271397858858109, + "learning_rate": 1.8099436625220443e-05 + }, + { + "step": 550, + "epoch": 3.4375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720121344, + "loss": 0.6909, + "grad_norm": 0.12287227809429169, + "learning_rate": 1.7711810347746757e-05 + }, + { + "step": 551, + "epoch": 3.44375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6963, + "grad_norm": 0.23733270168304443, + "learning_rate": 1.7328119342451165e-05 + }, + { + "step": 552, + "epoch": 3.45, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6924, + "grad_norm": 0.056769512593746185, + "learning_rate": 1.694837502326674e-05 + }, + { + "step": 553, + "epoch": 3.45625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6961, + "grad_norm": 0.18614043295383453, + "learning_rate": 1.6572588686721606e-05 + }, + { + "step": 554, + "epoch": 3.4625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.687, + "grad_norm": 0.28821074962615967, + "learning_rate": 1.6200771511602882e-05 + }, + { + "step": 555, + "epoch": 3.46875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.6803, + "grad_norm": 0.6335099935531616, + "learning_rate": 1.583293455862422e-05 + }, + { + "step": 556, + "epoch": 3.475, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6914, + "grad_norm": 0.04852161929011345, + "learning_rate": 1.546908877009676e-05 + }, + { + "step": 557, + "epoch": 3.48125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.7001, + "grad_norm": 0.3308299481868744, + "learning_rate": 1.5109244969603546e-05 + }, + { + "step": 558, + "epoch": 3.4875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6994, + "grad_norm": 0.27911677956581116, + "learning_rate": 1.4753413861677604e-05 + }, + { + "step": 559, + "epoch": 3.49375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6888, + "grad_norm": 0.1531069129705429, + "learning_rate": 1.4401606031483497e-05 + }, + { + "step": 560, + "epoch": 3.5, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6968, + "grad_norm": 0.21970370411872864, + "learning_rate": 1.4053831944502508e-05 + }, + { + "step": 561, + "epoch": 3.50625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6912, + "grad_norm": 0.03667301684617996, + "learning_rate": 1.371010194622117e-05 + }, + { + "step": 562, + "epoch": 3.5125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.705, + "grad_norm": 0.47239577770233154, + "learning_rate": 1.3370426261823613e-05 + }, + { + "step": 563, + "epoch": 3.51875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.696, + "grad_norm": 0.15824586153030396, + "learning_rate": 1.3034814995887433e-05 + }, + { + "step": 564, + "epoch": 3.525, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6912, + "grad_norm": 0.09314225614070892, + "learning_rate": 1.2703278132082934e-05 + }, + { + "step": 565, + "epoch": 3.53125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.693, + "grad_norm": 0.04287112504243851, + "learning_rate": 1.237582553287631e-05 + }, + { + "step": 566, + "epoch": 3.5375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.7037, + "grad_norm": 0.5010049939155579, + "learning_rate": 1.205246693923616e-05 + }, + { + "step": 567, + "epoch": 3.54375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6992, + "grad_norm": 0.30880317091941833, + "learning_rate": 1.173321197034382e-05 + }, + { + "step": 568, + "epoch": 3.55, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6899, + "grad_norm": 0.16553786396980286, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 569, + "epoch": 3.55625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6994, + "grad_norm": 0.3421373963356018, + "learning_rate": 1.1107050772877507e-05 + }, + { + "step": 570, + "epoch": 3.5625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6914, + "grad_norm": 0.06323792040348053, + "learning_rate": 1.0800163171172332e-05 + }, + { + "step": 571, + "epoch": 3.56875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6943, + "grad_norm": 0.10484077036380768, + "learning_rate": 1.0497416447398187e-05 + }, + { + "step": 572, + "epoch": 3.575, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6891, + "grad_norm": 0.2918790578842163, + "learning_rate": 1.0198819607580233e-05 + }, + { + "step": 573, + "epoch": 3.58125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6945, + "grad_norm": 0.1520087569952011, + "learning_rate": 9.904381534293993e-06 + }, + { + "step": 574, + "epoch": 3.5875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6908, + "grad_norm": 0.18315336108207703, + "learning_rate": 9.614110986401169e-06 + }, + { + "step": 575, + "epoch": 3.59375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6932, + "grad_norm": 0.03694739565253258, + "learning_rate": 9.32801659878905e-06 + }, + { + "step": 576, + "epoch": 3.6, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6952, + "grad_norm": 0.19211795926094055, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 577, + "epoch": 3.60625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6929, + "grad_norm": 0.04055141285061836, + "learning_rate": 8.768390222546895e-06 + }, + { + "step": 578, + "epoch": 3.6125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720119808, + "loss": 0.6924, + "grad_norm": 0.0871226042509079, + "learning_rate": 8.494874881526215e-06 + }, + { + "step": 579, + "epoch": 3.61875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.693, + "grad_norm": 0.03763163089752197, + "learning_rate": 8.225568995509834e-06 + }, + { + "step": 580, + "epoch": 3.625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.691, + "grad_norm": 0.3463771641254425, + "learning_rate": 7.960480575734162e-06 + }, + { + "step": 581, + "epoch": 3.63125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.692, + "grad_norm": 0.13913355767726898, + "learning_rate": 7.699617507975563e-06 + }, + { + "step": 582, + "epoch": 3.6375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.6928, + "grad_norm": 0.03486444801092148, + "learning_rate": 7.442987552315833e-06 + }, + { + "step": 583, + "epoch": 3.64375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6941, + "grad_norm": 0.04248690605163574, + "learning_rate": 7.190598342911358e-06 + }, + { + "step": 584, + "epoch": 3.65, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6933, + "grad_norm": 0.03354766592383385, + "learning_rate": 6.942457387765976e-06 + }, + { + "step": 585, + "epoch": 3.65625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720119808, + "loss": 0.6921, + "grad_norm": 0.13390979170799255, + "learning_rate": 6.698572068507596e-06 + }, + { + "step": 586, + "epoch": 3.6625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6958, + "grad_norm": 0.35036563873291016, + "learning_rate": 6.458949640168675e-06 + }, + { + "step": 587, + "epoch": 3.66875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720119808, + "loss": 0.6913, + "grad_norm": 0.08112151175737381, + "learning_rate": 6.223597230970428e-06 + }, + { + "step": 588, + "epoch": 3.675, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6901, + "grad_norm": 0.2867778241634369, + "learning_rate": 5.992521842110709e-06 + }, + { + "step": 589, + "epoch": 3.68125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6947, + "grad_norm": 0.19014088809490204, + "learning_rate": 5.7657303475556974e-06 + }, + { + "step": 590, + "epoch": 3.6875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6957, + "grad_norm": 0.2982032597064972, + "learning_rate": 5.543229493835594e-06 + }, + { + "step": 591, + "epoch": 3.69375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6974, + "grad_norm": 0.45201632380485535, + "learning_rate": 5.325025899843732e-06 + }, + { + "step": 592, + "epoch": 3.7, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.6956, + "grad_norm": 0.23695386946201324, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 593, + "epoch": 3.70625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6948, + "grad_norm": 0.1825084090232849, + "learning_rate": 4.901536327256589e-06 + }, + { + "step": 594, + "epoch": 3.7125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6958, + "grad_norm": 0.4468189477920532, + "learning_rate": 4.6962629465110365e-06 + }, + { + "step": 595, + "epoch": 3.71875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720118272, + "loss": 0.6922, + "grad_norm": 0.04162812605500221, + "learning_rate": 4.495312020818403e-06 + }, + { + "step": 596, + "epoch": 3.725, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6916, + "grad_norm": 0.2421417534351349, + "learning_rate": 4.298689528010785e-06 + }, + { + "step": 597, + "epoch": 3.73125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6932, + "grad_norm": 0.09272465854883194, + "learning_rate": 4.106401317159275e-06 + }, + { + "step": 598, + "epoch": 3.7375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6935, + "grad_norm": 0.2547963559627533, + "learning_rate": 3.918453108399955e-06 + }, + { + "step": 599, + "epoch": 3.74375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.694, + "grad_norm": 0.11723332107067108, + "learning_rate": 3.7348504927637302e-06 + }, + { + "step": 600, + "epoch": 3.75, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6935, + "grad_norm": 0.14607861638069153, + "learning_rate": 3.5555989320099952e-06 + }, + { + "step": 601, + "epoch": 3.75625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6932, + "grad_norm": 0.06697121262550354, + "learning_rate": 3.3807037584642316e-06 + }, + { + "step": 602, + "epoch": 3.7625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6933, + "grad_norm": 0.11258146166801453, + "learning_rate": 3.21017017485925e-06 + }, + { + "step": 603, + "epoch": 3.76875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720116736, + "loss": 0.6944, + "grad_norm": 0.21183128654956818, + "learning_rate": 3.0440032541805825e-06 + }, + { + "step": 604, + "epoch": 3.775, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6927, + "grad_norm": 0.09649782627820969, + "learning_rate": 2.882207939515435e-06 + }, + { + "step": 605, + "epoch": 3.78125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720105984, + "loss": 0.6931, + "grad_norm": 0.3651270270347595, + "learning_rate": 2.7247890439057064e-06 + }, + { + "step": 606, + "epoch": 3.7875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6931, + "grad_norm": 0.1630934625864029, + "learning_rate": 2.5717512502048342e-06 + }, + { + "step": 607, + "epoch": 3.79375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6927, + "grad_norm": 0.1474672257900238, + "learning_rate": 2.423099110938376e-06 + }, + { + "step": 608, + "epoch": 3.8, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6934, + "grad_norm": 0.2054101675748825, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 609, + "epoch": 3.80625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6936, + "grad_norm": 0.30717524886131287, + "learning_rate": 2.1389693533636455e-06 + }, + { + "step": 610, + "epoch": 3.8125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6937, + "grad_norm": 0.05763290077447891, + "learning_rate": 2.003500187268153e-06 + }, + { + "step": 611, + "epoch": 3.81875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6934, + "grad_norm": 0.20102497935295105, + "learning_rate": 1.8724335797812685e-06 + }, + { + "step": 612, + "epoch": 3.825, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.6937, + "grad_norm": 0.15657541155815125, + "learning_rate": 1.7457734298359005e-06 + }, + { + "step": 613, + "epoch": 3.83125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6938, + "grad_norm": 0.06139551103115082, + "learning_rate": 1.6235235052828476e-06 + }, + { + "step": 614, + "epoch": 3.8375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6927, + "grad_norm": 0.1988128423690796, + "learning_rate": 1.505687442778819e-06 + }, + { + "step": 615, + "epoch": 3.84375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6935, + "grad_norm": 0.20542262494564056, + "learning_rate": 1.3922687476781047e-06 + }, + { + "step": 616, + "epoch": 3.85, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6928, + "grad_norm": 0.02514488995075226, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 617, + "epoch": 3.85625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6926, + "grad_norm": 0.1551140695810318, + "learning_rate": 1.1786968239705486e-06 + }, + { + "step": 618, + "epoch": 3.8625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6928, + "grad_norm": 0.028070511296391487, + "learning_rate": 1.0785499486417438e-06 + }, + { + "step": 619, + "epoch": 3.86875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6935, + "grad_norm": 0.05486086383461952, + "learning_rate": 9.82833147083345e-07 + }, + { + "step": 620, + "epoch": 3.875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6921, + "grad_norm": 0.24807782471179962, + "learning_rate": 8.91549266652053e-07 + }, + { + "step": 621, + "epoch": 3.88125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6931, + "grad_norm": 0.30466580390930176, + "learning_rate": 8.04701022835319e-07 + }, + { + "step": 622, + "epoch": 3.8875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6938, + "grad_norm": 0.10400989651679993, + "learning_rate": 7.222909991704773e-07 + }, + { + "step": 623, + "epoch": 3.89375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6934, + "grad_norm": 0.25413626432418823, + "learning_rate": 6.443216471679058e-07 + }, + { + "step": 624, + "epoch": 3.9, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6938, + "grad_norm": 0.10774695128202438, + "learning_rate": 5.707952862381681e-07 + }, + { + "step": 625, + "epoch": 3.90625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.693, + "grad_norm": 0.09930668771266937, + "learning_rate": 5.017141036229522e-07 + }, + { + "step": 626, + "epoch": 3.9125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6928, + "grad_norm": 0.028209511190652847, + "learning_rate": 4.370801543300051e-07 + }, + { + "step": 627, + "epoch": 3.91875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6928, + "grad_norm": 0.11059189587831497, + "learning_rate": 3.768953610720327e-07 + }, + { + "step": 628, + "epoch": 3.925, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.693, + "grad_norm": 0.10915698111057281, + "learning_rate": 3.211615142094781e-07 + }, + { + "step": 629, + "epoch": 3.93125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720109056, + "loss": 0.6934, + "grad_norm": 0.2059352993965149, + "learning_rate": 2.6988027169728145e-07 + }, + { + "step": 630, + "epoch": 3.9375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.72010752, + "loss": 0.6929, + "grad_norm": 0.20825274288654327, + "learning_rate": 2.2305315903553555e-07 + }, + { + "step": 631, + "epoch": 3.94375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720121344, + "loss": 0.6934, + "grad_norm": 0.1575714498758316, + "learning_rate": 1.8068156922413924e-07 + }, + { + "step": 632, + "epoch": 3.95, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6924, + "grad_norm": 0.09849834442138672, + "learning_rate": 1.4276676272133025e-07 + }, + { + "step": 633, + "epoch": 3.95625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.692, + "grad_norm": 0.026948142796754837, + "learning_rate": 1.0930986740621539e-07 + }, + { + "step": 634, + "epoch": 3.9625, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720112128, + "loss": 0.6926, + "grad_norm": 0.20780989527702332, + "learning_rate": 8.031187854514731e-08 + }, + { + "step": 635, + "epoch": 3.96875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6934, + "grad_norm": 0.05437394604086876, + "learning_rate": 5.577365876224815e-08 + }, + { + "step": 636, + "epoch": 3.975, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720104448, + "loss": 0.6936, + "grad_norm": 0.039754580706357956, + "learning_rate": 3.5695938013630134e-08 + }, + { + "step": 637, + "epoch": 3.98125, + "cpu_mem": 1.730596864, + "gpu_mem": 4.7201152, + "loss": 0.6938, + "grad_norm": 0.35335856676101685, + "learning_rate": 2.007931356572956e-08 + }, + { + "step": 638, + "epoch": 3.9875, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6926, + "grad_norm": 0.19915981590747833, + "learning_rate": 8.924249977537712e-09 + }, + { + "step": 639, + "epoch": 3.99375, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720113664, + "loss": 0.6928, + "grad_norm": 0.20088230073451996, + "learning_rate": 2.2310790867619e-09 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "loss": 0.6926, + "grad_norm": 0.16665127873420715, + "learning_rate": 0.0 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.730596864, + "gpu_mem": 4.720110592, + "train_runtime": 1433.4662, + "train_samples_per_second": 28.557, + "train_steps_per_second": 0.446, + "total_flos": 1.4912476582969344e+16, + "train_loss": 0.7271688040345907 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r8-a2/README.md b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r8-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r8-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r8-a2/adapter_config.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..616e0cc3677d4646846654f1887fbef4d57d10ca --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r8-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r8-a2/eval_results.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..591df5a8840c5f4217107c968491d4ca952dc5ed --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "winogrande", + "results": 0.5295974743488555 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r8-a2/training_configuration.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..6758fc33521a1d00dab9257f813a345b7384a0dc --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "WINOGRANDE", + "dataset_id": "allenai/winogrande", + "preprocess_id": "winogrande_train_deepeval" + }, + "peft_config": { + "method": "lora", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 6307840 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-lora-winogrande-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r8-a2", + "seed": 42, + "timestamp": "2025-08-30T01:03:49.105158" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r8-a2/training_logs.json b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..7c9da487dd4772e9ffe4fdafcbef6675bb9aa1bd --- /dev/null +++ b/TinyLlama_v1.1-lora/TinyLlama_v1.1-lora-winogrande-r8-a2/training_logs.json @@ -0,0 +1,5773 @@ +[ + { + "step": 1, + "epoch": 0.00625, + "cpu_mem": 1.686470656, + "gpu_mem": 4.442566144, + "loss": 3.3802, + "grad_norm": 28.665983200073242, + "learning_rate": 4.6875e-06 + }, + { + "step": 2, + "epoch": 0.0125, + "cpu_mem": 1.692368896, + "gpu_mem": 4.493026816, + "loss": 3.3361, + "grad_norm": 28.10526466369629, + "learning_rate": 9.375e-06 + }, + { + "step": 3, + "epoch": 0.01875, + "cpu_mem": 1.69295872, + "gpu_mem": 4.493031424, + "loss": 3.1715, + "grad_norm": 26.71535873413086, + "learning_rate": 1.40625e-05 + }, + { + "step": 4, + "epoch": 0.025, + "cpu_mem": 1.693351936, + "gpu_mem": 4.493029888, + "loss": 3.0164, + "grad_norm": 27.324888229370117, + "learning_rate": 1.875e-05 + }, + { + "step": 5, + "epoch": 0.03125, + "cpu_mem": 1.693745152, + "gpu_mem": 4.493029888, + "loss": 2.8504, + "grad_norm": 26.73137664794922, + "learning_rate": 2.3437499999999997e-05 + }, + { + "step": 6, + "epoch": 0.0375, + "cpu_mem": 1.69394176, + "gpu_mem": 4.493036032, + "loss": 2.5881, + "grad_norm": 26.747364044189453, + "learning_rate": 2.8125e-05 + }, + { + "step": 7, + "epoch": 0.04375, + "cpu_mem": 1.694138368, + "gpu_mem": 4.493042176, + "loss": 2.2082, + "grad_norm": 24.36501121520996, + "learning_rate": 3.28125e-05 + }, + { + "step": 8, + "epoch": 0.05, + "cpu_mem": 1.694531584, + "gpu_mem": 4.49302528, + "loss": 1.8804, + "grad_norm": 19.74995994567871, + "learning_rate": 3.75e-05 + }, + { + "step": 9, + "epoch": 0.05625, + "cpu_mem": 1.694531584, + "gpu_mem": 4.493031424, + "loss": 1.7271, + "grad_norm": 16.149599075317383, + "learning_rate": 4.2187499999999995e-05 + }, + { + "step": 10, + "epoch": 0.0625, + "cpu_mem": 1.694728192, + "gpu_mem": 4.493034496, + "loss": 1.2976, + "grad_norm": 12.080035209655762, + "learning_rate": 4.6874999999999994e-05 + }, + { + "step": 11, + "epoch": 0.06875, + "cpu_mem": 1.6949248, + "gpu_mem": 4.493023744, + "loss": 1.0683, + "grad_norm": 9.163285255432129, + "learning_rate": 5.156249999999999e-05 + }, + { + "step": 12, + "epoch": 0.075, + "cpu_mem": 1.695121408, + "gpu_mem": 4.493028352, + "loss": 1.0503, + "grad_norm": 8.875922203063965, + "learning_rate": 5.625e-05 + }, + { + "step": 13, + "epoch": 0.08125, + "cpu_mem": 1.695318016, + "gpu_mem": 4.493036032, + "loss": 0.8807, + "grad_norm": 4.396264553070068, + "learning_rate": 6.09375e-05 + }, + { + "step": 14, + "epoch": 0.0875, + "cpu_mem": 1.695318016, + "gpu_mem": 4.493031424, + "loss": 0.7895, + "grad_norm": 3.330125570297241, + "learning_rate": 6.5625e-05 + }, + { + "step": 15, + "epoch": 0.09375, + "cpu_mem": 1.695318016, + "gpu_mem": 4.493031424, + "loss": 0.7954, + "grad_norm": 6.737238883972168, + "learning_rate": 7.03125e-05 + }, + { + "step": 16, + "epoch": 0.1, + "cpu_mem": 1.695514624, + "gpu_mem": 4.493028352, + "loss": 0.7323, + "grad_norm": 6.610389709472656, + "learning_rate": 7.5e-05 + }, + { + "step": 17, + "epoch": 0.10625, + "cpu_mem": 1.695514624, + "gpu_mem": 4.493028352, + "loss": 0.7468, + "grad_norm": 6.945790767669678, + "learning_rate": 7.968749999999999e-05 + }, + { + "step": 18, + "epoch": 0.1125, + "cpu_mem": 1.695711232, + "gpu_mem": 4.493031424, + "loss": 0.6949, + "grad_norm": 1.157287836074829, + "learning_rate": 8.437499999999999e-05 + }, + { + "step": 19, + "epoch": 0.11875, + "cpu_mem": 1.695711232, + "gpu_mem": 4.493028352, + "loss": 0.7309, + "grad_norm": 3.1316652297973633, + "learning_rate": 8.906249999999999e-05 + }, + { + "step": 20, + "epoch": 0.125, + "cpu_mem": 1.69590784, + "gpu_mem": 4.493036032, + "loss": 0.7755, + "grad_norm": 5.258666038513184, + "learning_rate": 9.374999999999999e-05 + }, + { + "step": 21, + "epoch": 0.13125, + "cpu_mem": 1.69590784, + "gpu_mem": 4.493028352, + "loss": 0.715, + "grad_norm": 2.5349018573760986, + "learning_rate": 9.843749999999999e-05 + }, + { + "step": 22, + "epoch": 0.1375, + "cpu_mem": 1.69590784, + "gpu_mem": 4.493028352, + "loss": 0.726, + "grad_norm": 3.523320198059082, + "learning_rate": 0.00010312499999999999 + }, + { + "step": 23, + "epoch": 0.14375, + "cpu_mem": 1.696104448, + "gpu_mem": 4.493023744, + "loss": 0.6869, + "grad_norm": 2.5169506072998047, + "learning_rate": 0.00010781249999999998 + }, + { + "step": 24, + "epoch": 0.15, + "cpu_mem": 1.696104448, + "gpu_mem": 4.493026816, + "loss": 0.7, + "grad_norm": 1.0256839990615845, + "learning_rate": 0.0001125 + }, + { + "step": 25, + "epoch": 0.15625, + "cpu_mem": 1.696104448, + "gpu_mem": 4.493029888, + "loss": 0.717, + "grad_norm": 1.7955429553985596, + "learning_rate": 0.0001171875 + }, + { + "step": 26, + "epoch": 0.1625, + "cpu_mem": 1.696301056, + "gpu_mem": 4.49302528, + "loss": 0.7096, + "grad_norm": 2.880420446395874, + "learning_rate": 0.000121875 + }, + { + "step": 27, + "epoch": 0.16875, + "cpu_mem": 1.696301056, + "gpu_mem": 4.493023744, + "loss": 0.6948, + "grad_norm": 2.6128876209259033, + "learning_rate": 0.0001265625 + }, + { + "step": 28, + "epoch": 0.175, + "cpu_mem": 1.696301056, + "gpu_mem": 4.493029888, + "loss": 0.7163, + "grad_norm": 2.238095760345459, + "learning_rate": 0.00013125 + }, + { + "step": 29, + "epoch": 0.18125, + "cpu_mem": 1.696497664, + "gpu_mem": 4.493028352, + "loss": 0.6978, + "grad_norm": 0.49516555666923523, + "learning_rate": 0.0001359375 + }, + { + "step": 30, + "epoch": 0.1875, + "cpu_mem": 1.696497664, + "gpu_mem": 4.493028352, + "loss": 0.7222, + "grad_norm": 2.4172372817993164, + "learning_rate": 0.000140625 + }, + { + "step": 31, + "epoch": 0.19375, + "cpu_mem": 1.696497664, + "gpu_mem": 4.493028352, + "loss": 0.6957, + "grad_norm": 1.0442181825637817, + "learning_rate": 0.0001453125 + }, + { + "step": 32, + "epoch": 0.2, + "cpu_mem": 1.696497664, + "gpu_mem": 4.49302528, + "loss": 0.664, + "grad_norm": 0.735107421875, + "learning_rate": 0.00015 + }, + { + "step": 33, + "epoch": 0.20625, + "cpu_mem": 1.696497664, + "gpu_mem": 4.49302528, + "loss": 0.7946, + "grad_norm": 4.9637298583984375, + "learning_rate": 0.00015468749999999999 + }, + { + "step": 34, + "epoch": 0.2125, + "cpu_mem": 1.696497664, + "gpu_mem": 4.49302528, + "loss": 0.7411, + "grad_norm": 3.2427453994750977, + "learning_rate": 0.00015937499999999998 + }, + { + "step": 35, + "epoch": 0.21875, + "cpu_mem": 1.696497664, + "gpu_mem": 4.493031424, + "loss": 0.6978, + "grad_norm": 1.0246829986572266, + "learning_rate": 0.00016406249999999998 + }, + { + "step": 36, + "epoch": 0.225, + "cpu_mem": 1.696497664, + "gpu_mem": 4.493026816, + "loss": 0.7466, + "grad_norm": 3.6515204906463623, + "learning_rate": 0.00016874999999999998 + }, + { + "step": 37, + "epoch": 0.23125, + "cpu_mem": 1.696497664, + "gpu_mem": 4.49302528, + "loss": 0.6783, + "grad_norm": 0.8173540234565735, + "learning_rate": 0.00017343749999999998 + }, + { + "step": 38, + "epoch": 0.2375, + "cpu_mem": 1.696497664, + "gpu_mem": 4.493029888, + "loss": 0.6897, + "grad_norm": 1.459547519683838, + "learning_rate": 0.00017812499999999998 + }, + { + "step": 39, + "epoch": 0.24375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493036032, + "loss": 0.6912, + "grad_norm": 1.691298246383667, + "learning_rate": 0.00018281249999999998 + }, + { + "step": 40, + "epoch": 0.25, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.7714, + "grad_norm": 3.3266375064849854, + "learning_rate": 0.00018749999999999998 + }, + { + "step": 41, + "epoch": 0.25625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.7235, + "grad_norm": 1.9434911012649536, + "learning_rate": 0.00019218749999999998 + }, + { + "step": 42, + "epoch": 0.2625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7615, + "grad_norm": 3.4989898204803467, + "learning_rate": 0.00019687499999999997 + }, + { + "step": 43, + "epoch": 0.26875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6885, + "grad_norm": 0.9040302634239197, + "learning_rate": 0.00020156249999999997 + }, + { + "step": 44, + "epoch": 0.275, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7078, + "grad_norm": 0.6956483721733093, + "learning_rate": 0.00020624999999999997 + }, + { + "step": 45, + "epoch": 0.28125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493037568, + "loss": 0.7003, + "grad_norm": 0.4748885929584503, + "learning_rate": 0.00021093749999999997 + }, + { + "step": 46, + "epoch": 0.2875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7282, + "grad_norm": 1.2245064973831177, + "learning_rate": 0.00021562499999999997 + }, + { + "step": 47, + "epoch": 0.29375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.7376, + "grad_norm": 1.353363037109375, + "learning_rate": 0.00022031249999999997 + }, + { + "step": 48, + "epoch": 0.3, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.7056, + "grad_norm": 1.342427134513855, + "learning_rate": 0.000225 + }, + { + "step": 49, + "epoch": 0.30625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493022208, + "loss": 0.7451, + "grad_norm": 2.893486499786377, + "learning_rate": 0.0002296875 + }, + { + "step": 50, + "epoch": 0.3125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.7186, + "grad_norm": 1.2226550579071045, + "learning_rate": 0.000234375 + }, + { + "step": 51, + "epoch": 0.31875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.712, + "grad_norm": 1.6836299896240234, + "learning_rate": 0.0002390625 + }, + { + "step": 52, + "epoch": 0.325, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.7181, + "grad_norm": 1.800547480583191, + "learning_rate": 0.00024375 + }, + { + "step": 53, + "epoch": 0.33125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.7519, + "grad_norm": 2.83194637298584, + "learning_rate": 0.00024843749999999996 + }, + { + "step": 54, + "epoch": 0.3375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493019136, + "loss": 0.6884, + "grad_norm": 0.2825492322444916, + "learning_rate": 0.000253125 + }, + { + "step": 55, + "epoch": 0.34375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.6924, + "grad_norm": 0.3489837944507599, + "learning_rate": 0.00025781249999999996 + }, + { + "step": 56, + "epoch": 0.35, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6939, + "grad_norm": 0.1639823317527771, + "learning_rate": 0.0002625 + }, + { + "step": 57, + "epoch": 0.35625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493022208, + "loss": 0.7116, + "grad_norm": 1.1705121994018555, + "learning_rate": 0.00026718749999999996 + }, + { + "step": 58, + "epoch": 0.3625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.696, + "grad_norm": 0.18490128219127655, + "learning_rate": 0.000271875 + }, + { + "step": 59, + "epoch": 0.36875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493037568, + "loss": 0.7093, + "grad_norm": 0.9328839778900146, + "learning_rate": 0.00027656249999999995 + }, + { + "step": 60, + "epoch": 0.375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7004, + "grad_norm": 0.23575809597969055, + "learning_rate": 0.00028125 + }, + { + "step": 61, + "epoch": 0.38125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6936, + "grad_norm": 0.220880389213562, + "learning_rate": 0.00028593749999999995 + }, + { + "step": 62, + "epoch": 0.3875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6941, + "grad_norm": 0.1911846101284027, + "learning_rate": 0.000290625 + }, + { + "step": 63, + "epoch": 0.39375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6831, + "grad_norm": 0.5680527091026306, + "learning_rate": 0.00029531249999999995 + }, + { + "step": 64, + "epoch": 0.4, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493036032, + "loss": 0.7409, + "grad_norm": 2.146034002304077, + "learning_rate": 0.0003 + }, + { + "step": 65, + "epoch": 0.40625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.684, + "grad_norm": 0.2460382729768753, + "learning_rate": 0.00029999776892091325 + }, + { + "step": 66, + "epoch": 0.4125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.7147, + "grad_norm": 1.222211241722107, + "learning_rate": 0.00029999107575002246 + }, + { + "step": 67, + "epoch": 0.41875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7141, + "grad_norm": 1.6179394721984863, + "learning_rate": 0.0002999799206864343 + }, + { + "step": 68, + "epoch": 0.425, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7011, + "grad_norm": 0.5519856214523315, + "learning_rate": 0.0002999643040619863 + }, + { + "step": 69, + "epoch": 0.43125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.688, + "grad_norm": 0.33305254578590393, + "learning_rate": 0.0002999442263412377 + }, + { + "step": 70, + "epoch": 0.4375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493036032, + "loss": 0.7177, + "grad_norm": 1.315094232559204, + "learning_rate": 0.00029991968812145484 + }, + { + "step": 71, + "epoch": 0.44375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.7019, + "grad_norm": 0.5177063345909119, + "learning_rate": 0.00029989069013259374 + }, + { + "step": 72, + "epoch": 0.45, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6935, + "grad_norm": 0.23018385469913483, + "learning_rate": 0.00029985723323727866 + }, + { + "step": 73, + "epoch": 0.45625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7242, + "grad_norm": 1.4038382768630981, + "learning_rate": 0.00029981931843077583 + }, + { + "step": 74, + "epoch": 0.4625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.6983, + "grad_norm": 0.47567206621170044, + "learning_rate": 0.00029977694684096444 + }, + { + "step": 75, + "epoch": 0.46875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7009, + "grad_norm": 1.1548891067504883, + "learning_rate": 0.0002997301197283027 + }, + { + "step": 76, + "epoch": 0.475, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.7364, + "grad_norm": 1.830674171447754, + "learning_rate": 0.0002996788384857905 + }, + { + "step": 77, + "epoch": 0.48125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493036032, + "loss": 0.711, + "grad_norm": 1.0190329551696777, + "learning_rate": 0.00029962310463892795 + }, + { + "step": 78, + "epoch": 0.4875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.7044, + "grad_norm": 0.8249661922454834, + "learning_rate": 0.00029956291984566997 + }, + { + "step": 79, + "epoch": 0.49375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6979, + "grad_norm": 0.18655584752559662, + "learning_rate": 0.00029949828589637703 + }, + { + "step": 80, + "epoch": 0.5, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6999, + "grad_norm": 0.45632898807525635, + "learning_rate": 0.0002994292047137618 + }, + { + "step": 81, + "epoch": 0.50625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493022208, + "loss": 0.7082, + "grad_norm": 0.6185548305511475, + "learning_rate": 0.00029935567835283203 + }, + { + "step": 82, + "epoch": 0.5125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.6939, + "grad_norm": 0.25643277168273926, + "learning_rate": 0.00029927770900082954 + }, + { + "step": 83, + "epoch": 0.51875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6958, + "grad_norm": 0.7324631810188293, + "learning_rate": 0.0002991952989771647 + }, + { + "step": 84, + "epoch": 0.525, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.7179, + "grad_norm": 1.0736939907073975, + "learning_rate": 0.0002991084507333479 + }, + { + "step": 85, + "epoch": 0.53125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7212, + "grad_norm": 0.9404941201210022, + "learning_rate": 0.00029901716685291663 + }, + { + "step": 86, + "epoch": 0.5375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6935, + "grad_norm": 0.24125660955905914, + "learning_rate": 0.0002989214500513582 + }, + { + "step": 87, + "epoch": 0.54375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.7037, + "grad_norm": 0.6701832413673401, + "learning_rate": 0.0002988213031760294 + }, + { + "step": 88, + "epoch": 0.55, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.695, + "grad_norm": 0.14632035791873932, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 89, + "epoch": 0.55625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.7031, + "grad_norm": 0.5828673243522644, + "learning_rate": 0.0002986077312523219 + }, + { + "step": 90, + "epoch": 0.5625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.7003, + "grad_norm": 0.5106858015060425, + "learning_rate": 0.00029849431255722116 + }, + { + "step": 91, + "epoch": 0.56875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.7106, + "grad_norm": 1.1175287961959839, + "learning_rate": 0.00029837647649471715 + }, + { + "step": 92, + "epoch": 0.575, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.705, + "grad_norm": 0.6045042276382446, + "learning_rate": 0.0002982542265701641 + }, + { + "step": 93, + "epoch": 0.58125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.7004, + "grad_norm": 0.27942338585853577, + "learning_rate": 0.0002981275664202187 + }, + { + "step": 94, + "epoch": 0.5875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6957, + "grad_norm": 0.12745022773742676, + "learning_rate": 0.00029799649981273186 + }, + { + "step": 95, + "epoch": 0.59375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6892, + "grad_norm": 0.35973602533340454, + "learning_rate": 0.00029786103064663634 + }, + { + "step": 96, + "epoch": 0.6, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.7019, + "grad_norm": 0.6132001876831055, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 97, + "epoch": 0.60625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6967, + "grad_norm": 0.1171581819653511, + "learning_rate": 0.00029757690088906156 + }, + { + "step": 98, + "epoch": 0.6125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6947, + "grad_norm": 0.10143351554870605, + "learning_rate": 0.00029742824874979515 + }, + { + "step": 99, + "epoch": 0.61875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.692, + "grad_norm": 0.6896800398826599, + "learning_rate": 0.0002972752109560943 + }, + { + "step": 100, + "epoch": 0.625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6991, + "grad_norm": 0.3269404172897339, + "learning_rate": 0.00029711779206048454 + }, + { + "step": 101, + "epoch": 0.63125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6997, + "grad_norm": 0.32721012830734253, + "learning_rate": 0.0002969559967458194 + }, + { + "step": 102, + "epoch": 0.6375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493036032, + "loss": 0.7169, + "grad_norm": 0.7541475296020508, + "learning_rate": 0.0002967898298251407 + }, + { + "step": 103, + "epoch": 0.64375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6869, + "grad_norm": 0.14480820298194885, + "learning_rate": 0.0002966192962415358 + }, + { + "step": 104, + "epoch": 0.65, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6964, + "grad_norm": 0.272762656211853, + "learning_rate": 0.00029644440106799 + }, + { + "step": 105, + "epoch": 0.65625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6942, + "grad_norm": 0.20602084696292877, + "learning_rate": 0.00029626514950723627 + }, + { + "step": 106, + "epoch": 0.6625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7073, + "grad_norm": 0.5213059186935425, + "learning_rate": 0.0002960815468916 + }, + { + "step": 107, + "epoch": 0.66875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7006, + "grad_norm": 0.36949029564857483, + "learning_rate": 0.0002958935986828407 + }, + { + "step": 108, + "epoch": 0.675, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6906, + "grad_norm": 0.09492352604866028, + "learning_rate": 0.00029570131047198915 + }, + { + "step": 109, + "epoch": 0.68125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6998, + "grad_norm": 0.16219347715377808, + "learning_rate": 0.0002955046879791816 + }, + { + "step": 110, + "epoch": 0.6875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.691, + "grad_norm": 0.3650710880756378, + "learning_rate": 0.00029530373705348895 + }, + { + "step": 111, + "epoch": 0.69375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.7043, + "grad_norm": 0.37806203961372375, + "learning_rate": 0.00029509846367274336 + }, + { + "step": 112, + "epoch": 0.7, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7005, + "grad_norm": 0.36658886075019836, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 113, + "epoch": 0.70625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493020672, + "loss": 0.6776, + "grad_norm": 0.1979617178440094, + "learning_rate": 0.00029467497410015625 + }, + { + "step": 114, + "epoch": 0.7125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6936, + "grad_norm": 0.16880585253238678, + "learning_rate": 0.00029445677050616437 + }, + { + "step": 115, + "epoch": 0.71875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.7017, + "grad_norm": 0.34666332602500916, + "learning_rate": 0.0002942342696524443 + }, + { + "step": 116, + "epoch": 0.725, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.7214, + "grad_norm": 1.2124948501586914, + "learning_rate": 0.0002940074781578893 + }, + { + "step": 117, + "epoch": 0.73125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.7001, + "grad_norm": 0.4066324532032013, + "learning_rate": 0.00029377640276902954 + }, + { + "step": 118, + "epoch": 0.7375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6997, + "grad_norm": 0.38744887709617615, + "learning_rate": 0.0002935410503598313 + }, + { + "step": 119, + "epoch": 0.74375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7492, + "grad_norm": 1.3124505281448364, + "learning_rate": 0.00029330142793149237 + }, + { + "step": 120, + "epoch": 0.75, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6981, + "grad_norm": 0.3076198399066925, + "learning_rate": 0.000293057542612234 + }, + { + "step": 121, + "epoch": 0.75625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6928, + "grad_norm": 0.17415845394134521, + "learning_rate": 0.0002928094016570886 + }, + { + "step": 122, + "epoch": 0.7625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.7025, + "grad_norm": 0.7632031440734863, + "learning_rate": 0.00029255701244768414 + }, + { + "step": 123, + "epoch": 0.76875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6951, + "grad_norm": 0.13217808306217194, + "learning_rate": 0.0002923003824920244 + }, + { + "step": 124, + "epoch": 0.775, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6975, + "grad_norm": 0.19855965673923492, + "learning_rate": 0.0002920395194242658 + }, + { + "step": 125, + "epoch": 0.78125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493020672, + "loss": 0.6953, + "grad_norm": 0.20367978513240814, + "learning_rate": 0.00029177443100449014 + }, + { + "step": 126, + "epoch": 0.7875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.703, + "grad_norm": 0.4335212707519531, + "learning_rate": 0.00029150512511847375 + }, + { + "step": 127, + "epoch": 0.79375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6925, + "grad_norm": 0.19620643556118011, + "learning_rate": 0.00029123160977745306 + }, + { + "step": 128, + "epoch": 0.8, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.6983, + "grad_norm": 0.3586576282978058, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 129, + "epoch": 0.80625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6989, + "grad_norm": 0.12707452476024628, + "learning_rate": 0.00029067198340121094 + }, + { + "step": 130, + "epoch": 0.8125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6985, + "grad_norm": 0.18235577642917633, + "learning_rate": 0.00029038588901359884 + }, + { + "step": 131, + "epoch": 0.81875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6946, + "grad_norm": 0.24550971388816833, + "learning_rate": 0.00029009561846570604 + }, + { + "step": 132, + "epoch": 0.825, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6996, + "grad_norm": 0.5102546811103821, + "learning_rate": 0.00028980118039241976 + }, + { + "step": 133, + "epoch": 0.83125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6939, + "grad_norm": 0.9080157279968262, + "learning_rate": 0.00028950258355260177 + }, + { + "step": 134, + "epoch": 0.8375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7036, + "grad_norm": 0.5241925716400146, + "learning_rate": 0.00028919983682882766 + }, + { + "step": 135, + "epoch": 0.84375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6723, + "grad_norm": 0.10313402116298676, + "learning_rate": 0.0002888929492271224 + }, + { + "step": 136, + "epoch": 0.85, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6791, + "grad_norm": 0.35359328985214233, + "learning_rate": 0.000288581929876693 + }, + { + "step": 137, + "epoch": 0.85625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6987, + "grad_norm": 0.655633270740509, + "learning_rate": 0.00028826678802965614 + }, + { + "step": 138, + "epoch": 0.8625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7105, + "grad_norm": 0.7292742133140564, + "learning_rate": 0.0002879475330607638 + }, + { + "step": 139, + "epoch": 0.86875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6995, + "grad_norm": 0.38950204849243164, + "learning_rate": 0.00028762417446712363 + }, + { + "step": 140, + "epoch": 0.875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.7, + "grad_norm": 0.5125278830528259, + "learning_rate": 0.00028729672186791704 + }, + { + "step": 141, + "epoch": 0.88125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.692, + "grad_norm": 0.643389105796814, + "learning_rate": 0.00028696518500411254 + }, + { + "step": 142, + "epoch": 0.8875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6909, + "grad_norm": 0.672387421131134, + "learning_rate": 0.0002866295737381763 + }, + { + "step": 143, + "epoch": 0.89375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7342, + "grad_norm": 1.4714711904525757, + "learning_rate": 0.0002862898980537788 + }, + { + "step": 144, + "epoch": 0.9, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6947, + "grad_norm": 0.14217062294483185, + "learning_rate": 0.0002859461680554975 + }, + { + "step": 145, + "epoch": 0.90625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493036032, + "loss": 0.7238, + "grad_norm": 1.05335533618927, + "learning_rate": 0.0002855983939685165 + }, + { + "step": 146, + "epoch": 0.9125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.711, + "grad_norm": 0.8686351776123047, + "learning_rate": 0.0002852465861383224 + }, + { + "step": 147, + "epoch": 0.91875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.7028, + "grad_norm": 0.5422638654708862, + "learning_rate": 0.00028489075503039643 + }, + { + "step": 148, + "epoch": 0.925, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6918, + "grad_norm": 0.2078210562467575, + "learning_rate": 0.00028453091122990323 + }, + { + "step": 149, + "epoch": 0.93125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6805, + "grad_norm": 0.13184936344623566, + "learning_rate": 0.0002841670654413757 + }, + { + "step": 150, + "epoch": 0.9375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6853, + "grad_norm": 0.12715697288513184, + "learning_rate": 0.0002837992284883971 + }, + { + "step": 151, + "epoch": 0.94375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.7278, + "grad_norm": 1.1451689004898071, + "learning_rate": 0.0002834274113132784 + }, + { + "step": 152, + "epoch": 0.95, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6986, + "grad_norm": 0.3828032612800598, + "learning_rate": 0.0002830516249767332 + }, + { + "step": 153, + "epoch": 0.95625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.693, + "grad_norm": 0.21105526387691498, + "learning_rate": 0.0002826718806575488 + }, + { + "step": 154, + "epoch": 0.9625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6928, + "grad_norm": 0.06871931254863739, + "learning_rate": 0.0002822881896522532 + }, + { + "step": 155, + "epoch": 0.96875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6876, + "grad_norm": 0.290257066488266, + "learning_rate": 0.0002819005633747795 + }, + { + "step": 156, + "epoch": 0.975, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493022208, + "loss": 0.7107, + "grad_norm": 0.6576236486434937, + "learning_rate": 0.00028150901335612615 + }, + { + "step": 157, + "epoch": 0.98125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6441, + "grad_norm": 1.1662838459014893, + "learning_rate": 0.0002811135512440138 + }, + { + "step": 158, + "epoch": 0.9875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493039104, + "loss": 0.7143, + "grad_norm": 0.7577666640281677, + "learning_rate": 0.0002807141888025392 + }, + { + "step": 159, + "epoch": 0.99375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6899, + "grad_norm": 0.2897883355617523, + "learning_rate": 0.00028031093791182484 + }, + { + "step": 160, + "epoch": 1.0, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.7092, + "grad_norm": 0.6580364108085632, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 161, + "epoch": 1.00625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.7004, + "grad_norm": 0.3102606534957886, + "learning_rate": 0.0002794928188811727 + }, + { + "step": 162, + "epoch": 1.0125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.692, + "grad_norm": 0.08698059618473053, + "learning_rate": 0.0002790779750784118 + }, + { + "step": 163, + "epoch": 1.01875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6739, + "grad_norm": 0.1946014165878296, + "learning_rate": 0.0002786592915000408 + }, + { + "step": 164, + "epoch": 1.025, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6827, + "grad_norm": 0.5152174830436707, + "learning_rate": 0.00027823678060094197 + }, + { + "step": 165, + "epoch": 1.03125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6898, + "grad_norm": 0.6915310025215149, + "learning_rate": 0.0002778104549498518 + }, + { + "step": 166, + "epoch": 1.0375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7178, + "grad_norm": 1.1917349100112915, + "learning_rate": 0.00027738032722898683 + }, + { + "step": 167, + "epoch": 1.04375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.7043, + "grad_norm": 0.5981776118278503, + "learning_rate": 0.00027694641023366656 + }, + { + "step": 168, + "epoch": 1.05, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6935, + "grad_norm": 0.10579261928796768, + "learning_rate": 0.0002765087168719328 + }, + { + "step": 169, + "epoch": 1.05625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6927, + "grad_norm": 0.2089833915233612, + "learning_rate": 0.00027606726016416567 + }, + { + "step": 170, + "epoch": 1.0625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493036032, + "loss": 0.6862, + "grad_norm": 0.2181556522846222, + "learning_rate": 0.00027562205324269617 + }, + { + "step": 171, + "epoch": 1.06875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.7146, + "grad_norm": 1.1503543853759766, + "learning_rate": 0.00027517310935141565 + }, + { + "step": 172, + "epoch": 1.075, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7006, + "grad_norm": 0.40699440240859985, + "learning_rate": 0.0002747204418453818 + }, + { + "step": 173, + "epoch": 1.08125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493020672, + "loss": 0.7049, + "grad_norm": 0.7430932521820068, + "learning_rate": 0.00027426406419042135 + }, + { + "step": 174, + "epoch": 1.0875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6963, + "grad_norm": 0.22398938238620758, + "learning_rate": 0.00027380398996272956 + }, + { + "step": 175, + "epoch": 1.09375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6998, + "grad_norm": 0.8039484620094299, + "learning_rate": 0.0002733402328484662 + }, + { + "step": 176, + "epoch": 1.1, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6918, + "grad_norm": 0.1995018571615219, + "learning_rate": 0.00027287280664334875 + }, + { + "step": 177, + "epoch": 1.10625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6975, + "grad_norm": 0.4768397808074951, + "learning_rate": 0.0002724017252522415 + }, + { + "step": 178, + "epoch": 1.1125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6924, + "grad_norm": 0.5351897478103638, + "learning_rate": 0.0002719270026887423 + }, + { + "step": 179, + "epoch": 1.11875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6935, + "grad_norm": 0.4734565317630768, + "learning_rate": 0.0002714486530747656 + }, + { + "step": 180, + "epoch": 1.125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6924, + "grad_norm": 0.2129371166229248, + "learning_rate": 0.0002709666906401224 + }, + { + "step": 181, + "epoch": 1.13125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493043712, + "loss": 0.6674, + "grad_norm": 0.9295939207077026, + "learning_rate": 0.0002704811297220967 + }, + { + "step": 182, + "epoch": 1.1375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.7811, + "grad_norm": 3.22161865234375, + "learning_rate": 0.00026999198476501945 + }, + { + "step": 183, + "epoch": 1.14375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7849, + "grad_norm": 3.143036127090454, + "learning_rate": 0.0002694992703198383 + }, + { + "step": 184, + "epoch": 1.15, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7339, + "grad_norm": 2.1766912937164307, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 185, + "epoch": 1.15625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6893, + "grad_norm": 0.3886832892894745, + "learning_rate": 0.0002685031916994403 + }, + { + "step": 186, + "epoch": 1.1625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.7288, + "grad_norm": 1.6236306428909302, + "learning_rate": 0.0002679998571552925 + }, + { + "step": 187, + "epoch": 1.16875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6823, + "grad_norm": 0.514438271522522, + "learning_rate": 0.0002674930123842975 + }, + { + "step": 188, + "epoch": 1.175, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7179, + "grad_norm": 1.2123099565505981, + "learning_rate": 0.0002669826724639322 + }, + { + "step": 189, + "epoch": 1.18125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7321, + "grad_norm": 1.4335020780563354, + "learning_rate": 0.0002664688525756463 + }, + { + "step": 190, + "epoch": 1.1875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6847, + "grad_norm": 0.21651946008205414, + "learning_rate": 0.0002659515680044105 + }, + { + "step": 191, + "epoch": 1.19375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.6886, + "grad_norm": 0.4559673070907593, + "learning_rate": 0.00026543083413826203 + }, + { + "step": 192, + "epoch": 1.2, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6903, + "grad_norm": 0.9113612174987793, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 193, + "epoch": 1.20625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6936, + "grad_norm": 0.22141362726688385, + "learning_rate": 0.0002643790805859582 + }, + { + "step": 194, + "epoch": 1.2125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493022208, + "loss": 0.6916, + "grad_norm": 0.39688560366630554, + "learning_rate": 0.00026384809218707423 + }, + { + "step": 195, + "epoch": 1.21875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.7672, + "grad_norm": 2.2791860103607178, + "learning_rate": 0.0002633137170668897 + }, + { + "step": 196, + "epoch": 1.225, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.7152, + "grad_norm": 1.069309115409851, + "learning_rate": 0.0002627759711218466 + }, + { + "step": 197, + "epoch": 1.23125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493036032, + "loss": 0.7034, + "grad_norm": 0.4910648465156555, + "learning_rate": 0.00026223487034866133 + }, + { + "step": 198, + "epoch": 1.2375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6945, + "grad_norm": 0.1611475646495819, + "learning_rate": 0.00026169043084384896 + }, + { + "step": 199, + "epoch": 1.24375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6818, + "grad_norm": 0.16853831708431244, + "learning_rate": 0.00026114266880324387 + }, + { + "step": 200, + "epoch": 1.25, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7282, + "grad_norm": 1.2285224199295044, + "learning_rate": 0.0002605916005215186 + }, + { + "step": 201, + "epoch": 1.25625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493036032, + "loss": 0.7267, + "grad_norm": 1.1224629878997803, + "learning_rate": 0.00026003724239169874 + }, + { + "step": 202, + "epoch": 1.2625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6777, + "grad_norm": 0.19731394946575165, + "learning_rate": 0.00025947961090467533 + }, + { + "step": 203, + "epoch": 1.26875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.7407, + "grad_norm": 1.686057448387146, + "learning_rate": 0.0002589187226487144 + }, + { + "step": 204, + "epoch": 1.275, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6972, + "grad_norm": 0.3397570848464966, + "learning_rate": 0.0002583545943089633 + }, + { + "step": 205, + "epoch": 1.28125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.6901, + "grad_norm": 0.07230561971664429, + "learning_rate": 0.00025778724266695466 + }, + { + "step": 206, + "epoch": 1.2875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7043, + "grad_norm": 0.5197455286979675, + "learning_rate": 0.00025721668460010696 + }, + { + "step": 207, + "epoch": 1.29375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6974, + "grad_norm": 0.33101004362106323, + "learning_rate": 0.0002566429370812223 + }, + { + "step": 208, + "epoch": 1.3, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6774, + "grad_norm": 0.13139708340168, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 209, + "epoch": 1.30625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7307, + "grad_norm": 1.2186952829360962, + "learning_rate": 0.0002554859420524386 + }, + { + "step": 210, + "epoch": 1.3125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7143, + "grad_norm": 0.8806723356246948, + "learning_rate": 0.00025490272896050507 + }, + { + "step": 211, + "epoch": 1.31875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6956, + "grad_norm": 0.8888772130012512, + "learning_rate": 0.00025431639525144175 + }, + { + "step": 212, + "epoch": 1.325, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6958, + "grad_norm": 0.25488850474357605, + "learning_rate": 0.0002537269583673404 + }, + { + "step": 213, + "epoch": 1.33125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6914, + "grad_norm": 0.1009368747472763, + "learning_rate": 0.0002531344358426051 + }, + { + "step": 214, + "epoch": 1.3375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493037568, + "loss": 0.6993, + "grad_norm": 0.6389911770820618, + "learning_rate": 0.0002525388453034307 + }, + { + "step": 215, + "epoch": 1.34375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.6863, + "grad_norm": 0.3630368113517761, + "learning_rate": 0.0002519402044672784 + }, + { + "step": 216, + "epoch": 1.35, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6856, + "grad_norm": 0.644487202167511, + "learning_rate": 0.00025133853114234905 + }, + { + "step": 217, + "epoch": 1.35625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.7065, + "grad_norm": 0.8069692254066467, + "learning_rate": 0.00025073384322705274 + }, + { + "step": 218, + "epoch": 1.3625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7136, + "grad_norm": 1.434441089630127, + "learning_rate": 0.0002501261587094771 + }, + { + "step": 219, + "epoch": 1.36875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.694, + "grad_norm": 0.14592304825782776, + "learning_rate": 0.00024951549566685165 + }, + { + "step": 220, + "epoch": 1.375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6941, + "grad_norm": 0.07811355590820312, + "learning_rate": 0.0002489018722650103 + }, + { + "step": 221, + "epoch": 1.38125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.7032, + "grad_norm": 1.1504899263381958, + "learning_rate": 0.00024828530675785094 + }, + { + "step": 222, + "epoch": 1.3875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6925, + "grad_norm": 0.058415528386831284, + "learning_rate": 0.00024766581748679234 + }, + { + "step": 223, + "epoch": 1.39375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493022208, + "loss": 0.6951, + "grad_norm": 0.6030896902084351, + "learning_rate": 0.0002470434228802286 + }, + { + "step": 224, + "epoch": 1.4, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6929, + "grad_norm": 0.08266313374042511, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 225, + "epoch": 1.40625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.7037, + "grad_norm": 0.7748782634735107, + "learning_rate": 0.0002457899918057468 + }, + { + "step": 226, + "epoch": 1.4125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.7011, + "grad_norm": 0.615720272064209, + "learning_rate": 0.0002451589926245468 + }, + { + "step": 227, + "epoch": 1.41875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.6963, + "grad_norm": 0.16643446683883667, + "learning_rate": 0.00024452516268016865 + }, + { + "step": 228, + "epoch": 1.425, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6934, + "grad_norm": 0.11306018382310867, + "learning_rate": 0.00024388852082760884 + }, + { + "step": 229, + "epoch": 1.43125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6945, + "grad_norm": 0.23699624836444855, + "learning_rate": 0.00024324908600551162 + }, + { + "step": 230, + "epoch": 1.4375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6955, + "grad_norm": 0.3691926896572113, + "learning_rate": 0.00024260687723560574 + }, + { + "step": 231, + "epoch": 1.44375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6884, + "grad_norm": 0.4948045015335083, + "learning_rate": 0.00024196191362213862 + }, + { + "step": 232, + "epoch": 1.45, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6906, + "grad_norm": 0.10192902386188507, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 233, + "epoch": 1.45625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6969, + "grad_norm": 0.23165380954742432, + "learning_rate": 0.0002406637986906913 + }, + { + "step": 234, + "epoch": 1.4625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493039104, + "loss": 0.6928, + "grad_norm": 0.1594015210866928, + "learning_rate": 0.00024001068598867212 + }, + { + "step": 235, + "epoch": 1.46875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.7003, + "grad_norm": 0.5398861169815063, + "learning_rate": 0.000239354895673865 + }, + { + "step": 236, + "epoch": 1.475, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6907, + "grad_norm": 0.644536554813385, + "learning_rate": 0.00023869644725453735 + }, + { + "step": 237, + "epoch": 1.48125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493037568, + "loss": 0.7039, + "grad_norm": 0.6447754502296448, + "learning_rate": 0.00023803536031802918 + }, + { + "step": 238, + "epoch": 1.4875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493036032, + "loss": 0.6954, + "grad_norm": 1.0588040351867676, + "learning_rate": 0.00023737165453017033 + }, + { + "step": 239, + "epoch": 1.49375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.7057, + "grad_norm": 0.6445721387863159, + "learning_rate": 0.0002367053496346955 + }, + { + "step": 240, + "epoch": 1.5, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7023, + "grad_norm": 0.45956942439079285, + "learning_rate": 0.00023603646545265687 + }, + { + "step": 241, + "epoch": 1.50625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6716, + "grad_norm": 0.32909414172172546, + "learning_rate": 0.00023536502188183472 + }, + { + "step": 242, + "epoch": 1.5125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6804, + "grad_norm": 0.06117682904005051, + "learning_rate": 0.00023469103889614505 + }, + { + "step": 243, + "epoch": 1.51875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493037568, + "loss": 0.7469, + "grad_norm": 1.4097740650177002, + "learning_rate": 0.0002340145365450458 + }, + { + "step": 244, + "epoch": 1.525, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.717, + "grad_norm": 0.757781445980072, + "learning_rate": 0.0002333355349529403 + }, + { + "step": 245, + "epoch": 1.53125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.7042, + "grad_norm": 0.5569053888320923, + "learning_rate": 0.0002326540543185786 + }, + { + "step": 246, + "epoch": 1.5375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6931, + "grad_norm": 0.1567043960094452, + "learning_rate": 0.0002319701149144565 + }, + { + "step": 247, + "epoch": 1.54375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.6932, + "grad_norm": 0.06544040143489838, + "learning_rate": 0.00023128373708621275 + }, + { + "step": 248, + "epoch": 1.55, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7087, + "grad_norm": 0.46787333488464355, + "learning_rate": 0.00023059494125202357 + }, + { + "step": 249, + "epoch": 1.55625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6869, + "grad_norm": 0.09982309490442276, + "learning_rate": 0.00022990374790199532 + }, + { + "step": 250, + "epoch": 1.5625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7147, + "grad_norm": 0.4856555163860321, + "learning_rate": 0.0002292101775975552 + }, + { + "step": 251, + "epoch": 1.56875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6899, + "grad_norm": 0.0802168920636177, + "learning_rate": 0.00022851425097083906 + }, + { + "step": 252, + "epoch": 1.575, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6898, + "grad_norm": 0.26566416025161743, + "learning_rate": 0.00022781598872407822 + }, + { + "step": 253, + "epoch": 1.58125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.698, + "grad_norm": 0.08237555623054504, + "learning_rate": 0.00022711541162898321 + }, + { + "step": 254, + "epoch": 1.5875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6994, + "grad_norm": 0.09683462977409363, + "learning_rate": 0.00022641254052612627 + }, + { + "step": 255, + "epoch": 1.59375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493022208, + "loss": 0.6929, + "grad_norm": 0.2064366489648819, + "learning_rate": 0.00022570739632432079 + }, + { + "step": 256, + "epoch": 1.6, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6996, + "grad_norm": 0.26096436381340027, + "learning_rate": 0.000225 + }, + { + "step": 257, + "epoch": 1.60625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.694, + "grad_norm": 0.06342318654060364, + "learning_rate": 0.0002242903725965924 + }, + { + "step": 258, + "epoch": 1.6125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6951, + "grad_norm": 0.3993206024169922, + "learning_rate": 0.00022357853522389615 + }, + { + "step": 259, + "epoch": 1.61875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7006, + "grad_norm": 0.7187936902046204, + "learning_rate": 0.000222864509057451 + }, + { + "step": 260, + "epoch": 1.625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6955, + "grad_norm": 0.31932148337364197, + "learning_rate": 0.00022214831533790813 + }, + { + "step": 261, + "epoch": 1.63125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.7069, + "grad_norm": 0.8649726510047913, + "learning_rate": 0.0002214299753703987 + }, + { + "step": 262, + "epoch": 1.6375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6925, + "grad_norm": 0.0878240168094635, + "learning_rate": 0.00022070951052389966 + }, + { + "step": 263, + "epoch": 1.64375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493039104, + "loss": 0.6934, + "grad_norm": 0.06309176981449127, + "learning_rate": 0.00021998694223059837 + }, + { + "step": 264, + "epoch": 1.65, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6922, + "grad_norm": 0.06185629963874817, + "learning_rate": 0.0002192622919852551 + }, + { + "step": 265, + "epoch": 1.65625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6999, + "grad_norm": 0.43249449133872986, + "learning_rate": 0.00021853558134456307 + }, + { + "step": 266, + "epoch": 1.6625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.689, + "grad_norm": 0.2355310171842575, + "learning_rate": 0.00021780683192650796 + }, + { + "step": 267, + "epoch": 1.66875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6888, + "grad_norm": 0.22602681815624237, + "learning_rate": 0.00021707606540972413 + }, + { + "step": 268, + "epoch": 1.675, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7038, + "grad_norm": 0.8538910150527954, + "learning_rate": 0.00021634330353285017 + }, + { + "step": 269, + "epoch": 1.68125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6903, + "grad_norm": 0.3964362144470215, + "learning_rate": 0.00021560856809388213 + }, + { + "step": 270, + "epoch": 1.6875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6961, + "grad_norm": 0.5111532211303711, + "learning_rate": 0.00021487188094952489 + }, + { + "step": 271, + "epoch": 1.69375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6916, + "grad_norm": 0.10857870429754257, + "learning_rate": 0.0002141332640145423 + }, + { + "step": 272, + "epoch": 1.7, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.7025, + "grad_norm": 0.7991607785224915, + "learning_rate": 0.0002133927392611049 + }, + { + "step": 273, + "epoch": 1.70625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.6921, + "grad_norm": 0.22464728355407715, + "learning_rate": 0.00021265032871813658 + }, + { + "step": 274, + "epoch": 1.7125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6938, + "grad_norm": 0.6466021537780762, + "learning_rate": 0.00021190605447065917 + }, + { + "step": 275, + "epoch": 1.71875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6942, + "grad_norm": 0.5049682855606079, + "learning_rate": 0.0002111599386591355 + }, + { + "step": 276, + "epoch": 1.725, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.6983, + "grad_norm": 0.45334187150001526, + "learning_rate": 0.00021041200347881057 + }, + { + "step": 277, + "epoch": 1.73125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493036032, + "loss": 0.6923, + "grad_norm": 0.5406206250190735, + "learning_rate": 0.00020966227117905163 + }, + { + "step": 278, + "epoch": 1.7375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6927, + "grad_norm": 0.2610889673233032, + "learning_rate": 0.00020891076406268612 + }, + { + "step": 279, + "epoch": 1.74375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.7048, + "grad_norm": 0.7185795903205872, + "learning_rate": 0.00020815750448533805 + }, + { + "step": 280, + "epoch": 1.75, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493036032, + "loss": 0.698, + "grad_norm": 0.18311288952827454, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 281, + "epoch": 1.75625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493022208, + "loss": 0.6883, + "grad_norm": 0.349353551864624, + "learning_rate": 0.00020664581763018324 + }, + { + "step": 282, + "epoch": 1.7625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.7077, + "grad_norm": 0.523723304271698, + "learning_rate": 0.00020588743532161543 + }, + { + "step": 283, + "epoch": 1.76875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.7395, + "grad_norm": 1.2505675554275513, + "learning_rate": 0.00020512739048920552 + }, + { + "step": 284, + "epoch": 1.775, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.7039, + "grad_norm": 0.4622817039489746, + "learning_rate": 0.00020436570574255522 + }, + { + "step": 285, + "epoch": 1.78125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6922, + "grad_norm": 0.1096465215086937, + "learning_rate": 0.00020360240374005 + }, + { + "step": 286, + "epoch": 1.7875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.7107, + "grad_norm": 1.0897701978683472, + "learning_rate": 0.00020283750718818501 + }, + { + "step": 287, + "epoch": 1.79375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493022208, + "loss": 0.6997, + "grad_norm": 0.37231534719467163, + "learning_rate": 0.00020207103884088955 + }, + { + "step": 288, + "epoch": 1.8, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.696, + "grad_norm": 0.18987052142620087, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 289, + "epoch": 1.80625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6962, + "grad_norm": 0.35805419087409973, + "learning_rate": 0.00020053347800883298 + }, + { + "step": 290, + "epoch": 1.8125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6899, + "grad_norm": 0.1291409432888031, + "learning_rate": 0.00019976243126300282 + }, + { + "step": 291, + "epoch": 1.81875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6957, + "grad_norm": 0.21812103688716888, + "learning_rate": 0.00019898990419824333 + }, + { + "step": 292, + "epoch": 1.825, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493019136, + "loss": 0.7001, + "grad_norm": 0.29239845275878906, + "learning_rate": 0.00019821591979547423 + }, + { + "step": 293, + "epoch": 1.83125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.7279, + "grad_norm": 0.8060789704322815, + "learning_rate": 0.00019744050107896774 + }, + { + "step": 294, + "epoch": 1.8375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493020672, + "loss": 0.726, + "grad_norm": 0.8248571753501892, + "learning_rate": 0.0001966636711156636 + }, + { + "step": 295, + "epoch": 1.84375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6949, + "grad_norm": 0.07870083302259445, + "learning_rate": 0.00019588545301448302 + }, + { + "step": 296, + "epoch": 1.85, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7014, + "grad_norm": 0.3497270345687866, + "learning_rate": 0.00019510586992564093 + }, + { + "step": 297, + "epoch": 1.85625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6964, + "grad_norm": 0.12097702920436859, + "learning_rate": 0.0001943249450399578 + }, + { + "step": 298, + "epoch": 1.8625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6997, + "grad_norm": 0.18935005366802216, + "learning_rate": 0.0001935427015881693 + }, + { + "step": 299, + "epoch": 1.86875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6914, + "grad_norm": 0.0663309395313263, + "learning_rate": 0.00019275916284023563 + }, + { + "step": 300, + "epoch": 1.875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.712, + "grad_norm": 0.531062126159668, + "learning_rate": 0.00019197435210464882 + }, + { + "step": 301, + "epoch": 1.88125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6953, + "grad_norm": 0.10451968759298325, + "learning_rate": 0.00019118829272773985 + }, + { + "step": 302, + "epoch": 1.8875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6945, + "grad_norm": 0.18120066821575165, + "learning_rate": 0.00019040100809298392 + }, + { + "step": 303, + "epoch": 1.89375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493039104, + "loss": 0.6936, + "grad_norm": 0.09242997318506241, + "learning_rate": 0.00018961252162030476 + }, + { + "step": 304, + "epoch": 1.9, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6877, + "grad_norm": 0.1183919832110405, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 305, + "epoch": 1.90625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6853, + "grad_norm": 0.11475009471178055, + "learning_rate": 0.00018803203701893393 + }, + { + "step": 306, + "epoch": 1.9125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6902, + "grad_norm": 0.15166357159614563, + "learning_rate": 0.00018724008590605742 + }, + { + "step": 307, + "epoch": 1.91875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.7264, + "grad_norm": 0.7986534833908081, + "learning_rate": 0.0001864470269854896 + }, + { + "step": 308, + "epoch": 1.925, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.693, + "grad_norm": 0.07340432703495026, + "learning_rate": 0.00018565288384892595 + }, + { + "step": 309, + "epoch": 1.93125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.708, + "grad_norm": 0.647648274898529, + "learning_rate": 0.00018485768012031518 + }, + { + "step": 310, + "epoch": 1.9375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6953, + "grad_norm": 0.381138414144516, + "learning_rate": 0.00018406143945515598 + }, + { + "step": 311, + "epoch": 1.94375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493022208, + "loss": 0.701, + "grad_norm": 0.3000425100326538, + "learning_rate": 0.00018326418553979367 + }, + { + "step": 312, + "epoch": 1.95, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493022208, + "loss": 0.6925, + "grad_norm": 0.05272788926959038, + "learning_rate": 0.0001824659420907154 + }, + { + "step": 313, + "epoch": 1.95625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.7013, + "grad_norm": 0.3267762064933777, + "learning_rate": 0.00018166673285384475 + }, + { + "step": 314, + "epoch": 1.9625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6936, + "grad_norm": 0.13041891157627106, + "learning_rate": 0.00018086658160383523 + }, + { + "step": 315, + "epoch": 1.96875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6977, + "grad_norm": 0.2791895866394043, + "learning_rate": 0.00018006551214336304 + }, + { + "step": 316, + "epoch": 1.975, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6916, + "grad_norm": 0.29551464319229126, + "learning_rate": 0.00017926354830241924 + }, + { + "step": 317, + "epoch": 1.98125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6937, + "grad_norm": 0.25261348485946655, + "learning_rate": 0.00017846071393760044 + }, + { + "step": 318, + "epoch": 1.9875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6933, + "grad_norm": 0.1818682700395584, + "learning_rate": 0.00017765703293139948 + }, + { + "step": 319, + "epoch": 1.99375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6886, + "grad_norm": 0.2628140151500702, + "learning_rate": 0.00017685252919149493 + }, + { + "step": 320, + "epoch": 2.0, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6865, + "grad_norm": 0.317868173122406, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 321, + "epoch": 2.00625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6677, + "grad_norm": 0.5175949931144714, + "learning_rate": 0.00017524114926294887 + }, + { + "step": 322, + "epoch": 2.0125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6933, + "grad_norm": 0.16937540471553802, + "learning_rate": 0.0001744343210091883 + }, + { + "step": 323, + "epoch": 2.01875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.7038, + "grad_norm": 0.29901760816574097, + "learning_rate": 0.00017362676589005967 + }, + { + "step": 324, + "epoch": 2.025, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493039104, + "loss": 0.7339, + "grad_norm": 0.5926758646965027, + "learning_rate": 0.0001728185079284875 + }, + { + "step": 325, + "epoch": 2.03125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.7128, + "grad_norm": 0.37578606605529785, + "learning_rate": 0.00017200957116830423 + }, + { + "step": 326, + "epoch": 2.0375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493037568, + "loss": 0.7099, + "grad_norm": 0.3230392336845398, + "learning_rate": 0.00017119997967353514 + }, + { + "step": 327, + "epoch": 2.04375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6875, + "grad_norm": 0.046328164637088776, + "learning_rate": 0.00017038975752768211 + }, + { + "step": 328, + "epoch": 2.05, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.7011, + "grad_norm": 0.23796454071998596, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 329, + "epoch": 2.05625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.7032, + "grad_norm": 0.3285863995552063, + "learning_rate": 0.0001687675177098179 + }, + { + "step": 330, + "epoch": 2.0625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6938, + "grad_norm": 0.05735310539603233, + "learning_rate": 0.00016795554829574435 + }, + { + "step": 331, + "epoch": 2.06875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6963, + "grad_norm": 0.11120530217885971, + "learning_rate": 0.00016714304474502696 + }, + { + "step": 332, + "epoch": 2.075, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6847, + "grad_norm": 0.2426394671201706, + "learning_rate": 0.00016633003122779467 + }, + { + "step": 333, + "epoch": 2.08125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6845, + "grad_norm": 0.12603989243507385, + "learning_rate": 0.00016551653192934694 + }, + { + "step": 334, + "epoch": 2.0875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6988, + "grad_norm": 0.21568737924098969, + "learning_rate": 0.0001647025710494341 + }, + { + "step": 335, + "epoch": 2.09375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.7025, + "grad_norm": 0.27265453338623047, + "learning_rate": 0.00016388817280153735 + }, + { + "step": 336, + "epoch": 2.1, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6765, + "grad_norm": 0.06111739203333855, + "learning_rate": 0.00016307336141214873 + }, + { + "step": 337, + "epoch": 2.10625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.7288, + "grad_norm": 0.5730329751968384, + "learning_rate": 0.00016225816112005022 + }, + { + "step": 338, + "epoch": 2.1125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.7287, + "grad_norm": 0.5682311654090881, + "learning_rate": 0.00016144259617559286 + }, + { + "step": 339, + "epoch": 2.11875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.7041, + "grad_norm": 0.3026084005832672, + "learning_rate": 0.00016062669083997513 + }, + { + "step": 340, + "epoch": 2.125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6986, + "grad_norm": 0.18084314465522766, + "learning_rate": 0.00015981046938452146 + }, + { + "step": 341, + "epoch": 2.13125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6902, + "grad_norm": 0.1516728699207306, + "learning_rate": 0.00015899395608996015 + }, + { + "step": 342, + "epoch": 2.1375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.69, + "grad_norm": 0.2174105942249298, + "learning_rate": 0.00015817717524570094 + }, + { + "step": 343, + "epoch": 2.14375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6932, + "grad_norm": 0.07139900326728821, + "learning_rate": 0.0001573601511491127 + }, + { + "step": 344, + "epoch": 2.15, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6979, + "grad_norm": 0.3801301419734955, + "learning_rate": 0.00015654290810480042 + }, + { + "step": 345, + "epoch": 2.15625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.697, + "grad_norm": 0.32542312145233154, + "learning_rate": 0.00015572547042388223 + }, + { + "step": 346, + "epoch": 2.1625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.6966, + "grad_norm": 0.07485301047563553, + "learning_rate": 0.00015490786242326643 + }, + { + "step": 347, + "epoch": 2.16875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6953, + "grad_norm": 0.28153711557388306, + "learning_rate": 0.00015409010842492777 + }, + { + "step": 348, + "epoch": 2.175, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6969, + "grad_norm": 0.2658757269382477, + "learning_rate": 0.00015327223275518416 + }, + { + "step": 349, + "epoch": 2.18125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.4930176, + "loss": 0.6936, + "grad_norm": 0.1331307739019394, + "learning_rate": 0.000152454259743973 + }, + { + "step": 350, + "epoch": 2.1875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6912, + "grad_norm": 0.0664614588022232, + "learning_rate": 0.00015163621372412734 + }, + { + "step": 351, + "epoch": 2.19375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6938, + "grad_norm": 0.3038892447948456, + "learning_rate": 0.00015081811903065205 + }, + { + "step": 352, + "epoch": 2.2, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6931, + "grad_norm": 0.08123783022165298, + "learning_rate": 0.00015 + }, + { + "step": 353, + "epoch": 2.20625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6961, + "grad_norm": 0.18597663938999176, + "learning_rate": 0.0001491818809693479 + }, + { + "step": 354, + "epoch": 2.2125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6882, + "grad_norm": 1.3250588178634644, + "learning_rate": 0.00014836378627587266 + }, + { + "step": 355, + "epoch": 2.21875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6982, + "grad_norm": 0.38798844814300537, + "learning_rate": 0.00014754574025602698 + }, + { + "step": 356, + "epoch": 2.225, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.7026, + "grad_norm": 0.39895808696746826, + "learning_rate": 0.00014672776724481584 + }, + { + "step": 357, + "epoch": 2.23125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.7067, + "grad_norm": 0.49593594670295715, + "learning_rate": 0.00014590989157507224 + }, + { + "step": 358, + "epoch": 2.2375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6985, + "grad_norm": 0.38116171956062317, + "learning_rate": 0.00014509213757673357 + }, + { + "step": 359, + "epoch": 2.24375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6979, + "grad_norm": 0.15380047261714935, + "learning_rate": 0.00014427452957611775 + }, + { + "step": 360, + "epoch": 2.25, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6948, + "grad_norm": 0.31031155586242676, + "learning_rate": 0.0001434570918951996 + }, + { + "step": 361, + "epoch": 2.25625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6942, + "grad_norm": 0.08970730006694794, + "learning_rate": 0.0001426398488508873 + }, + { + "step": 362, + "epoch": 2.2625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6884, + "grad_norm": 0.467436820268631, + "learning_rate": 0.00014182282475429903 + }, + { + "step": 363, + "epoch": 2.26875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.69, + "grad_norm": 0.06760562211275101, + "learning_rate": 0.00014100604391003985 + }, + { + "step": 364, + "epoch": 2.275, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.7105, + "grad_norm": 0.552223265171051, + "learning_rate": 0.0001401895306154785 + }, + { + "step": 365, + "epoch": 2.28125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.719, + "grad_norm": 0.7308666110038757, + "learning_rate": 0.00013937330916002487 + }, + { + "step": 366, + "epoch": 2.2875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.7002, + "grad_norm": 0.309447705745697, + "learning_rate": 0.00013855740382440714 + }, + { + "step": 367, + "epoch": 2.29375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6811, + "grad_norm": 0.29352909326553345, + "learning_rate": 0.0001377418388799498 + }, + { + "step": 368, + "epoch": 2.3, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6852, + "grad_norm": 0.24660950899124146, + "learning_rate": 0.00013692663858785124 + }, + { + "step": 369, + "epoch": 2.30625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6859, + "grad_norm": 0.2273276299238205, + "learning_rate": 0.00013611182719846268 + }, + { + "step": 370, + "epoch": 2.3125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6944, + "grad_norm": 0.12357987463474274, + "learning_rate": 0.0001352974289505659 + }, + { + "step": 371, + "epoch": 2.31875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6952, + "grad_norm": 0.20386433601379395, + "learning_rate": 0.000134483468070653 + }, + { + "step": 372, + "epoch": 2.325, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6976, + "grad_norm": 0.24770157039165497, + "learning_rate": 0.00013366996877220533 + }, + { + "step": 373, + "epoch": 2.33125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6936, + "grad_norm": 0.09896448999643326, + "learning_rate": 0.000132856955254973 + }, + { + "step": 374, + "epoch": 2.3375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49304064, + "loss": 0.6866, + "grad_norm": 0.322651207447052, + "learning_rate": 0.00013204445170425565 + }, + { + "step": 375, + "epoch": 2.34375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6959, + "grad_norm": 0.2505664825439453, + "learning_rate": 0.00013123248229018214 + }, + { + "step": 376, + "epoch": 2.35, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6951, + "grad_norm": 0.2869178354740143, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 377, + "epoch": 2.35625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49304064, + "loss": 0.6939, + "grad_norm": 0.20747576653957367, + "learning_rate": 0.0001296102424723179 + }, + { + "step": 378, + "epoch": 2.3625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6909, + "grad_norm": 0.2358819544315338, + "learning_rate": 0.0001288000203264649 + }, + { + "step": 379, + "epoch": 2.36875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7016, + "grad_norm": 0.33987775444984436, + "learning_rate": 0.00012799042883169574 + }, + { + "step": 380, + "epoch": 2.375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6856, + "grad_norm": 0.3135680556297302, + "learning_rate": 0.00012718149207151247 + }, + { + "step": 381, + "epoch": 2.38125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7004, + "grad_norm": 0.283866286277771, + "learning_rate": 0.00012637323410994033 + }, + { + "step": 382, + "epoch": 2.3875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6829, + "grad_norm": 0.1594628244638443, + "learning_rate": 0.0001255656789908117 + }, + { + "step": 383, + "epoch": 2.39375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6877, + "grad_norm": 0.07586205005645752, + "learning_rate": 0.0001247588507370511 + }, + { + "step": 384, + "epoch": 2.4, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.7107, + "grad_norm": 0.5134323239326477, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 385, + "epoch": 2.40625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.7214, + "grad_norm": 0.7753734588623047, + "learning_rate": 0.0001231474708085051 + }, + { + "step": 386, + "epoch": 2.4125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493022208, + "loss": 0.7028, + "grad_norm": 0.3212447166442871, + "learning_rate": 0.0001223429670686005 + }, + { + "step": 387, + "epoch": 2.41875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6871, + "grad_norm": 0.13168202340602875, + "learning_rate": 0.00012153928606239957 + }, + { + "step": 388, + "epoch": 2.425, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6993, + "grad_norm": 0.2262464463710785, + "learning_rate": 0.00012073645169758076 + }, + { + "step": 389, + "epoch": 2.43125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6918, + "grad_norm": 0.04692644625902176, + "learning_rate": 0.00011993448785663692 + }, + { + "step": 390, + "epoch": 2.4375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.695, + "grad_norm": 0.22334837913513184, + "learning_rate": 0.00011913341839616476 + }, + { + "step": 391, + "epoch": 2.44375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6909, + "grad_norm": 0.05742593854665756, + "learning_rate": 0.00011833326714615522 + }, + { + "step": 392, + "epoch": 2.45, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493037568, + "loss": 0.693, + "grad_norm": 0.07806466519832611, + "learning_rate": 0.00011753405790928456 + }, + { + "step": 393, + "epoch": 2.45625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6924, + "grad_norm": 0.08753010630607605, + "learning_rate": 0.0001167358144602063 + }, + { + "step": 394, + "epoch": 2.4625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6969, + "grad_norm": 0.17764650285243988, + "learning_rate": 0.00011593856054484402 + }, + { + "step": 395, + "epoch": 2.46875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6716, + "grad_norm": 0.43902039527893066, + "learning_rate": 0.00011514231987968482 + }, + { + "step": 396, + "epoch": 2.475, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6938, + "grad_norm": 0.076702781021595, + "learning_rate": 0.00011434711615107404 + }, + { + "step": 397, + "epoch": 2.48125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493022208, + "loss": 0.6927, + "grad_norm": 0.10331606864929199, + "learning_rate": 0.00011355297301451042 + }, + { + "step": 398, + "epoch": 2.4875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493037568, + "loss": 0.6973, + "grad_norm": 0.14175844192504883, + "learning_rate": 0.00011275991409394253 + }, + { + "step": 399, + "epoch": 2.49375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7142, + "grad_norm": 0.4877012372016907, + "learning_rate": 0.00011196796298106608 + }, + { + "step": 400, + "epoch": 2.5, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.7115, + "grad_norm": 0.4726301431655884, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 401, + "epoch": 2.50625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6776, + "grad_norm": 0.28996241092681885, + "learning_rate": 0.00011038747837969526 + }, + { + "step": 402, + "epoch": 2.5125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.695, + "grad_norm": 0.13470716774463654, + "learning_rate": 0.00010959899190701608 + }, + { + "step": 403, + "epoch": 2.51875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6924, + "grad_norm": 0.05895346775650978, + "learning_rate": 0.00010881170727226018 + }, + { + "step": 404, + "epoch": 2.525, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6988, + "grad_norm": 0.38138335943222046, + "learning_rate": 0.00010802564789535119 + }, + { + "step": 405, + "epoch": 2.53125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6891, + "grad_norm": 0.19125130772590637, + "learning_rate": 0.00010724083715976441 + }, + { + "step": 406, + "epoch": 2.5375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6903, + "grad_norm": 0.23779939115047455, + "learning_rate": 0.00010645729841183066 + }, + { + "step": 407, + "epoch": 2.54375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6919, + "grad_norm": 0.1884462535381317, + "learning_rate": 0.00010567505496004213 + }, + { + "step": 408, + "epoch": 2.55, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493036032, + "loss": 0.6938, + "grad_norm": 0.21007876098155975, + "learning_rate": 0.00010489413007435904 + }, + { + "step": 409, + "epoch": 2.55625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.693, + "grad_norm": 0.18506306409835815, + "learning_rate": 0.00010411454698551695 + }, + { + "step": 410, + "epoch": 2.5625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6937, + "grad_norm": 0.42826855182647705, + "learning_rate": 0.00010333632888433638 + }, + { + "step": 411, + "epoch": 2.56875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6985, + "grad_norm": 0.4365687966346741, + "learning_rate": 0.00010255949892103225 + }, + { + "step": 412, + "epoch": 2.575, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6899, + "grad_norm": 0.1633734405040741, + "learning_rate": 0.00010178408020452579 + }, + { + "step": 413, + "epoch": 2.58125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493037568, + "loss": 0.6878, + "grad_norm": 0.13228663802146912, + "learning_rate": 0.00010101009580175669 + }, + { + "step": 414, + "epoch": 2.5875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6954, + "grad_norm": 0.14739520847797394, + "learning_rate": 0.00010023756873699722 + }, + { + "step": 415, + "epoch": 2.59375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6949, + "grad_norm": 0.24974851310253143, + "learning_rate": 9.946652199116699e-05 + }, + { + "step": 416, + "epoch": 2.6, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.7029, + "grad_norm": 0.46807584166526794, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 417, + "epoch": 2.60625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.7021, + "grad_norm": 0.4513898193836212, + "learning_rate": 9.792896115911045e-05 + }, + { + "step": 418, + "epoch": 2.6125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6896, + "grad_norm": 0.19563136994838715, + "learning_rate": 9.716249281181497e-05 + }, + { + "step": 419, + "epoch": 2.61875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6952, + "grad_norm": 0.5663607716560364, + "learning_rate": 9.639759625994998e-05 + }, + { + "step": 420, + "epoch": 2.625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6921, + "grad_norm": 0.09538810700178146, + "learning_rate": 9.563429425744476e-05 + }, + { + "step": 421, + "epoch": 2.63125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6985, + "grad_norm": 0.49463847279548645, + "learning_rate": 9.487260951079448e-05 + }, + { + "step": 422, + "epoch": 2.6375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.7046, + "grad_norm": 0.7449460625648499, + "learning_rate": 9.411256467838455e-05 + }, + { + "step": 423, + "epoch": 2.64375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6815, + "grad_norm": 0.6319383382797241, + "learning_rate": 9.335418236981677e-05 + }, + { + "step": 424, + "epoch": 2.65, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6965, + "grad_norm": 0.29675355553627014, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 425, + "epoch": 2.65625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6982, + "grad_norm": 0.317842036485672, + "learning_rate": 9.184249551466189e-05 + }, + { + "step": 426, + "epoch": 2.6625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.6883, + "grad_norm": 0.3356810212135315, + "learning_rate": 9.10892359373139e-05 + }, + { + "step": 427, + "epoch": 2.66875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493037568, + "loss": 0.695, + "grad_norm": 0.08812369406223297, + "learning_rate": 9.033772882094833e-05 + }, + { + "step": 428, + "epoch": 2.675, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493022208, + "loss": 0.7047, + "grad_norm": 0.7297579050064087, + "learning_rate": 8.958799652118943e-05 + }, + { + "step": 429, + "epoch": 2.68125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6928, + "grad_norm": 0.10384070128202438, + "learning_rate": 8.884006134086449e-05 + }, + { + "step": 430, + "epoch": 2.6875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6916, + "grad_norm": 0.14777417480945587, + "learning_rate": 8.809394552934079e-05 + }, + { + "step": 431, + "epoch": 2.69375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6905, + "grad_norm": 0.5010061264038086, + "learning_rate": 8.734967128186338e-05 + }, + { + "step": 432, + "epoch": 2.7, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.6941, + "grad_norm": 0.05859508737921715, + "learning_rate": 8.660726073889511e-05 + }, + { + "step": 433, + "epoch": 2.70625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6926, + "grad_norm": 0.10840573161840439, + "learning_rate": 8.586673598545771e-05 + }, + { + "step": 434, + "epoch": 2.7125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6948, + "grad_norm": 0.10780446976423264, + "learning_rate": 8.512811905047505e-05 + }, + { + "step": 435, + "epoch": 2.71875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.6931, + "grad_norm": 0.24686187505722046, + "learning_rate": 8.439143190611787e-05 + }, + { + "step": 436, + "epoch": 2.725, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49303296, + "loss": 0.6941, + "grad_norm": 0.10314740240573883, + "learning_rate": 8.365669646714983e-05 + }, + { + "step": 437, + "epoch": 2.73125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493036032, + "loss": 0.6973, + "grad_norm": 0.11145162582397461, + "learning_rate": 8.29239345902759e-05 + }, + { + "step": 438, + "epoch": 2.7375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6964, + "grad_norm": 0.056053824722766876, + "learning_rate": 8.219316807349204e-05 + }, + { + "step": 439, + "epoch": 2.74375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493028352, + "loss": 0.6926, + "grad_norm": 0.06714598834514618, + "learning_rate": 8.146441865543689e-05 + }, + { + "step": 440, + "epoch": 2.75, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6918, + "grad_norm": 0.7789713144302368, + "learning_rate": 8.073770801474495e-05 + }, + { + "step": 441, + "epoch": 2.75625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.6925, + "grad_norm": 0.06171654537320137, + "learning_rate": 8.001305776940163e-05 + }, + { + "step": 442, + "epoch": 2.7625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6907, + "grad_norm": 0.15443353354930878, + "learning_rate": 7.929048947610034e-05 + }, + { + "step": 443, + "epoch": 2.76875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493029888, + "loss": 0.6975, + "grad_norm": 0.27569693326950073, + "learning_rate": 7.857002462960132e-05 + }, + { + "step": 444, + "epoch": 2.775, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.687, + "grad_norm": 0.17305229604244232, + "learning_rate": 7.785168466209187e-05 + }, + { + "step": 445, + "epoch": 2.78125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493034496, + "loss": 0.6884, + "grad_norm": 0.07901181280612946, + "learning_rate": 7.713549094254897e-05 + }, + { + "step": 446, + "epoch": 2.7875, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6818, + "grad_norm": 0.215073361992836, + "learning_rate": 7.64214647761038e-05 + }, + { + "step": 447, + "epoch": 2.79375, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493026816, + "loss": 0.6855, + "grad_norm": 0.07525154203176498, + "learning_rate": 7.570962740340759e-05 + }, + { + "step": 448, + "epoch": 2.8, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493023744, + "loss": 0.6858, + "grad_norm": 0.05783047527074814, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 449, + "epoch": 2.80625, + "cpu_mem": 1.696694272, + "gpu_mem": 4.493031424, + "loss": 0.6864, + "grad_norm": 0.10311410576105118, + "learning_rate": 7.429260367567916e-05 + }, + { + "step": 450, + "epoch": 2.8125, + "cpu_mem": 1.696694272, + "gpu_mem": 4.49302528, + "loss": 0.7158, + "grad_norm": 0.782332718372345, + "learning_rate": 7.358745947387373e-05 + }, + { + "step": 451, + "epoch": 2.81875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6988, + "grad_norm": 0.3951030373573303, + "learning_rate": 7.288458837101675e-05 + }, + { + "step": 452, + "epoch": 2.825, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493034496, + "loss": 0.6635, + "grad_norm": 0.5224807262420654, + "learning_rate": 7.218401127592175e-05 + }, + { + "step": 453, + "epoch": 2.83125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6724, + "grad_norm": 0.32411062717437744, + "learning_rate": 7.14857490291609e-05 + }, + { + "step": 454, + "epoch": 2.8375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49303296, + "loss": 0.6973, + "grad_norm": 0.23633863031864166, + "learning_rate": 7.07898224024448e-05 + }, + { + "step": 455, + "epoch": 2.84375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.7122, + "grad_norm": 0.734066903591156, + "learning_rate": 7.009625209800465e-05 + }, + { + "step": 456, + "epoch": 2.85, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.7051, + "grad_norm": 0.5485091209411621, + "learning_rate": 6.940505874797639e-05 + }, + { + "step": 457, + "epoch": 2.85625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.7001, + "grad_norm": 0.3565041720867157, + "learning_rate": 6.871626291378728e-05 + }, + { + "step": 458, + "epoch": 2.8625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.7045, + "grad_norm": 0.6044797301292419, + "learning_rate": 6.80298850855435e-05 + }, + { + "step": 459, + "epoch": 2.86875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6955, + "grad_norm": 0.09000197798013687, + "learning_rate": 6.734594568142142e-05 + }, + { + "step": 460, + "epoch": 2.875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.6921, + "grad_norm": 0.2152387499809265, + "learning_rate": 6.66644650470597e-05 + }, + { + "step": 461, + "epoch": 2.88125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493023744, + "loss": 0.7078, + "grad_norm": 0.7516205906867981, + "learning_rate": 6.598546345495417e-05 + }, + { + "step": 462, + "epoch": 2.8875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6816, + "grad_norm": 0.3646191358566284, + "learning_rate": 6.530896110385494e-05 + }, + { + "step": 463, + "epoch": 2.89375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6963, + "grad_norm": 0.08513715118169785, + "learning_rate": 6.463497811816523e-05 + }, + { + "step": 464, + "epoch": 2.9, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.6995, + "grad_norm": 0.25939130783081055, + "learning_rate": 6.396353454734311e-05 + }, + { + "step": 465, + "epoch": 2.90625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.6824, + "grad_norm": 0.20606845617294312, + "learning_rate": 6.32946503653045e-05 + }, + { + "step": 466, + "epoch": 2.9125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493034496, + "loss": 0.6996, + "grad_norm": 0.1619577258825302, + "learning_rate": 6.262834546982969e-05 + }, + { + "step": 467, + "epoch": 2.91875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493037568, + "loss": 0.6898, + "grad_norm": 0.10774437338113785, + "learning_rate": 6.196463968197084e-05 + }, + { + "step": 468, + "epoch": 2.925, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6972, + "grad_norm": 0.16319096088409424, + "learning_rate": 6.130355274546267e-05 + }, + { + "step": 469, + "epoch": 2.93125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.7081, + "grad_norm": 0.41613149642944336, + "learning_rate": 6.064510432613499e-05 + }, + { + "step": 470, + "epoch": 2.9375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6983, + "grad_norm": 0.23476460576057434, + "learning_rate": 5.998931401132786e-05 + }, + { + "step": 471, + "epoch": 2.94375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6823, + "grad_norm": 0.26195529103279114, + "learning_rate": 5.933620130930867e-05 + }, + { + "step": 472, + "epoch": 2.95, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6982, + "grad_norm": 0.2508927583694458, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 473, + "epoch": 2.95625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493034496, + "loss": 0.6922, + "grad_norm": 0.0989418476819992, + "learning_rate": 5.803808637786135e-05 + }, + { + "step": 474, + "epoch": 2.9625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493022208, + "loss": 0.6977, + "grad_norm": 0.1494150310754776, + "learning_rate": 5.739312276439427e-05 + }, + { + "step": 475, + "epoch": 2.96875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6955, + "grad_norm": 0.06688978523015976, + "learning_rate": 5.6750913994488415e-05 + }, + { + "step": 476, + "epoch": 2.975, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6946, + "grad_norm": 0.05621028319001198, + "learning_rate": 5.6111479172391136e-05 + }, + { + "step": 477, + "epoch": 2.98125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6935, + "grad_norm": 0.15755125880241394, + "learning_rate": 5.5474837319831314e-05 + }, + { + "step": 478, + "epoch": 2.9875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6929, + "grad_norm": 0.17250403761863708, + "learning_rate": 5.4841007375453186e-05 + }, + { + "step": 479, + "epoch": 2.99375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493034496, + "loss": 0.6931, + "grad_norm": 0.18549101054668427, + "learning_rate": 5.4210008194253196e-05 + }, + { + "step": 480, + "epoch": 3.0, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.6925, + "grad_norm": 0.19919635355472565, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 481, + "epoch": 3.00625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493022208, + "loss": 0.6907, + "grad_norm": 0.1915500909090042, + "learning_rate": 5.2956577119771405e-05 + }, + { + "step": 482, + "epoch": 3.0125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493023744, + "loss": 0.692, + "grad_norm": 0.16469426453113556, + "learning_rate": 5.233418251320765e-05 + }, + { + "step": 483, + "epoch": 3.01875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6976, + "grad_norm": 0.23552103340625763, + "learning_rate": 5.171469324214901e-05 + }, + { + "step": 484, + "epoch": 3.025, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6945, + "grad_norm": 0.06904135644435883, + "learning_rate": 5.109812773498967e-05 + }, + { + "step": 485, + "epoch": 3.03125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6955, + "grad_norm": 0.22200410068035126, + "learning_rate": 5.048450433314835e-05 + }, + { + "step": 486, + "epoch": 3.0375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6911, + "grad_norm": 0.21895430982112885, + "learning_rate": 4.987384129052291e-05 + }, + { + "step": 487, + "epoch": 3.04375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6948, + "grad_norm": 0.12836559116840363, + "learning_rate": 4.926615677294723e-05 + }, + { + "step": 488, + "epoch": 3.05, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6923, + "grad_norm": 0.31419360637664795, + "learning_rate": 4.866146885765096e-05 + }, + { + "step": 489, + "epoch": 3.05625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493034496, + "loss": 0.6955, + "grad_norm": 0.33867669105529785, + "learning_rate": 4.8059795532721575e-05 + }, + { + "step": 490, + "epoch": 3.0625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6927, + "grad_norm": 0.15540319681167603, + "learning_rate": 4.7461154696569294e-05 + }, + { + "step": 491, + "epoch": 3.06875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6932, + "grad_norm": 0.2889021933078766, + "learning_rate": 4.686556415739488e-05 + }, + { + "step": 492, + "epoch": 3.075, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6921, + "grad_norm": 0.1372951865196228, + "learning_rate": 4.62730416326596e-05 + }, + { + "step": 493, + "epoch": 3.08125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6939, + "grad_norm": 0.3819200396537781, + "learning_rate": 4.568360474855826e-05 + }, + { + "step": 494, + "epoch": 3.0875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6897, + "grad_norm": 0.11837071925401688, + "learning_rate": 4.509727103949492e-05 + }, + { + "step": 495, + "epoch": 3.09375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6917, + "grad_norm": 0.401557981967926, + "learning_rate": 4.451405794756138e-05 + }, + { + "step": 496, + "epoch": 3.1, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493037568, + "loss": 0.6922, + "grad_norm": 0.13242988288402557, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 497, + "epoch": 3.10625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493037568, + "loss": 0.6966, + "grad_norm": 0.29421326518058777, + "learning_rate": 4.33570629187776e-05 + }, + { + "step": 498, + "epoch": 3.1125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6988, + "grad_norm": 0.38690340518951416, + "learning_rate": 4.278331539989307e-05 + }, + { + "step": 499, + "epoch": 3.11875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493036032, + "loss": 0.696, + "grad_norm": 0.11478157341480255, + "learning_rate": 4.2212757333045283e-05 + }, + { + "step": 500, + "epoch": 3.125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49303296, + "loss": 0.6966, + "grad_norm": 0.16195248067378998, + "learning_rate": 4.164540569103667e-05 + }, + { + "step": 501, + "epoch": 3.13125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6938, + "grad_norm": 0.04410400241613388, + "learning_rate": 4.108127735128561e-05 + }, + { + "step": 502, + "epoch": 3.1375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493034496, + "loss": 0.6899, + "grad_norm": 0.1583573818206787, + "learning_rate": 4.052038909532469e-05 + }, + { + "step": 503, + "epoch": 3.14375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6895, + "grad_norm": 0.20718729496002197, + "learning_rate": 3.996275760830125e-05 + }, + { + "step": 504, + "epoch": 3.15, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6942, + "grad_norm": 0.04870573803782463, + "learning_rate": 3.94083994784814e-05 + }, + { + "step": 505, + "epoch": 3.15625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49303296, + "loss": 0.6976, + "grad_norm": 0.29025962948799133, + "learning_rate": 3.885733119675616e-05 + }, + { + "step": 506, + "epoch": 3.1625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.693, + "grad_norm": 0.06557489931583405, + "learning_rate": 3.830956915615106e-05 + }, + { + "step": 507, + "epoch": 3.16875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493036032, + "loss": 0.6916, + "grad_norm": 0.06221061944961548, + "learning_rate": 3.776512965133863e-05 + }, + { + "step": 508, + "epoch": 3.175, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493022208, + "loss": 0.6878, + "grad_norm": 0.21418863534927368, + "learning_rate": 3.72240288781534e-05 + }, + { + "step": 509, + "epoch": 3.18125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6969, + "grad_norm": 0.24166400730609894, + "learning_rate": 3.66862829331103e-05 + }, + { + "step": 510, + "epoch": 3.1875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493023744, + "loss": 0.6966, + "grad_norm": 0.11592723429203033, + "learning_rate": 3.6151907812925717e-05 + }, + { + "step": 511, + "epoch": 3.19375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6924, + "grad_norm": 0.058626722544431686, + "learning_rate": 3.562091941404179e-05 + }, + { + "step": 512, + "epoch": 3.2, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493034496, + "loss": 0.6933, + "grad_norm": 0.06387929618358612, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 513, + "epoch": 3.20625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6912, + "grad_norm": 0.1360437273979187, + "learning_rate": 3.456916586173797e-05 + }, + { + "step": 514, + "epoch": 3.2125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6912, + "grad_norm": 0.22925372421741486, + "learning_rate": 3.404843199558945e-05 + }, + { + "step": 515, + "epoch": 3.21875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6943, + "grad_norm": 0.22309960424900055, + "learning_rate": 3.3531147424353664e-05 + }, + { + "step": 516, + "epoch": 3.225, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49303296, + "loss": 0.6942, + "grad_norm": 0.21509838104248047, + "learning_rate": 3.301732753606776e-05 + }, + { + "step": 517, + "epoch": 3.23125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493023744, + "loss": 0.6904, + "grad_norm": 0.44420284032821655, + "learning_rate": 3.250698761570244e-05 + }, + { + "step": 518, + "epoch": 3.2375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6894, + "grad_norm": 0.5426042675971985, + "learning_rate": 3.200014284470745e-05 + }, + { + "step": 519, + "epoch": 3.24375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6938, + "grad_norm": 0.052213966846466064, + "learning_rate": 3.149680830055967e-05 + }, + { + "step": 520, + "epoch": 3.25, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.6899, + "grad_norm": 0.22281043231487274, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 521, + "epoch": 3.25625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6924, + "grad_norm": 0.06270848959684372, + "learning_rate": 3.0500729680161663e-05 + }, + { + "step": 522, + "epoch": 3.2625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6888, + "grad_norm": 0.16507966816425323, + "learning_rate": 3.0008015234980552e-05 + }, + { + "step": 523, + "epoch": 3.26875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.6925, + "grad_norm": 0.06773971021175385, + "learning_rate": 2.9518870277903274e-05 + }, + { + "step": 524, + "epoch": 3.275, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6896, + "grad_norm": 0.07986762374639511, + "learning_rate": 2.9033309359877597e-05 + }, + { + "step": 525, + "epoch": 3.28125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493023744, + "loss": 0.6999, + "grad_norm": 0.3209311068058014, + "learning_rate": 2.855134692523438e-05 + }, + { + "step": 526, + "epoch": 3.2875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6917, + "grad_norm": 0.08552578836679459, + "learning_rate": 2.807299731125773e-05 + }, + { + "step": 527, + "epoch": 3.29375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493034496, + "loss": 0.6927, + "grad_norm": 0.0907055214047432, + "learning_rate": 2.759827474775852e-05 + }, + { + "step": 528, + "epoch": 3.3, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493023744, + "loss": 0.6881, + "grad_norm": 0.10205096006393433, + "learning_rate": 2.7127193356651213e-05 + }, + { + "step": 529, + "epoch": 3.30625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6975, + "grad_norm": 0.30923786759376526, + "learning_rate": 2.665976715153377e-05 + }, + { + "step": 530, + "epoch": 3.3125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49303296, + "loss": 0.6854, + "grad_norm": 0.2259840965270996, + "learning_rate": 2.619601003727043e-05 + }, + { + "step": 531, + "epoch": 3.31875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493042176, + "loss": 0.7017, + "grad_norm": 0.3666524887084961, + "learning_rate": 2.5735935809578656e-05 + }, + { + "step": 532, + "epoch": 3.325, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.7022, + "grad_norm": 0.36436206102371216, + "learning_rate": 2.5279558154618197e-05 + }, + { + "step": 533, + "epoch": 3.33125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6939, + "grad_norm": 0.15668059885501862, + "learning_rate": 2.4826890648584353e-05 + }, + { + "step": 534, + "epoch": 3.3375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6944, + "grad_norm": 0.11514193564653397, + "learning_rate": 2.4377946757303828e-05 + }, + { + "step": 535, + "epoch": 3.34375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49303296, + "loss": 0.6966, + "grad_norm": 0.10471631586551666, + "learning_rate": 2.393273983583427e-05 + }, + { + "step": 536, + "epoch": 3.35, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6935, + "grad_norm": 0.13799601793289185, + "learning_rate": 2.3491283128067174e-05 + }, + { + "step": 537, + "epoch": 3.35625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6903, + "grad_norm": 0.14637072384357452, + "learning_rate": 2.3053589766333414e-05 + }, + { + "step": 538, + "epoch": 3.3625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6925, + "grad_norm": 0.16684648394584656, + "learning_rate": 2.261967277101318e-05 + }, + { + "step": 539, + "epoch": 3.36875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6905, + "grad_norm": 0.07885071635246277, + "learning_rate": 2.218954505014821e-05 + }, + { + "step": 540, + "epoch": 3.375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6922, + "grad_norm": 0.3496592938899994, + "learning_rate": 2.1763219399058042e-05 + }, + { + "step": 541, + "epoch": 3.38125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6918, + "grad_norm": 0.07611975073814392, + "learning_rate": 2.1340708499959197e-05 + }, + { + "step": 542, + "epoch": 3.3875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6934, + "grad_norm": 0.15355516970157623, + "learning_rate": 2.0922024921588167e-05 + }, + { + "step": 543, + "epoch": 3.39375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6921, + "grad_norm": 0.14245451986789703, + "learning_rate": 2.0507181118827254e-05 + }, + { + "step": 544, + "epoch": 3.4, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6879, + "grad_norm": 0.07326628267765045, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 545, + "epoch": 3.40625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493036032, + "loss": 0.6963, + "grad_norm": 0.0877152681350708, + "learning_rate": 1.9689062088175154e-05 + }, + { + "step": 546, + "epoch": 3.4125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6908, + "grad_norm": 0.05573296174407005, + "learning_rate": 1.928581119746081e-05 + }, + { + "step": 547, + "epoch": 3.41875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6902, + "grad_norm": 0.18238958716392517, + "learning_rate": 1.8886448755986193e-05 + }, + { + "step": 548, + "epoch": 3.425, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6923, + "grad_norm": 0.08995348960161209, + "learning_rate": 1.8490986643873845e-05 + }, + { + "step": 549, + "epoch": 3.43125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6931, + "grad_norm": 0.08982895314693451, + "learning_rate": 1.8099436625220443e-05 + }, + { + "step": 550, + "epoch": 3.4375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493039104, + "loss": 0.6898, + "grad_norm": 0.12476994097232819, + "learning_rate": 1.7711810347746757e-05 + }, + { + "step": 551, + "epoch": 3.44375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.696, + "grad_norm": 0.2012532651424408, + "learning_rate": 1.7328119342451165e-05 + }, + { + "step": 552, + "epoch": 3.45, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493023744, + "loss": 0.6929, + "grad_norm": 0.0789322629570961, + "learning_rate": 1.694837502326674e-05 + }, + { + "step": 553, + "epoch": 3.45625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6945, + "grad_norm": 0.1620406061410904, + "learning_rate": 1.6572588686721606e-05 + }, + { + "step": 554, + "epoch": 3.4625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6873, + "grad_norm": 0.27678102254867554, + "learning_rate": 1.6200771511602882e-05 + }, + { + "step": 555, + "epoch": 3.46875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493036032, + "loss": 0.6854, + "grad_norm": 0.574009358882904, + "learning_rate": 1.583293455862422e-05 + }, + { + "step": 556, + "epoch": 3.475, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49303296, + "loss": 0.6905, + "grad_norm": 0.07018734514713287, + "learning_rate": 1.546908877009676e-05 + }, + { + "step": 557, + "epoch": 3.48125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.697, + "grad_norm": 0.280269056558609, + "learning_rate": 1.5109244969603546e-05 + }, + { + "step": 558, + "epoch": 3.4875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6985, + "grad_norm": 0.2342129498720169, + "learning_rate": 1.4753413861677604e-05 + }, + { + "step": 559, + "epoch": 3.49375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6925, + "grad_norm": 0.15905432403087616, + "learning_rate": 1.4401606031483497e-05 + }, + { + "step": 560, + "epoch": 3.5, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6929, + "grad_norm": 0.1872699111700058, + "learning_rate": 1.4053831944502508e-05 + }, + { + "step": 561, + "epoch": 3.50625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49303296, + "loss": 0.6926, + "grad_norm": 0.06447822600603104, + "learning_rate": 1.371010194622117e-05 + }, + { + "step": 562, + "epoch": 3.5125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.7033, + "grad_norm": 0.40418362617492676, + "learning_rate": 1.3370426261823613e-05 + }, + { + "step": 563, + "epoch": 3.51875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6959, + "grad_norm": 0.13371387124061584, + "learning_rate": 1.3034814995887433e-05 + }, + { + "step": 564, + "epoch": 3.525, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493023744, + "loss": 0.6903, + "grad_norm": 0.10461732745170593, + "learning_rate": 1.2703278132082934e-05 + }, + { + "step": 565, + "epoch": 3.53125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6914, + "grad_norm": 0.06751887500286102, + "learning_rate": 1.237582553287631e-05 + }, + { + "step": 566, + "epoch": 3.5375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.7025, + "grad_norm": 0.44440704584121704, + "learning_rate": 1.205246693923616e-05 + }, + { + "step": 567, + "epoch": 3.54375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6997, + "grad_norm": 0.2823053002357483, + "learning_rate": 1.173321197034382e-05 + }, + { + "step": 568, + "epoch": 3.55, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.6918, + "grad_norm": 0.17047429084777832, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 569, + "epoch": 3.55625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.6986, + "grad_norm": 0.3142000138759613, + "learning_rate": 1.1107050772877507e-05 + }, + { + "step": 570, + "epoch": 3.5625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493023744, + "loss": 0.6883, + "grad_norm": 0.09026341885328293, + "learning_rate": 1.0800163171172332e-05 + }, + { + "step": 571, + "epoch": 3.56875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493034496, + "loss": 0.6943, + "grad_norm": 0.10978658497333527, + "learning_rate": 1.0497416447398187e-05 + }, + { + "step": 572, + "epoch": 3.575, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6865, + "grad_norm": 0.2786584198474884, + "learning_rate": 1.0198819607580233e-05 + }, + { + "step": 573, + "epoch": 3.58125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493034496, + "loss": 0.6927, + "grad_norm": 0.1552763283252716, + "learning_rate": 9.904381534293993e-06 + }, + { + "step": 574, + "epoch": 3.5875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6901, + "grad_norm": 0.18652953207492828, + "learning_rate": 9.614110986401169e-06 + }, + { + "step": 575, + "epoch": 3.59375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6911, + "grad_norm": 0.05882210284471512, + "learning_rate": 9.32801659878905e-06 + }, + { + "step": 576, + "epoch": 3.6, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6974, + "grad_norm": 0.20268455147743225, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 577, + "epoch": 3.60625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493023744, + "loss": 0.6933, + "grad_norm": 0.07182348519563675, + "learning_rate": 8.768390222546895e-06 + }, + { + "step": 578, + "epoch": 3.6125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493037568, + "loss": 0.6933, + "grad_norm": 0.09048701822757721, + "learning_rate": 8.494874881526215e-06 + }, + { + "step": 579, + "epoch": 3.61875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.6937, + "grad_norm": 0.05703119933605194, + "learning_rate": 8.225568995509834e-06 + }, + { + "step": 580, + "epoch": 3.625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6865, + "grad_norm": 0.32615232467651367, + "learning_rate": 7.960480575734162e-06 + }, + { + "step": 581, + "epoch": 3.63125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6893, + "grad_norm": 0.13133034110069275, + "learning_rate": 7.699617507975563e-06 + }, + { + "step": 582, + "epoch": 3.6375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493022208, + "loss": 0.6902, + "grad_norm": 0.06653029471635818, + "learning_rate": 7.442987552315833e-06 + }, + { + "step": 583, + "epoch": 3.64375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.691, + "grad_norm": 0.07508520036935806, + "learning_rate": 7.190598342911358e-06 + }, + { + "step": 584, + "epoch": 3.65, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6887, + "grad_norm": 0.07525808364152908, + "learning_rate": 6.942457387765976e-06 + }, + { + "step": 585, + "epoch": 3.65625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493037568, + "loss": 0.6894, + "grad_norm": 0.13507962226867676, + "learning_rate": 6.698572068507596e-06 + }, + { + "step": 586, + "epoch": 3.6625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6977, + "grad_norm": 0.3384900689125061, + "learning_rate": 6.458949640168675e-06 + }, + { + "step": 587, + "epoch": 3.66875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493037568, + "loss": 0.6884, + "grad_norm": 0.10127384215593338, + "learning_rate": 6.223597230970428e-06 + }, + { + "step": 588, + "epoch": 3.675, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6864, + "grad_norm": 0.27799150347709656, + "learning_rate": 5.992521842110709e-06 + }, + { + "step": 589, + "epoch": 3.68125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.6941, + "grad_norm": 0.20866553485393524, + "learning_rate": 5.7657303475556974e-06 + }, + { + "step": 590, + "epoch": 3.6875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6976, + "grad_norm": 0.2923052906990051, + "learning_rate": 5.543229493835594e-06 + }, + { + "step": 591, + "epoch": 3.69375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6947, + "grad_norm": 0.44352656602859497, + "learning_rate": 5.325025899843732e-06 + }, + { + "step": 592, + "epoch": 3.7, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493036032, + "loss": 0.6963, + "grad_norm": 0.24066875874996185, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 593, + "epoch": 3.70625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.6923, + "grad_norm": 0.18978099524974823, + "learning_rate": 4.901536327256589e-06 + }, + { + "step": 594, + "epoch": 3.7125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6975, + "grad_norm": 0.44291582703590393, + "learning_rate": 4.6962629465110365e-06 + }, + { + "step": 595, + "epoch": 3.71875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493036032, + "loss": 0.6927, + "grad_norm": 0.06275568157434464, + "learning_rate": 4.495312020818403e-06 + }, + { + "step": 596, + "epoch": 3.725, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6902, + "grad_norm": 0.23561690747737885, + "learning_rate": 4.298689528010785e-06 + }, + { + "step": 597, + "epoch": 3.73125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49303296, + "loss": 0.6924, + "grad_norm": 0.10850068181753159, + "learning_rate": 4.106401317159275e-06 + }, + { + "step": 598, + "epoch": 3.7375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6932, + "grad_norm": 0.2521173059940338, + "learning_rate": 3.918453108399955e-06 + }, + { + "step": 599, + "epoch": 3.74375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6927, + "grad_norm": 0.1406988650560379, + "learning_rate": 3.7348504927637302e-06 + }, + { + "step": 600, + "epoch": 3.75, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.6929, + "grad_norm": 0.14805816113948822, + "learning_rate": 3.5555989320099952e-06 + }, + { + "step": 601, + "epoch": 3.75625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493034496, + "loss": 0.6905, + "grad_norm": 0.09466034173965454, + "learning_rate": 3.3807037584642316e-06 + }, + { + "step": 602, + "epoch": 3.7625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6922, + "grad_norm": 0.13128240406513214, + "learning_rate": 3.21017017485925e-06 + }, + { + "step": 603, + "epoch": 3.76875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493034496, + "loss": 0.6911, + "grad_norm": 0.2240767627954483, + "learning_rate": 3.0440032541805825e-06 + }, + { + "step": 604, + "epoch": 3.775, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6934, + "grad_norm": 0.10662133991718292, + "learning_rate": 2.882207939515435e-06 + }, + { + "step": 605, + "epoch": 3.78125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493023744, + "loss": 0.6946, + "grad_norm": 0.36319100856781006, + "learning_rate": 2.7247890439057064e-06 + }, + { + "step": 606, + "epoch": 3.7875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6905, + "grad_norm": 0.17981187999248505, + "learning_rate": 2.5717512502048342e-06 + }, + { + "step": 607, + "epoch": 3.79375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49303296, + "loss": 0.6901, + "grad_norm": 0.15079951286315918, + "learning_rate": 2.423099110938376e-06 + }, + { + "step": 608, + "epoch": 3.8, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.6916, + "grad_norm": 0.20673954486846924, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 609, + "epoch": 3.80625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6906, + "grad_norm": 0.31353992223739624, + "learning_rate": 2.1389693533636455e-06 + }, + { + "step": 610, + "epoch": 3.8125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6973, + "grad_norm": 0.09800034016370773, + "learning_rate": 2.003500187268153e-06 + }, + { + "step": 611, + "epoch": 3.81875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.6915, + "grad_norm": 0.19551484286785126, + "learning_rate": 1.8724335797812685e-06 + }, + { + "step": 612, + "epoch": 3.825, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493022208, + "loss": 0.6928, + "grad_norm": 0.1656205803155899, + "learning_rate": 1.7457734298359005e-06 + }, + { + "step": 613, + "epoch": 3.83125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6944, + "grad_norm": 0.08614305406808853, + "learning_rate": 1.6235235052828476e-06 + }, + { + "step": 614, + "epoch": 3.8375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6903, + "grad_norm": 0.1983010321855545, + "learning_rate": 1.505687442778819e-06 + }, + { + "step": 615, + "epoch": 3.84375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6932, + "grad_norm": 0.21022222936153412, + "learning_rate": 1.3922687476781047e-06 + }, + { + "step": 616, + "epoch": 3.85, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.6906, + "grad_norm": 0.06873234361410141, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 617, + "epoch": 3.85625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49303296, + "loss": 0.691, + "grad_norm": 0.17068582773208618, + "learning_rate": 1.1786968239705486e-06 + }, + { + "step": 618, + "epoch": 3.8625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.689, + "grad_norm": 0.06660193204879761, + "learning_rate": 1.0785499486417438e-06 + }, + { + "step": 619, + "epoch": 3.86875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6916, + "grad_norm": 0.07141067832708359, + "learning_rate": 9.82833147083345e-07 + }, + { + "step": 620, + "epoch": 3.875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6909, + "grad_norm": 0.24069342017173767, + "learning_rate": 8.91549266652053e-07 + }, + { + "step": 621, + "epoch": 3.88125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6943, + "grad_norm": 0.3169083595275879, + "learning_rate": 8.04701022835319e-07 + }, + { + "step": 622, + "epoch": 3.8875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49303296, + "loss": 0.6974, + "grad_norm": 0.12587815523147583, + "learning_rate": 7.222909991704773e-07 + }, + { + "step": 623, + "epoch": 3.89375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6886, + "grad_norm": 0.26202312111854553, + "learning_rate": 6.443216471679058e-07 + }, + { + "step": 624, + "epoch": 3.9, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6933, + "grad_norm": 0.13029184937477112, + "learning_rate": 5.707952862381681e-07 + }, + { + "step": 625, + "epoch": 3.90625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6905, + "grad_norm": 0.10749762505292892, + "learning_rate": 5.017141036229522e-07 + }, + { + "step": 626, + "epoch": 3.9125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6916, + "grad_norm": 0.0625760480761528, + "learning_rate": 4.370801543300051e-07 + }, + { + "step": 627, + "epoch": 3.91875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6924, + "grad_norm": 0.1227780357003212, + "learning_rate": 3.768953610720327e-07 + }, + { + "step": 628, + "epoch": 3.925, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6897, + "grad_norm": 0.12869249284267426, + "learning_rate": 3.211615142094781e-07 + }, + { + "step": 629, + "epoch": 3.93125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493026816, + "loss": 0.6935, + "grad_norm": 0.2155870795249939, + "learning_rate": 2.6988027169728145e-07 + }, + { + "step": 630, + "epoch": 3.9375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49302528, + "loss": 0.6909, + "grad_norm": 0.2240155041217804, + "learning_rate": 2.2305315903553555e-07 + }, + { + "step": 631, + "epoch": 3.94375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493039104, + "loss": 0.6898, + "grad_norm": 0.1806894689798355, + "learning_rate": 1.8068156922413924e-07 + }, + { + "step": 632, + "epoch": 3.95, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6898, + "grad_norm": 0.10770968347787857, + "learning_rate": 1.4276676272133025e-07 + }, + { + "step": 633, + "epoch": 3.95625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6904, + "grad_norm": 0.06192193925380707, + "learning_rate": 1.0930986740621539e-07 + }, + { + "step": 634, + "epoch": 3.9625, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493029888, + "loss": 0.6915, + "grad_norm": 0.2151312679052353, + "learning_rate": 8.031187854514731e-08 + }, + { + "step": 635, + "epoch": 3.96875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6921, + "grad_norm": 0.07607932388782501, + "learning_rate": 5.577365876224815e-08 + }, + { + "step": 636, + "epoch": 3.975, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493022208, + "loss": 0.6907, + "grad_norm": 0.0835399255156517, + "learning_rate": 3.5695938013630134e-08 + }, + { + "step": 637, + "epoch": 3.98125, + "cpu_mem": 1.69689088, + "gpu_mem": 4.49303296, + "loss": 0.6944, + "grad_norm": 0.360629677772522, + "learning_rate": 2.007931356572956e-08 + }, + { + "step": 638, + "epoch": 3.9875, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6912, + "grad_norm": 0.2038470208644867, + "learning_rate": 8.924249977537712e-09 + }, + { + "step": 639, + "epoch": 3.99375, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493031424, + "loss": 0.6923, + "grad_norm": 0.2071213275194168, + "learning_rate": 2.2310790867619e-09 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "loss": 0.6927, + "grad_norm": 0.17101933062076569, + "learning_rate": 0.0 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 1.69689088, + "gpu_mem": 4.493028352, + "train_runtime": 1426.2981, + "train_samples_per_second": 28.701, + "train_steps_per_second": 0.449, + "total_flos": 1.4646189048397824e+16, + "train_loss": 0.728723248746246 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r2-a2/README.md b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r2-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r2-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r2-a2/adapter_config.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c5f43ee5d95e6efa86bc12e96d56fbf5a2c265b7 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r2-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 4, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 2, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r2-a2/eval_results.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..6a1c907f23adc12e7e3382421af2e67230df2a28 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_c", + "results": 0.3174061433447099 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r2-a2/training_configuration.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..4136b72b9899c6f97051da4498cce71323829af5 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_C", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "loraq4", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 1576960 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loraq4-arc_c-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loraq4-fix/TinyLlama_v1.1-loraq4-arc_c-r2-a2", + "seed": 42, + "timestamp": "2025-08-31T23:12:54.963595" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r2-a2/training_logs.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..8cb4588180204d04b6ae14a055c03101cab6d9f3 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r2-a2/training_logs.json @@ -0,0 +1,625 @@ +[ + { + "step": 1, + "epoch": 0.05714285714285714, + "cpu_mem": 3.7005312, + "gpu_mem": 1.056004608, + "loss": 4.4743, + "grad_norm": 17.3563232421875, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 2, + "epoch": 0.11428571428571428, + "cpu_mem": 3.700727808, + "gpu_mem": 1.06861056, + "loss": 4.5283, + "grad_norm": 18.069007873535156, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 3, + "epoch": 0.17142857142857143, + "cpu_mem": 3.700727808, + "gpu_mem": 1.06864128, + "loss": 4.1227, + "grad_norm": 18.82330322265625, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 4, + "epoch": 0.22857142857142856, + "cpu_mem": 3.700727808, + "gpu_mem": 1.068607488, + "loss": 3.6267, + "grad_norm": 17.52536964416504, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 5, + "epoch": 0.2857142857142857, + "cpu_mem": 3.700727808, + "gpu_mem": 1.0685952, + "loss": 3.0515, + "grad_norm": 16.327932357788086, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 6, + "epoch": 0.34285714285714286, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068658176, + "loss": 2.4667, + "grad_norm": 10.08739948272705, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 7, + "epoch": 0.4, + "cpu_mem": 3.700924416, + "gpu_mem": 1.06866432, + "loss": 1.8524, + "grad_norm": 5.373015403747559, + "learning_rate": 0.0003 + }, + { + "step": 8, + "epoch": 0.45714285714285713, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068622848, + "loss": 1.6163, + "grad_norm": 3.53236985206604, + "learning_rate": 0.00029980111348272456 + }, + { + "step": 9, + "epoch": 0.5142857142857142, + "cpu_mem": 3.700924416, + "gpu_mem": 1.06861824, + "loss": 1.655, + "grad_norm": 4.065086364746094, + "learning_rate": 0.00029920498134218835 + }, + { + "step": 10, + "epoch": 0.5714285714285714, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068607488, + "loss": 1.3922, + "grad_norm": 1.2350471019744873, + "learning_rate": 0.0002982131844136615 + }, + { + "step": 11, + "epoch": 0.6285714285714286, + "cpu_mem": 3.700924416, + "gpu_mem": 1.06861824, + "loss": 1.3809, + "grad_norm": 1.329240083694458, + "learning_rate": 0.0002968283527643036 + }, + { + "step": 12, + "epoch": 0.6857142857142857, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068642816, + "loss": 1.4498, + "grad_norm": 3.537505865097046, + "learning_rate": 0.000295054158718698 + }, + { + "step": 13, + "epoch": 0.7428571428571429, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068642816, + "loss": 1.4374, + "grad_norm": 4.314914226531982, + "learning_rate": 0.00029289530712050735 + }, + { + "step": 14, + "epoch": 0.8, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068590592, + "loss": 1.5023, + "grad_norm": 3.6951515674591064, + "learning_rate": 0.000290357522856074 + }, + { + "step": 15, + "epoch": 0.8571428571428571, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068665856, + "loss": 1.5626, + "grad_norm": 4.894389629364014, + "learning_rate": 0.0002874475356730507 + }, + { + "step": 16, + "epoch": 0.9142857142857143, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068659712, + "loss": 1.3969, + "grad_norm": 1.7418400049209595, + "learning_rate": 0.0002841730623343193 + }, + { + "step": 17, + "epoch": 0.9714285714285714, + "cpu_mem": 3.700924416, + "gpu_mem": 1.06866432, + "loss": 1.4596, + "grad_norm": 2.8951644897460938, + "learning_rate": 0.00028054278615452326 + }, + { + "step": 18, + "epoch": 1.0285714285714285, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074929152, + "loss": 2.0044, + "grad_norm": 2.531168222427368, + "learning_rate": 0.0002765663339734778 + }, + { + "step": 19, + "epoch": 1.0857142857142856, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074927616, + "loss": 1.3898, + "grad_norm": 1.39984929561615, + "learning_rate": 0.00027225425062752165 + }, + { + "step": 20, + "epoch": 1.1428571428571428, + "cpu_mem": 3.700924416, + "gpu_mem": 1.07490304, + "loss": 1.4157, + "grad_norm": 2.3767001628875732, + "learning_rate": 0.0002676179709865066 + }, + { + "step": 21, + "epoch": 1.2, + "cpu_mem": 3.700924416, + "gpu_mem": 1.07491072, + "loss": 1.3668, + "grad_norm": 1.178501844406128, + "learning_rate": 0.0002626697896305779 + }, + { + "step": 22, + "epoch": 1.2571428571428571, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074939904, + "loss": 1.3261, + "grad_norm": 1.0675058364868164, + "learning_rate": 0.000257422828247159 + }, + { + "step": 23, + "epoch": 1.3142857142857143, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074969088, + "loss": 1.3068, + "grad_norm": 0.7573672533035278, + "learning_rate": 0.00025189100083459397 + }, + { + "step": 24, + "epoch": 1.3714285714285714, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074912256, + "loss": 1.3635, + "grad_norm": 1.4012393951416016, + "learning_rate": 0.0002460889768047263 + }, + { + "step": 25, + "epoch": 1.4285714285714286, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074981376, + "loss": 1.3021, + "grad_norm": 0.961760401725769, + "learning_rate": 0.00024003214208225522 + }, + { + "step": 26, + "epoch": 1.4857142857142858, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074938368, + "loss": 1.3227, + "grad_norm": 1.0150567293167114, + "learning_rate": 0.00023373655830402968 + }, + { + "step": 27, + "epoch": 1.5428571428571427, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074896896, + "loss": 1.3462, + "grad_norm": 1.226587176322937, + "learning_rate": 0.00022721892022647462 + }, + { + "step": 28, + "epoch": 1.6, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074942976, + "loss": 1.51, + "grad_norm": 3.3016269207000732, + "learning_rate": 0.000220496511454098 + }, + { + "step": 29, + "epoch": 1.657142857142857, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074938368, + "loss": 1.357, + "grad_norm": 0.9773362278938293, + "learning_rate": 0.0002135871586064791 + }, + { + "step": 30, + "epoch": 1.7142857142857144, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074927616, + "loss": 1.3517, + "grad_norm": 1.3701797723770142, + "learning_rate": 0.00020650918404527775 + }, + { + "step": 31, + "epoch": 1.7714285714285714, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074958336, + "loss": 1.3518, + "grad_norm": 0.8749883770942688, + "learning_rate": 0.00019928135728662522 + }, + { + "step": 32, + "epoch": 1.8285714285714287, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074967552, + "loss": 1.3811, + "grad_norm": 1.0001131296157837, + "learning_rate": 0.00019192284522774142 + }, + { + "step": 33, + "epoch": 1.8857142857142857, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074947584, + "loss": 1.3948, + "grad_norm": 1.1911927461624146, + "learning_rate": 0.00018445316131976934 + }, + { + "step": 34, + "epoch": 1.9428571428571428, + "cpu_mem": 3.700924416, + "gpu_mem": 1.07492608, + "loss": 1.3715, + "grad_norm": 1.2399791479110718, + "learning_rate": 0.00017689211382161034 + }, + { + "step": 35, + "epoch": 2.0, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074813952, + "loss": 2.1258, + "grad_norm": 2.229743719100952, + "learning_rate": 0.00016925975327198266 + }, + { + "step": 36, + "epoch": 2.057142857142857, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068636672, + "loss": 1.4029, + "grad_norm": 2.037341356277466, + "learning_rate": 0.00016157631931899697 + }, + { + "step": 37, + "epoch": 2.1142857142857143, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068645888, + "loss": 1.3682, + "grad_norm": 0.8479436635971069, + "learning_rate": 0.0001538621870482483 + }, + { + "step": 38, + "epoch": 2.1714285714285713, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068616704, + "loss": 1.3661, + "grad_norm": 1.0788065195083618, + "learning_rate": 0.00014613781295175172 + }, + { + "step": 39, + "epoch": 2.2285714285714286, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068635136, + "loss": 1.3411, + "grad_norm": 0.960898756980896, + "learning_rate": 0.00013842368068100303 + }, + { + "step": 40, + "epoch": 2.2857142857142856, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068612096, + "loss": 1.3756, + "grad_norm": 0.9445275068283081, + "learning_rate": 0.00013074024672801731 + }, + { + "step": 41, + "epoch": 2.342857142857143, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068613632, + "loss": 1.3833, + "grad_norm": 0.9456892013549805, + "learning_rate": 0.00012310788617838966 + }, + { + "step": 42, + "epoch": 2.4, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068642816, + "loss": 1.3112, + "grad_norm": 1.3252665996551514, + "learning_rate": 0.00011554683868023067 + }, + { + "step": 43, + "epoch": 2.4571428571428573, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068658176, + "loss": 1.3501, + "grad_norm": 1.1131830215454102, + "learning_rate": 0.00010807715477225858 + }, + { + "step": 44, + "epoch": 2.5142857142857142, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068676608, + "loss": 1.3222, + "grad_norm": 0.789993941783905, + "learning_rate": 0.00010071864271337478 + }, + { + "step": 45, + "epoch": 2.571428571428571, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068630528, + "loss": 1.3119, + "grad_norm": 0.7824442982673645, + "learning_rate": 9.34908159547222e-05 + }, + { + "step": 46, + "epoch": 2.6285714285714286, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068624384, + "loss": 1.2969, + "grad_norm": 0.9669123888015747, + "learning_rate": 8.641284139352091e-05 + }, + { + "step": 47, + "epoch": 2.685714285714286, + "cpu_mem": 3.700924416, + "gpu_mem": 1.06861824, + "loss": 1.3252, + "grad_norm": 1.0396476984024048, + "learning_rate": 7.950348854590204e-05 + }, + { + "step": 48, + "epoch": 2.742857142857143, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068622848, + "loss": 1.2596, + "grad_norm": 0.8679232001304626, + "learning_rate": 7.278107977352543e-05 + }, + { + "step": 49, + "epoch": 2.8, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068613632, + "loss": 1.3137, + "grad_norm": 0.8202853798866272, + "learning_rate": 6.626344169597031e-05 + }, + { + "step": 50, + "epoch": 2.857142857142857, + "cpu_mem": 3.700924416, + "gpu_mem": 1.0685952, + "loss": 1.3208, + "grad_norm": 1.0051933526992798, + "learning_rate": 5.996785791774478e-05 + }, + { + "step": 51, + "epoch": 2.914285714285714, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068619776, + "loss": 1.3436, + "grad_norm": 1.4980287551879883, + "learning_rate": 5.391102319527373e-05 + }, + { + "step": 52, + "epoch": 2.9714285714285715, + "cpu_mem": 3.700924416, + "gpu_mem": 1.068647424, + "loss": 1.3677, + "grad_norm": 1.0744749307632446, + "learning_rate": 4.8108999165406026e-05 + }, + { + "step": 53, + "epoch": 3.0285714285714285, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074924544, + "loss": 1.9324, + "grad_norm": 1.3372554779052734, + "learning_rate": 4.257717175284103e-05 + }, + { + "step": 54, + "epoch": 3.085714285714286, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074893824, + "loss": 1.3469, + "grad_norm": 1.2190483808517456, + "learning_rate": 3.733021036942205e-05 + }, + { + "step": 55, + "epoch": 3.142857142857143, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074927616, + "loss": 1.3594, + "grad_norm": 1.6697105169296265, + "learning_rate": 3.238202901349345e-05 + }, + { + "step": 56, + "epoch": 3.2, + "cpu_mem": 3.700924416, + "gpu_mem": 1.075001344, + "loss": 1.3225, + "grad_norm": 0.9616037011146545, + "learning_rate": 2.774574937247831e-05 + }, + { + "step": 57, + "epoch": 3.257142857142857, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074944512, + "loss": 1.3007, + "grad_norm": 0.8286488652229309, + "learning_rate": 2.3433666026522153e-05 + }, + { + "step": 58, + "epoch": 3.314285714285714, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074938368, + "loss": 1.2296, + "grad_norm": 1.3770592212677002, + "learning_rate": 1.945721384547671e-05 + }, + { + "step": 59, + "epoch": 3.3714285714285714, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074989056, + "loss": 1.2958, + "grad_norm": 1.0127066373825073, + "learning_rate": 1.5826937665680693e-05 + }, + { + "step": 60, + "epoch": 3.4285714285714284, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074915328, + "loss": 1.3358, + "grad_norm": 1.5256590843200684, + "learning_rate": 1.2552464326949302e-05 + }, + { + "step": 61, + "epoch": 3.4857142857142858, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074929152, + "loss": 1.3548, + "grad_norm": 1.3627742528915405, + "learning_rate": 9.64247714392597e-06 + }, + { + "step": 62, + "epoch": 3.5428571428571427, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074930688, + "loss": 1.3413, + "grad_norm": 1.352027416229248, + "learning_rate": 7.104692879492624e-06 + }, + { + "step": 63, + "epoch": 3.6, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074919936, + "loss": 1.3324, + "grad_norm": 1.1300991773605347, + "learning_rate": 4.945841281301943e-06 + }, + { + "step": 64, + "epoch": 3.657142857142857, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074936832, + "loss": 1.2993, + "grad_norm": 1.3003207445144653, + "learning_rate": 3.1716472356963286e-06 + }, + { + "step": 65, + "epoch": 3.7142857142857144, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074958336, + "loss": 1.3521, + "grad_norm": 1.458931565284729, + "learning_rate": 1.7868155863384415e-06 + }, + { + "step": 66, + "epoch": 3.7714285714285714, + "cpu_mem": 3.700924416, + "gpu_mem": 1.07494912, + "loss": 1.2798, + "grad_norm": 1.2680859565734863, + "learning_rate": 7.950186578116413e-07 + }, + { + "step": 67, + "epoch": 3.8285714285714287, + "cpu_mem": 3.700924416, + "gpu_mem": 1.074975232, + "loss": 1.2798, + "grad_norm": 0.9544034600257874, + "learning_rate": 1.988865172754206e-07 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 3.700924416, + "gpu_mem": 1.07492608, + "loss": 1.2954, + "grad_norm": 1.4769362211227417, + "learning_rate": 0.0 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 3.700924416, + "gpu_mem": 1.07492608, + "train_runtime": 382.2546, + "train_samples_per_second": 11.709, + "train_steps_per_second": 0.178, + "total_flos": 4001546965180416.0, + "train_loss": 1.6085487446364235 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r32-a2/README.md b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r32-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r32-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r32-a2/adapter_config.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..97cff55d3f03a364161498b7b6299c246238daf5 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r32-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r32-a2/eval_results.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b4bf34913d6961a9bb1e7005b10d029272de0cdf --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_c", + "results": 0.5742320819112628 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r32-a2/training_configuration.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..f0b09f382b39bfdc64fb818810a2160c141efd47 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_C", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "loraq4", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 25231360 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loraq4-arc_c-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loraq4-fix/TinyLlama_v1.1-loraq4-arc_c-r32-a2", + "seed": 42, + "timestamp": "2025-09-01T13:36:06.437312" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r32-a2/training_logs.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..e44b3bc77a6f78adc5375736cc5821f4f81eaaa8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r32-a2/training_logs.json @@ -0,0 +1,625 @@ +[ + { + "step": 1, + "epoch": 0.05714285714285714, + "cpu_mem": 3.268227072, + "gpu_mem": 1.150622208, + "loss": 4.4743, + "grad_norm": 70.48493194580078, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 2, + "epoch": 0.11428571428571428, + "cpu_mem": 3.26842368, + "gpu_mem": 1.35246336, + "loss": 4.5283, + "grad_norm": 73.35309600830078, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 3, + "epoch": 0.17142857142857143, + "cpu_mem": 3.268620288, + "gpu_mem": 1.35249408, + "loss": 2.2008, + "grad_norm": 27.49148178100586, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 4, + "epoch": 0.22857142857142856, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352460288, + "loss": 1.5604, + "grad_norm": 4.846491813659668, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 5, + "epoch": 0.2857142857142857, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352448, + "loss": 1.4445, + "grad_norm": 3.921125650405884, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 6, + "epoch": 0.34285714285714286, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352510976, + "loss": 1.4458, + "grad_norm": 5.514718532562256, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 7, + "epoch": 0.4, + "cpu_mem": 3.268620288, + "gpu_mem": 1.35251712, + "loss": 1.4613, + "grad_norm": 4.101064682006836, + "learning_rate": 0.0003 + }, + { + "step": 8, + "epoch": 0.45714285714285713, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352475648, + "loss": 1.443, + "grad_norm": 2.8938405513763428, + "learning_rate": 0.00029980111348272456 + }, + { + "step": 9, + "epoch": 0.5142857142857142, + "cpu_mem": 3.268620288, + "gpu_mem": 1.35247104, + "loss": 1.3675, + "grad_norm": 2.940446376800537, + "learning_rate": 0.00029920498134218835 + }, + { + "step": 10, + "epoch": 0.5714285714285714, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352460288, + "loss": 1.5168, + "grad_norm": 3.3495631217956543, + "learning_rate": 0.0002982131844136615 + }, + { + "step": 11, + "epoch": 0.6285714285714286, + "cpu_mem": 3.268620288, + "gpu_mem": 1.35247104, + "loss": 1.4508, + "grad_norm": 2.0165822505950928, + "learning_rate": 0.0002968283527643036 + }, + { + "step": 12, + "epoch": 0.6857142857142857, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352495616, + "loss": 1.416, + "grad_norm": 1.6785409450531006, + "learning_rate": 0.000295054158718698 + }, + { + "step": 13, + "epoch": 0.7428571428571429, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352495616, + "loss": 1.3385, + "grad_norm": 2.197312831878662, + "learning_rate": 0.00029289530712050735 + }, + { + "step": 14, + "epoch": 0.8, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352443392, + "loss": 1.6075, + "grad_norm": 3.844707489013672, + "learning_rate": 0.000290357522856074 + }, + { + "step": 15, + "epoch": 0.8571428571428571, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352518656, + "loss": 1.5023, + "grad_norm": 3.2151381969451904, + "learning_rate": 0.0002874475356730507 + }, + { + "step": 16, + "epoch": 0.9142857142857143, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352512512, + "loss": 1.4752, + "grad_norm": 3.953890323638916, + "learning_rate": 0.0002841730623343193 + }, + { + "step": 17, + "epoch": 0.9714285714285714, + "cpu_mem": 3.268620288, + "gpu_mem": 1.35251712, + "loss": 1.4831, + "grad_norm": 3.3845419883728027, + "learning_rate": 0.00028054278615452326 + }, + { + "step": 18, + "epoch": 1.0285714285714285, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453399552, + "loss": 2.2146, + "grad_norm": 8.789177894592285, + "learning_rate": 0.0002765663339734778 + }, + { + "step": 19, + "epoch": 1.0857142857142856, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453398016, + "loss": 1.4243, + "grad_norm": 2.396503448486328, + "learning_rate": 0.00027225425062752165 + }, + { + "step": 20, + "epoch": 1.1428571428571428, + "cpu_mem": 3.268620288, + "gpu_mem": 1.45337344, + "loss": 1.3645, + "grad_norm": 2.610074281692505, + "learning_rate": 0.0002676179709865066 + }, + { + "step": 21, + "epoch": 1.2, + "cpu_mem": 3.268620288, + "gpu_mem": 1.45338112, + "loss": 1.4127, + "grad_norm": 2.9313762187957764, + "learning_rate": 0.0002626697896305779 + }, + { + "step": 22, + "epoch": 1.2571428571428571, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453410304, + "loss": 1.3393, + "grad_norm": 1.7093162536621094, + "learning_rate": 0.000257422828247159 + }, + { + "step": 23, + "epoch": 1.3142857142857143, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453439488, + "loss": 1.3418, + "grad_norm": 2.065953254699707, + "learning_rate": 0.00025189100083459397 + }, + { + "step": 24, + "epoch": 1.3714285714285714, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453382656, + "loss": 1.3266, + "grad_norm": 1.2991076707839966, + "learning_rate": 0.0002460889768047263 + }, + { + "step": 25, + "epoch": 1.4285714285714286, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453451776, + "loss": 1.3158, + "grad_norm": 1.3514689207077026, + "learning_rate": 0.00024003214208225522 + }, + { + "step": 26, + "epoch": 1.4857142857142858, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453408768, + "loss": 1.3767, + "grad_norm": 1.6894904375076294, + "learning_rate": 0.00023373655830402968 + }, + { + "step": 27, + "epoch": 1.5428571428571427, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453367296, + "loss": 1.4377, + "grad_norm": 2.078199625015259, + "learning_rate": 0.00022721892022647462 + }, + { + "step": 28, + "epoch": 1.6, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453413376, + "loss": 1.5796, + "grad_norm": 4.497745513916016, + "learning_rate": 0.000220496511454098 + }, + { + "step": 29, + "epoch": 1.657142857142857, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453408768, + "loss": 1.3561, + "grad_norm": 1.1484696865081787, + "learning_rate": 0.0002135871586064791 + }, + { + "step": 30, + "epoch": 1.7142857142857144, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453398016, + "loss": 1.3637, + "grad_norm": 1.4079972505569458, + "learning_rate": 0.00020650918404527775 + }, + { + "step": 31, + "epoch": 1.7714285714285714, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453428736, + "loss": 1.3608, + "grad_norm": 1.4038970470428467, + "learning_rate": 0.00019928135728662522 + }, + { + "step": 32, + "epoch": 1.8285714285714287, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453437952, + "loss": 1.3584, + "grad_norm": 1.1175187826156616, + "learning_rate": 0.00019192284522774142 + }, + { + "step": 33, + "epoch": 1.8857142857142857, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453417984, + "loss": 1.4022, + "grad_norm": 1.4485479593276978, + "learning_rate": 0.00018445316131976934 + }, + { + "step": 34, + "epoch": 1.9428571428571428, + "cpu_mem": 3.268620288, + "gpu_mem": 1.45339648, + "loss": 1.4038, + "grad_norm": 1.5805984735488892, + "learning_rate": 0.00017689211382161034 + }, + { + "step": 35, + "epoch": 2.0, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453284352, + "loss": 2.0434, + "grad_norm": 1.0675715208053589, + "learning_rate": 0.00016925975327198266 + }, + { + "step": 36, + "epoch": 2.057142857142857, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352489472, + "loss": 1.3122, + "grad_norm": 0.9641790390014648, + "learning_rate": 0.00016157631931899697 + }, + { + "step": 37, + "epoch": 2.1142857142857143, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352498688, + "loss": 1.4003, + "grad_norm": 1.7670724391937256, + "learning_rate": 0.0001538621870482483 + }, + { + "step": 38, + "epoch": 2.1714285714285713, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352469504, + "loss": 1.3414, + "grad_norm": 1.0632096529006958, + "learning_rate": 0.00014613781295175172 + }, + { + "step": 39, + "epoch": 2.2285714285714286, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352487936, + "loss": 1.2838, + "grad_norm": 1.137154221534729, + "learning_rate": 0.00013842368068100303 + }, + { + "step": 40, + "epoch": 2.2857142857142856, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352464896, + "loss": 1.3565, + "grad_norm": 1.7783344984054565, + "learning_rate": 0.00013074024672801731 + }, + { + "step": 41, + "epoch": 2.342857142857143, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352466432, + "loss": 1.3844, + "grad_norm": 2.21124529838562, + "learning_rate": 0.00012310788617838966 + }, + { + "step": 42, + "epoch": 2.4, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352495616, + "loss": 1.2961, + "grad_norm": 2.1071722507476807, + "learning_rate": 0.00011554683868023067 + }, + { + "step": 43, + "epoch": 2.4571428571428573, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352510976, + "loss": 1.3001, + "grad_norm": 1.9287575483322144, + "learning_rate": 0.00010807715477225858 + }, + { + "step": 44, + "epoch": 2.5142857142857142, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352529408, + "loss": 1.2904, + "grad_norm": 1.698517918586731, + "learning_rate": 0.00010071864271337478 + }, + { + "step": 45, + "epoch": 2.571428571428571, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352483328, + "loss": 1.2474, + "grad_norm": 1.6059659719467163, + "learning_rate": 9.34908159547222e-05 + }, + { + "step": 46, + "epoch": 2.6285714285714286, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352477184, + "loss": 1.2039, + "grad_norm": 1.3696398735046387, + "learning_rate": 8.641284139352091e-05 + }, + { + "step": 47, + "epoch": 2.685714285714286, + "cpu_mem": 3.268620288, + "gpu_mem": 1.35247104, + "loss": 1.2029, + "grad_norm": 1.5597642660140991, + "learning_rate": 7.950348854590204e-05 + }, + { + "step": 48, + "epoch": 2.742857142857143, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352475648, + "loss": 1.1775, + "grad_norm": 1.6748318672180176, + "learning_rate": 7.278107977352543e-05 + }, + { + "step": 49, + "epoch": 2.8, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352466432, + "loss": 1.2206, + "grad_norm": 2.7124218940734863, + "learning_rate": 6.626344169597031e-05 + }, + { + "step": 50, + "epoch": 2.857142857142857, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352448, + "loss": 1.2471, + "grad_norm": 2.056762456893921, + "learning_rate": 5.996785791774478e-05 + }, + { + "step": 51, + "epoch": 2.914285714285714, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352472576, + "loss": 1.2159, + "grad_norm": 2.0742671489715576, + "learning_rate": 5.391102319527373e-05 + }, + { + "step": 52, + "epoch": 2.9714285714285715, + "cpu_mem": 3.268620288, + "gpu_mem": 1.352500224, + "loss": 1.2586, + "grad_norm": 1.9112428426742554, + "learning_rate": 4.8108999165406026e-05 + }, + { + "step": 53, + "epoch": 3.0285714285714285, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453394944, + "loss": 1.7335, + "grad_norm": 3.4679014682769775, + "learning_rate": 4.257717175284103e-05 + }, + { + "step": 54, + "epoch": 3.085714285714286, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453364224, + "loss": 1.1335, + "grad_norm": 2.1489686965942383, + "learning_rate": 3.733021036942205e-05 + }, + { + "step": 55, + "epoch": 3.142857142857143, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453398016, + "loss": 1.1386, + "grad_norm": 2.577728033065796, + "learning_rate": 3.238202901349345e-05 + }, + { + "step": 56, + "epoch": 3.2, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453471744, + "loss": 1.0852, + "grad_norm": 2.618350028991699, + "learning_rate": 2.774574937247831e-05 + }, + { + "step": 57, + "epoch": 3.257142857142857, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453414912, + "loss": 1.0791, + "grad_norm": 2.144256830215454, + "learning_rate": 2.3433666026522153e-05 + }, + { + "step": 58, + "epoch": 3.314285714285714, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453408768, + "loss": 0.9717, + "grad_norm": 2.7098464965820312, + "learning_rate": 1.945721384547671e-05 + }, + { + "step": 59, + "epoch": 3.3714285714285714, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453459456, + "loss": 0.9905, + "grad_norm": 2.3563175201416016, + "learning_rate": 1.5826937665680693e-05 + }, + { + "step": 60, + "epoch": 3.4285714285714284, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453385728, + "loss": 1.1574, + "grad_norm": 5.1379828453063965, + "learning_rate": 1.2552464326949302e-05 + }, + { + "step": 61, + "epoch": 3.4857142857142858, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453399552, + "loss": 1.128, + "grad_norm": 3.686218738555908, + "learning_rate": 9.64247714392597e-06 + }, + { + "step": 62, + "epoch": 3.5428571428571427, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453401088, + "loss": 1.1009, + "grad_norm": 3.122516632080078, + "learning_rate": 7.104692879492624e-06 + }, + { + "step": 63, + "epoch": 3.6, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453390336, + "loss": 1.1137, + "grad_norm": 3.1904258728027344, + "learning_rate": 4.945841281301943e-06 + }, + { + "step": 64, + "epoch": 3.657142857142857, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453407232, + "loss": 1.0441, + "grad_norm": 4.285902976989746, + "learning_rate": 3.1716472356963286e-06 + }, + { + "step": 65, + "epoch": 3.7142857142857144, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453428736, + "loss": 1.0882, + "grad_norm": 3.524383306503296, + "learning_rate": 1.7868155863384415e-06 + }, + { + "step": 66, + "epoch": 3.7714285714285714, + "cpu_mem": 3.268620288, + "gpu_mem": 1.45341952, + "loss": 1.0573, + "grad_norm": 3.218918800354004, + "learning_rate": 7.950186578116413e-07 + }, + { + "step": 67, + "epoch": 3.8285714285714287, + "cpu_mem": 3.268620288, + "gpu_mem": 1.453445632, + "loss": 1.0501, + "grad_norm": 3.1913533210754395, + "learning_rate": 1.988865172754206e-07 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 3.268620288, + "gpu_mem": 1.45339648, + "loss": 1.0176, + "grad_norm": 2.7636172771453857, + "learning_rate": 0.0 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 3.268620288, + "gpu_mem": 1.45339648, + "train_runtime": 386.4963, + "train_samples_per_second": 11.581, + "train_steps_per_second": 0.176, + "total_flos": 4092904137302016.0, + "train_loss": 1.4377437680959702 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r8-a2/README.md b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r8-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r8-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r8-a2/adapter_config.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a34e999804ff05ab393ed2117c936e4d7827f88f --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r8-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r8-a2/eval_results.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..d9e665d202db0151c3fa60a609bdd3a4f9b94b0d --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_c", + "results": 0.4138225255972696 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r8-a2/training_configuration.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..5a920a99492f90b0fb73de01d8cfe2bb436153e8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_C", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "loraq4", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 6307840 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loraq4-arc_c-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loraq4-fix/TinyLlama_v1.1-loraq4-arc_c-r8-a2", + "seed": 42, + "timestamp": "2025-09-01T06:24:25.900462" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r8-a2/training_logs.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..72fbf812655da845b68bcd40cee48602b9548e48 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_c-r8-a2/training_logs.json @@ -0,0 +1,625 @@ +[ + { + "step": 1, + "epoch": 0.05714285714285714, + "cpu_mem": 3.32398592, + "gpu_mem": 1.074928128, + "loss": 4.4743, + "grad_norm": 33.909698486328125, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 2, + "epoch": 0.11428571428571428, + "cpu_mem": 3.324182528, + "gpu_mem": 1.12538112, + "loss": 4.5283, + "grad_norm": 35.14842224121094, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 3, + "epoch": 0.17142857142857143, + "cpu_mem": 3.324379136, + "gpu_mem": 1.12541184, + "loss": 3.5695, + "grad_norm": 33.57164764404297, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 4, + "epoch": 0.22857142857142856, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125378048, + "loss": 2.3501, + "grad_norm": 17.770660400390625, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 5, + "epoch": 0.2857142857142857, + "cpu_mem": 3.324575744, + "gpu_mem": 1.12536576, + "loss": 1.7579, + "grad_norm": 6.073936462402344, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 6, + "epoch": 0.34285714285714286, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125428736, + "loss": 1.6111, + "grad_norm": 3.049161911010742, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 7, + "epoch": 0.4, + "cpu_mem": 3.324575744, + "gpu_mem": 1.12543488, + "loss": 1.4819, + "grad_norm": 2.516075372695923, + "learning_rate": 0.0003 + }, + { + "step": 8, + "epoch": 0.45714285714285713, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125393408, + "loss": 1.4278, + "grad_norm": 2.897015333175659, + "learning_rate": 0.00029980111348272456 + }, + { + "step": 9, + "epoch": 0.5142857142857142, + "cpu_mem": 3.324575744, + "gpu_mem": 1.1253888, + "loss": 1.3528, + "grad_norm": 3.4002885818481445, + "learning_rate": 0.00029920498134218835 + }, + { + "step": 10, + "epoch": 0.5714285714285714, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125378048, + "loss": 1.5596, + "grad_norm": 8.238040924072266, + "learning_rate": 0.0002982131844136615 + }, + { + "step": 11, + "epoch": 0.6285714285714286, + "cpu_mem": 3.324575744, + "gpu_mem": 1.1253888, + "loss": 1.4422, + "grad_norm": 3.1994659900665283, + "learning_rate": 0.0002968283527643036 + }, + { + "step": 12, + "epoch": 0.6857142857142857, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125413376, + "loss": 1.4093, + "grad_norm": 2.132479190826416, + "learning_rate": 0.000295054158718698 + }, + { + "step": 13, + "epoch": 0.7428571428571429, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125413376, + "loss": 1.3542, + "grad_norm": 3.2087483406066895, + "learning_rate": 0.00029289530712050735 + }, + { + "step": 14, + "epoch": 0.8, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125361152, + "loss": 1.5932, + "grad_norm": 4.739607810974121, + "learning_rate": 0.000290357522856074 + }, + { + "step": 15, + "epoch": 0.8571428571428571, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125436416, + "loss": 1.5772, + "grad_norm": 3.8176076412200928, + "learning_rate": 0.0002874475356730507 + }, + { + "step": 16, + "epoch": 0.9142857142857143, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125430272, + "loss": 1.3958, + "grad_norm": 1.170166254043579, + "learning_rate": 0.0002841730623343193 + }, + { + "step": 17, + "epoch": 0.9714285714285714, + "cpu_mem": 3.324575744, + "gpu_mem": 1.12543488, + "loss": 1.3998, + "grad_norm": 1.4277585744857788, + "learning_rate": 0.00028054278615452326 + }, + { + "step": 18, + "epoch": 1.0285714285714285, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150623232, + "loss": 2.0323, + "grad_norm": 1.4944947957992554, + "learning_rate": 0.0002765663339734778 + }, + { + "step": 19, + "epoch": 1.0857142857142856, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150621696, + "loss": 1.4314, + "grad_norm": 1.2402381896972656, + "learning_rate": 0.00027225425062752165 + }, + { + "step": 20, + "epoch": 1.1428571428571428, + "cpu_mem": 3.324575744, + "gpu_mem": 1.15059712, + "loss": 1.4471, + "grad_norm": 1.7855156660079956, + "learning_rate": 0.0002676179709865066 + }, + { + "step": 21, + "epoch": 1.2, + "cpu_mem": 3.324575744, + "gpu_mem": 1.1506048, + "loss": 1.3761, + "grad_norm": 0.8922517895698547, + "learning_rate": 0.0002626697896305779 + }, + { + "step": 22, + "epoch": 1.2571428571428571, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150633984, + "loss": 1.3381, + "grad_norm": 0.8572810292243958, + "learning_rate": 0.000257422828247159 + }, + { + "step": 23, + "epoch": 1.3142857142857143, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150663168, + "loss": 1.3171, + "grad_norm": 0.7841963171958923, + "learning_rate": 0.00025189100083459397 + }, + { + "step": 24, + "epoch": 1.3714285714285714, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150606336, + "loss": 1.3759, + "grad_norm": 1.24485445022583, + "learning_rate": 0.0002460889768047263 + }, + { + "step": 25, + "epoch": 1.4285714285714286, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150675456, + "loss": 1.3331, + "grad_norm": 0.9976959228515625, + "learning_rate": 0.00024003214208225522 + }, + { + "step": 26, + "epoch": 1.4857142857142858, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150632448, + "loss": 1.325, + "grad_norm": 0.8690944910049438, + "learning_rate": 0.00023373655830402968 + }, + { + "step": 27, + "epoch": 1.5428571428571427, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150590976, + "loss": 1.3396, + "grad_norm": 1.0178576707839966, + "learning_rate": 0.00022721892022647462 + }, + { + "step": 28, + "epoch": 1.6, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150637056, + "loss": 1.5294, + "grad_norm": 3.046597957611084, + "learning_rate": 0.000220496511454098 + }, + { + "step": 29, + "epoch": 1.657142857142857, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150632448, + "loss": 1.3595, + "grad_norm": 1.41475248336792, + "learning_rate": 0.0002135871586064791 + }, + { + "step": 30, + "epoch": 1.7142857142857144, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150621696, + "loss": 1.376, + "grad_norm": 1.656673789024353, + "learning_rate": 0.00020650918404527775 + }, + { + "step": 31, + "epoch": 1.7714285714285714, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150652416, + "loss": 1.3307, + "grad_norm": 0.7929617166519165, + "learning_rate": 0.00019928135728662522 + }, + { + "step": 32, + "epoch": 1.8285714285714287, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150661632, + "loss": 1.3607, + "grad_norm": 1.1309109926223755, + "learning_rate": 0.00019192284522774142 + }, + { + "step": 33, + "epoch": 1.8857142857142857, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150641664, + "loss": 1.4106, + "grad_norm": 2.018371105194092, + "learning_rate": 0.00018445316131976934 + }, + { + "step": 34, + "epoch": 1.9428571428571428, + "cpu_mem": 3.324575744, + "gpu_mem": 1.15062016, + "loss": 1.3654, + "grad_norm": 1.1604561805725098, + "learning_rate": 0.00017689211382161034 + }, + { + "step": 35, + "epoch": 2.0, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150508032, + "loss": 2.1297, + "grad_norm": 1.87057363986969, + "learning_rate": 0.00016925975327198266 + }, + { + "step": 36, + "epoch": 2.057142857142857, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125407232, + "loss": 1.3753, + "grad_norm": 1.4637526273727417, + "learning_rate": 0.00016157631931899697 + }, + { + "step": 37, + "epoch": 2.1142857142857143, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125416448, + "loss": 1.3689, + "grad_norm": 1.0966377258300781, + "learning_rate": 0.0001538621870482483 + }, + { + "step": 38, + "epoch": 2.1714285714285713, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125387264, + "loss": 1.3557, + "grad_norm": 0.6691208481788635, + "learning_rate": 0.00014613781295175172 + }, + { + "step": 39, + "epoch": 2.2285714285714286, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125405696, + "loss": 1.3459, + "grad_norm": 0.8344929218292236, + "learning_rate": 0.00013842368068100303 + }, + { + "step": 40, + "epoch": 2.2857142857142856, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125382656, + "loss": 1.3577, + "grad_norm": 0.7070911526679993, + "learning_rate": 0.00013074024672801731 + }, + { + "step": 41, + "epoch": 2.342857142857143, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125384192, + "loss": 1.3646, + "grad_norm": 0.8230006694793701, + "learning_rate": 0.00012310788617838966 + }, + { + "step": 42, + "epoch": 2.4, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125413376, + "loss": 1.3264, + "grad_norm": 1.4941058158874512, + "learning_rate": 0.00011554683868023067 + }, + { + "step": 43, + "epoch": 2.4571428571428573, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125428736, + "loss": 1.3357, + "grad_norm": 0.822471022605896, + "learning_rate": 0.00010807715477225858 + }, + { + "step": 44, + "epoch": 2.5142857142857142, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125447168, + "loss": 1.3194, + "grad_norm": 0.7611081004142761, + "learning_rate": 0.00010071864271337478 + }, + { + "step": 45, + "epoch": 2.571428571428571, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125401088, + "loss": 1.3007, + "grad_norm": 0.5624395608901978, + "learning_rate": 9.34908159547222e-05 + }, + { + "step": 46, + "epoch": 2.6285714285714286, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125394944, + "loss": 1.281, + "grad_norm": 0.6890324354171753, + "learning_rate": 8.641284139352091e-05 + }, + { + "step": 47, + "epoch": 2.685714285714286, + "cpu_mem": 3.324575744, + "gpu_mem": 1.1253888, + "loss": 1.2841, + "grad_norm": 0.7997681498527527, + "learning_rate": 7.950348854590204e-05 + }, + { + "step": 48, + "epoch": 2.742857142857143, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125393408, + "loss": 1.2197, + "grad_norm": 0.7132437229156494, + "learning_rate": 7.278107977352543e-05 + }, + { + "step": 49, + "epoch": 2.8, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125384192, + "loss": 1.2719, + "grad_norm": 0.7102539539337158, + "learning_rate": 6.626344169597031e-05 + }, + { + "step": 50, + "epoch": 2.857142857142857, + "cpu_mem": 3.324575744, + "gpu_mem": 1.12536576, + "loss": 1.3055, + "grad_norm": 0.9197364449501038, + "learning_rate": 5.996785791774478e-05 + }, + { + "step": 51, + "epoch": 2.914285714285714, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125390336, + "loss": 1.2877, + "grad_norm": 1.0779134035110474, + "learning_rate": 5.391102319527373e-05 + }, + { + "step": 52, + "epoch": 2.9714285714285715, + "cpu_mem": 3.324575744, + "gpu_mem": 1.125417984, + "loss": 1.3505, + "grad_norm": 1.1818938255310059, + "learning_rate": 4.8108999165406026e-05 + }, + { + "step": 53, + "epoch": 3.0285714285714285, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150618624, + "loss": 1.8794, + "grad_norm": 1.2823582887649536, + "learning_rate": 4.257717175284103e-05 + }, + { + "step": 54, + "epoch": 3.085714285714286, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150587904, + "loss": 1.2942, + "grad_norm": 1.1376744508743286, + "learning_rate": 3.733021036942205e-05 + }, + { + "step": 55, + "epoch": 3.142857142857143, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150621696, + "loss": 1.3538, + "grad_norm": 1.6907755136489868, + "learning_rate": 3.238202901349345e-05 + }, + { + "step": 56, + "epoch": 3.2, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150695424, + "loss": 1.2925, + "grad_norm": 1.1396971940994263, + "learning_rate": 2.774574937247831e-05 + }, + { + "step": 57, + "epoch": 3.257142857142857, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150638592, + "loss": 1.2695, + "grad_norm": 0.8919467329978943, + "learning_rate": 2.3433666026522153e-05 + }, + { + "step": 58, + "epoch": 3.314285714285714, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150632448, + "loss": 1.1762, + "grad_norm": 1.1649504899978638, + "learning_rate": 1.945721384547671e-05 + }, + { + "step": 59, + "epoch": 3.3714285714285714, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150683136, + "loss": 1.2588, + "grad_norm": 1.083436369895935, + "learning_rate": 1.5826937665680693e-05 + }, + { + "step": 60, + "epoch": 3.4285714285714284, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150609408, + "loss": 1.2783, + "grad_norm": 1.4166045188903809, + "learning_rate": 1.2552464326949302e-05 + }, + { + "step": 61, + "epoch": 3.4857142857142858, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150623232, + "loss": 1.3022, + "grad_norm": 1.2695715427398682, + "learning_rate": 9.64247714392597e-06 + }, + { + "step": 62, + "epoch": 3.5428571428571427, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150624768, + "loss": 1.3082, + "grad_norm": 1.2136319875717163, + "learning_rate": 7.104692879492624e-06 + }, + { + "step": 63, + "epoch": 3.6, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150614016, + "loss": 1.2818, + "grad_norm": 1.186843991279602, + "learning_rate": 4.945841281301943e-06 + }, + { + "step": 64, + "epoch": 3.657142857142857, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150630912, + "loss": 1.2565, + "grad_norm": 1.2347488403320312, + "learning_rate": 3.1716472356963286e-06 + }, + { + "step": 65, + "epoch": 3.7142857142857144, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150652416, + "loss": 1.2988, + "grad_norm": 1.4303252696990967, + "learning_rate": 1.7868155863384415e-06 + }, + { + "step": 66, + "epoch": 3.7714285714285714, + "cpu_mem": 3.324575744, + "gpu_mem": 1.1506432, + "loss": 1.2359, + "grad_norm": 1.1867451667785645, + "learning_rate": 7.950186578116413e-07 + }, + { + "step": 67, + "epoch": 3.8285714285714287, + "cpu_mem": 3.324575744, + "gpu_mem": 1.150669312, + "loss": 1.2333, + "grad_norm": 1.1337357759475708, + "learning_rate": 1.988865172754206e-07 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 3.324575744, + "gpu_mem": 1.15062016, + "loss": 1.2459, + "grad_norm": 1.3634076118469238, + "learning_rate": 0.0 + }, + { + "step": 68, + "epoch": 3.8857142857142857, + "cpu_mem": 3.324575744, + "gpu_mem": 1.15062016, + "train_runtime": 382.9609, + "train_samples_per_second": 11.688, + "train_steps_per_second": 0.178, + "total_flos": 4019818399604736.0, + "train_loss": 1.527628078180201 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r2-a2/README.md b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r2-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r2-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r2-a2/adapter_config.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c5f43ee5d95e6efa86bc12e96d56fbf5a2c265b7 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r2-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 4, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 2, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r2-a2/eval_results.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..30cc83ae75888d4bc957956ed4c0c781daafe129 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_e", + "results": 0.33375420875420875 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r2-a2/training_configuration.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..8087da8087f2b0994cd403352c4cd18046b56d1d --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_E", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "loraq4", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 1576960 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loraq4-arc_e-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loraq4-fix/TinyLlama_v1.1-loraq4-arc_e-r2-a2", + "seed": 42, + "timestamp": "2025-08-31T22:30:17.768719" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r2-a2/training_logs.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..1da90b2d800f8a7dd1175e5bb82a0078c1b41b8f --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r2-a2/training_logs.json @@ -0,0 +1,1273 @@ +[ + { + "step": 1, + "epoch": 0.028169014084507043, + "cpu_mem": 3.690397696, + "gpu_mem": 1.055949312, + "loss": 4.5728, + "grad_norm": 17.623332977294922, + "learning_rate": 2.1428571428571425e-05 + }, + { + "step": 2, + "epoch": 0.056338028169014086, + "cpu_mem": 3.690790912, + "gpu_mem": 1.06862592, + "loss": 4.3932, + "grad_norm": 17.91280746459961, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 3, + "epoch": 0.08450704225352113, + "cpu_mem": 3.69098752, + "gpu_mem": 1.068604416, + "loss": 4.4887, + "grad_norm": 18.502147674560547, + "learning_rate": 6.428571428571427e-05 + }, + { + "step": 4, + "epoch": 0.11267605633802817, + "cpu_mem": 3.69098752, + "gpu_mem": 1.068582912, + "loss": 4.362, + "grad_norm": 17.787639617919922, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 5, + "epoch": 0.14084507042253522, + "cpu_mem": 3.691184128, + "gpu_mem": 1.068624384, + "loss": 3.8968, + "grad_norm": 18.357763290405273, + "learning_rate": 0.00010714285714285714 + }, + { + "step": 6, + "epoch": 0.16901408450704225, + "cpu_mem": 3.691184128, + "gpu_mem": 1.068599808, + "loss": 3.2833, + "grad_norm": 16.60222625732422, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 7, + "epoch": 0.19718309859154928, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068622848, + "loss": 2.8667, + "grad_norm": 14.6502103805542, + "learning_rate": 0.00015 + }, + { + "step": 8, + "epoch": 0.22535211267605634, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068581376, + "loss": 2.2765, + "grad_norm": 11.139254570007324, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 9, + "epoch": 0.2535211267605634, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068582912, + "loss": 2.0664, + "grad_norm": 8.033945083618164, + "learning_rate": 0.00019285714285714286 + }, + { + "step": 10, + "epoch": 0.28169014084507044, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068578304, + "loss": 1.733, + "grad_norm": 3.683929443359375, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 11, + "epoch": 0.30985915492957744, + "cpu_mem": 3.691380736, + "gpu_mem": 1.06865664, + "loss": 1.6081, + "grad_norm": 5.792934417724609, + "learning_rate": 0.00023571428571428569 + }, + { + "step": 12, + "epoch": 0.3380281690140845, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068630528, + "loss": 1.5443, + "grad_norm": 3.2337942123413086, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 13, + "epoch": 0.36619718309859156, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068581376, + "loss": 1.4156, + "grad_norm": 2.4118101596832275, + "learning_rate": 0.00027857142857142854 + }, + { + "step": 14, + "epoch": 0.39436619718309857, + "cpu_mem": 3.691380736, + "gpu_mem": 1.06860288, + "loss": 1.4122, + "grad_norm": 2.9343068599700928, + "learning_rate": 0.0003 + }, + { + "step": 15, + "epoch": 0.4225352112676056, + "cpu_mem": 3.691380736, + "gpu_mem": 1.06857984, + "loss": 1.3403, + "grad_norm": 2.6774017810821533, + "learning_rate": 0.0002999533773001224 + }, + { + "step": 16, + "epoch": 0.4507042253521127, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068584448, + "loss": 1.3478, + "grad_norm": 2.118807792663574, + "learning_rate": 0.0002998135381828383 + }, + { + "step": 17, + "epoch": 0.4788732394366197, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068621312, + "loss": 1.3051, + "grad_norm": 2.7805705070495605, + "learning_rate": 0.00029958056957717696 + }, + { + "step": 18, + "epoch": 0.5070422535211268, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068632064, + "loss": 1.354, + "grad_norm": 4.082036972045898, + "learning_rate": 0.0002992546163048102 + }, + { + "step": 19, + "epoch": 0.5352112676056338, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068575232, + "loss": 1.4668, + "grad_norm": 6.4572272300720215, + "learning_rate": 0.0002988358809900258 + }, + { + "step": 20, + "epoch": 0.5633802816901409, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068645888, + "loss": 1.4388, + "grad_norm": 4.705542087554932, + "learning_rate": 0.0002983246239337692 + }, + { + "step": 21, + "epoch": 0.5915492957746479, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068644352, + "loss": 1.2743, + "grad_norm": 2.2187483310699463, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 22, + "epoch": 0.6197183098591549, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068601344, + "loss": 1.3381, + "grad_norm": 2.8660285472869873, + "learning_rate": 0.00029702587317728153 + }, + { + "step": 23, + "epoch": 0.647887323943662, + "cpu_mem": 3.691380736, + "gpu_mem": 1.06861824, + "loss": 1.3274, + "grad_norm": 1.4837523698806763, + "learning_rate": 0.0002962391868272735 + }, + { + "step": 24, + "epoch": 0.676056338028169, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068575232, + "loss": 1.3416, + "grad_norm": 1.922071099281311, + "learning_rate": 0.00029536159293436166 + }, + { + "step": 25, + "epoch": 0.704225352112676, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068604416, + "loss": 1.3801, + "grad_norm": 1.6259715557098389, + "learning_rate": 0.00029439363704250176 + }, + { + "step": 26, + "epoch": 0.7323943661971831, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068584448, + "loss": 1.4395, + "grad_norm": 1.2233842611312866, + "learning_rate": 0.00029333592086792107 + }, + { + "step": 27, + "epoch": 0.7605633802816901, + "cpu_mem": 3.691380736, + "gpu_mem": 1.06861056, + "loss": 1.3334, + "grad_norm": 1.0918117761611938, + "learning_rate": 0.0002921891019250697 + }, + { + "step": 28, + "epoch": 0.7887323943661971, + "cpu_mem": 3.691380736, + "gpu_mem": 1.06861056, + "loss": 1.3798, + "grad_norm": 1.179876685142517, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 29, + "epoch": 0.8169014084507042, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068589056, + "loss": 1.2622, + "grad_norm": 1.2401257753372192, + "learning_rate": 0.00028963106229663063 + }, + { + "step": 30, + "epoch": 0.8450704225352113, + "cpu_mem": 3.691380736, + "gpu_mem": 1.06857984, + "loss": 1.3518, + "grad_norm": 1.2181894779205322, + "learning_rate": 0.00028822143178056114 + }, + { + "step": 31, + "epoch": 0.8732394366197183, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068598272, + "loss": 1.3756, + "grad_norm": 0.9212087988853455, + "learning_rate": 0.00028672587784675096 + }, + { + "step": 32, + "epoch": 0.9014084507042254, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068621312, + "loss": 1.3391, + "grad_norm": 1.1012150049209595, + "learning_rate": 0.0002851453301853628 + }, + { + "step": 33, + "epoch": 0.9295774647887324, + "cpu_mem": 3.691380736, + "gpu_mem": 1.06861824, + "loss": 1.3747, + "grad_norm": 1.2176092863082886, + "learning_rate": 0.00028348077132172027 + }, + { + "step": 34, + "epoch": 0.9577464788732394, + "cpu_mem": 3.691380736, + "gpu_mem": 1.068621312, + "loss": 1.4545, + "grad_norm": 1.7010281085968018, + "learning_rate": 0.0002817332360055343 + }, + { + "step": 35, + "epoch": 0.9859154929577465, + "cpu_mem": 3.691380736, + "gpu_mem": 1.06860288, + "loss": 1.3188, + "grad_norm": 1.0391651391983032, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 36, + "epoch": 1.0140845070422535, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074901504, + "loss": 1.9444, + "grad_norm": 1.5095938444137573, + "learning_rate": 0.0002779936322448233 + }, + { + "step": 37, + "epoch": 1.0422535211267605, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074906112, + "loss": 1.334, + "grad_norm": 0.47452786564826965, + "learning_rate": 0.0002760038884726157 + }, + { + "step": 38, + "epoch": 1.0704225352112675, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074884608, + "loss": 1.2291, + "grad_norm": 0.7163384556770325, + "learning_rate": 0.00027393581614739923 + }, + { + "step": 39, + "epoch": 1.0985915492957747, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074873856, + "loss": 1.4007, + "grad_norm": 2.18056583404541, + "learning_rate": 0.0002717907008573785 + }, + { + "step": 40, + "epoch": 1.1267605633802817, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074936832, + "loss": 1.4538, + "grad_norm": 2.1345205307006836, + "learning_rate": 0.0002695698760834384 + }, + { + "step": 41, + "epoch": 1.1549295774647887, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074896896, + "loss": 1.3325, + "grad_norm": 0.8111973404884338, + "learning_rate": 0.00026727472237020447 + }, + { + "step": 42, + "epoch": 1.1830985915492958, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074939904, + "loss": 1.3589, + "grad_norm": 0.7804684638977051, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 43, + "epoch": 1.2112676056338028, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074889216, + "loss": 1.3937, + "grad_norm": 0.9075874090194702, + "learning_rate": 0.0002624671804451601 + }, + { + "step": 44, + "epoch": 1.2394366197183098, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074953728, + "loss": 1.3525, + "grad_norm": 0.7741839289665222, + "learning_rate": 0.0002599577807744739 + }, + { + "step": 45, + "epoch": 1.267605633802817, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074921472, + "loss": 1.3732, + "grad_norm": 0.5592060685157776, + "learning_rate": 0.0002573800273889577 + }, + { + "step": 46, + "epoch": 1.295774647887324, + "cpu_mem": 3.691380736, + "gpu_mem": 1.07492608, + "loss": 1.3733, + "grad_norm": 0.6973922848701477, + "learning_rate": 0.0002547355227129109 + }, + { + "step": 47, + "epoch": 1.323943661971831, + "cpu_mem": 3.691380736, + "gpu_mem": 1.07487232, + "loss": 1.3485, + "grad_norm": 1.4727650880813599, + "learning_rate": 0.00025202591066563786 + }, + { + "step": 48, + "epoch": 1.352112676056338, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074886144, + "loss": 1.34, + "grad_norm": 0.7860413789749146, + "learning_rate": 0.0002492528756395289 + }, + { + "step": 49, + "epoch": 1.380281690140845, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074875392, + "loss": 1.3358, + "grad_norm": 0.7949798703193665, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 50, + "epoch": 1.408450704225352, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074889216, + "loss": 1.3326, + "grad_norm": 0.8648865818977356, + "learning_rate": 0.00024352347027881003 + }, + { + "step": 51, + "epoch": 1.436619718309859, + "cpu_mem": 3.691380736, + "gpu_mem": 1.07494144, + "loss": 1.3538, + "grad_norm": 0.9514224529266357, + "learning_rate": 0.0002405706615488216 + }, + { + "step": 52, + "epoch": 1.4647887323943662, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074889216, + "loss": 1.3673, + "grad_norm": 0.9149455428123474, + "learning_rate": 0.00023756155083521846 + }, + { + "step": 53, + "epoch": 1.4929577464788732, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074958336, + "loss": 1.3168, + "grad_norm": 0.801841139793396, + "learning_rate": 0.00023449800870954326 + }, + { + "step": 54, + "epoch": 1.5211267605633803, + "cpu_mem": 3.691380736, + "gpu_mem": 1.07492608, + "loss": 1.2903, + "grad_norm": 0.5973301529884338, + "learning_rate": 0.0002313819395798639 + }, + { + "step": 55, + "epoch": 1.5492957746478875, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074935296, + "loss": 1.3629, + "grad_norm": 0.7240919470787048, + "learning_rate": 0.0002282152805069247 + }, + { + "step": 56, + "epoch": 1.5774647887323945, + "cpu_mem": 3.691380736, + "gpu_mem": 1.07491072, + "loss": 1.3243, + "grad_norm": 0.7025162577629089, + "learning_rate": 0.000225 + }, + { + "step": 57, + "epoch": 1.6056338028169015, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074944512, + "loss": 1.3045, + "grad_norm": 0.43494072556495667, + "learning_rate": 0.00022173809679319772 + }, + { + "step": 58, + "epoch": 1.6338028169014085, + "cpu_mem": 3.691380736, + "gpu_mem": 1.07492608, + "loss": 1.3074, + "grad_norm": 0.5212063789367676, + "learning_rate": 0.00021843159860297442 + }, + { + "step": 59, + "epoch": 1.6619718309859155, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074912256, + "loss": 1.3141, + "grad_norm": 0.5083311200141907, + "learning_rate": 0.00021508256086763368 + }, + { + "step": 60, + "epoch": 1.6901408450704225, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074950656, + "loss": 1.3333, + "grad_norm": 1.1470415592193604, + "learning_rate": 0.00021169306546959174 + }, + { + "step": 61, + "epoch": 1.7183098591549295, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074883072, + "loss": 1.3049, + "grad_norm": 0.7600314617156982, + "learning_rate": 0.0002082652194412042 + }, + { + "step": 62, + "epoch": 1.7464788732394365, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074930688, + "loss": 1.3376, + "grad_norm": 0.4235212802886963, + "learning_rate": 0.00020480115365495926 + }, + { + "step": 63, + "epoch": 1.7746478873239435, + "cpu_mem": 3.691380736, + "gpu_mem": 1.07488, + "loss": 1.2988, + "grad_norm": 0.6694610714912415, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 64, + "epoch": 1.8028169014084507, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074929152, + "loss": 1.3327, + "grad_norm": 0.8132529258728027, + "learning_rate": 0.00019777299753775265 + }, + { + "step": 65, + "epoch": 1.8309859154929577, + "cpu_mem": 3.691380736, + "gpu_mem": 1.074927616, + "loss": 1.3434, + "grad_norm": 0.6485650539398193, + "learning_rate": 0.00019421327616163563 + }, + { + "step": 66, + "epoch": 1.8591549295774648, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074946048, + "loss": 1.2963, + "grad_norm": 0.7389944791793823, + "learning_rate": 0.00019062607022145078 + }, + { + "step": 67, + "epoch": 1.887323943661972, + "cpu_mem": 3.6929536, + "gpu_mem": 1.07488768, + "loss": 1.317, + "grad_norm": 0.6572802662849426, + "learning_rate": 0.00018701360965354402 + }, + { + "step": 68, + "epoch": 1.915492957746479, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074899968, + "loss": 1.3621, + "grad_norm": 0.8535525798797607, + "learning_rate": 0.00018337814009344714 + }, + { + "step": 69, + "epoch": 1.943661971830986, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074924544, + "loss": 1.3348, + "grad_norm": 1.3202941417694092, + "learning_rate": 0.0001797219214799096 + }, + { + "step": 70, + "epoch": 1.971830985915493, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074901504, + "loss": 1.2963, + "grad_norm": 0.8796724081039429, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 71, + "epoch": 2.0, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074737152, + "loss": 2.1001, + "grad_norm": 3.4859209060668945, + "learning_rate": 0.00017235633992642615 + }, + { + "step": 72, + "epoch": 2.028169014084507, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068613632, + "loss": 1.3325, + "grad_norm": 0.7528451085090637, + "learning_rate": 0.00016865155569712278 + }, + { + "step": 73, + "epoch": 2.056338028169014, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068576768, + "loss": 1.3041, + "grad_norm": 0.5460439324378967, + "learning_rate": 0.0001649351769893725 + }, + { + "step": 74, + "epoch": 2.084507042253521, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068636672, + "loss": 1.2879, + "grad_norm": 0.5270991325378418, + "learning_rate": 0.00016120951403796364 + }, + { + "step": 75, + "epoch": 2.112676056338028, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068604416, + "loss": 1.325, + "grad_norm": 0.7511728405952454, + "learning_rate": 0.00015747688284910457 + }, + { + "step": 76, + "epoch": 2.140845070422535, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068615168, + "loss": 1.294, + "grad_norm": 0.5017948150634766, + "learning_rate": 0.00015373960376071093 + }, + { + "step": 77, + "epoch": 2.169014084507042, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068652032, + "loss": 1.3605, + "grad_norm": 0.45860522985458374, + "learning_rate": 0.00015 + }, + { + "step": 78, + "epoch": 2.1971830985915495, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068636672, + "loss": 1.3503, + "grad_norm": 0.7987976670265198, + "learning_rate": 0.00014626039623928907 + }, + { + "step": 79, + "epoch": 2.2253521126760565, + "cpu_mem": 3.6929536, + "gpu_mem": 1.06858752, + "loss": 1.2721, + "grad_norm": 0.9762871861457825, + "learning_rate": 0.0001425231171508954 + }, + { + "step": 80, + "epoch": 2.2535211267605635, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068632064, + "loss": 1.3477, + "grad_norm": 0.8408863544464111, + "learning_rate": 0.00013879048596203636 + }, + { + "step": 81, + "epoch": 2.2816901408450705, + "cpu_mem": 3.6929536, + "gpu_mem": 1.06861824, + "loss": 1.32, + "grad_norm": 1.0601235628128052, + "learning_rate": 0.0001350648230106275 + }, + { + "step": 82, + "epoch": 2.3098591549295775, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068585984, + "loss": 1.3313, + "grad_norm": 0.8787514567375183, + "learning_rate": 0.00013134844430287725 + }, + { + "step": 83, + "epoch": 2.3380281690140845, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068636672, + "loss": 1.341, + "grad_norm": 1.5738627910614014, + "learning_rate": 0.0001276436600735738 + }, + { + "step": 84, + "epoch": 2.3661971830985915, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068575232, + "loss": 1.3325, + "grad_norm": 0.9779291152954102, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 85, + "epoch": 2.3943661971830985, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068621312, + "loss": 1.3498, + "grad_norm": 1.4073760509490967, + "learning_rate": 0.00012027807852009038 + }, + { + "step": 86, + "epoch": 2.4225352112676055, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068575232, + "loss": 1.325, + "grad_norm": 0.7961599826812744, + "learning_rate": 0.00011662185990655284 + }, + { + "step": 87, + "epoch": 2.4507042253521125, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068605952, + "loss": 1.3851, + "grad_norm": 0.7633869647979736, + "learning_rate": 0.00011298639034645593 + }, + { + "step": 88, + "epoch": 2.4788732394366195, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068581376, + "loss": 1.3293, + "grad_norm": 1.2013071775436401, + "learning_rate": 0.00010937392977854923 + }, + { + "step": 89, + "epoch": 2.507042253521127, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068635136, + "loss": 1.2953, + "grad_norm": 0.7941093444824219, + "learning_rate": 0.00010578672383836435 + }, + { + "step": 90, + "epoch": 2.535211267605634, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068616704, + "loss": 1.3323, + "grad_norm": 0.9225144982337952, + "learning_rate": 0.00010222700246224735 + }, + { + "step": 91, + "epoch": 2.563380281690141, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068566016, + "loss": 1.4241, + "grad_norm": 1.6464204788208008, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 92, + "epoch": 2.591549295774648, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068590592, + "loss": 1.2766, + "grad_norm": 0.5622947216033936, + "learning_rate": 9.519884634504074e-05 + }, + { + "step": 93, + "epoch": 2.619718309859155, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068593664, + "loss": 1.2514, + "grad_norm": 1.0032285451889038, + "learning_rate": 9.17347805587958e-05 + }, + { + "step": 94, + "epoch": 2.647887323943662, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068585984, + "loss": 1.267, + "grad_norm": 0.6765721440315247, + "learning_rate": 8.830693453040829e-05 + }, + { + "step": 95, + "epoch": 2.676056338028169, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068624384, + "loss": 1.3152, + "grad_norm": 0.6626193523406982, + "learning_rate": 8.491743913236628e-05 + }, + { + "step": 96, + "epoch": 2.704225352112676, + "cpu_mem": 3.6929536, + "gpu_mem": 1.0686336, + "loss": 1.2253, + "grad_norm": 1.2044754028320312, + "learning_rate": 8.156840139702554e-05 + }, + { + "step": 97, + "epoch": 2.732394366197183, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068576768, + "loss": 1.312, + "grad_norm": 0.7560423612594604, + "learning_rate": 7.82619032068023e-05 + }, + { + "step": 98, + "epoch": 2.76056338028169, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068576768, + "loss": 1.322, + "grad_norm": 0.6339558959007263, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 99, + "epoch": 2.788732394366197, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068573696, + "loss": 1.2762, + "grad_norm": 0.785936176776886, + "learning_rate": 7.17847194930753e-05 + }, + { + "step": 100, + "epoch": 2.816901408450704, + "cpu_mem": 3.6929536, + "gpu_mem": 1.06857216, + "loss": 1.2646, + "grad_norm": 0.8913729190826416, + "learning_rate": 6.86180604201361e-05 + }, + { + "step": 101, + "epoch": 2.845070422535211, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068615168, + "loss": 1.2362, + "grad_norm": 0.9554247856140137, + "learning_rate": 6.550199129045668e-05 + }, + { + "step": 102, + "epoch": 2.873239436619718, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068553728, + "loss": 1.295, + "grad_norm": 0.6520857810974121, + "learning_rate": 6.243844916478155e-05 + }, + { + "step": 103, + "epoch": 2.9014084507042255, + "cpu_mem": 3.6929536, + "gpu_mem": 1.06860288, + "loss": 1.2899, + "grad_norm": 0.6042222380638123, + "learning_rate": 5.9429338451178355e-05 + }, + { + "step": 104, + "epoch": 2.9295774647887325, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068665856, + "loss": 1.3259, + "grad_norm": 0.9028760194778442, + "learning_rate": 5.6476529721189974e-05 + }, + { + "step": 105, + "epoch": 2.9577464788732395, + "cpu_mem": 3.6929536, + "gpu_mem": 1.06861824, + "loss": 1.2508, + "grad_norm": 0.6963882446289062, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 106, + "epoch": 2.9859154929577465, + "cpu_mem": 3.6929536, + "gpu_mem": 1.068599808, + "loss": 1.2909, + "grad_norm": 0.6007523536682129, + "learning_rate": 5.074712436047112e-05 + }, + { + "step": 107, + "epoch": 3.0140845070422535, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074924544, + "loss": 1.7987, + "grad_norm": 1.3940343856811523, + "learning_rate": 4.7974089334362057e-05 + }, + { + "step": 108, + "epoch": 3.0422535211267605, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074906112, + "loss": 1.2796, + "grad_norm": 0.8170494437217712, + "learning_rate": 4.526447728708908e-05 + }, + { + "step": 109, + "epoch": 3.0704225352112675, + "cpu_mem": 3.6929536, + "gpu_mem": 1.07489536, + "loss": 1.2919, + "grad_norm": 0.6924269795417786, + "learning_rate": 4.261997261104223e-05 + }, + { + "step": 110, + "epoch": 3.0985915492957745, + "cpu_mem": 3.6929536, + "gpu_mem": 1.07494912, + "loss": 1.3189, + "grad_norm": 1.688995599746704, + "learning_rate": 4.004221922552608e-05 + }, + { + "step": 111, + "epoch": 3.1267605633802815, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074909184, + "loss": 1.2747, + "grad_norm": 0.8169498443603516, + "learning_rate": 3.753281955483985e-05 + }, + { + "step": 112, + "epoch": 3.1549295774647885, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074927616, + "loss": 1.2728, + "grad_norm": 0.6004881858825684, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 113, + "epoch": 3.183098591549296, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074990592, + "loss": 1.2833, + "grad_norm": 0.7969038486480713, + "learning_rate": 3.2725277629795526e-05 + }, + { + "step": 114, + "epoch": 3.211267605633803, + "cpu_mem": 3.6929536, + "gpu_mem": 1.0749184, + "loss": 1.3059, + "grad_norm": 0.7586613893508911, + "learning_rate": 3.0430123916561672e-05 + }, + { + "step": 115, + "epoch": 3.23943661971831, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074912256, + "loss": 1.3185, + "grad_norm": 1.0476021766662598, + "learning_rate": 2.8209299142621522e-05 + }, + { + "step": 116, + "epoch": 3.267605633802817, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074927616, + "loss": 1.2993, + "grad_norm": 0.8736559152603149, + "learning_rate": 2.6064183852600797e-05 + }, + { + "step": 117, + "epoch": 3.295774647887324, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074942976, + "loss": 1.2329, + "grad_norm": 0.8061811923980713, + "learning_rate": 2.3996111527384288e-05 + }, + { + "step": 118, + "epoch": 3.323943661971831, + "cpu_mem": 3.6929536, + "gpu_mem": 1.07493376, + "loss": 1.2847, + "grad_norm": 1.0225528478622437, + "learning_rate": 2.2006367755176655e-05 + }, + { + "step": 119, + "epoch": 3.352112676056338, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074924544, + "loss": 1.3201, + "grad_norm": 0.8458144664764404, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 120, + "epoch": 3.380281690140845, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074942976, + "loss": 1.2808, + "grad_norm": 0.7275317907333374, + "learning_rate": 1.82667639944657e-05 + }, + { + "step": 121, + "epoch": 3.408450704225352, + "cpu_mem": 3.6929536, + "gpu_mem": 1.07494144, + "loss": 1.2662, + "grad_norm": 0.9482367634773254, + "learning_rate": 1.6519228678279718e-05 + }, + { + "step": 122, + "epoch": 3.436619718309859, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074898432, + "loss": 1.246, + "grad_norm": 0.958002507686615, + "learning_rate": 1.4854669814637143e-05 + }, + { + "step": 123, + "epoch": 3.464788732394366, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074930688, + "loss": 1.271, + "grad_norm": 0.8729208111763, + "learning_rate": 1.3274122153249028e-05 + }, + { + "step": 124, + "epoch": 3.492957746478873, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074884608, + "loss": 1.3104, + "grad_norm": 0.6702376008033752, + "learning_rate": 1.1778568219438839e-05 + }, + { + "step": 125, + "epoch": 3.52112676056338, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074929152, + "loss": 1.2681, + "grad_norm": 1.2754486799240112, + "learning_rate": 1.036893770336938e-05 + }, + { + "step": 126, + "epoch": 3.5492957746478875, + "cpu_mem": 3.6929536, + "gpu_mem": 1.07488, + "loss": 1.269, + "grad_norm": 1.1399822235107422, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 127, + "epoch": 3.5774647887323945, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074892288, + "loss": 1.294, + "grad_norm": 0.7175446152687073, + "learning_rate": 7.810898074930243e-06 + }, + { + "step": 128, + "epoch": 3.6056338028169015, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074916864, + "loss": 1.2766, + "grad_norm": 1.0038007497787476, + "learning_rate": 6.664079132078881e-06 + }, + { + "step": 129, + "epoch": 3.6338028169014085, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074878464, + "loss": 1.2607, + "grad_norm": 0.7497984170913696, + "learning_rate": 5.606362957498195e-06 + }, + { + "step": 130, + "epoch": 3.6619718309859155, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074881536, + "loss": 1.2917, + "grad_norm": 0.7116180062294006, + "learning_rate": 4.638407065638322e-06 + }, + { + "step": 131, + "epoch": 3.6901408450704225, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074893824, + "loss": 1.2595, + "grad_norm": 0.849716305732727, + "learning_rate": 3.760813172726457e-06 + }, + { + "step": 132, + "epoch": 3.7183098591549295, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074858496, + "loss": 1.2608, + "grad_norm": 0.8837090134620667, + "learning_rate": 2.9741268227184255e-06 + }, + { + "step": 133, + "epoch": 3.7464788732394365, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074899968, + "loss": 1.2206, + "grad_norm": 0.8048534989356995, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 134, + "epoch": 3.7746478873239435, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074915328, + "loss": 1.2285, + "grad_norm": 0.9010533094406128, + "learning_rate": 1.6753760662307215e-06 + }, + { + "step": 135, + "epoch": 3.802816901408451, + "cpu_mem": 3.6929536, + "gpu_mem": 1.07488, + "loss": 1.2902, + "grad_norm": 1.1859617233276367, + "learning_rate": 1.1641190099741904e-06 + }, + { + "step": 136, + "epoch": 3.830985915492958, + "cpu_mem": 3.6929536, + "gpu_mem": 1.07488768, + "loss": 1.3032, + "grad_norm": 0.9159260988235474, + "learning_rate": 7.453836951897885e-07 + }, + { + "step": 137, + "epoch": 3.859154929577465, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074909184, + "loss": 1.2715, + "grad_norm": 1.2041606903076172, + "learning_rate": 4.194304228229806e-07 + }, + { + "step": 138, + "epoch": 3.887323943661972, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074919936, + "loss": 1.3311, + "grad_norm": 0.8129523992538452, + "learning_rate": 1.8646181716164831e-07 + }, + { + "step": 139, + "epoch": 3.915492957746479, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074912256, + "loss": 1.3325, + "grad_norm": 0.7621023654937744, + "learning_rate": 4.662269987756317e-08 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074946048, + "loss": 1.3128, + "grad_norm": 0.8396763205528259, + "learning_rate": 0.0 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 3.6929536, + "gpu_mem": 1.074946048, + "train_runtime": 688.8458, + "train_samples_per_second": 13.071, + "train_steps_per_second": 0.203, + "total_flos": 7230061454266368.0, + "train_loss": 1.4852122988019671 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r32-a2/README.md b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r32-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r32-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r32-a2/adapter_config.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..97cff55d3f03a364161498b7b6299c246238daf5 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r32-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r32-a2/eval_results.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..302300608260a98117992f71074b1b0b43fc384c --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_e", + "results": 0.5084175084175084 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r32-a2/training_configuration.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..655af647760cae2f1bd77487bf011014bcc7fb55 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_E", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "loraq4", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 25231360 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loraq4-arc_e-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loraq4-fix/TinyLlama_v1.1-loraq4-arc_e-r32-a2", + "seed": 42, + "timestamp": "2025-09-01T12:54:54.340547" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r32-a2/training_logs.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..afb83f73316b3d0cbb424e22886694f6135b3076 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r32-a2/training_logs.json @@ -0,0 +1,1273 @@ +[ + { + "step": 1, + "epoch": 0.028169014084507043, + "cpu_mem": 3.26907904, + "gpu_mem": 1.150566912, + "loss": 4.5728, + "grad_norm": 71.27687072753906, + "learning_rate": 2.1428571428571425e-05 + }, + { + "step": 2, + "epoch": 0.056338028169014086, + "cpu_mem": 3.269275648, + "gpu_mem": 1.35247872, + "loss": 4.3932, + "grad_norm": 72.56617736816406, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 3, + "epoch": 0.08450704225352113, + "cpu_mem": 3.269472256, + "gpu_mem": 1.352457216, + "loss": 3.2174, + "grad_norm": 56.07395553588867, + "learning_rate": 6.428571428571427e-05 + }, + { + "step": 4, + "epoch": 0.11267605633802817, + "cpu_mem": 3.269668864, + "gpu_mem": 1.352435712, + "loss": 2.2151, + "grad_norm": 20.56838607788086, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 5, + "epoch": 0.14084507042253522, + "cpu_mem": 3.269865472, + "gpu_mem": 1.352477184, + "loss": 1.6095, + "grad_norm": 6.020491123199463, + "learning_rate": 0.00010714285714285714 + }, + { + "step": 6, + "epoch": 0.16901408450704225, + "cpu_mem": 3.269865472, + "gpu_mem": 1.352452608, + "loss": 1.4682, + "grad_norm": 6.049921989440918, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 7, + "epoch": 0.19718309859154928, + "cpu_mem": 3.269865472, + "gpu_mem": 1.352475648, + "loss": 1.4652, + "grad_norm": 5.79309606552124, + "learning_rate": 0.00015 + }, + { + "step": 8, + "epoch": 0.22535211267605634, + "cpu_mem": 3.269865472, + "gpu_mem": 1.352434176, + "loss": 1.3602, + "grad_norm": 3.346912145614624, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 9, + "epoch": 0.2535211267605634, + "cpu_mem": 3.269865472, + "gpu_mem": 1.352435712, + "loss": 1.4145, + "grad_norm": 4.064579486846924, + "learning_rate": 0.00019285714285714286 + }, + { + "step": 10, + "epoch": 0.28169014084507044, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352431104, + "loss": 1.4897, + "grad_norm": 5.763367176055908, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 11, + "epoch": 0.30985915492957744, + "cpu_mem": 3.27006208, + "gpu_mem": 1.35250944, + "loss": 1.3287, + "grad_norm": 2.654299020767212, + "learning_rate": 0.00023571428571428569 + }, + { + "step": 12, + "epoch": 0.3380281690140845, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352483328, + "loss": 1.4183, + "grad_norm": 3.3611836433410645, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 13, + "epoch": 0.36619718309859156, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352434176, + "loss": 1.3813, + "grad_norm": 2.040464162826538, + "learning_rate": 0.00027857142857142854 + }, + { + "step": 14, + "epoch": 0.39436619718309857, + "cpu_mem": 3.27006208, + "gpu_mem": 1.35245568, + "loss": 1.3259, + "grad_norm": 1.1107145547866821, + "learning_rate": 0.0003 + }, + { + "step": 15, + "epoch": 0.4225352112676056, + "cpu_mem": 3.27006208, + "gpu_mem": 1.35243264, + "loss": 1.3914, + "grad_norm": 1.9442670345306396, + "learning_rate": 0.0002999533773001224 + }, + { + "step": 16, + "epoch": 0.4507042253521127, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352437248, + "loss": 1.4105, + "grad_norm": 1.7166483402252197, + "learning_rate": 0.0002998135381828383 + }, + { + "step": 17, + "epoch": 0.4788732394366197, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352474112, + "loss": 1.3399, + "grad_norm": 1.3153350353240967, + "learning_rate": 0.00029958056957717696 + }, + { + "step": 18, + "epoch": 0.5070422535211268, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352484864, + "loss": 1.3729, + "grad_norm": 1.899366855621338, + "learning_rate": 0.0002992546163048102 + }, + { + "step": 19, + "epoch": 0.5352112676056338, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352428032, + "loss": 1.371, + "grad_norm": 1.8430126905441284, + "learning_rate": 0.0002988358809900258 + }, + { + "step": 20, + "epoch": 0.5633802816901409, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352498688, + "loss": 1.3511, + "grad_norm": 1.1451483964920044, + "learning_rate": 0.0002983246239337692 + }, + { + "step": 21, + "epoch": 0.5915492957746479, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352497152, + "loss": 1.3461, + "grad_norm": 1.9857527017593384, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 22, + "epoch": 0.6197183098591549, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352454144, + "loss": 1.3336, + "grad_norm": 1.4646514654159546, + "learning_rate": 0.00029702587317728153 + }, + { + "step": 23, + "epoch": 0.647887323943662, + "cpu_mem": 3.27006208, + "gpu_mem": 1.35247104, + "loss": 1.3248, + "grad_norm": 1.5220701694488525, + "learning_rate": 0.0002962391868272735 + }, + { + "step": 24, + "epoch": 0.676056338028169, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352428032, + "loss": 1.3607, + "grad_norm": 2.051405668258667, + "learning_rate": 0.00029536159293436166 + }, + { + "step": 25, + "epoch": 0.704225352112676, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352457216, + "loss": 1.4035, + "grad_norm": 2.2210841178894043, + "learning_rate": 0.00029439363704250176 + }, + { + "step": 26, + "epoch": 0.7323943661971831, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352437248, + "loss": 1.4498, + "grad_norm": 1.3466033935546875, + "learning_rate": 0.00029333592086792107 + }, + { + "step": 27, + "epoch": 0.7605633802816901, + "cpu_mem": 3.27006208, + "gpu_mem": 1.35246336, + "loss": 1.3571, + "grad_norm": 1.4195977449417114, + "learning_rate": 0.0002921891019250697 + }, + { + "step": 28, + "epoch": 0.7887323943661971, + "cpu_mem": 3.27006208, + "gpu_mem": 1.35246336, + "loss": 1.3929, + "grad_norm": 1.3282390832901, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 29, + "epoch": 0.8169014084507042, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352441856, + "loss": 1.2964, + "grad_norm": 1.6030638217926025, + "learning_rate": 0.00028963106229663063 + }, + { + "step": 30, + "epoch": 0.8450704225352113, + "cpu_mem": 3.27006208, + "gpu_mem": 1.35243264, + "loss": 1.3634, + "grad_norm": 1.4350515604019165, + "learning_rate": 0.00028822143178056114 + }, + { + "step": 31, + "epoch": 0.8732394366197183, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352451072, + "loss": 1.3654, + "grad_norm": 0.8298089504241943, + "learning_rate": 0.00028672587784675096 + }, + { + "step": 32, + "epoch": 0.9014084507042254, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352474112, + "loss": 1.3412, + "grad_norm": 1.4396945238113403, + "learning_rate": 0.0002851453301853628 + }, + { + "step": 33, + "epoch": 0.9295774647887324, + "cpu_mem": 3.27006208, + "gpu_mem": 1.35247104, + "loss": 1.3586, + "grad_norm": 1.061161994934082, + "learning_rate": 0.00028348077132172027 + }, + { + "step": 34, + "epoch": 0.9577464788732394, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352474112, + "loss": 1.3878, + "grad_norm": 1.0271666049957275, + "learning_rate": 0.0002817332360055343 + }, + { + "step": 35, + "epoch": 0.9859154929577465, + "cpu_mem": 3.27006208, + "gpu_mem": 1.35245568, + "loss": 1.3544, + "grad_norm": 2.4923853874206543, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 36, + "epoch": 1.0140845070422535, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453371904, + "loss": 1.9735, + "grad_norm": 1.5746839046478271, + "learning_rate": 0.0002779936322448233 + }, + { + "step": 37, + "epoch": 1.0422535211267605, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453376512, + "loss": 1.3817, + "grad_norm": 2.2626185417175293, + "learning_rate": 0.0002760038884726157 + }, + { + "step": 38, + "epoch": 1.0704225352112675, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453355008, + "loss": 1.2452, + "grad_norm": 0.8303118348121643, + "learning_rate": 0.00027393581614739923 + }, + { + "step": 39, + "epoch": 1.0985915492957747, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453344256, + "loss": 1.3451, + "grad_norm": 1.8267189264297485, + "learning_rate": 0.0002717907008573785 + }, + { + "step": 40, + "epoch": 1.1267605633802817, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453407232, + "loss": 1.3427, + "grad_norm": 0.9114444851875305, + "learning_rate": 0.0002695698760834384 + }, + { + "step": 41, + "epoch": 1.1549295774647887, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453367296, + "loss": 1.3794, + "grad_norm": 1.8642899990081787, + "learning_rate": 0.00026727472237020447 + }, + { + "step": 42, + "epoch": 1.1830985915492958, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453410304, + "loss": 1.3891, + "grad_norm": 1.7553538084030151, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 43, + "epoch": 1.2112676056338028, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453359616, + "loss": 1.4095, + "grad_norm": 1.239801287651062, + "learning_rate": 0.0002624671804451601 + }, + { + "step": 44, + "epoch": 1.2394366197183098, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453424128, + "loss": 1.3565, + "grad_norm": 1.1079013347625732, + "learning_rate": 0.0002599577807744739 + }, + { + "step": 45, + "epoch": 1.267605633802817, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453391872, + "loss": 1.3607, + "grad_norm": 0.8367860317230225, + "learning_rate": 0.0002573800273889577 + }, + { + "step": 46, + "epoch": 1.295774647887324, + "cpu_mem": 3.27006208, + "gpu_mem": 1.45339648, + "loss": 1.379, + "grad_norm": 1.6928415298461914, + "learning_rate": 0.0002547355227129109 + }, + { + "step": 47, + "epoch": 1.323943661971831, + "cpu_mem": 3.27006208, + "gpu_mem": 1.45334272, + "loss": 1.3828, + "grad_norm": 2.363194465637207, + "learning_rate": 0.00025202591066563786 + }, + { + "step": 48, + "epoch": 1.352112676056338, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453356544, + "loss": 1.3605, + "grad_norm": 1.8203710317611694, + "learning_rate": 0.0002492528756395289 + }, + { + "step": 49, + "epoch": 1.380281690140845, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453345792, + "loss": 1.3374, + "grad_norm": 1.8188560009002686, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 50, + "epoch": 1.408450704225352, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453359616, + "loss": 1.3342, + "grad_norm": 1.6275304555892944, + "learning_rate": 0.00024352347027881003 + }, + { + "step": 51, + "epoch": 1.436619718309859, + "cpu_mem": 3.27006208, + "gpu_mem": 1.45341184, + "loss": 1.3683, + "grad_norm": 1.8337191343307495, + "learning_rate": 0.0002405706615488216 + }, + { + "step": 52, + "epoch": 1.4647887323943662, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453359616, + "loss": 1.4567, + "grad_norm": 2.711681842803955, + "learning_rate": 0.00023756155083521846 + }, + { + "step": 53, + "epoch": 1.4929577464788732, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453428736, + "loss": 1.3586, + "grad_norm": 1.6432777643203735, + "learning_rate": 0.00023449800870954326 + }, + { + "step": 54, + "epoch": 1.5211267605633803, + "cpu_mem": 3.27006208, + "gpu_mem": 1.45339648, + "loss": 1.2779, + "grad_norm": 0.8583344221115112, + "learning_rate": 0.0002313819395798639 + }, + { + "step": 55, + "epoch": 1.5492957746478875, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453405696, + "loss": 1.386, + "grad_norm": 1.3969924449920654, + "learning_rate": 0.0002282152805069247 + }, + { + "step": 56, + "epoch": 1.5774647887323945, + "cpu_mem": 3.27006208, + "gpu_mem": 1.45338112, + "loss": 1.3231, + "grad_norm": 0.970017671585083, + "learning_rate": 0.000225 + }, + { + "step": 57, + "epoch": 1.6056338028169015, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453414912, + "loss": 1.3082, + "grad_norm": 1.0404213666915894, + "learning_rate": 0.00022173809679319772 + }, + { + "step": 58, + "epoch": 1.6338028169014085, + "cpu_mem": 3.27006208, + "gpu_mem": 1.45339648, + "loss": 1.3271, + "grad_norm": 0.8812659978866577, + "learning_rate": 0.00021843159860297442 + }, + { + "step": 59, + "epoch": 1.6619718309859155, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453382656, + "loss": 1.3452, + "grad_norm": 1.3225237131118774, + "learning_rate": 0.00021508256086763368 + }, + { + "step": 60, + "epoch": 1.6901408450704225, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453421056, + "loss": 1.3588, + "grad_norm": 1.6646146774291992, + "learning_rate": 0.00021169306546959174 + }, + { + "step": 61, + "epoch": 1.7183098591549295, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453353472, + "loss": 1.3225, + "grad_norm": 1.4376122951507568, + "learning_rate": 0.0002082652194412042 + }, + { + "step": 62, + "epoch": 1.7464788732394365, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453401088, + "loss": 1.3608, + "grad_norm": 1.0143382549285889, + "learning_rate": 0.00020480115365495926 + }, + { + "step": 63, + "epoch": 1.7746478873239435, + "cpu_mem": 3.27006208, + "gpu_mem": 1.4533504, + "loss": 1.3212, + "grad_norm": 0.8607113361358643, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 64, + "epoch": 1.8028169014084507, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453399552, + "loss": 1.3125, + "grad_norm": 0.8695589900016785, + "learning_rate": 0.00019777299753775265 + }, + { + "step": 65, + "epoch": 1.8309859154929577, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453398016, + "loss": 1.3737, + "grad_norm": 1.533826470375061, + "learning_rate": 0.00019421327616163563 + }, + { + "step": 66, + "epoch": 1.8591549295774648, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453416448, + "loss": 1.335, + "grad_norm": 2.1815032958984375, + "learning_rate": 0.00019062607022145078 + }, + { + "step": 67, + "epoch": 1.887323943661972, + "cpu_mem": 3.27006208, + "gpu_mem": 1.45335808, + "loss": 1.3485, + "grad_norm": 1.4336656332015991, + "learning_rate": 0.00018701360965354402 + }, + { + "step": 68, + "epoch": 1.915492957746479, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453370368, + "loss": 1.3844, + "grad_norm": 1.0832065343856812, + "learning_rate": 0.00018337814009344714 + }, + { + "step": 69, + "epoch": 1.943661971830986, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453394944, + "loss": 1.3442, + "grad_norm": 1.3890724182128906, + "learning_rate": 0.0001797219214799096 + }, + { + "step": 70, + "epoch": 1.971830985915493, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453371904, + "loss": 1.2736, + "grad_norm": 0.7815632820129395, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 71, + "epoch": 2.0, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453207552, + "loss": 1.9169, + "grad_norm": 1.5413181781768799, + "learning_rate": 0.00017235633992642615 + }, + { + "step": 72, + "epoch": 2.028169014084507, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352466432, + "loss": 1.4119, + "grad_norm": 1.4770418405532837, + "learning_rate": 0.00016865155569712278 + }, + { + "step": 73, + "epoch": 2.056338028169014, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352429568, + "loss": 1.3283, + "grad_norm": 1.1221708059310913, + "learning_rate": 0.0001649351769893725 + }, + { + "step": 74, + "epoch": 2.084507042253521, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352489472, + "loss": 1.3172, + "grad_norm": 0.9554754495620728, + "learning_rate": 0.00016120951403796364 + }, + { + "step": 75, + "epoch": 2.112676056338028, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352457216, + "loss": 1.347, + "grad_norm": 1.1061042547225952, + "learning_rate": 0.00015747688284910457 + }, + { + "step": 76, + "epoch": 2.140845070422535, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352467968, + "loss": 1.2626, + "grad_norm": 0.7214590907096863, + "learning_rate": 0.00015373960376071093 + }, + { + "step": 77, + "epoch": 2.169014084507042, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352504832, + "loss": 1.3259, + "grad_norm": 0.7056289911270142, + "learning_rate": 0.00015 + }, + { + "step": 78, + "epoch": 2.1971830985915495, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352489472, + "loss": 1.3493, + "grad_norm": 1.0354070663452148, + "learning_rate": 0.00014626039623928907 + }, + { + "step": 79, + "epoch": 2.2253521126760565, + "cpu_mem": 3.27006208, + "gpu_mem": 1.35244032, + "loss": 1.258, + "grad_norm": 0.9498522877693176, + "learning_rate": 0.0001425231171508954 + }, + { + "step": 80, + "epoch": 2.2535211267605635, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352484864, + "loss": 1.2917, + "grad_norm": 0.9291858077049255, + "learning_rate": 0.00013879048596203636 + }, + { + "step": 81, + "epoch": 2.2816901408450705, + "cpu_mem": 3.27006208, + "gpu_mem": 1.35247104, + "loss": 1.3368, + "grad_norm": 1.260483980178833, + "learning_rate": 0.0001350648230106275 + }, + { + "step": 82, + "epoch": 2.3098591549295775, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352438784, + "loss": 1.3207, + "grad_norm": 1.452541708946228, + "learning_rate": 0.00013134844430287725 + }, + { + "step": 83, + "epoch": 2.3380281690140845, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352489472, + "loss": 1.3498, + "grad_norm": 2.128232717514038, + "learning_rate": 0.0001276436600735738 + }, + { + "step": 84, + "epoch": 2.3661971830985915, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352428032, + "loss": 1.3294, + "grad_norm": 1.1398086547851562, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 85, + "epoch": 2.3943661971830985, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352474112, + "loss": 1.3717, + "grad_norm": 1.8851423263549805, + "learning_rate": 0.00012027807852009038 + }, + { + "step": 86, + "epoch": 2.4225352112676055, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352428032, + "loss": 1.2964, + "grad_norm": 1.1609834432601929, + "learning_rate": 0.00011662185990655284 + }, + { + "step": 87, + "epoch": 2.4507042253521125, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352458752, + "loss": 1.303, + "grad_norm": 1.2602580785751343, + "learning_rate": 0.00011298639034645593 + }, + { + "step": 88, + "epoch": 2.4788732394366195, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352434176, + "loss": 1.2519, + "grad_norm": 1.6833473443984985, + "learning_rate": 0.00010937392977854923 + }, + { + "step": 89, + "epoch": 2.507042253521127, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352487936, + "loss": 1.2606, + "grad_norm": 1.2176685333251953, + "learning_rate": 0.00010578672383836435 + }, + { + "step": 90, + "epoch": 2.535211267605634, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352469504, + "loss": 1.3407, + "grad_norm": 2.517871618270874, + "learning_rate": 0.00010222700246224735 + }, + { + "step": 91, + "epoch": 2.563380281690141, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352418816, + "loss": 1.3603, + "grad_norm": 2.3676059246063232, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 92, + "epoch": 2.591549295774648, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352443392, + "loss": 1.3448, + "grad_norm": 2.3296971321105957, + "learning_rate": 9.519884634504074e-05 + }, + { + "step": 93, + "epoch": 2.619718309859155, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352446464, + "loss": 1.2667, + "grad_norm": 2.365894079208374, + "learning_rate": 9.17347805587958e-05 + }, + { + "step": 94, + "epoch": 2.647887323943662, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352438784, + "loss": 1.2468, + "grad_norm": 1.2305500507354736, + "learning_rate": 8.830693453040829e-05 + }, + { + "step": 95, + "epoch": 2.676056338028169, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352477184, + "loss": 1.2776, + "grad_norm": 2.337395668029785, + "learning_rate": 8.491743913236628e-05 + }, + { + "step": 96, + "epoch": 2.704225352112676, + "cpu_mem": 3.27006208, + "gpu_mem": 1.3524864, + "loss": 1.1774, + "grad_norm": 1.479711890220642, + "learning_rate": 8.156840139702554e-05 + }, + { + "step": 97, + "epoch": 2.732394366197183, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352429568, + "loss": 1.2827, + "grad_norm": 1.602455496788025, + "learning_rate": 7.82619032068023e-05 + }, + { + "step": 98, + "epoch": 2.76056338028169, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352429568, + "loss": 1.2667, + "grad_norm": 1.5519955158233643, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 99, + "epoch": 2.788732394366197, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352426496, + "loss": 1.2624, + "grad_norm": 1.387523889541626, + "learning_rate": 7.17847194930753e-05 + }, + { + "step": 100, + "epoch": 2.816901408450704, + "cpu_mem": 3.27006208, + "gpu_mem": 1.35242496, + "loss": 1.229, + "grad_norm": 1.3637725114822388, + "learning_rate": 6.86180604201361e-05 + }, + { + "step": 101, + "epoch": 2.845070422535211, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352467968, + "loss": 1.1472, + "grad_norm": 1.1382238864898682, + "learning_rate": 6.550199129045668e-05 + }, + { + "step": 102, + "epoch": 2.873239436619718, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352406528, + "loss": 1.307, + "grad_norm": 1.531333565711975, + "learning_rate": 6.243844916478155e-05 + }, + { + "step": 103, + "epoch": 2.9014084507042255, + "cpu_mem": 3.27006208, + "gpu_mem": 1.35245568, + "loss": 1.254, + "grad_norm": 1.094638705253601, + "learning_rate": 5.9429338451178355e-05 + }, + { + "step": 104, + "epoch": 2.9295774647887325, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352518656, + "loss": 1.3433, + "grad_norm": 1.785682201385498, + "learning_rate": 5.6476529721189974e-05 + }, + { + "step": 105, + "epoch": 2.9577464788732395, + "cpu_mem": 3.27006208, + "gpu_mem": 1.35247104, + "loss": 1.2021, + "grad_norm": 1.1777366399765015, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 106, + "epoch": 2.9859154929577465, + "cpu_mem": 3.27006208, + "gpu_mem": 1.352452608, + "loss": 1.2377, + "grad_norm": 1.4150007963180542, + "learning_rate": 5.074712436047112e-05 + }, + { + "step": 107, + "epoch": 3.0140845070422535, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453394944, + "loss": 1.7096, + "grad_norm": 2.5708060264587402, + "learning_rate": 4.7974089334362057e-05 + }, + { + "step": 108, + "epoch": 3.0422535211267605, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453376512, + "loss": 1.183, + "grad_norm": 1.329061508178711, + "learning_rate": 4.526447728708908e-05 + }, + { + "step": 109, + "epoch": 3.0704225352112675, + "cpu_mem": 3.27006208, + "gpu_mem": 1.45336576, + "loss": 1.1843, + "grad_norm": 1.2743369340896606, + "learning_rate": 4.261997261104223e-05 + }, + { + "step": 110, + "epoch": 3.0985915492957745, + "cpu_mem": 3.27006208, + "gpu_mem": 1.45341952, + "loss": 1.1861, + "grad_norm": 1.9914251565933228, + "learning_rate": 4.004221922552608e-05 + }, + { + "step": 111, + "epoch": 3.1267605633802815, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453379584, + "loss": 1.2021, + "grad_norm": 1.615986943244934, + "learning_rate": 3.753281955483985e-05 + }, + { + "step": 112, + "epoch": 3.1549295774647885, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453398016, + "loss": 1.1381, + "grad_norm": 1.7984881401062012, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 113, + "epoch": 3.183098591549296, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453460992, + "loss": 1.1436, + "grad_norm": 2.490532398223877, + "learning_rate": 3.2725277629795526e-05 + }, + { + "step": 114, + "epoch": 3.211267605633803, + "cpu_mem": 3.27006208, + "gpu_mem": 1.4533888, + "loss": 1.2004, + "grad_norm": 2.108456611633301, + "learning_rate": 3.0430123916561672e-05 + }, + { + "step": 115, + "epoch": 3.23943661971831, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453382656, + "loss": 1.2498, + "grad_norm": 2.1600825786590576, + "learning_rate": 2.8209299142621522e-05 + }, + { + "step": 116, + "epoch": 3.267605633802817, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453398016, + "loss": 1.2116, + "grad_norm": 1.9217031002044678, + "learning_rate": 2.6064183852600797e-05 + }, + { + "step": 117, + "epoch": 3.295774647887324, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453413376, + "loss": 1.1837, + "grad_norm": 2.250230073928833, + "learning_rate": 2.3996111527384288e-05 + }, + { + "step": 118, + "epoch": 3.323943661971831, + "cpu_mem": 3.27006208, + "gpu_mem": 1.45340416, + "loss": 1.1194, + "grad_norm": 1.6285483837127686, + "learning_rate": 2.2006367755176655e-05 + }, + { + "step": 119, + "epoch": 3.352112676056338, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453394944, + "loss": 1.1972, + "grad_norm": 1.6395460367202759, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 120, + "epoch": 3.380281690140845, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453413376, + "loss": 1.1681, + "grad_norm": 1.287441372871399, + "learning_rate": 1.82667639944657e-05 + }, + { + "step": 121, + "epoch": 3.408450704225352, + "cpu_mem": 3.27006208, + "gpu_mem": 1.45341184, + "loss": 1.1206, + "grad_norm": 1.4543119668960571, + "learning_rate": 1.6519228678279718e-05 + }, + { + "step": 122, + "epoch": 3.436619718309859, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453368832, + "loss": 1.0764, + "grad_norm": 1.5564324855804443, + "learning_rate": 1.4854669814637143e-05 + }, + { + "step": 123, + "epoch": 3.464788732394366, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453401088, + "loss": 1.0965, + "grad_norm": 1.8127130270004272, + "learning_rate": 1.3274122153249028e-05 + }, + { + "step": 124, + "epoch": 3.492957746478873, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453355008, + "loss": 1.1825, + "grad_norm": 2.2149300575256348, + "learning_rate": 1.1778568219438839e-05 + }, + { + "step": 125, + "epoch": 3.52112676056338, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453399552, + "loss": 1.0716, + "grad_norm": 1.8996946811676025, + "learning_rate": 1.036893770336938e-05 + }, + { + "step": 126, + "epoch": 3.5492957746478875, + "cpu_mem": 3.27006208, + "gpu_mem": 1.4533504, + "loss": 1.1211, + "grad_norm": 1.6778340339660645, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 127, + "epoch": 3.5774647887323945, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453362688, + "loss": 1.1325, + "grad_norm": 2.1428651809692383, + "learning_rate": 7.810898074930243e-06 + }, + { + "step": 128, + "epoch": 3.6056338028169015, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453387264, + "loss": 1.1163, + "grad_norm": 1.701382040977478, + "learning_rate": 6.664079132078881e-06 + }, + { + "step": 129, + "epoch": 3.6338028169014085, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453348864, + "loss": 1.0902, + "grad_norm": 1.848825454711914, + "learning_rate": 5.606362957498195e-06 + }, + { + "step": 130, + "epoch": 3.6619718309859155, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453351936, + "loss": 1.1636, + "grad_norm": 2.0770599842071533, + "learning_rate": 4.638407065638322e-06 + }, + { + "step": 131, + "epoch": 3.6901408450704225, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453364224, + "loss": 1.0999, + "grad_norm": 2.2524313926696777, + "learning_rate": 3.760813172726457e-06 + }, + { + "step": 132, + "epoch": 3.7183098591549295, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453328896, + "loss": 1.116, + "grad_norm": 1.5813498497009277, + "learning_rate": 2.9741268227184255e-06 + }, + { + "step": 133, + "epoch": 3.7464788732394365, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453370368, + "loss": 1.1683, + "grad_norm": 1.7459475994110107, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 134, + "epoch": 3.7746478873239435, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453385728, + "loss": 1.0949, + "grad_norm": 1.6756196022033691, + "learning_rate": 1.6753760662307215e-06 + }, + { + "step": 135, + "epoch": 3.802816901408451, + "cpu_mem": 3.27006208, + "gpu_mem": 1.4533504, + "loss": 1.1767, + "grad_norm": 2.3433804512023926, + "learning_rate": 1.1641190099741904e-06 + }, + { + "step": 136, + "epoch": 3.830985915492958, + "cpu_mem": 3.27006208, + "gpu_mem": 1.45335808, + "loss": 1.1629, + "grad_norm": 1.9952489137649536, + "learning_rate": 7.453836951897885e-07 + }, + { + "step": 137, + "epoch": 3.859154929577465, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453379584, + "loss": 1.0831, + "grad_norm": 2.3805150985717773, + "learning_rate": 4.194304228229806e-07 + }, + { + "step": 138, + "epoch": 3.887323943661972, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453390336, + "loss": 1.1271, + "grad_norm": 1.6594616174697876, + "learning_rate": 1.8646181716164831e-07 + }, + { + "step": 139, + "epoch": 3.915492957746479, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453382656, + "loss": 1.1998, + "grad_norm": 2.0030813217163086, + "learning_rate": 4.662269987756317e-08 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453416448, + "loss": 1.1628, + "grad_norm": 2.2832202911376953, + "learning_rate": 0.0 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 3.27006208, + "gpu_mem": 1.453416448, + "train_runtime": 697.2892, + "train_samples_per_second": 12.913, + "train_steps_per_second": 0.201, + "total_flos": 7395127108743168.0, + "train_loss": 1.3715088069438934 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r8-a2/README.md b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r8-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r8-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r8-a2/adapter_config.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a34e999804ff05ab393ed2117c936e4d7827f88f --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r8-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r8-a2/eval_results.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a28baa8d53570525344657bb12fd77d3f24ae3bb --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "arc_e", + "results": 0.5 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r8-a2/training_configuration.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..768fa4c476830e59b6bf36054e48b7ae7035792e --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "ARC_E", + "dataset_id": "allenai/ai2_arc", + "preprocess_id": "arc_train_deepeval" + }, + "peft_config": { + "method": "loraq4", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 6307840 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loraq4-arc_e-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loraq4-fix/TinyLlama_v1.1-loraq4-arc_e-r8-a2", + "seed": 42, + "timestamp": "2025-09-01T05:43:37.108073" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r8-a2/training_logs.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..b2b85e941b963c030093447ad8845fd01f14dcfb --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-arc_e-r8-a2/training_logs.json @@ -0,0 +1,1273 @@ +[ + { + "step": 1, + "epoch": 0.028169014084507043, + "cpu_mem": 3.333206016, + "gpu_mem": 1.074872832, + "loss": 4.5728, + "grad_norm": 33.86262130737305, + "learning_rate": 2.1428571428571425e-05 + }, + { + "step": 2, + "epoch": 0.056338028169014086, + "cpu_mem": 3.333599232, + "gpu_mem": 1.12539648, + "loss": 4.3932, + "grad_norm": 34.76551818847656, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 3, + "epoch": 0.08450704225352113, + "cpu_mem": 3.33379584, + "gpu_mem": 1.125374976, + "loss": 4.2086, + "grad_norm": 35.19297409057617, + "learning_rate": 6.428571428571427e-05 + }, + { + "step": 4, + "epoch": 0.11267605633802817, + "cpu_mem": 3.33379584, + "gpu_mem": 1.125353472, + "loss": 3.5972, + "grad_norm": 29.389989852905273, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 5, + "epoch": 0.14084507042253522, + "cpu_mem": 3.33379584, + "gpu_mem": 1.125394944, + "loss": 2.5543, + "grad_norm": 18.877796173095703, + "learning_rate": 0.00010714285714285714 + }, + { + "step": 6, + "epoch": 0.16901408450704225, + "cpu_mem": 3.333992448, + "gpu_mem": 1.125370368, + "loss": 1.9716, + "grad_norm": 8.926615715026855, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 7, + "epoch": 0.19718309859154928, + "cpu_mem": 3.333992448, + "gpu_mem": 1.125393408, + "loss": 1.7154, + "grad_norm": 6.924084663391113, + "learning_rate": 0.00015 + }, + { + "step": 8, + "epoch": 0.22535211267605634, + "cpu_mem": 3.333992448, + "gpu_mem": 1.125351936, + "loss": 1.5198, + "grad_norm": 3.222949981689453, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 9, + "epoch": 0.2535211267605634, + "cpu_mem": 3.333992448, + "gpu_mem": 1.125353472, + "loss": 1.4527, + "grad_norm": 3.850663661956787, + "learning_rate": 0.00019285714285714286 + }, + { + "step": 10, + "epoch": 0.28169014084507044, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125348864, + "loss": 1.5042, + "grad_norm": 6.100240707397461, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 11, + "epoch": 0.30985915492957744, + "cpu_mem": 3.334189056, + "gpu_mem": 1.1254272, + "loss": 1.3889, + "grad_norm": 3.210764169692993, + "learning_rate": 0.00023571428571428569 + }, + { + "step": 12, + "epoch": 0.3380281690140845, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125401088, + "loss": 1.3707, + "grad_norm": 3.3460819721221924, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 13, + "epoch": 0.36619718309859156, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125351936, + "loss": 1.4508, + "grad_norm": 6.236752986907959, + "learning_rate": 0.00027857142857142854 + }, + { + "step": 14, + "epoch": 0.39436619718309857, + "cpu_mem": 3.334189056, + "gpu_mem": 1.12537344, + "loss": 1.3355, + "grad_norm": 1.7698322534561157, + "learning_rate": 0.0003 + }, + { + "step": 15, + "epoch": 0.4225352112676056, + "cpu_mem": 3.334189056, + "gpu_mem": 1.1253504, + "loss": 1.3495, + "grad_norm": 2.274522542953491, + "learning_rate": 0.0002999533773001224 + }, + { + "step": 16, + "epoch": 0.4507042253521127, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125355008, + "loss": 1.4146, + "grad_norm": 2.608646869659424, + "learning_rate": 0.0002998135381828383 + }, + { + "step": 17, + "epoch": 0.4788732394366197, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125391872, + "loss": 1.3024, + "grad_norm": 1.1738522052764893, + "learning_rate": 0.00029958056957717696 + }, + { + "step": 18, + "epoch": 0.5070422535211268, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125402624, + "loss": 1.3742, + "grad_norm": 2.0573129653930664, + "learning_rate": 0.0002992546163048102 + }, + { + "step": 19, + "epoch": 0.5352112676056338, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125345792, + "loss": 1.3784, + "grad_norm": 2.0514602661132812, + "learning_rate": 0.0002988358809900258 + }, + { + "step": 20, + "epoch": 0.5633802816901409, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125416448, + "loss": 1.3576, + "grad_norm": 0.9641831517219543, + "learning_rate": 0.0002983246239337692 + }, + { + "step": 21, + "epoch": 0.5915492957746479, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125414912, + "loss": 1.3405, + "grad_norm": 1.298500657081604, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 22, + "epoch": 0.6197183098591549, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125371904, + "loss": 1.3609, + "grad_norm": 1.4783939123153687, + "learning_rate": 0.00029702587317728153 + }, + { + "step": 23, + "epoch": 0.647887323943662, + "cpu_mem": 3.334189056, + "gpu_mem": 1.1253888, + "loss": 1.33, + "grad_norm": 0.8664279580116272, + "learning_rate": 0.0002962391868272735 + }, + { + "step": 24, + "epoch": 0.676056338028169, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125345792, + "loss": 1.3233, + "grad_norm": 0.892545759677887, + "learning_rate": 0.00029536159293436166 + }, + { + "step": 25, + "epoch": 0.704225352112676, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125374976, + "loss": 1.4139, + "grad_norm": 1.4401839971542358, + "learning_rate": 0.00029439363704250176 + }, + { + "step": 26, + "epoch": 0.7323943661971831, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125355008, + "loss": 1.4747, + "grad_norm": 1.5091224908828735, + "learning_rate": 0.00029333592086792107 + }, + { + "step": 27, + "epoch": 0.7605633802816901, + "cpu_mem": 3.334189056, + "gpu_mem": 1.12538112, + "loss": 1.3267, + "grad_norm": 0.6856065392494202, + "learning_rate": 0.0002921891019250697 + }, + { + "step": 28, + "epoch": 0.7887323943661971, + "cpu_mem": 3.334189056, + "gpu_mem": 1.12538112, + "loss": 1.3808, + "grad_norm": 1.0650168657302856, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 29, + "epoch": 0.8169014084507042, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125359616, + "loss": 1.2593, + "grad_norm": 0.8949085474014282, + "learning_rate": 0.00028963106229663063 + }, + { + "step": 30, + "epoch": 0.8450704225352113, + "cpu_mem": 3.334189056, + "gpu_mem": 1.1253504, + "loss": 1.3367, + "grad_norm": 0.6276845932006836, + "learning_rate": 0.00028822143178056114 + }, + { + "step": 31, + "epoch": 0.8732394366197183, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125368832, + "loss": 1.3754, + "grad_norm": 0.95515376329422, + "learning_rate": 0.00028672587784675096 + }, + { + "step": 32, + "epoch": 0.9014084507042254, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125391872, + "loss": 1.342, + "grad_norm": 1.117200255393982, + "learning_rate": 0.0002851453301853628 + }, + { + "step": 33, + "epoch": 0.9295774647887324, + "cpu_mem": 3.334189056, + "gpu_mem": 1.1253888, + "loss": 1.367, + "grad_norm": 0.7321996688842773, + "learning_rate": 0.00028348077132172027 + }, + { + "step": 34, + "epoch": 0.9577464788732394, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125391872, + "loss": 1.43, + "grad_norm": 1.1582528352737427, + "learning_rate": 0.0002817332360055343 + }, + { + "step": 35, + "epoch": 0.9859154929577465, + "cpu_mem": 3.334189056, + "gpu_mem": 1.12537344, + "loss": 1.3197, + "grad_norm": 0.7974072098731995, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 36, + "epoch": 1.0140845070422535, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150595584, + "loss": 1.9673, + "grad_norm": 1.3636599779129028, + "learning_rate": 0.0002779936322448233 + }, + { + "step": 37, + "epoch": 1.0422535211267605, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150600192, + "loss": 1.3228, + "grad_norm": 0.5033758282661438, + "learning_rate": 0.0002760038884726157 + }, + { + "step": 38, + "epoch": 1.0704225352112675, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150578688, + "loss": 1.2339, + "grad_norm": 0.7470241785049438, + "learning_rate": 0.00027393581614739923 + }, + { + "step": 39, + "epoch": 1.0985915492957747, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150567936, + "loss": 1.4041, + "grad_norm": 1.8452550172805786, + "learning_rate": 0.0002717907008573785 + }, + { + "step": 40, + "epoch": 1.1267605633802817, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150630912, + "loss": 1.4342, + "grad_norm": 1.6611576080322266, + "learning_rate": 0.0002695698760834384 + }, + { + "step": 41, + "epoch": 1.1549295774647887, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150590976, + "loss": 1.3064, + "grad_norm": 0.7073245644569397, + "learning_rate": 0.00026727472237020447 + }, + { + "step": 42, + "epoch": 1.1830985915492958, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150633984, + "loss": 1.3456, + "grad_norm": 0.6292695999145508, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 43, + "epoch": 1.2112676056338028, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150583296, + "loss": 1.387, + "grad_norm": 0.5566242933273315, + "learning_rate": 0.0002624671804451601 + }, + { + "step": 44, + "epoch": 1.2394366197183098, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150647808, + "loss": 1.3418, + "grad_norm": 1.0090694427490234, + "learning_rate": 0.0002599577807744739 + }, + { + "step": 45, + "epoch": 1.267605633802817, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150615552, + "loss": 1.3702, + "grad_norm": 0.8750900030136108, + "learning_rate": 0.0002573800273889577 + }, + { + "step": 46, + "epoch": 1.295774647887324, + "cpu_mem": 3.334189056, + "gpu_mem": 1.15062016, + "loss": 1.3806, + "grad_norm": 1.0443229675292969, + "learning_rate": 0.0002547355227129109 + }, + { + "step": 47, + "epoch": 1.323943661971831, + "cpu_mem": 3.334189056, + "gpu_mem": 1.1505664, + "loss": 1.3419, + "grad_norm": 1.4716330766677856, + "learning_rate": 0.00025202591066563786 + }, + { + "step": 48, + "epoch": 1.352112676056338, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150580224, + "loss": 1.3217, + "grad_norm": 0.9559872150421143, + "learning_rate": 0.0002492528756395289 + }, + { + "step": 49, + "epoch": 1.380281690140845, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150569472, + "loss": 1.3032, + "grad_norm": 0.5994803309440613, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 50, + "epoch": 1.408450704225352, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150583296, + "loss": 1.3303, + "grad_norm": 0.7726380825042725, + "learning_rate": 0.00024352347027881003 + }, + { + "step": 51, + "epoch": 1.436619718309859, + "cpu_mem": 3.334189056, + "gpu_mem": 1.15063552, + "loss": 1.373, + "grad_norm": 1.0797308683395386, + "learning_rate": 0.0002405706615488216 + }, + { + "step": 52, + "epoch": 1.4647887323943662, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150583296, + "loss": 1.3526, + "grad_norm": 1.2443790435791016, + "learning_rate": 0.00023756155083521846 + }, + { + "step": 53, + "epoch": 1.4929577464788732, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150652416, + "loss": 1.3197, + "grad_norm": 0.8344854712486267, + "learning_rate": 0.00023449800870954326 + }, + { + "step": 54, + "epoch": 1.5211267605633803, + "cpu_mem": 3.334189056, + "gpu_mem": 1.15062016, + "loss": 1.3164, + "grad_norm": 1.25892174243927, + "learning_rate": 0.0002313819395798639 + }, + { + "step": 55, + "epoch": 1.5492957746478875, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150629376, + "loss": 1.3408, + "grad_norm": 0.7773955464363098, + "learning_rate": 0.0002282152805069247 + }, + { + "step": 56, + "epoch": 1.5774647887323945, + "cpu_mem": 3.334189056, + "gpu_mem": 1.1506048, + "loss": 1.3158, + "grad_norm": 0.8528761863708496, + "learning_rate": 0.000225 + }, + { + "step": 57, + "epoch": 1.6056338028169015, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150638592, + "loss": 1.2882, + "grad_norm": 0.6971151828765869, + "learning_rate": 0.00022173809679319772 + }, + { + "step": 58, + "epoch": 1.6338028169014085, + "cpu_mem": 3.334189056, + "gpu_mem": 1.15062016, + "loss": 1.3042, + "grad_norm": 0.7086783647537231, + "learning_rate": 0.00021843159860297442 + }, + { + "step": 59, + "epoch": 1.6619718309859155, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150606336, + "loss": 1.3164, + "grad_norm": 0.7261967062950134, + "learning_rate": 0.00021508256086763368 + }, + { + "step": 60, + "epoch": 1.6901408450704225, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150644736, + "loss": 1.3021, + "grad_norm": 0.9239888787269592, + "learning_rate": 0.00021169306546959174 + }, + { + "step": 61, + "epoch": 1.7183098591549295, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150577152, + "loss": 1.3107, + "grad_norm": 1.1465481519699097, + "learning_rate": 0.0002082652194412042 + }, + { + "step": 62, + "epoch": 1.7464788732394365, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150624768, + "loss": 1.3599, + "grad_norm": 1.096864104270935, + "learning_rate": 0.00020480115365495926 + }, + { + "step": 63, + "epoch": 1.7746478873239435, + "cpu_mem": 3.334189056, + "gpu_mem": 1.15057408, + "loss": 1.3083, + "grad_norm": 1.0841001272201538, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 64, + "epoch": 1.8028169014084507, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150623232, + "loss": 1.3176, + "grad_norm": 0.9216986894607544, + "learning_rate": 0.00019777299753775265 + }, + { + "step": 65, + "epoch": 1.8309859154929577, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150621696, + "loss": 1.3449, + "grad_norm": 1.130928635597229, + "learning_rate": 0.00019421327616163563 + }, + { + "step": 66, + "epoch": 1.8591549295774648, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150640128, + "loss": 1.3343, + "grad_norm": 1.2934296131134033, + "learning_rate": 0.00019062607022145078 + }, + { + "step": 67, + "epoch": 1.887323943661972, + "cpu_mem": 3.334189056, + "gpu_mem": 1.15058176, + "loss": 1.3294, + "grad_norm": 1.088483214378357, + "learning_rate": 0.00018701360965354402 + }, + { + "step": 68, + "epoch": 1.915492957746479, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150594048, + "loss": 1.3535, + "grad_norm": 0.8583921790122986, + "learning_rate": 0.00018337814009344714 + }, + { + "step": 69, + "epoch": 1.943661971830986, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150618624, + "loss": 1.2933, + "grad_norm": 0.9922156929969788, + "learning_rate": 0.0001797219214799096 + }, + { + "step": 70, + "epoch": 1.971830985915493, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150595584, + "loss": 1.2791, + "grad_norm": 0.6503699421882629, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 71, + "epoch": 2.0, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150431232, + "loss": 2.0151, + "grad_norm": 2.5221052169799805, + "learning_rate": 0.00017235633992642615 + }, + { + "step": 72, + "epoch": 2.028169014084507, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125384192, + "loss": 1.3478, + "grad_norm": 0.9867616891860962, + "learning_rate": 0.00016865155569712278 + }, + { + "step": 73, + "epoch": 2.056338028169014, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125347328, + "loss": 1.2886, + "grad_norm": 0.5598757266998291, + "learning_rate": 0.0001649351769893725 + }, + { + "step": 74, + "epoch": 2.084507042253521, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125407232, + "loss": 1.2822, + "grad_norm": 0.5943240523338318, + "learning_rate": 0.00016120951403796364 + }, + { + "step": 75, + "epoch": 2.112676056338028, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125374976, + "loss": 1.3177, + "grad_norm": 0.7274808287620544, + "learning_rate": 0.00015747688284910457 + }, + { + "step": 76, + "epoch": 2.140845070422535, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125385728, + "loss": 1.2688, + "grad_norm": 0.5879124402999878, + "learning_rate": 0.00015373960376071093 + }, + { + "step": 77, + "epoch": 2.169014084507042, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125422592, + "loss": 1.3351, + "grad_norm": 0.5320140719413757, + "learning_rate": 0.00015 + }, + { + "step": 78, + "epoch": 2.1971830985915495, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125407232, + "loss": 1.3148, + "grad_norm": 0.6910581588745117, + "learning_rate": 0.00014626039623928907 + }, + { + "step": 79, + "epoch": 2.2253521126760565, + "cpu_mem": 3.334189056, + "gpu_mem": 1.12535808, + "loss": 1.2288, + "grad_norm": 0.8275500535964966, + "learning_rate": 0.0001425231171508954 + }, + { + "step": 80, + "epoch": 2.2535211267605635, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125402624, + "loss": 1.3152, + "grad_norm": 1.193676233291626, + "learning_rate": 0.00013879048596203636 + }, + { + "step": 81, + "epoch": 2.2816901408450705, + "cpu_mem": 3.334189056, + "gpu_mem": 1.1253888, + "loss": 1.3016, + "grad_norm": 1.1068768501281738, + "learning_rate": 0.0001350648230106275 + }, + { + "step": 82, + "epoch": 2.3098591549295775, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125356544, + "loss": 1.3027, + "grad_norm": 1.2004145383834839, + "learning_rate": 0.00013134844430287725 + }, + { + "step": 83, + "epoch": 2.3380281690140845, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125407232, + "loss": 1.2703, + "grad_norm": 1.495643138885498, + "learning_rate": 0.0001276436600735738 + }, + { + "step": 84, + "epoch": 2.3661971830985915, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125345792, + "loss": 1.3233, + "grad_norm": 1.5472499132156372, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 85, + "epoch": 2.3943661971830985, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125391872, + "loss": 1.3059, + "grad_norm": 1.799462914466858, + "learning_rate": 0.00012027807852009038 + }, + { + "step": 86, + "epoch": 2.4225352112676055, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125345792, + "loss": 1.2916, + "grad_norm": 1.0714576244354248, + "learning_rate": 0.00011662185990655284 + }, + { + "step": 87, + "epoch": 2.4507042253521125, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125376512, + "loss": 1.3209, + "grad_norm": 1.2693254947662354, + "learning_rate": 0.00011298639034645593 + }, + { + "step": 88, + "epoch": 2.4788732394366195, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125351936, + "loss": 1.2829, + "grad_norm": 1.5111981630325317, + "learning_rate": 0.00010937392977854923 + }, + { + "step": 89, + "epoch": 2.507042253521127, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125405696, + "loss": 1.2294, + "grad_norm": 1.5623971223831177, + "learning_rate": 0.00010578672383836435 + }, + { + "step": 90, + "epoch": 2.535211267605634, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125387264, + "loss": 1.3091, + "grad_norm": 1.9625316858291626, + "learning_rate": 0.00010222700246224735 + }, + { + "step": 91, + "epoch": 2.563380281690141, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125336576, + "loss": 1.3064, + "grad_norm": 2.2262487411499023, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 92, + "epoch": 2.591549295774648, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125361152, + "loss": 1.2509, + "grad_norm": 1.480814814567566, + "learning_rate": 9.519884634504074e-05 + }, + { + "step": 93, + "epoch": 2.619718309859155, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125364224, + "loss": 1.2138, + "grad_norm": 1.6825463771820068, + "learning_rate": 9.17347805587958e-05 + }, + { + "step": 94, + "epoch": 2.647887323943662, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125356544, + "loss": 1.156, + "grad_norm": 1.9236863851547241, + "learning_rate": 8.830693453040829e-05 + }, + { + "step": 95, + "epoch": 2.676056338028169, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125394944, + "loss": 1.272, + "grad_norm": 1.8665015697479248, + "learning_rate": 8.491743913236628e-05 + }, + { + "step": 96, + "epoch": 2.704225352112676, + "cpu_mem": 3.334189056, + "gpu_mem": 1.12540416, + "loss": 1.125, + "grad_norm": 1.7548712491989136, + "learning_rate": 8.156840139702554e-05 + }, + { + "step": 97, + "epoch": 2.732394366197183, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125347328, + "loss": 1.2273, + "grad_norm": 1.908643126487732, + "learning_rate": 7.82619032068023e-05 + }, + { + "step": 98, + "epoch": 2.76056338028169, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125347328, + "loss": 1.2463, + "grad_norm": 1.6651192903518677, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 99, + "epoch": 2.788732394366197, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125344256, + "loss": 1.2088, + "grad_norm": 1.7607053518295288, + "learning_rate": 7.17847194930753e-05 + }, + { + "step": 100, + "epoch": 2.816901408450704, + "cpu_mem": 3.334189056, + "gpu_mem": 1.12534272, + "loss": 1.1507, + "grad_norm": 1.8790853023529053, + "learning_rate": 6.86180604201361e-05 + }, + { + "step": 101, + "epoch": 2.845070422535211, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125385728, + "loss": 1.0778, + "grad_norm": 1.9931615591049194, + "learning_rate": 6.550199129045668e-05 + }, + { + "step": 102, + "epoch": 2.873239436619718, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125324288, + "loss": 1.2881, + "grad_norm": 2.6105551719665527, + "learning_rate": 6.243844916478155e-05 + }, + { + "step": 103, + "epoch": 2.9014084507042255, + "cpu_mem": 3.334189056, + "gpu_mem": 1.12537344, + "loss": 1.2162, + "grad_norm": 1.9779986143112183, + "learning_rate": 5.9429338451178355e-05 + }, + { + "step": 104, + "epoch": 2.9295774647887325, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125436416, + "loss": 1.367, + "grad_norm": 3.5651304721832275, + "learning_rate": 5.6476529721189974e-05 + }, + { + "step": 105, + "epoch": 2.9577464788732395, + "cpu_mem": 3.334189056, + "gpu_mem": 1.1253888, + "loss": 1.214, + "grad_norm": 2.317695140838623, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 106, + "epoch": 2.9859154929577465, + "cpu_mem": 3.334189056, + "gpu_mem": 1.125370368, + "loss": 1.2387, + "grad_norm": 2.3056371212005615, + "learning_rate": 5.074712436047112e-05 + }, + { + "step": 107, + "epoch": 3.0140845070422535, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150618624, + "loss": 1.5299, + "grad_norm": 4.328421592712402, + "learning_rate": 4.7974089334362057e-05 + }, + { + "step": 108, + "epoch": 3.0422535211267605, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150600192, + "loss": 1.1701, + "grad_norm": 2.2429699897766113, + "learning_rate": 4.526447728708908e-05 + }, + { + "step": 109, + "epoch": 3.0704225352112675, + "cpu_mem": 3.334189056, + "gpu_mem": 1.15058944, + "loss": 1.1194, + "grad_norm": 2.1113884449005127, + "learning_rate": 4.261997261104223e-05 + }, + { + "step": 110, + "epoch": 3.0985915492957745, + "cpu_mem": 3.334189056, + "gpu_mem": 1.1506432, + "loss": 1.1748, + "grad_norm": 4.475000381469727, + "learning_rate": 4.004221922552608e-05 + }, + { + "step": 111, + "epoch": 3.1267605633802815, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150603264, + "loss": 1.1095, + "grad_norm": 2.2468886375427246, + "learning_rate": 3.753281955483985e-05 + }, + { + "step": 112, + "epoch": 3.1549295774647885, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150621696, + "loss": 1.108, + "grad_norm": 2.263409376144409, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 113, + "epoch": 3.183098591549296, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150684672, + "loss": 1.1073, + "grad_norm": 2.53086519241333, + "learning_rate": 3.2725277629795526e-05 + }, + { + "step": 114, + "epoch": 3.211267605633803, + "cpu_mem": 3.334189056, + "gpu_mem": 1.15061248, + "loss": 1.1124, + "grad_norm": 2.7708022594451904, + "learning_rate": 3.0430123916561672e-05 + }, + { + "step": 115, + "epoch": 3.23943661971831, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150606336, + "loss": 1.1458, + "grad_norm": 2.7742066383361816, + "learning_rate": 2.8209299142621522e-05 + }, + { + "step": 116, + "epoch": 3.267605633802817, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150621696, + "loss": 1.0707, + "grad_norm": 2.7080955505371094, + "learning_rate": 2.6064183852600797e-05 + }, + { + "step": 117, + "epoch": 3.295774647887324, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150637056, + "loss": 1.1054, + "grad_norm": 2.755727767944336, + "learning_rate": 2.3996111527384288e-05 + }, + { + "step": 118, + "epoch": 3.323943661971831, + "cpu_mem": 3.334189056, + "gpu_mem": 1.15062784, + "loss": 1.1134, + "grad_norm": 3.775683641433716, + "learning_rate": 2.2006367755176655e-05 + }, + { + "step": 119, + "epoch": 3.352112676056338, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150618624, + "loss": 1.1463, + "grad_norm": 3.10908842086792, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 120, + "epoch": 3.380281690140845, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150637056, + "loss": 1.1271, + "grad_norm": 2.657003879547119, + "learning_rate": 1.82667639944657e-05 + }, + { + "step": 121, + "epoch": 3.408450704225352, + "cpu_mem": 3.334189056, + "gpu_mem": 1.15063552, + "loss": 1.0425, + "grad_norm": 3.135737895965576, + "learning_rate": 1.6519228678279718e-05 + }, + { + "step": 122, + "epoch": 3.436619718309859, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150592512, + "loss": 0.9249, + "grad_norm": 2.552415132522583, + "learning_rate": 1.4854669814637143e-05 + }, + { + "step": 123, + "epoch": 3.464788732394366, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150624768, + "loss": 1.0657, + "grad_norm": 3.6429359912872314, + "learning_rate": 1.3274122153249028e-05 + }, + { + "step": 124, + "epoch": 3.492957746478873, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150578688, + "loss": 1.1765, + "grad_norm": 2.835240364074707, + "learning_rate": 1.1778568219438839e-05 + }, + { + "step": 125, + "epoch": 3.52112676056338, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150623232, + "loss": 1.0289, + "grad_norm": 3.3008811473846436, + "learning_rate": 1.036893770336938e-05 + }, + { + "step": 126, + "epoch": 3.5492957746478875, + "cpu_mem": 3.334189056, + "gpu_mem": 1.15057408, + "loss": 1.0689, + "grad_norm": 3.5186192989349365, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 127, + "epoch": 3.5774647887323945, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150586368, + "loss": 1.1238, + "grad_norm": 3.5988266468048096, + "learning_rate": 7.810898074930243e-06 + }, + { + "step": 128, + "epoch": 3.6056338028169015, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150610944, + "loss": 1.0844, + "grad_norm": 3.300708532333374, + "learning_rate": 6.664079132078881e-06 + }, + { + "step": 129, + "epoch": 3.6338028169014085, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150572544, + "loss": 0.9792, + "grad_norm": 4.237852573394775, + "learning_rate": 5.606362957498195e-06 + }, + { + "step": 130, + "epoch": 3.6619718309859155, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150575616, + "loss": 1.0774, + "grad_norm": 3.611595869064331, + "learning_rate": 4.638407065638322e-06 + }, + { + "step": 131, + "epoch": 3.6901408450704225, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150587904, + "loss": 0.9478, + "grad_norm": 3.071817636489868, + "learning_rate": 3.760813172726457e-06 + }, + { + "step": 132, + "epoch": 3.7183098591549295, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150552576, + "loss": 1.0596, + "grad_norm": 3.188192844390869, + "learning_rate": 2.9741268227184255e-06 + }, + { + "step": 133, + "epoch": 3.7464788732394365, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150594048, + "loss": 1.1108, + "grad_norm": 3.317403554916382, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 134, + "epoch": 3.7746478873239435, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150609408, + "loss": 1.0055, + "grad_norm": 3.0370452404022217, + "learning_rate": 1.6753760662307215e-06 + }, + { + "step": 135, + "epoch": 3.802816901408451, + "cpu_mem": 3.334189056, + "gpu_mem": 1.15057408, + "loss": 1.0392, + "grad_norm": 3.547057628631592, + "learning_rate": 1.1641190099741904e-06 + }, + { + "step": 136, + "epoch": 3.830985915492958, + "cpu_mem": 3.334189056, + "gpu_mem": 1.15058176, + "loss": 1.1056, + "grad_norm": 4.652339458465576, + "learning_rate": 7.453836951897885e-07 + }, + { + "step": 137, + "epoch": 3.859154929577465, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150603264, + "loss": 1.0659, + "grad_norm": 3.707662582397461, + "learning_rate": 4.194304228229806e-07 + }, + { + "step": 138, + "epoch": 3.887323943661972, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150614016, + "loss": 1.1002, + "grad_norm": 3.0242950916290283, + "learning_rate": 1.8646181716164831e-07 + }, + { + "step": 139, + "epoch": 3.915492957746479, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150606336, + "loss": 1.1242, + "grad_norm": 3.7408151626586914, + "learning_rate": 4.662269987756317e-08 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150640128, + "loss": 1.0853, + "grad_norm": 3.439875364303589, + "learning_rate": 0.0 + }, + { + "step": 140, + "epoch": 3.943661971830986, + "cpu_mem": 3.334189056, + "gpu_mem": 1.150640128, + "train_runtime": 689.8196, + "train_samples_per_second": 13.053, + "train_steps_per_second": 0.203, + "total_flos": 7263074585161728.0, + "train_loss": 1.374368126477514 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r2-a2/README.md b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r2-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r2-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r2-a2/adapter_config.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c5f43ee5d95e6efa86bc12e96d56fbf5a2c265b7 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r2-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 4, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 2, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r2-a2/eval_results.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7dce9db737db09a248c7ab4f42d94ed2fd4ab48a --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "boolq", + "results": 0.7596330275229358 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r2-a2/training_configuration.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..347e600777487bfae1ec44b1a040e1ef2230ba94 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "BOOLQ", + "dataset_id": "google/boolq", + "preprocess_id": "boolq_train_deepeval" + }, + "peft_config": { + "method": "loraq4", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 1576960 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 2, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loraq4-boolq-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loraq4-fix/TinyLlama_v1.1-loraq4-boolq-r2-a2", + "seed": 42, + "timestamp": "2025-08-31T18:40:29.248455" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r2-a2/training_logs.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..3b014fc35e2b5873ea7fc620b5b4fa60a62fae12 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r2-a2/training_logs.json @@ -0,0 +1,2659 @@ +[ + { + "step": 1, + "epoch": 0.006779661016949152, + "cpu_mem": 2.910605312, + "gpu_mem": 1.03901184, + "loss": 8.7378, + "grad_norm": 15.271435737609863, + "learning_rate": 9.999999999999999e-06 + }, + { + "step": 2, + "epoch": 0.013559322033898305, + "cpu_mem": 2.992316416, + "gpu_mem": 1.068804608, + "loss": 8.7839, + "grad_norm": 16.564252853393555, + "learning_rate": 1.9999999999999998e-05 + }, + { + "step": 3, + "epoch": 0.020338983050847456, + "cpu_mem": 2.993102848, + "gpu_mem": 1.0687232, + "loss": 8.6827, + "grad_norm": 18.38329315185547, + "learning_rate": 2.9999999999999997e-05 + }, + { + "step": 4, + "epoch": 0.02711864406779661, + "cpu_mem": 2.993692672, + "gpu_mem": 1.0687232, + "loss": 8.6549, + "grad_norm": 19.324323654174805, + "learning_rate": 3.9999999999999996e-05 + }, + { + "step": 5, + "epoch": 0.03389830508474576, + "cpu_mem": 2.994282496, + "gpu_mem": 1.068658688, + "loss": 8.416, + "grad_norm": 17.658281326293945, + "learning_rate": 4.9999999999999996e-05 + }, + { + "step": 6, + "epoch": 0.04067796610169491, + "cpu_mem": 2.99487232, + "gpu_mem": 1.068678656, + "loss": 8.5806, + "grad_norm": 13.763591766357422, + "learning_rate": 5.9999999999999995e-05 + }, + { + "step": 7, + "epoch": 0.04745762711864407, + "cpu_mem": 2.995462144, + "gpu_mem": 1.06873088, + "loss": 8.3132, + "grad_norm": 14.808317184448242, + "learning_rate": 7e-05 + }, + { + "step": 8, + "epoch": 0.05423728813559322, + "cpu_mem": 2.99585536, + "gpu_mem": 1.068816896, + "loss": 7.9866, + "grad_norm": 15.866630554199219, + "learning_rate": 7.999999999999999e-05 + }, + { + "step": 9, + "epoch": 0.061016949152542375, + "cpu_mem": 2.996248576, + "gpu_mem": 1.068724736, + "loss": 7.2748, + "grad_norm": 19.040985107421875, + "learning_rate": 8.999999999999999e-05 + }, + { + "step": 10, + "epoch": 0.06779661016949153, + "cpu_mem": 2.9968384, + "gpu_mem": 1.068624896, + "loss": 7.0921, + "grad_norm": 19.471487045288086, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 11, + "epoch": 0.07457627118644068, + "cpu_mem": 2.997231616, + "gpu_mem": 1.068729344, + "loss": 6.4156, + "grad_norm": 23.543916702270508, + "learning_rate": 0.00010999999999999998 + }, + { + "step": 12, + "epoch": 0.08135593220338982, + "cpu_mem": 2.997624832, + "gpu_mem": 1.069101056, + "loss": 5.6552, + "grad_norm": 27.700197219848633, + "learning_rate": 0.00011999999999999999 + }, + { + "step": 13, + "epoch": 0.08813559322033898, + "cpu_mem": 2.998018048, + "gpu_mem": 1.068704768, + "loss": 4.8825, + "grad_norm": 28.357162475585938, + "learning_rate": 0.00013 + }, + { + "step": 14, + "epoch": 0.09491525423728814, + "cpu_mem": 2.99880448, + "gpu_mem": 1.068681728, + "loss": 3.7457, + "grad_norm": 21.508867263793945, + "learning_rate": 0.00014 + }, + { + "step": 15, + "epoch": 0.1016949152542373, + "cpu_mem": 2.999001088, + "gpu_mem": 1.068620288, + "loss": 2.8219, + "grad_norm": 19.41361427307129, + "learning_rate": 0.00015 + }, + { + "step": 16, + "epoch": 0.10847457627118644, + "cpu_mem": 2.999394304, + "gpu_mem": 1.068704768, + "loss": 2.1812, + "grad_norm": 11.234686851501465, + "learning_rate": 0.00015999999999999999 + }, + { + "step": 17, + "epoch": 0.1152542372881356, + "cpu_mem": 2.999984128, + "gpu_mem": 1.068744704, + "loss": 1.642, + "grad_norm": 7.353567123413086, + "learning_rate": 0.00016999999999999999 + }, + { + "step": 18, + "epoch": 0.12203389830508475, + "cpu_mem": 3.000377344, + "gpu_mem": 1.06880768, + "loss": 1.4183, + "grad_norm": 6.885821342468262, + "learning_rate": 0.00017999999999999998 + }, + { + "step": 19, + "epoch": 0.1288135593220339, + "cpu_mem": 3.000573952, + "gpu_mem": 1.068644864, + "loss": 1.0639, + "grad_norm": 4.479239463806152, + "learning_rate": 0.00018999999999999998 + }, + { + "step": 20, + "epoch": 0.13559322033898305, + "cpu_mem": 3.000967168, + "gpu_mem": 1.068756992, + "loss": 0.8437, + "grad_norm": 4.104933261871338, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 21, + "epoch": 0.1423728813559322, + "cpu_mem": 3.001163776, + "gpu_mem": 1.0689152, + "loss": 0.6619, + "grad_norm": 1.2240197658538818, + "learning_rate": 0.00020999999999999998 + }, + { + "step": 22, + "epoch": 0.14915254237288136, + "cpu_mem": 3.001556992, + "gpu_mem": 1.06880768, + "loss": 0.8166, + "grad_norm": 5.675334453582764, + "learning_rate": 0.00021999999999999995 + }, + { + "step": 23, + "epoch": 0.15593220338983052, + "cpu_mem": 3.0017536, + "gpu_mem": 1.068780032, + "loss": 0.6847, + "grad_norm": 1.5025297403335571, + "learning_rate": 0.00023 + }, + { + "step": 24, + "epoch": 0.16271186440677965, + "cpu_mem": 3.002146816, + "gpu_mem": 1.068836864, + "loss": 0.6687, + "grad_norm": 6.094733238220215, + "learning_rate": 0.00023999999999999998 + }, + { + "step": 25, + "epoch": 0.1694915254237288, + "cpu_mem": 3.002540032, + "gpu_mem": 1.068621824, + "loss": 0.6656, + "grad_norm": 3.599914789199829, + "learning_rate": 0.00025 + }, + { + "step": 26, + "epoch": 0.17627118644067796, + "cpu_mem": 3.00273664, + "gpu_mem": 1.06867712, + "loss": 0.7156, + "grad_norm": 7.082047462463379, + "learning_rate": 0.00026 + }, + { + "step": 27, + "epoch": 0.18305084745762712, + "cpu_mem": 3.002933248, + "gpu_mem": 1.06896896, + "loss": 0.6233, + "grad_norm": 1.1837284564971924, + "learning_rate": 0.00027 + }, + { + "step": 28, + "epoch": 0.18983050847457628, + "cpu_mem": 3.003129856, + "gpu_mem": 1.068647936, + "loss": 0.7715, + "grad_norm": 18.536123275756836, + "learning_rate": 0.00028 + }, + { + "step": 29, + "epoch": 0.19661016949152543, + "cpu_mem": 3.003326464, + "gpu_mem": 1.068712448, + "loss": 0.6415, + "grad_norm": 3.613028049468994, + "learning_rate": 0.00029 + }, + { + "step": 30, + "epoch": 0.2033898305084746, + "cpu_mem": 3.003523072, + "gpu_mem": 1.068790784, + "loss": 0.6806, + "grad_norm": 4.323382377624512, + "learning_rate": 0.0003 + }, + { + "step": 31, + "epoch": 0.21016949152542372, + "cpu_mem": 3.00371968, + "gpu_mem": 1.068594176, + "loss": 0.6568, + "grad_norm": 6.955411434173584, + "learning_rate": 0.0002999893794250036 + }, + { + "step": 32, + "epoch": 0.21694915254237288, + "cpu_mem": 3.004112896, + "gpu_mem": 1.06870784, + "loss": 0.7211, + "grad_norm": 6.009191513061523, + "learning_rate": 0.00029995751920396937 + }, + { + "step": 33, + "epoch": 0.22372881355932203, + "cpu_mem": 3.004309504, + "gpu_mem": 1.06894592, + "loss": 0.6923, + "grad_norm": 5.080873489379883, + "learning_rate": 0.00029990442384854874 + }, + { + "step": 34, + "epoch": 0.2305084745762712, + "cpu_mem": 3.004506112, + "gpu_mem": 1.068647936, + "loss": 0.6451, + "grad_norm": 5.852160930633545, + "learning_rate": 0.0002998301008774512 + }, + { + "step": 35, + "epoch": 0.23728813559322035, + "cpu_mem": 3.004899328, + "gpu_mem": 1.068858368, + "loss": 0.6976, + "grad_norm": 2.980858087539673, + "learning_rate": 0.0002997345608153792 + }, + { + "step": 36, + "epoch": 0.2440677966101695, + "cpu_mem": 3.004899328, + "gpu_mem": 1.068809216, + "loss": 0.6791, + "grad_norm": 4.070059776306152, + "learning_rate": 0.000299617817191538 + }, + { + "step": 37, + "epoch": 0.25084745762711863, + "cpu_mem": 3.005243392, + "gpu_mem": 1.068620288, + "loss": 0.6479, + "grad_norm": 5.109630107879639, + "learning_rate": 0.0002994798865377198 + }, + { + "step": 38, + "epoch": 0.2576271186440678, + "cpu_mem": 3.0052352, + "gpu_mem": 1.068867584, + "loss": 0.7543, + "grad_norm": 6.712920188903809, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 39, + "epoch": 0.26440677966101694, + "cpu_mem": 3.007533056, + "gpu_mem": 1.069246976, + "loss": 0.7244, + "grad_norm": 4.7745585441589355, + "learning_rate": 0.0002991405452657846 + }, + { + "step": 40, + "epoch": 0.2711864406779661, + "cpu_mem": 3.007926272, + "gpu_mem": 1.068816896, + "loss": 0.6404, + "grad_norm": 1.4595054388046265, + "learning_rate": 0.00029893918270099324 + }, + { + "step": 41, + "epoch": 0.27796610169491526, + "cpu_mem": 3.00812288, + "gpu_mem": 1.069044224, + "loss": 0.7158, + "grad_norm": 5.147329807281494, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 42, + "epoch": 0.2847457627118644, + "cpu_mem": 3.012251648, + "gpu_mem": 1.068941312, + "loss": 0.7589, + "grad_norm": 7.117756366729736, + "learning_rate": 0.0002984732162821399 + }, + { + "step": 43, + "epoch": 0.29152542372881357, + "cpu_mem": 3.012448256, + "gpu_mem": 1.068763136, + "loss": 0.6885, + "grad_norm": 6.944693565368652, + "learning_rate": 0.0002982086784124952 + }, + { + "step": 44, + "epoch": 0.2983050847457627, + "cpu_mem": 3.012448256, + "gpu_mem": 1.068905984, + "loss": 0.6364, + "grad_norm": 0.8794609904289246, + "learning_rate": 0.00029792315305772796 + }, + { + "step": 45, + "epoch": 0.3050847457627119, + "cpu_mem": 3.014610944, + "gpu_mem": 1.068686336, + "loss": 1.0324, + "grad_norm": 10.703725814819336, + "learning_rate": 0.0002976166806504174 + }, + { + "step": 46, + "epoch": 0.31186440677966104, + "cpu_mem": 3.014807552, + "gpu_mem": 1.068929024, + "loss": 0.9029, + "grad_norm": 8.505386352539062, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 47, + "epoch": 0.31864406779661014, + "cpu_mem": 3.01500416, + "gpu_mem": 1.068652544, + "loss": 0.8251, + "grad_norm": 7.021992206573486, + "learning_rate": 0.00029694107123365385 + }, + { + "step": 48, + "epoch": 0.3254237288135593, + "cpu_mem": 3.0159872, + "gpu_mem": 1.068729344, + "loss": 0.6274, + "grad_norm": 2.8866336345672607, + "learning_rate": 0.00029657202989567393 + }, + { + "step": 49, + "epoch": 0.33220338983050846, + "cpu_mem": 3.016183808, + "gpu_mem": 1.06874624, + "loss": 0.7173, + "grad_norm": 3.7519149780273438, + "learning_rate": 0.00029618223283454893 + }, + { + "step": 50, + "epoch": 0.3389830508474576, + "cpu_mem": 3.018477568, + "gpu_mem": 1.0686848, + "loss": 0.7664, + "grad_norm": 7.1080780029296875, + "learning_rate": 0.00029577173524853123 + }, + { + "step": 51, + "epoch": 0.34576271186440677, + "cpu_mem": 3.020443648, + "gpu_mem": 1.068689408, + "loss": 1.0344, + "grad_norm": 11.303244590759277, + "learning_rate": 0.0002953405952672261 + }, + { + "step": 52, + "epoch": 0.3525423728813559, + "cpu_mem": 3.020836864, + "gpu_mem": 1.06876928, + "loss": 0.9492, + "grad_norm": 8.597086906433105, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 53, + "epoch": 0.3593220338983051, + "cpu_mem": 3.021033472, + "gpu_mem": 1.06879232, + "loss": 0.993, + "grad_norm": 9.987013816833496, + "learning_rate": 0.0002944166352441363 + }, + { + "step": 54, + "epoch": 0.36610169491525424, + "cpu_mem": 3.021033472, + "gpu_mem": 1.068720128, + "loss": 0.7134, + "grad_norm": 2.8843307495117188, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 55, + "epoch": 0.3728813559322034, + "cpu_mem": 3.021426688, + "gpu_mem": 1.068990464, + "loss": 0.6039, + "grad_norm": 1.265394926071167, + "learning_rate": 0.00029341087610604337 + }, + { + "step": 56, + "epoch": 0.37966101694915255, + "cpu_mem": 3.021426688, + "gpu_mem": 1.06877696, + "loss": 0.8019, + "grad_norm": 5.300872802734375, + "learning_rate": 0.00029287749809037904 + }, + { + "step": 57, + "epoch": 0.3864406779661017, + "cpu_mem": 3.021623296, + "gpu_mem": 1.068770816, + "loss": 0.7357, + "grad_norm": 4.298585414886475, + "learning_rate": 0.0002923238875255979 + }, + { + "step": 58, + "epoch": 0.39322033898305087, + "cpu_mem": 3.021623296, + "gpu_mem": 1.068666368, + "loss": 0.7165, + "grad_norm": 4.128717422485352, + "learning_rate": 0.00029175012280720024 + }, + { + "step": 59, + "epoch": 0.4, + "cpu_mem": 3.022016512, + "gpu_mem": 1.068683264, + "loss": 0.8328, + "grad_norm": 4.5696845054626465, + "learning_rate": 0.000291156285184669 + }, + { + "step": 60, + "epoch": 0.4067796610169492, + "cpu_mem": 3.02221312, + "gpu_mem": 1.06877696, + "loss": 0.5698, + "grad_norm": 1.1020276546478271, + "learning_rate": 0.00029054245874996426 + }, + { + "step": 61, + "epoch": 0.4135593220338983, + "cpu_mem": 3.02221312, + "gpu_mem": 1.068787712, + "loss": 0.6117, + "grad_norm": 1.2840603590011597, + "learning_rate": 0.0002899087304256151 + }, + { + "step": 62, + "epoch": 0.42033898305084744, + "cpu_mem": 3.022409728, + "gpu_mem": 1.068775424, + "loss": 0.6904, + "grad_norm": 0.703222930431366, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 63, + "epoch": 0.4271186440677966, + "cpu_mem": 3.022409728, + "gpu_mem": 1.068767744, + "loss": 0.6291, + "grad_norm": 3.823390007019043, + "learning_rate": 0.000288581929876693 + }, + { + "step": 64, + "epoch": 0.43389830508474575, + "cpu_mem": 3.022606336, + "gpu_mem": 1.068697088, + "loss": 0.6666, + "grad_norm": 2.750748634338379, + "learning_rate": 0.0002878890455372498 + }, + { + "step": 65, + "epoch": 0.4406779661016949, + "cpu_mem": 3.022606336, + "gpu_mem": 1.068741632, + "loss": 0.6596, + "grad_norm": 1.6596299409866333, + "learning_rate": 0.0002871766350518159 + }, + { + "step": 66, + "epoch": 0.44745762711864406, + "cpu_mem": 3.022606336, + "gpu_mem": 1.068935168, + "loss": 0.5707, + "grad_norm": 1.104280948638916, + "learning_rate": 0.00028644479930317775 + }, + { + "step": 67, + "epoch": 0.4542372881355932, + "cpu_mem": 3.022802944, + "gpu_mem": 1.068644864, + "loss": 0.6097, + "grad_norm": 1.4518613815307617, + "learning_rate": 0.00028569364192488803 + }, + { + "step": 68, + "epoch": 0.4610169491525424, + "cpu_mem": 3.022802944, + "gpu_mem": 1.068612608, + "loss": 0.7811, + "grad_norm": 3.472898006439209, + "learning_rate": 0.00028492326928659045 + }, + { + "step": 69, + "epoch": 0.46779661016949153, + "cpu_mem": 3.022999552, + "gpu_mem": 1.068678656, + "loss": 0.6198, + "grad_norm": 1.8571032285690308, + "learning_rate": 0.00028413379047895665 + }, + { + "step": 70, + "epoch": 0.4745762711864407, + "cpu_mem": 3.022999552, + "gpu_mem": 1.068672512, + "loss": 0.6059, + "grad_norm": 0.7543649077415466, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 71, + "epoch": 0.48135593220338985, + "cpu_mem": 3.022999552, + "gpu_mem": 1.068901376, + "loss": 0.6369, + "grad_norm": 3.2864503860473633, + "learning_rate": 0.0002824979642304366 + }, + { + "step": 72, + "epoch": 0.488135593220339, + "cpu_mem": 3.022999552, + "gpu_mem": 1.068893696, + "loss": 0.6114, + "grad_norm": 3.0940074920654297, + "learning_rate": 0.0002816518484350883 + }, + { + "step": 73, + "epoch": 0.49491525423728816, + "cpu_mem": 3.022999552, + "gpu_mem": 1.068859904, + "loss": 0.6278, + "grad_norm": 0.7879101037979126, + "learning_rate": 0.0002807870897286772 + }, + { + "step": 74, + "epoch": 0.5016949152542373, + "cpu_mem": 3.02319616, + "gpu_mem": 1.068720128, + "loss": 0.5549, + "grad_norm": 1.6631962060928345, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 75, + "epoch": 0.5084745762711864, + "cpu_mem": 3.02319616, + "gpu_mem": 1.068644864, + "loss": 0.55, + "grad_norm": 1.4942209720611572, + "learning_rate": 0.000279002136031155 + }, + { + "step": 76, + "epoch": 0.5152542372881356, + "cpu_mem": 3.02319616, + "gpu_mem": 1.06858496, + "loss": 0.6338, + "grad_norm": 1.523253321647644, + "learning_rate": 0.00027808219380317216 + }, + { + "step": 77, + "epoch": 0.5220338983050847, + "cpu_mem": 3.02319616, + "gpu_mem": 1.068658688, + "loss": 0.522, + "grad_norm": 1.1379536390304565, + "learning_rate": 0.0002771441141545895 + }, + { + "step": 78, + "epoch": 0.5288135593220339, + "cpu_mem": 3.02319616, + "gpu_mem": 1.068710912, + "loss": 0.8138, + "grad_norm": 5.910027503967285, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 79, + "epoch": 0.535593220338983, + "cpu_mem": 3.02319616, + "gpu_mem": 1.068843008, + "loss": 0.6741, + "grad_norm": 2.8561506271362305, + "learning_rate": 0.000275214076502292 + }, + { + "step": 80, + "epoch": 0.5423728813559322, + "cpu_mem": 3.023392768, + "gpu_mem": 1.068733952, + "loss": 0.6004, + "grad_norm": 1.5697033405303955, + "learning_rate": 0.0002742223918067056 + }, + { + "step": 81, + "epoch": 0.5491525423728814, + "cpu_mem": 3.023392768, + "gpu_mem": 1.068614144, + "loss": 0.5904, + "grad_norm": 1.2251287698745728, + "learning_rate": 0.00027321311626807374 + }, + { + "step": 82, + "epoch": 0.5559322033898305, + "cpu_mem": 3.023392768, + "gpu_mem": 1.068683264, + "loss": 0.654, + "grad_norm": 2.2357256412506104, + "learning_rate": 0.0002721863928075503 + }, + { + "step": 83, + "epoch": 0.5627118644067797, + "cpu_mem": 3.023589376, + "gpu_mem": 1.068783104, + "loss": 0.6284, + "grad_norm": 1.343625783920288, + "learning_rate": 0.000271142366817049 + }, + { + "step": 84, + "epoch": 0.5694915254237288, + "cpu_mem": 3.023785984, + "gpu_mem": 1.06874624, + "loss": 0.649, + "grad_norm": 2.501361846923828, + "learning_rate": 0.00027008118613865406 + }, + { + "step": 85, + "epoch": 0.576271186440678, + "cpu_mem": 3.023785984, + "gpu_mem": 1.068778496, + "loss": 0.5897, + "grad_norm": 1.2885847091674805, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 86, + "epoch": 0.5830508474576271, + "cpu_mem": 3.023785984, + "gpu_mem": 1.068729344, + "loss": 0.616, + "grad_norm": 1.5744162797927856, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 87, + "epoch": 0.5898305084745763, + "cpu_mem": 3.023785984, + "gpu_mem": 1.068737024, + "loss": 0.5909, + "grad_norm": 2.462799549102783, + "learning_rate": 0.00026679623070746325 + }, + { + "step": 88, + "epoch": 0.5966101694915255, + "cpu_mem": 3.023785984, + "gpu_mem": 1.068881408, + "loss": 0.5409, + "grad_norm": 1.7684823274612427, + "learning_rate": 0.0002656679579618081 + }, + { + "step": 89, + "epoch": 0.6033898305084746, + "cpu_mem": 3.023785984, + "gpu_mem": 1.068663296, + "loss": 0.6646, + "grad_norm": 2.56778621673584, + "learning_rate": 0.0002645233057465235 + }, + { + "step": 90, + "epoch": 0.6101694915254238, + "cpu_mem": 3.023785984, + "gpu_mem": 1.068717056, + "loss": 0.5649, + "grad_norm": 1.0568256378173828, + "learning_rate": 0.00026336243615313873 + }, + { + "step": 91, + "epoch": 0.6169491525423729, + "cpu_mem": 3.023785984, + "gpu_mem": 1.0686848, + "loss": 0.5404, + "grad_norm": 1.7081698179244995, + "learning_rate": 0.00026218551356968814 + }, + { + "step": 92, + "epoch": 0.6237288135593221, + "cpu_mem": 3.023785984, + "gpu_mem": 1.068766208, + "loss": 0.6255, + "grad_norm": 0.9770658016204834, + "learning_rate": 0.00026099270465743254 + }, + { + "step": 93, + "epoch": 0.6305084745762712, + "cpu_mem": 3.023982592, + "gpu_mem": 1.0685696, + "loss": 0.7038, + "grad_norm": 1.3799071311950684, + "learning_rate": 0.0002597841783272588 + }, + { + "step": 94, + "epoch": 0.6372881355932203, + "cpu_mem": 3.023982592, + "gpu_mem": 1.068683264, + "loss": 0.7005, + "grad_norm": 3.4808766841888428, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 95, + "epoch": 0.6440677966101694, + "cpu_mem": 3.023982592, + "gpu_mem": 1.068703232, + "loss": 0.6204, + "grad_norm": 1.9717398881912231, + "learning_rate": 0.00025732066016100394 + }, + { + "step": 96, + "epoch": 0.6508474576271186, + "cpu_mem": 3.023982592, + "gpu_mem": 1.068741632, + "loss": 0.5145, + "grad_norm": 1.615808367729187, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 97, + "epoch": 0.6576271186440678, + "cpu_mem": 3.023982592, + "gpu_mem": 1.068726272, + "loss": 0.6303, + "grad_norm": 1.0164070129394531, + "learning_rate": 0.0002547963544337602 + }, + { + "step": 98, + "epoch": 0.6644067796610169, + "cpu_mem": 3.023982592, + "gpu_mem": 1.06863872, + "loss": 0.5928, + "grad_norm": 1.5801987648010254, + "learning_rate": 0.0002535118517223168 + }, + { + "step": 99, + "epoch": 0.6711864406779661, + "cpu_mem": 3.023982592, + "gpu_mem": 1.068588032, + "loss": 0.5803, + "grad_norm": 2.0679094791412354, + "learning_rate": 0.00025221269093908365 + }, + { + "step": 100, + "epoch": 0.6779661016949152, + "cpu_mem": 3.023982592, + "gpu_mem": 1.068704768, + "loss": 0.5756, + "grad_norm": 1.0271257162094116, + "learning_rate": 0.0002508990560551879 + }, + { + "step": 101, + "epoch": 0.6847457627118644, + "cpu_mem": 3.023982592, + "gpu_mem": 1.068737024, + "loss": 0.5664, + "grad_norm": 1.1557652950286865, + "learning_rate": 0.0002495711330914001 + }, + { + "step": 102, + "epoch": 0.6915254237288135, + "cpu_mem": 3.023982592, + "gpu_mem": 1.068770816, + "loss": 0.5937, + "grad_norm": 1.039198398590088, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 103, + "epoch": 0.6983050847457627, + "cpu_mem": 3.023982592, + "gpu_mem": 1.068821504, + "loss": 0.5946, + "grad_norm": 1.4562244415283203, + "learning_rate": 0.0002468731770971113 + }, + { + "step": 104, + "epoch": 0.7050847457627119, + "cpu_mem": 3.023982592, + "gpu_mem": 1.068726272, + "loss": 0.5638, + "grad_norm": 1.289884090423584, + "learning_rate": 0.0002455035261178632 + }, + { + "step": 105, + "epoch": 0.711864406779661, + "cpu_mem": 3.023982592, + "gpu_mem": 1.068827648, + "loss": 0.6115, + "grad_norm": 1.2751543521881104, + "learning_rate": 0.0002441203511071278 + }, + { + "step": 106, + "epoch": 0.7186440677966102, + "cpu_mem": 3.023982592, + "gpu_mem": 1.068778496, + "loss": 0.6222, + "grad_norm": 1.9205100536346436, + "learning_rate": 0.00024272384793309077 + }, + { + "step": 107, + "epoch": 0.7254237288135593, + "cpu_mem": 3.023982592, + "gpu_mem": 1.068666368, + "loss": 0.4969, + "grad_norm": 1.0122655630111694, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 108, + "epoch": 0.7322033898305085, + "cpu_mem": 3.023982592, + "gpu_mem": 1.068850688, + "loss": 0.5754, + "grad_norm": 1.3669155836105347, + "learning_rate": 0.00023989164997670202 + }, + { + "step": 109, + "epoch": 0.7389830508474576, + "cpu_mem": 3.023982592, + "gpu_mem": 1.068704768, + "loss": 0.6756, + "grad_norm": 2.3788578510284424, + "learning_rate": 0.0002384563562552943 + }, + { + "step": 110, + "epoch": 0.7457627118644068, + "cpu_mem": 3.0241792, + "gpu_mem": 1.06870784, + "loss": 0.6394, + "grad_norm": 2.7610864639282227, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 111, + "epoch": 0.752542372881356, + "cpu_mem": 3.0241792, + "gpu_mem": 1.06867712, + "loss": 0.5875, + "grad_norm": 2.554514169692993, + "learning_rate": 0.0002355483955402446 + }, + { + "step": 112, + "epoch": 0.7593220338983051, + "cpu_mem": 3.0241792, + "gpu_mem": 1.0687232, + "loss": 0.5541, + "grad_norm": 1.0130515098571777, + "learning_rate": 0.00023407614033613407 + }, + { + "step": 113, + "epoch": 0.7661016949152543, + "cpu_mem": 3.0241792, + "gpu_mem": 1.068713984, + "loss": 0.5699, + "grad_norm": 1.1030726432800293, + "learning_rate": 0.0002325919793059723 + }, + { + "step": 114, + "epoch": 0.7728813559322034, + "cpu_mem": 3.0241792, + "gpu_mem": 1.068695552, + "loss": 0.5668, + "grad_norm": 1.7635741233825684, + "learning_rate": 0.00023109612261833963 + }, + { + "step": 115, + "epoch": 0.7796610169491526, + "cpu_mem": 3.0241792, + "gpu_mem": 1.068770816, + "loss": 0.5809, + "grad_norm": 2.15474796295166, + "learning_rate": 0.0002295887820980112 + }, + { + "step": 116, + "epoch": 0.7864406779661017, + "cpu_mem": 3.0241792, + "gpu_mem": 1.068690944, + "loss": 0.5857, + "grad_norm": 2.1928961277008057, + "learning_rate": 0.0002280701711959608 + }, + { + "step": 117, + "epoch": 0.7932203389830509, + "cpu_mem": 3.0241792, + "gpu_mem": 1.068581888, + "loss": 0.5269, + "grad_norm": 1.2244411706924438, + "learning_rate": 0.00022654050495913495 + }, + { + "step": 118, + "epoch": 0.8, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068819968, + "loss": 0.5719, + "grad_norm": 1.8091001510620117, + "learning_rate": 0.000225 + }, + { + "step": 119, + "epoch": 0.8067796610169492, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068990464, + "loss": 0.5121, + "grad_norm": 1.253862977027893, + "learning_rate": 0.00022344887446586865 + }, + { + "step": 120, + "epoch": 0.8135593220338984, + "cpu_mem": 3.024375808, + "gpu_mem": 1.0687232, + "loss": 0.5483, + "grad_norm": 1.3096308708190918, + "learning_rate": 0.00022188734800800852 + }, + { + "step": 121, + "epoch": 0.8203389830508474, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068750848, + "loss": 0.5007, + "grad_norm": 1.3187991380691528, + "learning_rate": 0.00022031564175053754 + }, + { + "step": 122, + "epoch": 0.8271186440677966, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068801536, + "loss": 0.4983, + "grad_norm": 1.3363879919052124, + "learning_rate": 0.00021873397825911153 + }, + { + "step": 123, + "epoch": 0.8338983050847457, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068611072, + "loss": 0.54, + "grad_norm": 2.4911434650421143, + "learning_rate": 0.00021714258150940685 + }, + { + "step": 124, + "epoch": 0.8406779661016949, + "cpu_mem": 3.024375808, + "gpu_mem": 1.06905344, + "loss": 0.4846, + "grad_norm": 1.453972578048706, + "learning_rate": 0.0002155416768554039 + }, + { + "step": 125, + "epoch": 0.847457627118644, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068780032, + "loss": 0.5229, + "grad_norm": 1.3837724924087524, + "learning_rate": 0.00021393149099747523 + }, + { + "step": 126, + "epoch": 0.8542372881355932, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068663296, + "loss": 0.5306, + "grad_norm": 1.7945332527160645, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 127, + "epoch": 0.8610169491525423, + "cpu_mem": 3.024375808, + "gpu_mem": 1.069102592, + "loss": 0.5396, + "grad_norm": 1.5270707607269287, + "learning_rate": 0.00021068418901049025 + }, + { + "step": 128, + "epoch": 0.8677966101694915, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068878336, + "loss": 0.4676, + "grad_norm": 1.7519081830978394, + "learning_rate": 0.0002090475327242912 + }, + { + "step": 129, + "epoch": 0.8745762711864407, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068918272, + "loss": 0.5526, + "grad_norm": 1.4487687349319458, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 130, + "epoch": 0.8813559322033898, + "cpu_mem": 3.024375808, + "gpu_mem": 1.06870016, + "loss": 0.6355, + "grad_norm": 1.3922486305236816, + "learning_rate": 0.0002057493683490491 + }, + { + "step": 131, + "epoch": 0.888135593220339, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068829184, + "loss": 0.5467, + "grad_norm": 1.3203169107437134, + "learning_rate": 0.00020408832730536746 + }, + { + "step": 132, + "epoch": 0.8949152542372881, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068910592, + "loss": 0.4764, + "grad_norm": 1.8880524635314941, + "learning_rate": 0.00020241962693986476 + }, + { + "step": 133, + "epoch": 0.9016949152542373, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068694016, + "loss": 0.5068, + "grad_norm": 1.802552342414856, + "learning_rate": 0.0002007435035533061 + }, + { + "step": 134, + "epoch": 0.9084745762711864, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068827648, + "loss": 0.4676, + "grad_norm": 2.9769034385681152, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 135, + "epoch": 0.9152542372881356, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068850688, + "loss": 0.5116, + "grad_norm": 1.4150545597076416, + "learning_rate": 0.00019736993814225374 + }, + { + "step": 136, + "epoch": 0.9220338983050848, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068687872, + "loss": 0.4517, + "grad_norm": 1.7311124801635742, + "learning_rate": 0.00019567297384048604 + }, + { + "step": 137, + "epoch": 0.9288135593220339, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068568064, + "loss": 0.5902, + "grad_norm": 2.247164726257324, + "learning_rate": 0.0001939695418954653 + }, + { + "step": 138, + "epoch": 0.9355932203389831, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068749312, + "loss": 0.4995, + "grad_norm": 2.6833088397979736, + "learning_rate": 0.00019225988352621445 + }, + { + "step": 139, + "epoch": 0.9423728813559322, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068647936, + "loss": 0.5016, + "grad_norm": 2.480593204498291, + "learning_rate": 0.00019054424083346592 + }, + { + "step": 140, + "epoch": 0.9491525423728814, + "cpu_mem": 3.024375808, + "gpu_mem": 1.06870016, + "loss": 0.4778, + "grad_norm": 1.5207394361495972, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 141, + "epoch": 0.9559322033898305, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068732416, + "loss": 0.5666, + "grad_norm": 1.44263756275177, + "learning_rate": 0.0001870959750831323 + }, + { + "step": 142, + "epoch": 0.9627118644067797, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068872192, + "loss": 0.4993, + "grad_norm": 2.264585256576538, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 143, + "epoch": 0.9694915254237289, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068855296, + "loss": 0.618, + "grad_norm": 2.1623430252075195, + "learning_rate": 0.00018362669777878453 + }, + { + "step": 144, + "epoch": 0.976271186440678, + "cpu_mem": 3.024375808, + "gpu_mem": 1.069047296, + "loss": 0.5553, + "grad_norm": 1.8484605550765991, + "learning_rate": 0.00018188479343294648 + }, + { + "step": 145, + "epoch": 0.9830508474576272, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068758528, + "loss": 0.5344, + "grad_norm": 3.229119300842285, + "learning_rate": 0.0001801383739559098 + }, + { + "step": 146, + "epoch": 0.9898305084745763, + "cpu_mem": 3.024375808, + "gpu_mem": 1.068793856, + "loss": 0.4841, + "grad_norm": 1.4119296073913574, + "learning_rate": 0.0001783876866540615 + }, + { + "step": 147, + "epoch": 0.9966101694915255, + "cpu_mem": 3.024375808, + "gpu_mem": 1.06869248, + "loss": 0.519, + "grad_norm": 1.645526647567749, + "learning_rate": 0.00017663297943814552 + }, + { + "step": 148, + "epoch": 1.0033898305084745, + "cpu_mem": 3.024375808, + "gpu_mem": 1.075164672, + "loss": 0.7668, + "grad_norm": 3.4961225986480713, + "learning_rate": 0.0001748745007881561 + }, + { + "step": 149, + "epoch": 1.0101694915254238, + "cpu_mem": 3.024572416, + "gpu_mem": 1.07510016, + "loss": 0.4852, + "grad_norm": 2.7741973400115967, + "learning_rate": 0.00017311249971815185 + }, + { + "step": 150, + "epoch": 1.0169491525423728, + "cpu_mem": 3.024572416, + "gpu_mem": 1.074937344, + "loss": 0.4893, + "grad_norm": 2.4123740196228027, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 151, + "epoch": 1.023728813559322, + "cpu_mem": 3.024330752, + "gpu_mem": 1.075009536, + "loss": 0.5661, + "grad_norm": 2.72945237159729, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 152, + "epoch": 1.0305084745762711, + "cpu_mem": 3.02452736, + "gpu_mem": 1.075044864, + "loss": 0.4701, + "grad_norm": 2.0736031532287598, + "learning_rate": 0.00016780785939859576 + }, + { + "step": 153, + "epoch": 1.0372881355932204, + "cpu_mem": 3.02452736, + "gpu_mem": 1.07506944, + "loss": 0.6137, + "grad_norm": 1.7767512798309326, + "learning_rate": 0.00016603426823476693 + }, + { + "step": 154, + "epoch": 1.0440677966101695, + "cpu_mem": 3.02452736, + "gpu_mem": 1.07503104, + "loss": 0.4822, + "grad_norm": 1.5099303722381592, + "learning_rate": 0.00016425840649562736 + }, + { + "step": 155, + "epoch": 1.0508474576271187, + "cpu_mem": 3.02452736, + "gpu_mem": 1.075252224, + "loss": 0.487, + "grad_norm": 1.8491995334625244, + "learning_rate": 0.00016248052565681436 + }, + { + "step": 156, + "epoch": 1.0576271186440678, + "cpu_mem": 3.02452736, + "gpu_mem": 1.075160064, + "loss": 0.4668, + "grad_norm": 3.0728366374969482, + "learning_rate": 0.00016070087747988482 + }, + { + "step": 157, + "epoch": 1.064406779661017, + "cpu_mem": 3.02452736, + "gpu_mem": 1.075066368, + "loss": 0.4703, + "grad_norm": 1.3022750616073608, + "learning_rate": 0.00015891971397666464 + }, + { + "step": 158, + "epoch": 1.071186440677966, + "cpu_mem": 3.02452736, + "gpu_mem": 1.07499264, + "loss": 0.4533, + "grad_norm": 1.5942481756210327, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 159, + "epoch": 1.0779661016949154, + "cpu_mem": 3.02452736, + "gpu_mem": 1.075341312, + "loss": 0.4258, + "grad_norm": 1.7073136568069458, + "learning_rate": 0.00015535385007584706 + }, + { + "step": 160, + "epoch": 1.0847457627118644, + "cpu_mem": 3.02452736, + "gpu_mem": 1.074935808, + "loss": 0.4883, + "grad_norm": 2.8987491130828857, + "learning_rate": 0.0001535696546319161 + }, + { + "step": 161, + "epoch": 1.0915254237288137, + "cpu_mem": 3.02452736, + "gpu_mem": 1.074882048, + "loss": 0.3961, + "grad_norm": 1.3458895683288574, + "learning_rate": 0.00015178495369752213 + }, + { + "step": 162, + "epoch": 1.0983050847457627, + "cpu_mem": 3.02452736, + "gpu_mem": 1.075657728, + "loss": 0.3968, + "grad_norm": 1.7554863691329956, + "learning_rate": 0.00015 + }, + { + "step": 163, + "epoch": 1.1050847457627118, + "cpu_mem": 3.02452736, + "gpu_mem": 1.075133952, + "loss": 0.4727, + "grad_norm": 2.3294637203216553, + "learning_rate": 0.00014821504630247785 + }, + { + "step": 164, + "epoch": 1.111864406779661, + "cpu_mem": 3.02452736, + "gpu_mem": 1.0750464, + "loss": 0.559, + "grad_norm": 2.5595593452453613, + "learning_rate": 0.00014643034536808387 + }, + { + "step": 165, + "epoch": 1.11864406779661, + "cpu_mem": 3.02452736, + "gpu_mem": 1.074995712, + "loss": 0.4919, + "grad_norm": 2.569507122039795, + "learning_rate": 0.00014464614992415294 + }, + { + "step": 166, + "epoch": 1.1254237288135593, + "cpu_mem": 3.02452736, + "gpu_mem": 1.075090944, + "loss": 0.4528, + "grad_norm": 2.5460243225097656, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 167, + "epoch": 1.1322033898305084, + "cpu_mem": 3.02452736, + "gpu_mem": 1.075008, + "loss": 0.5163, + "grad_norm": 1.9810658693313599, + "learning_rate": 0.00014108028602333536 + }, + { + "step": 168, + "epoch": 1.1389830508474577, + "cpu_mem": 3.02452736, + "gpu_mem": 1.075026432, + "loss": 0.5094, + "grad_norm": 2.57608699798584, + "learning_rate": 0.00013929912252011516 + }, + { + "step": 169, + "epoch": 1.1457627118644067, + "cpu_mem": 3.02452736, + "gpu_mem": 1.075113984, + "loss": 0.5151, + "grad_norm": 4.170752048492432, + "learning_rate": 0.00013751947434318564 + }, + { + "step": 170, + "epoch": 1.152542372881356, + "cpu_mem": 3.02452736, + "gpu_mem": 1.074998784, + "loss": 0.6024, + "grad_norm": 2.973747491836548, + "learning_rate": 0.00013574159350437261 + }, + { + "step": 171, + "epoch": 1.159322033898305, + "cpu_mem": 3.02452736, + "gpu_mem": 1.07506176, + "loss": 0.4935, + "grad_norm": 1.5709177255630493, + "learning_rate": 0.0001339657317652331 + }, + { + "step": 172, + "epoch": 1.1661016949152543, + "cpu_mem": 3.02452736, + "gpu_mem": 1.0749696, + "loss": 0.3875, + "grad_norm": 1.98631751537323, + "learning_rate": 0.00013219214060140424 + }, + { + "step": 173, + "epoch": 1.1728813559322033, + "cpu_mem": 3.024723968, + "gpu_mem": 1.07526912, + "loss": 0.509, + "grad_norm": 1.7751320600509644, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 174, + "epoch": 1.1796610169491526, + "cpu_mem": 3.024723968, + "gpu_mem": 1.07499264, + "loss": 0.5287, + "grad_norm": 2.255643606185913, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 175, + "epoch": 1.1864406779661016, + "cpu_mem": 3.024723968, + "gpu_mem": 1.074958848, + "loss": 0.5266, + "grad_norm": 1.9241957664489746, + "learning_rate": 0.00012688750028184818 + }, + { + "step": 176, + "epoch": 1.193220338983051, + "cpu_mem": 3.024723968, + "gpu_mem": 1.075097088, + "loss": 0.4003, + "grad_norm": 2.2404611110687256, + "learning_rate": 0.0001251254992118439 + }, + { + "step": 177, + "epoch": 1.2, + "cpu_mem": 3.024723968, + "gpu_mem": 1.075195392, + "loss": 0.4808, + "grad_norm": 1.8443471193313599, + "learning_rate": 0.00012336702056185453 + }, + { + "step": 178, + "epoch": 1.2067796610169492, + "cpu_mem": 3.024723968, + "gpu_mem": 1.074941952, + "loss": 0.4479, + "grad_norm": 1.9600310325622559, + "learning_rate": 0.00012161231334593851 + }, + { + "step": 179, + "epoch": 1.2135593220338983, + "cpu_mem": 3.024723968, + "gpu_mem": 1.075041792, + "loss": 0.5299, + "grad_norm": 2.4954330921173096, + "learning_rate": 0.00011986162604409015 + }, + { + "step": 180, + "epoch": 1.2203389830508475, + "cpu_mem": 3.024723968, + "gpu_mem": 1.075014144, + "loss": 0.4952, + "grad_norm": 2.905496597290039, + "learning_rate": 0.00011811520656705348 + }, + { + "step": 181, + "epoch": 1.2271186440677966, + "cpu_mem": 3.024723968, + "gpu_mem": 1.074951168, + "loss": 0.4377, + "grad_norm": 2.2784111499786377, + "learning_rate": 0.00011637330222121543 + }, + { + "step": 182, + "epoch": 1.2338983050847459, + "cpu_mem": 3.024723968, + "gpu_mem": 1.07516928, + "loss": 0.5208, + "grad_norm": 3.6050169467926025, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 183, + "epoch": 1.240677966101695, + "cpu_mem": 3.024723968, + "gpu_mem": 1.075066368, + "loss": 0.4716, + "grad_norm": 1.895915150642395, + "learning_rate": 0.00011290402491686766 + }, + { + "step": 184, + "epoch": 1.2474576271186442, + "cpu_mem": 3.024723968, + "gpu_mem": 1.075014144, + "loss": 0.4344, + "grad_norm": 1.6646631956100464, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 185, + "epoch": 1.2542372881355932, + "cpu_mem": 3.024723968, + "gpu_mem": 1.07499264, + "loss": 0.4424, + "grad_norm": 1.8181227445602417, + "learning_rate": 0.00010945575916653407 + }, + { + "step": 186, + "epoch": 1.2610169491525425, + "cpu_mem": 3.024723968, + "gpu_mem": 1.075001856, + "loss": 0.3862, + "grad_norm": 1.5334510803222656, + "learning_rate": 0.00010774011647378553 + }, + { + "step": 187, + "epoch": 1.2677966101694915, + "cpu_mem": 3.024420864, + "gpu_mem": 1.074934272, + "loss": 0.6011, + "grad_norm": 3.338742971420288, + "learning_rate": 0.00010603045810453468 + }, + { + "step": 188, + "epoch": 1.2745762711864406, + "cpu_mem": 3.024617472, + "gpu_mem": 1.075097088, + "loss": 0.4274, + "grad_norm": 2.4851043224334717, + "learning_rate": 0.00010432702615951396 + }, + { + "step": 189, + "epoch": 1.2813559322033898, + "cpu_mem": 3.0245888, + "gpu_mem": 1.074966528, + "loss": 0.5496, + "grad_norm": 2.2646358013153076, + "learning_rate": 0.00010263006185774627 + }, + { + "step": 190, + "epoch": 1.288135593220339, + "cpu_mem": 3.0245888, + "gpu_mem": 1.075086336, + "loss": 0.4594, + "grad_norm": 2.1592464447021484, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 191, + "epoch": 1.2949152542372881, + "cpu_mem": 3.0245888, + "gpu_mem": 1.074905088, + "loss": 0.4049, + "grad_norm": 1.844449758529663, + "learning_rate": 9.925649644669391e-05 + }, + { + "step": 192, + "epoch": 1.3016949152542372, + "cpu_mem": 3.0245888, + "gpu_mem": 1.075037184, + "loss": 0.382, + "grad_norm": 2.8812406063079834, + "learning_rate": 9.758037306013526e-05 + }, + { + "step": 193, + "epoch": 1.3084745762711865, + "cpu_mem": 3.0245888, + "gpu_mem": 1.075011072, + "loss": 0.4015, + "grad_norm": 1.388166904449463, + "learning_rate": 9.591167269463255e-05 + }, + { + "step": 194, + "epoch": 1.3152542372881357, + "cpu_mem": 3.0245888, + "gpu_mem": 1.07497728, + "loss": 0.4162, + "grad_norm": 1.6763309240341187, + "learning_rate": 9.425063165095088e-05 + }, + { + "step": 195, + "epoch": 1.3220338983050848, + "cpu_mem": 3.0245888, + "gpu_mem": 1.075081728, + "loss": 0.3628, + "grad_norm": 1.3622221946716309, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 196, + "epoch": 1.3288135593220338, + "cpu_mem": 3.0245888, + "gpu_mem": 1.07507712, + "loss": 0.4919, + "grad_norm": 2.0335755348205566, + "learning_rate": 9.095246727570879e-05 + }, + { + "step": 197, + "epoch": 1.335593220338983, + "cpu_mem": 3.0245888, + "gpu_mem": 1.074935808, + "loss": 0.4112, + "grad_norm": 2.5751991271972656, + "learning_rate": 8.931581098950973e-05 + }, + { + "step": 198, + "epoch": 1.3423728813559321, + "cpu_mem": 3.0245888, + "gpu_mem": 1.075127808, + "loss": 0.4315, + "grad_norm": 1.641266942024231, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 199, + "epoch": 1.3491525423728814, + "cpu_mem": 3.0245888, + "gpu_mem": 1.074978816, + "loss": 0.4738, + "grad_norm": 2.687556505203247, + "learning_rate": 8.606850900252478e-05 + }, + { + "step": 200, + "epoch": 1.3559322033898304, + "cpu_mem": 3.0245888, + "gpu_mem": 1.075081728, + "loss": 0.4541, + "grad_norm": 2.1629574298858643, + "learning_rate": 8.445832314459608e-05 + }, + { + "step": 201, + "epoch": 1.3627118644067797, + "cpu_mem": 3.0245888, + "gpu_mem": 1.07528448, + "loss": 0.3815, + "grad_norm": 2.7345011234283447, + "learning_rate": 8.285741849059311e-05 + }, + { + "step": 202, + "epoch": 1.3694915254237288, + "cpu_mem": 3.0245888, + "gpu_mem": 1.075086336, + "loss": 0.4441, + "grad_norm": 2.783310651779175, + "learning_rate": 8.126602174088843e-05 + }, + { + "step": 203, + "epoch": 1.376271186440678, + "cpu_mem": 3.0245888, + "gpu_mem": 1.074972672, + "loss": 0.3808, + "grad_norm": 2.3834574222564697, + "learning_rate": 7.968435824946242e-05 + }, + { + "step": 204, + "epoch": 1.383050847457627, + "cpu_mem": 3.0245888, + "gpu_mem": 1.074986496, + "loss": 0.449, + "grad_norm": 3.366748571395874, + "learning_rate": 7.811265199199152e-05 + }, + { + "step": 205, + "epoch": 1.3898305084745763, + "cpu_mem": 3.024785408, + "gpu_mem": 1.07503104, + "loss": 0.4842, + "grad_norm": 3.7573835849761963, + "learning_rate": 7.655112553413135e-05 + }, + { + "step": 206, + "epoch": 1.3966101694915254, + "cpu_mem": 3.024785408, + "gpu_mem": 1.074972672, + "loss": 0.3926, + "grad_norm": 2.3505759239196777, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 207, + "epoch": 1.4033898305084747, + "cpu_mem": 3.024785408, + "gpu_mem": 1.075206144, + "loss": 0.4886, + "grad_norm": 2.816462755203247, + "learning_rate": 7.345949504086507e-05 + }, + { + "step": 208, + "epoch": 1.4101694915254237, + "cpu_mem": 3.024785408, + "gpu_mem": 1.075236864, + "loss": 0.4668, + "grad_norm": 4.412685394287109, + "learning_rate": 7.192982880403917e-05 + }, + { + "step": 209, + "epoch": 1.4169491525423727, + "cpu_mem": 3.024687104, + "gpu_mem": 1.075163136, + "loss": 0.5436, + "grad_norm": 2.401031255722046, + "learning_rate": 7.041121790198881e-05 + }, + { + "step": 210, + "epoch": 1.423728813559322, + "cpu_mem": 3.024678912, + "gpu_mem": 1.075051008, + "loss": 0.5138, + "grad_norm": 3.1173043251037598, + "learning_rate": 6.890387738166041e-05 + }, + { + "step": 211, + "epoch": 1.4305084745762713, + "cpu_mem": 3.024871424, + "gpu_mem": 1.07500032, + "loss": 0.3884, + "grad_norm": 3.181987762451172, + "learning_rate": 6.740802069402771e-05 + }, + { + "step": 212, + "epoch": 1.4372881355932203, + "cpu_mem": 3.024871424, + "gpu_mem": 1.0749696, + "loss": 0.4573, + "grad_norm": 2.1436924934387207, + "learning_rate": 6.592385966386588e-05 + }, + { + "step": 213, + "epoch": 1.4440677966101694, + "cpu_mem": 3.024871424, + "gpu_mem": 1.07499264, + "loss": 0.5766, + "grad_norm": 3.5437324047088623, + "learning_rate": 6.445160445975536e-05 + }, + { + "step": 214, + "epoch": 1.4508474576271186, + "cpu_mem": 3.024871424, + "gpu_mem": 1.075075584, + "loss": 0.4327, + "grad_norm": 2.2185556888580322, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 215, + "epoch": 1.457627118644068, + "cpu_mem": 3.024871424, + "gpu_mem": 1.075003392, + "loss": 0.5787, + "grad_norm": 4.941860675811768, + "learning_rate": 6.154364374470568e-05 + }, + { + "step": 216, + "epoch": 1.464406779661017, + "cpu_mem": 3.024666624, + "gpu_mem": 1.07516928, + "loss": 0.4142, + "grad_norm": 1.9517929553985596, + "learning_rate": 6.010835002329795e-05 + }, + { + "step": 217, + "epoch": 1.471186440677966, + "cpu_mem": 3.024863232, + "gpu_mem": 1.075011072, + "loss": 0.4439, + "grad_norm": 3.090721368789673, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 218, + "epoch": 1.4779661016949153, + "cpu_mem": 3.024863232, + "gpu_mem": 1.074988032, + "loss": 0.3915, + "grad_norm": 2.453636407852173, + "learning_rate": 5.72761520669092e-05 + }, + { + "step": 219, + "epoch": 1.4847457627118645, + "cpu_mem": 3.024666624, + "gpu_mem": 1.075113984, + "loss": 0.4858, + "grad_norm": 2.096686601638794, + "learning_rate": 5.587964889287218e-05 + }, + { + "step": 220, + "epoch": 1.4915254237288136, + "cpu_mem": 3.024863232, + "gpu_mem": 1.075147776, + "loss": 0.4988, + "grad_norm": 2.437363862991333, + "learning_rate": 5.449647388213678e-05 + }, + { + "step": 221, + "epoch": 1.4983050847457626, + "cpu_mem": 3.024863232, + "gpu_mem": 1.07501568, + "loss": 0.4518, + "grad_norm": 2.46529221534729, + "learning_rate": 5.312682290288869e-05 + }, + { + "step": 222, + "epoch": 1.505084745762712, + "cpu_mem": 3.024863232, + "gpu_mem": 1.075152384, + "loss": 0.4895, + "grad_norm": 2.251645088195801, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 223, + "epoch": 1.5118644067796612, + "cpu_mem": 3.024863232, + "gpu_mem": 1.075066368, + "loss": 0.3613, + "grad_norm": 2.8285458087921143, + "learning_rate": 5.0428866908599864e-05 + }, + { + "step": 224, + "epoch": 1.5186440677966102, + "cpu_mem": 3.024863232, + "gpu_mem": 1.07503104, + "loss": 0.3692, + "grad_norm": 2.062828779220581, + "learning_rate": 4.9100943944812114e-05 + }, + { + "step": 225, + "epoch": 1.5254237288135593, + "cpu_mem": 3.024863232, + "gpu_mem": 1.074995712, + "loss": 0.3709, + "grad_norm": 1.7823961973190308, + "learning_rate": 4.778730906091632e-05 + }, + { + "step": 226, + "epoch": 1.5322033898305085, + "cpu_mem": 3.024863232, + "gpu_mem": 1.075144704, + "loss": 0.4057, + "grad_norm": 2.202742338180542, + "learning_rate": 4.648814827768322e-05 + }, + { + "step": 227, + "epoch": 1.5389830508474578, + "cpu_mem": 3.024863232, + "gpu_mem": 1.075034112, + "loss": 0.4497, + "grad_norm": 2.1048293113708496, + "learning_rate": 4.5203645566239816e-05 + }, + { + "step": 228, + "epoch": 1.5457627118644068, + "cpu_mem": 3.024863232, + "gpu_mem": 1.074978816, + "loss": 0.4873, + "grad_norm": 1.9505807161331177, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 229, + "epoch": 1.5525423728813559, + "cpu_mem": 3.024863232, + "gpu_mem": 1.074920448, + "loss": 0.4034, + "grad_norm": 2.175112724304199, + "learning_rate": 4.267933983899601e-05 + }, + { + "step": 230, + "epoch": 1.559322033898305, + "cpu_mem": 3.024863232, + "gpu_mem": 1.07497728, + "loss": 0.3997, + "grad_norm": 1.9738308191299438, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 231, + "epoch": 1.5661016949152542, + "cpu_mem": 3.024863232, + "gpu_mem": 1.075255296, + "loss": 0.4895, + "grad_norm": 2.2598307132720947, + "learning_rate": 4.0215821672741213e-05 + }, + { + "step": 232, + "epoch": 1.5728813559322035, + "cpu_mem": 3.024863232, + "gpu_mem": 1.074978816, + "loss": 0.4872, + "grad_norm": 2.378850221633911, + "learning_rate": 3.900729534256745e-05 + }, + { + "step": 233, + "epoch": 1.5796610169491525, + "cpu_mem": 3.024863232, + "gpu_mem": 1.07529216, + "loss": 0.4147, + "grad_norm": 2.264596939086914, + "learning_rate": 3.781448643031187e-05 + }, + { + "step": 234, + "epoch": 1.5864406779661016, + "cpu_mem": 3.024863232, + "gpu_mem": 1.075167744, + "loss": 0.5108, + "grad_norm": 2.694446563720703, + "learning_rate": 3.663756384686127e-05 + }, + { + "step": 235, + "epoch": 1.5932203389830508, + "cpu_mem": 3.024863232, + "gpu_mem": 1.07492352, + "loss": 0.3608, + "grad_norm": 1.9014707803726196, + "learning_rate": 3.547669425347647e-05 + }, + { + "step": 236, + "epoch": 1.6, + "cpu_mem": 3.024863232, + "gpu_mem": 1.074983424, + "loss": 0.4918, + "grad_norm": 3.32745099067688, + "learning_rate": 3.433204203819185e-05 + }, + { + "step": 237, + "epoch": 1.6067796610169491, + "cpu_mem": 3.024863232, + "gpu_mem": 1.075044864, + "loss": 0.4804, + "grad_norm": 2.260383129119873, + "learning_rate": 3.3203769292536764e-05 + }, + { + "step": 238, + "epoch": 1.6135593220338982, + "cpu_mem": 3.024863232, + "gpu_mem": 1.0750464, + "loss": 0.4134, + "grad_norm": 2.0600831508636475, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 239, + "epoch": 1.6203389830508474, + "cpu_mem": 3.024863232, + "gpu_mem": 1.07529984, + "loss": 0.4668, + "grad_norm": 2.459625005722046, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 240, + "epoch": 1.6271186440677967, + "cpu_mem": 3.024863232, + "gpu_mem": 1.074949632, + "loss": 0.6129, + "grad_norm": 2.439835786819458, + "learning_rate": 2.9918813861345952e-05 + }, + { + "step": 241, + "epoch": 1.6338983050847458, + "cpu_mem": 3.024863232, + "gpu_mem": 1.07524608, + "loss": 0.4002, + "grad_norm": 1.9965859651565552, + "learning_rate": 2.885763318295102e-05 + }, + { + "step": 242, + "epoch": 1.6406779661016948, + "cpu_mem": 3.024863232, + "gpu_mem": 1.07510784, + "loss": 0.4994, + "grad_norm": 2.1573808193206787, + "learning_rate": 2.781360719244964e-05 + }, + { + "step": 243, + "epoch": 1.647457627118644, + "cpu_mem": 3.024863232, + "gpu_mem": 1.074960384, + "loss": 0.6049, + "grad_norm": 3.2378268241882324, + "learning_rate": 2.6786883731926306e-05 + }, + { + "step": 244, + "epoch": 1.6542372881355933, + "cpu_mem": 3.024863232, + "gpu_mem": 1.07510016, + "loss": 0.3684, + "grad_norm": 2.0926833152770996, + "learning_rate": 2.5777608193294396e-05 + }, + { + "step": 245, + "epoch": 1.6610169491525424, + "cpu_mem": 3.024863232, + "gpu_mem": 1.074978816, + "loss": 0.4349, + "grad_norm": 1.8976649045944214, + "learning_rate": 2.4785923497707956e-05 + }, + { + "step": 246, + "epoch": 1.6677966101694914, + "cpu_mem": 3.024863232, + "gpu_mem": 1.075072512, + "loss": 0.4824, + "grad_norm": 2.24778413772583, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 247, + "epoch": 1.6745762711864407, + "cpu_mem": 3.024863232, + "gpu_mem": 1.075090944, + "loss": 0.3411, + "grad_norm": 1.7593706846237183, + "learning_rate": 2.285588584541047e-05 + }, + { + "step": 248, + "epoch": 1.68135593220339, + "cpu_mem": 3.02505984, + "gpu_mem": 1.075043328, + "loss": 0.3704, + "grad_norm": 1.9715851545333862, + "learning_rate": 2.1917806196827792e-05 + }, + { + "step": 249, + "epoch": 1.688135593220339, + "cpu_mem": 3.02505984, + "gpu_mem": 1.074949632, + "loss": 0.3807, + "grad_norm": 2.5225422382354736, + "learning_rate": 2.0997863968844914e-05 + }, + { + "step": 250, + "epoch": 1.694915254237288, + "cpu_mem": 3.02505984, + "gpu_mem": 1.075041792, + "loss": 0.4734, + "grad_norm": 2.3031728267669678, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 251, + "epoch": 1.7016949152542373, + "cpu_mem": 3.02505984, + "gpu_mem": 1.07495424, + "loss": 0.33, + "grad_norm": 2.017667770385742, + "learning_rate": 1.921291027132278e-05 + }, + { + "step": 252, + "epoch": 1.7084745762711866, + "cpu_mem": 3.02505984, + "gpu_mem": 1.074997248, + "loss": 0.3932, + "grad_norm": 2.3432397842407227, + "learning_rate": 1.834815156491165e-05 + }, + { + "step": 253, + "epoch": 1.7152542372881356, + "cpu_mem": 3.02505984, + "gpu_mem": 1.075190784, + "loss": 0.4538, + "grad_norm": 2.0075507164001465, + "learning_rate": 1.750203576956341e-05 + }, + { + "step": 254, + "epoch": 1.7220338983050847, + "cpu_mem": 3.02505984, + "gpu_mem": 1.074986496, + "loss": 0.4852, + "grad_norm": 2.306149959564209, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 255, + "epoch": 1.7288135593220337, + "cpu_mem": 3.02505984, + "gpu_mem": 1.075143168, + "loss": 0.5241, + "grad_norm": 2.369124174118042, + "learning_rate": 1.5866209521043304e-05 + }, + { + "step": 256, + "epoch": 1.735593220338983, + "cpu_mem": 3.02505984, + "gpu_mem": 1.0749696, + "loss": 0.4323, + "grad_norm": 2.174739360809326, + "learning_rate": 1.5076730713409523e-05 + }, + { + "step": 257, + "epoch": 1.7423728813559323, + "cpu_mem": 3.02505984, + "gpu_mem": 1.075382784, + "loss": 0.4591, + "grad_norm": 1.7277308702468872, + "learning_rate": 1.4306358075111923e-05 + }, + { + "step": 258, + "epoch": 1.7491525423728813, + "cpu_mem": 3.02505984, + "gpu_mem": 1.075041792, + "loss": 0.469, + "grad_norm": 3.157832384109497, + "learning_rate": 1.3555200696822232e-05 + }, + { + "step": 259, + "epoch": 1.7559322033898304, + "cpu_mem": 3.02505984, + "gpu_mem": 1.074958848, + "loss": 0.4108, + "grad_norm": 2.3661012649536133, + "learning_rate": 1.2823364948184095e-05 + }, + { + "step": 260, + "epoch": 1.7627118644067796, + "cpu_mem": 3.02505984, + "gpu_mem": 1.075075584, + "loss": 0.3712, + "grad_norm": 1.8118715286254883, + "learning_rate": 1.2110954462750166e-05 + }, + { + "step": 261, + "epoch": 1.769491525423729, + "cpu_mem": 3.02505984, + "gpu_mem": 1.07503104, + "loss": 0.3222, + "grad_norm": 1.7548097372055054, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 262, + "epoch": 1.776271186440678, + "cpu_mem": 3.02505984, + "gpu_mem": 1.074988032, + "loss": 0.4032, + "grad_norm": 1.77567720413208, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 263, + "epoch": 1.783050847457627, + "cpu_mem": 3.02505984, + "gpu_mem": 1.075024896, + "loss": 0.3679, + "grad_norm": 2.026758909225464, + "learning_rate": 1.0091269574384874e-05 + }, + { + "step": 264, + "epoch": 1.7898305084745763, + "cpu_mem": 3.02505984, + "gpu_mem": 1.075112448, + "loss": 0.3857, + "grad_norm": 1.9547120332717896, + "learning_rate": 9.45754125003576e-06 + }, + { + "step": 265, + "epoch": 1.7966101694915255, + "cpu_mem": 3.02505984, + "gpu_mem": 1.07503104, + "loss": 0.4915, + "grad_norm": 2.225541830062866, + "learning_rate": 8.843714815330987e-06 + }, + { + "step": 266, + "epoch": 1.8033898305084746, + "cpu_mem": 3.02505984, + "gpu_mem": 1.07524608, + "loss": 0.4647, + "grad_norm": 2.119649648666382, + "learning_rate": 8.249877192799731e-06 + }, + { + "step": 267, + "epoch": 1.8101694915254236, + "cpu_mem": 3.02505984, + "gpu_mem": 1.07503872, + "loss": 0.4975, + "grad_norm": 3.1394009590148926, + "learning_rate": 7.676112474402068e-06 + }, + { + "step": 268, + "epoch": 1.8169491525423729, + "cpu_mem": 3.02505984, + "gpu_mem": 1.075043328, + "loss": 0.4351, + "grad_norm": 2.0435707569122314, + "learning_rate": 7.122501909620926e-06 + }, + { + "step": 269, + "epoch": 1.8237288135593221, + "cpu_mem": 3.02505984, + "gpu_mem": 1.07505408, + "loss": 0.5114, + "grad_norm": 3.1682026386260986, + "learning_rate": 6.5891238939566275e-06 + }, + { + "step": 270, + "epoch": 1.8305084745762712, + "cpu_mem": 2.932064256, + "gpu_mem": 1.07509248, + "loss": 0.4417, + "grad_norm": 2.131368637084961, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 271, + "epoch": 1.8372881355932202, + "cpu_mem": 2.929115136, + "gpu_mem": 1.075144704, + "loss": 0.453, + "grad_norm": 2.360452175140381, + "learning_rate": 5.583364755863701e-06 + }, + { + "step": 272, + "epoch": 1.8440677966101695, + "cpu_mem": 2.929508352, + "gpu_mem": 1.075003392, + "loss": 0.4413, + "grad_norm": 1.8082300424575806, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 273, + "epoch": 1.8508474576271188, + "cpu_mem": 2.929901568, + "gpu_mem": 1.074883584, + "loss": 0.4387, + "grad_norm": 2.025212287902832, + "learning_rate": 4.659404732773908e-06 + }, + { + "step": 274, + "epoch": 1.8576271186440678, + "cpu_mem": 2.930688, + "gpu_mem": 1.075110912, + "loss": 0.4838, + "grad_norm": 2.322986364364624, + "learning_rate": 4.228264751468752e-06 + }, + { + "step": 275, + "epoch": 1.8644067796610169, + "cpu_mem": 2.930884608, + "gpu_mem": 1.075355136, + "loss": 0.4447, + "grad_norm": 2.5802230834960938, + "learning_rate": 3.817767165451041e-06 + }, + { + "step": 276, + "epoch": 1.8711864406779661, + "cpu_mem": 2.931277824, + "gpu_mem": 1.07501568, + "loss": 0.4898, + "grad_norm": 2.2804408073425293, + "learning_rate": 3.4279701043260886e-06 + }, + { + "step": 277, + "epoch": 1.8779661016949154, + "cpu_mem": 2.93167104, + "gpu_mem": 1.07496192, + "loss": 0.4575, + "grad_norm": 2.110462188720703, + "learning_rate": 3.0589287663461472e-06 + }, + { + "step": 278, + "epoch": 1.8847457627118644, + "cpu_mem": 2.932064256, + "gpu_mem": 1.075124736, + "loss": 0.5217, + "grad_norm": 2.344633102416992, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 279, + "epoch": 1.8915254237288135, + "cpu_mem": 2.932260864, + "gpu_mem": 1.075064832, + "loss": 0.4805, + "grad_norm": 2.08709979057312, + "learning_rate": 2.3833193495825853e-06 + }, + { + "step": 280, + "epoch": 1.8983050847457628, + "cpu_mem": 2.93265408, + "gpu_mem": 1.075044864, + "loss": 0.3981, + "grad_norm": 1.8200031518936157, + "learning_rate": 2.076846942272026e-06 + }, + { + "step": 281, + "epoch": 1.905084745762712, + "cpu_mem": 2.933047296, + "gpu_mem": 1.074980352, + "loss": 0.4251, + "grad_norm": 2.5643293857574463, + "learning_rate": 1.791321587504768e-06 + }, + { + "step": 282, + "epoch": 1.911864406779661, + "cpu_mem": 2.933243904, + "gpu_mem": 1.075408896, + "loss": 0.3721, + "grad_norm": 2.6374361515045166, + "learning_rate": 1.5267837178600972e-06 + }, + { + "step": 283, + "epoch": 1.9186440677966101, + "cpu_mem": 2.93363712, + "gpu_mem": 1.07511552, + "loss": 0.3651, + "grad_norm": 1.9787962436676025, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 284, + "epoch": 1.9254237288135592, + "cpu_mem": 2.933833728, + "gpu_mem": 1.074971136, + "loss": 0.4578, + "grad_norm": 1.8029755353927612, + "learning_rate": 1.0608172990067553e-06 + }, + { + "step": 285, + "epoch": 1.9322033898305084, + "cpu_mem": 2.934226944, + "gpu_mem": 1.075024896, + "loss": 0.3955, + "grad_norm": 2.1217472553253174, + "learning_rate": 8.594547342153979e-07 + }, + { + "step": 286, + "epoch": 1.9389830508474577, + "cpu_mem": 2.934423552, + "gpu_mem": 1.075442688, + "loss": 0.4484, + "grad_norm": 2.086652994155884, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 287, + "epoch": 1.9457627118644067, + "cpu_mem": 2.93462016, + "gpu_mem": 1.075212288, + "loss": 0.3601, + "grad_norm": 1.9902241230010986, + "learning_rate": 5.201134622801473e-07 + }, + { + "step": 288, + "epoch": 1.9525423728813558, + "cpu_mem": 2.934816768, + "gpu_mem": 1.074997248, + "loss": 0.5829, + "grad_norm": 3.221863031387329, + "learning_rate": 3.821828084619727e-07 + }, + { + "step": 289, + "epoch": 1.959322033898305, + "cpu_mem": 2.935013376, + "gpu_mem": 1.075081728, + "loss": 0.4291, + "grad_norm": 1.8546580076217651, + "learning_rate": 2.654391846207915e-07 + }, + { + "step": 290, + "epoch": 1.9661016949152543, + "cpu_mem": 2.935209984, + "gpu_mem": 1.075006464, + "loss": 0.5726, + "grad_norm": 3.5396029949188232, + "learning_rate": 1.6989912254880556e-07 + }, + { + "step": 291, + "epoch": 1.9728813559322034, + "cpu_mem": 2.935013376, + "gpu_mem": 1.075041792, + "loss": 0.4676, + "grad_norm": 3.036689281463623, + "learning_rate": 9.557615145123765e-08 + }, + { + "step": 292, + "epoch": 1.9796610169491524, + "cpu_mem": 2.9356032, + "gpu_mem": 1.075124736, + "loss": 0.3829, + "grad_norm": 1.836418867111206, + "learning_rate": 4.248079603064724e-08 + }, + { + "step": 293, + "epoch": 1.9864406779661017, + "cpu_mem": 2.935799808, + "gpu_mem": 1.075041792, + "loss": 0.5037, + "grad_norm": 3.046307325363159, + "learning_rate": 1.0620574996372811e-08 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 2.935996416, + "gpu_mem": 1.075067904, + "loss": 0.4862, + "grad_norm": 2.6024115085601807, + "learning_rate": 0.0 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 2.935996416, + "gpu_mem": 1.075067904, + "train_runtime": 4529.9321, + "train_samples_per_second": 4.162, + "train_steps_per_second": 0.065, + "total_flos": 4.702018789028659e+16, + "train_loss": 0.8857108023093672 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r32-a2/README.md b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r32-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r32-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r32-a2/adapter_config.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..97cff55d3f03a364161498b7b6299c246238daf5 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r32-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r32-a2/eval_results.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..5031f46ee95abd5f4e41e88565f3836fe0963e69 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "boolq", + "results": 0.8159021406727829 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r32-a2/training_configuration.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..22e50e8c09512d3d236046d9e3cd89b41d639cd9 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "BOOLQ", + "dataset_id": "google/boolq", + "preprocess_id": "boolq_train_deepeval" + }, + "peft_config": { + "method": "loraq4", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 25231360 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 2, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loraq4-boolq-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loraq4-fix/TinyLlama_v1.1-loraq4-boolq-r32-a2", + "seed": 42, + "timestamp": "2025-09-01T09:04:05.338079" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r32-a2/training_logs.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..7aa1065cae725ce76761ceab636650b75979e0c1 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r32-a2/training_logs.json @@ -0,0 +1,2659 @@ +[ + { + "step": 1, + "epoch": 0.006779661016949152, + "cpu_mem": 3.273248768, + "gpu_mem": 1.150930944, + "loss": 8.7378, + "grad_norm": 52.36585998535156, + "learning_rate": 9.999999999999999e-06 + }, + { + "step": 2, + "epoch": 0.013559322033898305, + "cpu_mem": 3.2740352, + "gpu_mem": 1.352919552, + "loss": 8.7839, + "grad_norm": 57.147090911865234, + "learning_rate": 1.9999999999999998e-05 + }, + { + "step": 3, + "epoch": 0.020338983050847456, + "cpu_mem": 3.274821632, + "gpu_mem": 1.352838144, + "loss": 8.1728, + "grad_norm": 54.23906326293945, + "learning_rate": 2.9999999999999997e-05 + }, + { + "step": 4, + "epoch": 0.02711864406779661, + "cpu_mem": 3.275411456, + "gpu_mem": 1.352838144, + "loss": 7.0753, + "grad_norm": 63.66853713989258, + "learning_rate": 3.9999999999999996e-05 + }, + { + "step": 5, + "epoch": 0.03389830508474576, + "cpu_mem": 3.27600128, + "gpu_mem": 1.352773632, + "loss": 5.411, + "grad_norm": 63.42523193359375, + "learning_rate": 4.9999999999999996e-05 + }, + { + "step": 6, + "epoch": 0.04067796610169491, + "cpu_mem": 3.276591104, + "gpu_mem": 1.3527936, + "loss": 3.8395, + "grad_norm": 52.70524215698242, + "learning_rate": 5.9999999999999995e-05 + }, + { + "step": 7, + "epoch": 0.04745762711864407, + "cpu_mem": 3.277180928, + "gpu_mem": 1.352845824, + "loss": 2.0669, + "grad_norm": 18.087276458740234, + "learning_rate": 7e-05 + }, + { + "step": 8, + "epoch": 0.05423728813559322, + "cpu_mem": 3.277574144, + "gpu_mem": 1.35293184, + "loss": 1.4394, + "grad_norm": 13.501249313354492, + "learning_rate": 7.999999999999999e-05 + }, + { + "step": 9, + "epoch": 0.061016949152542375, + "cpu_mem": 3.27796736, + "gpu_mem": 1.35283968, + "loss": 0.9751, + "grad_norm": 7.490012168884277, + "learning_rate": 8.999999999999999e-05 + }, + { + "step": 10, + "epoch": 0.06779661016949153, + "cpu_mem": 3.278557184, + "gpu_mem": 1.35273984, + "loss": 0.7542, + "grad_norm": 3.78539776802063, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 11, + "epoch": 0.07457627118644068, + "cpu_mem": 3.2789504, + "gpu_mem": 1.352844288, + "loss": 0.6885, + "grad_norm": 5.337689399719238, + "learning_rate": 0.00010999999999999998 + }, + { + "step": 12, + "epoch": 0.08135593220338982, + "cpu_mem": 3.279343616, + "gpu_mem": 1.353216, + "loss": 0.8726, + "grad_norm": 18.26336097717285, + "learning_rate": 0.00011999999999999999 + }, + { + "step": 13, + "epoch": 0.08813559322033898, + "cpu_mem": 3.279736832, + "gpu_mem": 1.352819712, + "loss": 0.7022, + "grad_norm": 9.844132423400879, + "learning_rate": 0.00013 + }, + { + "step": 14, + "epoch": 0.09491525423728814, + "cpu_mem": 3.280130048, + "gpu_mem": 1.352796672, + "loss": 1.0608, + "grad_norm": 37.057899475097656, + "learning_rate": 0.00014 + }, + { + "step": 15, + "epoch": 0.1016949152542373, + "cpu_mem": 3.280523264, + "gpu_mem": 1.352735232, + "loss": 1.1215, + "grad_norm": 22.301456451416016, + "learning_rate": 0.00015 + }, + { + "step": 16, + "epoch": 0.10847457627118644, + "cpu_mem": 3.28091648, + "gpu_mem": 1.352819712, + "loss": 0.7225, + "grad_norm": 8.182317733764648, + "learning_rate": 0.00015999999999999999 + }, + { + "step": 17, + "epoch": 0.1152542372881356, + "cpu_mem": 3.281113088, + "gpu_mem": 1.352859648, + "loss": 1.2458, + "grad_norm": 19.587060928344727, + "learning_rate": 0.00016999999999999999 + }, + { + "step": 18, + "epoch": 0.12203389830508475, + "cpu_mem": 3.281506304, + "gpu_mem": 1.352922624, + "loss": 1.1259, + "grad_norm": 14.39857292175293, + "learning_rate": 0.00017999999999999998 + }, + { + "step": 19, + "epoch": 0.1288135593220339, + "cpu_mem": 3.281702912, + "gpu_mem": 1.352759808, + "loss": 0.8808, + "grad_norm": 10.709318161010742, + "learning_rate": 0.00018999999999999998 + }, + { + "step": 20, + "epoch": 0.13559322033898305, + "cpu_mem": 3.282096128, + "gpu_mem": 1.352871936, + "loss": 0.8818, + "grad_norm": 11.054442405700684, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 21, + "epoch": 0.1423728813559322, + "cpu_mem": 3.282292736, + "gpu_mem": 1.353030144, + "loss": 0.9173, + "grad_norm": 9.976797103881836, + "learning_rate": 0.00020999999999999998 + }, + { + "step": 22, + "epoch": 0.14915254237288136, + "cpu_mem": 3.282685952, + "gpu_mem": 1.352922624, + "loss": 0.6792, + "grad_norm": 1.19441819190979, + "learning_rate": 0.00021999999999999995 + }, + { + "step": 23, + "epoch": 0.15593220338983052, + "cpu_mem": 3.28288256, + "gpu_mem": 1.352894976, + "loss": 0.6519, + "grad_norm": 0.9090519547462463, + "learning_rate": 0.00023 + }, + { + "step": 24, + "epoch": 0.16271186440677965, + "cpu_mem": 3.283275776, + "gpu_mem": 1.352951808, + "loss": 0.5632, + "grad_norm": 1.4651094675064087, + "learning_rate": 0.00023999999999999998 + }, + { + "step": 25, + "epoch": 0.1694915254237288, + "cpu_mem": 3.283668992, + "gpu_mem": 1.352736768, + "loss": 0.6136, + "grad_norm": 1.4357552528381348, + "learning_rate": 0.00025 + }, + { + "step": 26, + "epoch": 0.17627118644067796, + "cpu_mem": 3.2838656, + "gpu_mem": 1.352792064, + "loss": 0.7423, + "grad_norm": 6.041957855224609, + "learning_rate": 0.00026 + }, + { + "step": 27, + "epoch": 0.18305084745762712, + "cpu_mem": 3.284062208, + "gpu_mem": 1.353083904, + "loss": 0.8464, + "grad_norm": 7.431577682495117, + "learning_rate": 0.00027 + }, + { + "step": 28, + "epoch": 0.18983050847457628, + "cpu_mem": 3.284258816, + "gpu_mem": 1.35276288, + "loss": 0.7603, + "grad_norm": 5.0595526695251465, + "learning_rate": 0.00028 + }, + { + "step": 29, + "epoch": 0.19661016949152543, + "cpu_mem": 3.284652032, + "gpu_mem": 1.352827392, + "loss": 0.6082, + "grad_norm": 2.658418893814087, + "learning_rate": 0.00029 + }, + { + "step": 30, + "epoch": 0.2033898305084746, + "cpu_mem": 3.28484864, + "gpu_mem": 1.352905728, + "loss": 0.6273, + "grad_norm": 2.112206220626831, + "learning_rate": 0.0003 + }, + { + "step": 31, + "epoch": 0.21016949152542372, + "cpu_mem": 3.285045248, + "gpu_mem": 1.35270912, + "loss": 0.6012, + "grad_norm": 3.911397933959961, + "learning_rate": 0.0002999893794250036 + }, + { + "step": 32, + "epoch": 0.21694915254237288, + "cpu_mem": 3.285241856, + "gpu_mem": 1.352822784, + "loss": 0.6792, + "grad_norm": 2.459207057952881, + "learning_rate": 0.00029995751920396937 + }, + { + "step": 33, + "epoch": 0.22372881355932203, + "cpu_mem": 3.285438464, + "gpu_mem": 1.353060864, + "loss": 0.7658, + "grad_norm": 2.3136324882507324, + "learning_rate": 0.00029990442384854874 + }, + { + "step": 34, + "epoch": 0.2305084745762712, + "cpu_mem": 3.28583168, + "gpu_mem": 1.35276288, + "loss": 0.5876, + "grad_norm": 2.4393556118011475, + "learning_rate": 0.0002998301008774512 + }, + { + "step": 35, + "epoch": 0.23728813559322035, + "cpu_mem": 3.286028288, + "gpu_mem": 1.352973312, + "loss": 0.685, + "grad_norm": 2.3712430000305176, + "learning_rate": 0.0002997345608153792 + }, + { + "step": 36, + "epoch": 0.2440677966101695, + "cpu_mem": 3.286224896, + "gpu_mem": 1.35292416, + "loss": 0.6609, + "grad_norm": 2.2131965160369873, + "learning_rate": 0.000299617817191538 + }, + { + "step": 37, + "epoch": 0.25084745762711863, + "cpu_mem": 3.286421504, + "gpu_mem": 1.352735232, + "loss": 0.6173, + "grad_norm": 2.2226457595825195, + "learning_rate": 0.0002994798865377198 + }, + { + "step": 38, + "epoch": 0.2576271186440678, + "cpu_mem": 3.286618112, + "gpu_mem": 1.352982528, + "loss": 0.7774, + "grad_norm": 3.8071861267089844, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 39, + "epoch": 0.26440677966101694, + "cpu_mem": 3.28681472, + "gpu_mem": 1.35336192, + "loss": 0.675, + "grad_norm": 2.2399120330810547, + "learning_rate": 0.0002991405452657846 + }, + { + "step": 40, + "epoch": 0.2711864406779661, + "cpu_mem": 3.287011328, + "gpu_mem": 1.35293184, + "loss": 0.6265, + "grad_norm": 2.643010139465332, + "learning_rate": 0.00029893918270099324 + }, + { + "step": 41, + "epoch": 0.27796610169491526, + "cpu_mem": 3.287207936, + "gpu_mem": 1.353159168, + "loss": 0.6807, + "grad_norm": 2.315294027328491, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 42, + "epoch": 0.2847457627118644, + "cpu_mem": 3.287404544, + "gpu_mem": 1.353056256, + "loss": 0.5752, + "grad_norm": 1.0011048316955566, + "learning_rate": 0.0002984732162821399 + }, + { + "step": 43, + "epoch": 0.29152542372881357, + "cpu_mem": 3.287404544, + "gpu_mem": 1.35287808, + "loss": 0.5598, + "grad_norm": 1.045304536819458, + "learning_rate": 0.0002982086784124952 + }, + { + "step": 44, + "epoch": 0.2983050847457627, + "cpu_mem": 3.287601152, + "gpu_mem": 1.353020928, + "loss": 0.5222, + "grad_norm": 1.3168087005615234, + "learning_rate": 0.00029792315305772796 + }, + { + "step": 45, + "epoch": 0.3050847457627119, + "cpu_mem": 3.287601152, + "gpu_mem": 1.35280128, + "loss": 0.7656, + "grad_norm": 3.0438952445983887, + "learning_rate": 0.0002976166806504174 + }, + { + "step": 46, + "epoch": 0.31186440677966104, + "cpu_mem": 3.28779776, + "gpu_mem": 1.353043968, + "loss": 0.7624, + "grad_norm": 5.301084041595459, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 47, + "epoch": 0.31864406779661014, + "cpu_mem": 3.287994368, + "gpu_mem": 1.352767488, + "loss": 0.7125, + "grad_norm": 4.372990608215332, + "learning_rate": 0.00029694107123365385 + }, + { + "step": 48, + "epoch": 0.3254237288135593, + "cpu_mem": 3.288190976, + "gpu_mem": 1.352844288, + "loss": 0.5278, + "grad_norm": 1.9276179075241089, + "learning_rate": 0.00029657202989567393 + }, + { + "step": 49, + "epoch": 0.33220338983050846, + "cpu_mem": 3.288190976, + "gpu_mem": 1.352861184, + "loss": 0.9279, + "grad_norm": 5.972068786621094, + "learning_rate": 0.00029618223283454893 + }, + { + "step": 50, + "epoch": 0.3389830508474576, + "cpu_mem": 3.288387584, + "gpu_mem": 1.352799744, + "loss": 0.7403, + "grad_norm": 4.211651802062988, + "learning_rate": 0.00029577173524853123 + }, + { + "step": 51, + "epoch": 0.34576271186440677, + "cpu_mem": 3.288387584, + "gpu_mem": 1.352804352, + "loss": 0.5065, + "grad_norm": 1.4248895645141602, + "learning_rate": 0.0002953405952672261 + }, + { + "step": 52, + "epoch": 0.3525423728813559, + "cpu_mem": 3.288584192, + "gpu_mem": 1.352884224, + "loss": 0.6393, + "grad_norm": 1.3210108280181885, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 53, + "epoch": 0.3593220338983051, + "cpu_mem": 3.288584192, + "gpu_mem": 1.352907264, + "loss": 0.8341, + "grad_norm": 5.0811052322387695, + "learning_rate": 0.0002944166352441363 + }, + { + "step": 54, + "epoch": 0.36610169491525424, + "cpu_mem": 3.2887808, + "gpu_mem": 1.352835072, + "loss": 0.5891, + "grad_norm": 2.0136959552764893, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 55, + "epoch": 0.3728813559322034, + "cpu_mem": 3.2887808, + "gpu_mem": 1.353105408, + "loss": 0.5486, + "grad_norm": 1.2197656631469727, + "learning_rate": 0.00029341087610604337 + }, + { + "step": 56, + "epoch": 0.37966101694915255, + "cpu_mem": 3.288977408, + "gpu_mem": 1.352891904, + "loss": 0.8744, + "grad_norm": 5.156645774841309, + "learning_rate": 0.00029287749809037904 + }, + { + "step": 57, + "epoch": 0.3864406779661017, + "cpu_mem": 3.288977408, + "gpu_mem": 1.35288576, + "loss": 0.6408, + "grad_norm": 3.2929751873016357, + "learning_rate": 0.0002923238875255979 + }, + { + "step": 58, + "epoch": 0.39322033898305087, + "cpu_mem": 3.288977408, + "gpu_mem": 1.352781312, + "loss": 0.5329, + "grad_norm": 1.484366774559021, + "learning_rate": 0.00029175012280720024 + }, + { + "step": 59, + "epoch": 0.4, + "cpu_mem": 3.289174016, + "gpu_mem": 1.352798208, + "loss": 0.7173, + "grad_norm": 3.88208270072937, + "learning_rate": 0.000291156285184669 + }, + { + "step": 60, + "epoch": 0.4067796610169492, + "cpu_mem": 3.289174016, + "gpu_mem": 1.352891904, + "loss": 0.6916, + "grad_norm": 5.392721176147461, + "learning_rate": 0.00029054245874996426 + }, + { + "step": 61, + "epoch": 0.4135593220338983, + "cpu_mem": 3.289174016, + "gpu_mem": 1.352902656, + "loss": 0.5711, + "grad_norm": 2.9586424827575684, + "learning_rate": 0.0002899087304256151 + }, + { + "step": 62, + "epoch": 0.42033898305084744, + "cpu_mem": 3.289370624, + "gpu_mem": 1.352890368, + "loss": 0.7748, + "grad_norm": 4.267385482788086, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 63, + "epoch": 0.4271186440677966, + "cpu_mem": 3.289370624, + "gpu_mem": 1.352882688, + "loss": 0.4302, + "grad_norm": 2.3629183769226074, + "learning_rate": 0.000288581929876693 + }, + { + "step": 64, + "epoch": 0.43389830508474575, + "cpu_mem": 3.289567232, + "gpu_mem": 1.352812032, + "loss": 0.5648, + "grad_norm": 3.0491480827331543, + "learning_rate": 0.0002878890455372498 + }, + { + "step": 65, + "epoch": 0.4406779661016949, + "cpu_mem": 3.28976384, + "gpu_mem": 1.352856576, + "loss": 0.5123, + "grad_norm": 1.8390069007873535, + "learning_rate": 0.0002871766350518159 + }, + { + "step": 66, + "epoch": 0.44745762711864406, + "cpu_mem": 3.28976384, + "gpu_mem": 1.353050112, + "loss": 0.5665, + "grad_norm": 3.017221212387085, + "learning_rate": 0.00028644479930317775 + }, + { + "step": 67, + "epoch": 0.4542372881355932, + "cpu_mem": 3.28976384, + "gpu_mem": 1.352759808, + "loss": 0.5266, + "grad_norm": 2.3310868740081787, + "learning_rate": 0.00028569364192488803 + }, + { + "step": 68, + "epoch": 0.4610169491525424, + "cpu_mem": 3.28976384, + "gpu_mem": 1.352727552, + "loss": 0.6596, + "grad_norm": 2.4974825382232666, + "learning_rate": 0.00028492326928659045 + }, + { + "step": 69, + "epoch": 0.46779661016949153, + "cpu_mem": 3.28976384, + "gpu_mem": 1.3527936, + "loss": 0.5559, + "grad_norm": 2.0011773109436035, + "learning_rate": 0.00028413379047895665 + }, + { + "step": 70, + "epoch": 0.4745762711864407, + "cpu_mem": 3.28976384, + "gpu_mem": 1.352787456, + "loss": 0.5068, + "grad_norm": 1.5908581018447876, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 71, + "epoch": 0.48135593220338985, + "cpu_mem": 3.289960448, + "gpu_mem": 1.35301632, + "loss": 0.5327, + "grad_norm": 2.7618117332458496, + "learning_rate": 0.0002824979642304366 + }, + { + "step": 72, + "epoch": 0.488135593220339, + "cpu_mem": 3.289960448, + "gpu_mem": 1.35300864, + "loss": 0.5118, + "grad_norm": 2.769101619720459, + "learning_rate": 0.0002816518484350883 + }, + { + "step": 73, + "epoch": 0.49491525423728816, + "cpu_mem": 3.290157056, + "gpu_mem": 1.352974848, + "loss": 0.7546, + "grad_norm": 3.347548007965088, + "learning_rate": 0.0002807870897286772 + }, + { + "step": 74, + "epoch": 0.5016949152542373, + "cpu_mem": 3.290157056, + "gpu_mem": 1.352835072, + "loss": 0.4547, + "grad_norm": 2.192159652709961, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 75, + "epoch": 0.5084745762711864, + "cpu_mem": 3.290157056, + "gpu_mem": 1.352759808, + "loss": 0.5159, + "grad_norm": 1.8108059167861938, + "learning_rate": 0.000279002136031155 + }, + { + "step": 76, + "epoch": 0.5152542372881356, + "cpu_mem": 3.290157056, + "gpu_mem": 1.352699904, + "loss": 0.4985, + "grad_norm": 1.922587275505066, + "learning_rate": 0.00027808219380317216 + }, + { + "step": 77, + "epoch": 0.5220338983050847, + "cpu_mem": 3.290157056, + "gpu_mem": 1.352773632, + "loss": 0.4498, + "grad_norm": 1.7314988374710083, + "learning_rate": 0.0002771441141545895 + }, + { + "step": 78, + "epoch": 0.5288135593220339, + "cpu_mem": 3.290157056, + "gpu_mem": 1.352825856, + "loss": 0.6235, + "grad_norm": 4.058626174926758, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 79, + "epoch": 0.535593220338983, + "cpu_mem": 3.290157056, + "gpu_mem": 1.352957952, + "loss": 0.5759, + "grad_norm": 2.122950792312622, + "learning_rate": 0.000275214076502292 + }, + { + "step": 80, + "epoch": 0.5423728813559322, + "cpu_mem": 3.290353664, + "gpu_mem": 1.352848896, + "loss": 0.515, + "grad_norm": 1.6026368141174316, + "learning_rate": 0.0002742223918067056 + }, + { + "step": 81, + "epoch": 0.5491525423728814, + "cpu_mem": 3.290353664, + "gpu_mem": 1.352729088, + "loss": 0.5402, + "grad_norm": 1.6085011959075928, + "learning_rate": 0.00027321311626807374 + }, + { + "step": 82, + "epoch": 0.5559322033898305, + "cpu_mem": 3.290353664, + "gpu_mem": 1.352798208, + "loss": 0.5479, + "grad_norm": 1.6739803552627563, + "learning_rate": 0.0002721863928075503 + }, + { + "step": 83, + "epoch": 0.5627118644067797, + "cpu_mem": 3.290550272, + "gpu_mem": 1.352898048, + "loss": 0.5892, + "grad_norm": 2.3180227279663086, + "learning_rate": 0.000271142366817049 + }, + { + "step": 84, + "epoch": 0.5694915254237288, + "cpu_mem": 3.290550272, + "gpu_mem": 1.352861184, + "loss": 0.4401, + "grad_norm": 1.7601823806762695, + "learning_rate": 0.00027008118613865406 + }, + { + "step": 85, + "epoch": 0.576271186440678, + "cpu_mem": 3.290550272, + "gpu_mem": 1.35289344, + "loss": 0.5189, + "grad_norm": 1.7403130531311035, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 86, + "epoch": 0.5830508474576271, + "cpu_mem": 3.290550272, + "gpu_mem": 1.352844288, + "loss": 0.525, + "grad_norm": 1.7776806354522705, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 87, + "epoch": 0.5898305084745763, + "cpu_mem": 3.290550272, + "gpu_mem": 1.352851968, + "loss": 0.5168, + "grad_norm": 1.8197221755981445, + "learning_rate": 0.00026679623070746325 + }, + { + "step": 88, + "epoch": 0.5966101694915255, + "cpu_mem": 3.290550272, + "gpu_mem": 1.352996352, + "loss": 0.5119, + "grad_norm": 1.962997555732727, + "learning_rate": 0.0002656679579618081 + }, + { + "step": 89, + "epoch": 0.6033898305084746, + "cpu_mem": 3.290550272, + "gpu_mem": 1.35277824, + "loss": 0.5389, + "grad_norm": 2.2364699840545654, + "learning_rate": 0.0002645233057465235 + }, + { + "step": 90, + "epoch": 0.6101694915254238, + "cpu_mem": 3.290550272, + "gpu_mem": 1.352832, + "loss": 0.407, + "grad_norm": 2.153712034225464, + "learning_rate": 0.00026336243615313873 + }, + { + "step": 91, + "epoch": 0.6169491525423729, + "cpu_mem": 3.290550272, + "gpu_mem": 1.352799744, + "loss": 0.4369, + "grad_norm": 1.928503394126892, + "learning_rate": 0.00026218551356968814 + }, + { + "step": 92, + "epoch": 0.6237288135593221, + "cpu_mem": 3.29074688, + "gpu_mem": 1.352881152, + "loss": 0.6359, + "grad_norm": 4.846434116363525, + "learning_rate": 0.00026099270465743254 + }, + { + "step": 93, + "epoch": 0.6305084745762712, + "cpu_mem": 3.29074688, + "gpu_mem": 1.352684544, + "loss": 0.7033, + "grad_norm": 2.7724034786224365, + "learning_rate": 0.0002597841783272588 + }, + { + "step": 94, + "epoch": 0.6372881355932203, + "cpu_mem": 3.29074688, + "gpu_mem": 1.352798208, + "loss": 0.5884, + "grad_norm": 3.511723756790161, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 95, + "epoch": 0.6440677966101694, + "cpu_mem": 3.29074688, + "gpu_mem": 1.352818176, + "loss": 0.5494, + "grad_norm": 2.426910877227783, + "learning_rate": 0.00025732066016100394 + }, + { + "step": 96, + "epoch": 0.6508474576271186, + "cpu_mem": 3.29074688, + "gpu_mem": 1.352856576, + "loss": 0.4237, + "grad_norm": 1.7255536317825317, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 97, + "epoch": 0.6576271186440678, + "cpu_mem": 3.29074688, + "gpu_mem": 1.352841216, + "loss": 0.5944, + "grad_norm": 2.9217448234558105, + "learning_rate": 0.0002547963544337602 + }, + { + "step": 98, + "epoch": 0.6644067796610169, + "cpu_mem": 3.29074688, + "gpu_mem": 1.352753664, + "loss": 0.5232, + "grad_norm": 3.309817314147949, + "learning_rate": 0.0002535118517223168 + }, + { + "step": 99, + "epoch": 0.6711864406779661, + "cpu_mem": 3.29074688, + "gpu_mem": 1.352702976, + "loss": 0.574, + "grad_norm": 2.7542884349823, + "learning_rate": 0.00025221269093908365 + }, + { + "step": 100, + "epoch": 0.6779661016949152, + "cpu_mem": 3.29074688, + "gpu_mem": 1.352819712, + "loss": 0.56, + "grad_norm": 2.6793477535247803, + "learning_rate": 0.0002508990560551879 + }, + { + "step": 101, + "epoch": 0.6847457627118644, + "cpu_mem": 3.29074688, + "gpu_mem": 1.352851968, + "loss": 0.5243, + "grad_norm": 2.742920160293579, + "learning_rate": 0.0002495711330914001 + }, + { + "step": 102, + "epoch": 0.6915254237288135, + "cpu_mem": 3.29074688, + "gpu_mem": 1.35288576, + "loss": 0.503, + "grad_norm": 1.820168375968933, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 103, + "epoch": 0.6983050847457627, + "cpu_mem": 3.29074688, + "gpu_mem": 1.352936448, + "loss": 0.592, + "grad_norm": 2.8000380992889404, + "learning_rate": 0.0002468731770971113 + }, + { + "step": 104, + "epoch": 0.7050847457627119, + "cpu_mem": 3.29074688, + "gpu_mem": 1.352841216, + "loss": 0.5713, + "grad_norm": 3.2366268634796143, + "learning_rate": 0.0002455035261178632 + }, + { + "step": 105, + "epoch": 0.711864406779661, + "cpu_mem": 3.29074688, + "gpu_mem": 1.352942592, + "loss": 0.5373, + "grad_norm": 2.5028128623962402, + "learning_rate": 0.0002441203511071278 + }, + { + "step": 106, + "epoch": 0.7186440677966102, + "cpu_mem": 3.29074688, + "gpu_mem": 1.35289344, + "loss": 0.4162, + "grad_norm": 1.8481773138046265, + "learning_rate": 0.00024272384793309077 + }, + { + "step": 107, + "epoch": 0.7254237288135593, + "cpu_mem": 3.29074688, + "gpu_mem": 1.352781312, + "loss": 0.4796, + "grad_norm": 2.4446568489074707, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 108, + "epoch": 0.7322033898305085, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352965632, + "loss": 0.5183, + "grad_norm": 2.0678813457489014, + "learning_rate": 0.00023989164997670202 + }, + { + "step": 109, + "epoch": 0.7389830508474576, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352819712, + "loss": 0.5407, + "grad_norm": 1.789390206336975, + "learning_rate": 0.0002384563562552943 + }, + { + "step": 110, + "epoch": 0.7457627118644068, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352822784, + "loss": 0.4802, + "grad_norm": 1.6455665826797485, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 111, + "epoch": 0.752542372881356, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352792064, + "loss": 0.4802, + "grad_norm": 1.36224365234375, + "learning_rate": 0.0002355483955402446 + }, + { + "step": 112, + "epoch": 0.7593220338983051, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352838144, + "loss": 0.5078, + "grad_norm": 1.9794381856918335, + "learning_rate": 0.00023407614033613407 + }, + { + "step": 113, + "epoch": 0.7661016949152543, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352828928, + "loss": 0.4539, + "grad_norm": 1.6282660961151123, + "learning_rate": 0.0002325919793059723 + }, + { + "step": 114, + "epoch": 0.7728813559322034, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352810496, + "loss": 0.4087, + "grad_norm": 1.4250513315200806, + "learning_rate": 0.00023109612261833963 + }, + { + "step": 115, + "epoch": 0.7796610169491526, + "cpu_mem": 3.290943488, + "gpu_mem": 1.35288576, + "loss": 0.4804, + "grad_norm": 1.8816901445388794, + "learning_rate": 0.0002295887820980112 + }, + { + "step": 116, + "epoch": 0.7864406779661017, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352805888, + "loss": 0.4058, + "grad_norm": 1.5584790706634521, + "learning_rate": 0.0002280701711959608 + }, + { + "step": 117, + "epoch": 0.7932203389830509, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352696832, + "loss": 0.4073, + "grad_norm": 2.26464581489563, + "learning_rate": 0.00022654050495913495 + }, + { + "step": 118, + "epoch": 0.8, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352934912, + "loss": 0.5979, + "grad_norm": 3.0938050746917725, + "learning_rate": 0.000225 + }, + { + "step": 119, + "epoch": 0.8067796610169492, + "cpu_mem": 3.290943488, + "gpu_mem": 1.353105408, + "loss": 0.4914, + "grad_norm": 2.642796754837036, + "learning_rate": 0.00022344887446586865 + }, + { + "step": 120, + "epoch": 0.8135593220338984, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352838144, + "loss": 0.4042, + "grad_norm": 2.714768409729004, + "learning_rate": 0.00022188734800800852 + }, + { + "step": 121, + "epoch": 0.8203389830508474, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352865792, + "loss": 0.5693, + "grad_norm": 3.2306149005889893, + "learning_rate": 0.00022031564175053754 + }, + { + "step": 122, + "epoch": 0.8271186440677966, + "cpu_mem": 3.290943488, + "gpu_mem": 1.35291648, + "loss": 0.4109, + "grad_norm": 2.6077404022216797, + "learning_rate": 0.00021873397825911153 + }, + { + "step": 123, + "epoch": 0.8338983050847457, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352726016, + "loss": 0.4581, + "grad_norm": 3.151318073272705, + "learning_rate": 0.00021714258150940685 + }, + { + "step": 124, + "epoch": 0.8406779661016949, + "cpu_mem": 3.290943488, + "gpu_mem": 1.353168384, + "loss": 0.452, + "grad_norm": 2.404751777648926, + "learning_rate": 0.0002155416768554039 + }, + { + "step": 125, + "epoch": 0.847457627118644, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352894976, + "loss": 0.4353, + "grad_norm": 2.141434907913208, + "learning_rate": 0.00021393149099747523 + }, + { + "step": 126, + "epoch": 0.8542372881355932, + "cpu_mem": 3.290943488, + "gpu_mem": 1.35277824, + "loss": 0.5108, + "grad_norm": 2.8292367458343506, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 127, + "epoch": 0.8610169491525423, + "cpu_mem": 3.290943488, + "gpu_mem": 1.353217536, + "loss": 0.4715, + "grad_norm": 1.8124420642852783, + "learning_rate": 0.00021068418901049025 + }, + { + "step": 128, + "epoch": 0.8677966101694915, + "cpu_mem": 3.290943488, + "gpu_mem": 1.35299328, + "loss": 0.4118, + "grad_norm": 2.0969150066375732, + "learning_rate": 0.0002090475327242912 + }, + { + "step": 129, + "epoch": 0.8745762711864407, + "cpu_mem": 3.290943488, + "gpu_mem": 1.353033216, + "loss": 0.4463, + "grad_norm": 2.5721445083618164, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 130, + "epoch": 0.8813559322033898, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352815104, + "loss": 0.565, + "grad_norm": 1.945946455001831, + "learning_rate": 0.0002057493683490491 + }, + { + "step": 131, + "epoch": 0.888135593220339, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352944128, + "loss": 0.47, + "grad_norm": 1.6579515933990479, + "learning_rate": 0.00020408832730536746 + }, + { + "step": 132, + "epoch": 0.8949152542372881, + "cpu_mem": 3.290943488, + "gpu_mem": 1.353025536, + "loss": 0.39, + "grad_norm": 3.0418357849121094, + "learning_rate": 0.00020241962693986476 + }, + { + "step": 133, + "epoch": 0.9016949152542373, + "cpu_mem": 3.290943488, + "gpu_mem": 1.35280896, + "loss": 0.4399, + "grad_norm": 2.5622801780700684, + "learning_rate": 0.0002007435035533061 + }, + { + "step": 134, + "epoch": 0.9084745762711864, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352942592, + "loss": 0.4986, + "grad_norm": 3.7753024101257324, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 135, + "epoch": 0.9152542372881356, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352965632, + "loss": 0.551, + "grad_norm": 3.1442036628723145, + "learning_rate": 0.00019736993814225374 + }, + { + "step": 136, + "epoch": 0.9220338983050848, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352802816, + "loss": 0.4607, + "grad_norm": 2.388061046600342, + "learning_rate": 0.00019567297384048604 + }, + { + "step": 137, + "epoch": 0.9288135593220339, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352683008, + "loss": 0.4896, + "grad_norm": 2.8195765018463135, + "learning_rate": 0.0001939695418954653 + }, + { + "step": 138, + "epoch": 0.9355932203389831, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352864256, + "loss": 0.4414, + "grad_norm": 2.851844310760498, + "learning_rate": 0.00019225988352621445 + }, + { + "step": 139, + "epoch": 0.9423728813559322, + "cpu_mem": 3.290943488, + "gpu_mem": 1.35276288, + "loss": 0.3744, + "grad_norm": 1.9495959281921387, + "learning_rate": 0.00019054424083346592 + }, + { + "step": 140, + "epoch": 0.9491525423728814, + "cpu_mem": 3.290943488, + "gpu_mem": 1.352815104, + "loss": 0.5123, + "grad_norm": 2.3302037715911865, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 141, + "epoch": 0.9559322033898305, + "cpu_mem": 3.290943488, + "gpu_mem": 1.35284736, + "loss": 0.6154, + "grad_norm": 3.6952226161956787, + "learning_rate": 0.0001870959750831323 + }, + { + "step": 142, + "epoch": 0.9627118644067797, + "cpu_mem": 3.291140096, + "gpu_mem": 1.352987136, + "loss": 0.4235, + "grad_norm": 2.773329496383667, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 143, + "epoch": 0.9694915254237289, + "cpu_mem": 3.291140096, + "gpu_mem": 1.35297024, + "loss": 0.5238, + "grad_norm": 1.8093615770339966, + "learning_rate": 0.00018362669777878453 + }, + { + "step": 144, + "epoch": 0.976271186440678, + "cpu_mem": 3.291140096, + "gpu_mem": 1.35316224, + "loss": 0.5201, + "grad_norm": 1.94582998752594, + "learning_rate": 0.00018188479343294648 + }, + { + "step": 145, + "epoch": 0.9830508474576272, + "cpu_mem": 3.291140096, + "gpu_mem": 1.352873472, + "loss": 0.461, + "grad_norm": 1.404140830039978, + "learning_rate": 0.0001801383739559098 + }, + { + "step": 146, + "epoch": 0.9898305084745763, + "cpu_mem": 3.291140096, + "gpu_mem": 1.3529088, + "loss": 0.5005, + "grad_norm": 2.0005831718444824, + "learning_rate": 0.0001783876866540615 + }, + { + "step": 147, + "epoch": 0.9966101694915255, + "cpu_mem": 3.291140096, + "gpu_mem": 1.352807424, + "loss": 0.4766, + "grad_norm": 1.767867088317871, + "learning_rate": 0.00017663297943814552 + }, + { + "step": 148, + "epoch": 1.0033898305084745, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453897216, + "loss": 0.5596, + "grad_norm": 2.1566927433013916, + "learning_rate": 0.0001748745007881561 + }, + { + "step": 149, + "epoch": 1.0101694915254238, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453832704, + "loss": 0.3519, + "grad_norm": 2.0478665828704834, + "learning_rate": 0.00017311249971815185 + }, + { + "step": 150, + "epoch": 1.0169491525423728, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453669888, + "loss": 0.3509, + "grad_norm": 2.0778300762176514, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 151, + "epoch": 1.023728813559322, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45374208, + "loss": 0.308, + "grad_norm": 1.7119958400726318, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 152, + "epoch": 1.0305084745762711, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453777408, + "loss": 0.3198, + "grad_norm": 1.577049732208252, + "learning_rate": 0.00016780785939859576 + }, + { + "step": 153, + "epoch": 1.0372881355932204, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453801984, + "loss": 0.4208, + "grad_norm": 2.306021213531494, + "learning_rate": 0.00016603426823476693 + }, + { + "step": 154, + "epoch": 1.0440677966101695, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453763584, + "loss": 0.3844, + "grad_norm": 2.6056602001190186, + "learning_rate": 0.00016425840649562736 + }, + { + "step": 155, + "epoch": 1.0508474576271187, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453984768, + "loss": 0.3883, + "grad_norm": 2.2159955501556396, + "learning_rate": 0.00016248052565681436 + }, + { + "step": 156, + "epoch": 1.0576271186440678, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453892608, + "loss": 0.3239, + "grad_norm": 3.0970022678375244, + "learning_rate": 0.00016070087747988482 + }, + { + "step": 157, + "epoch": 1.064406779661017, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453798912, + "loss": 0.3305, + "grad_norm": 2.0860419273376465, + "learning_rate": 0.00015891971397666464 + }, + { + "step": 158, + "epoch": 1.071186440677966, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453725184, + "loss": 0.3575, + "grad_norm": 2.038331985473633, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 159, + "epoch": 1.0779661016949154, + "cpu_mem": 3.291140096, + "gpu_mem": 1.454073856, + "loss": 0.2518, + "grad_norm": 2.3864269256591797, + "learning_rate": 0.00015535385007584706 + }, + { + "step": 160, + "epoch": 1.0847457627118644, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453668352, + "loss": 0.244, + "grad_norm": 1.6638277769088745, + "learning_rate": 0.0001535696546319161 + }, + { + "step": 161, + "epoch": 1.0915254237288137, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453614592, + "loss": 0.3273, + "grad_norm": 1.8513169288635254, + "learning_rate": 0.00015178495369752213 + }, + { + "step": 162, + "epoch": 1.0983050847457627, + "cpu_mem": 3.291140096, + "gpu_mem": 1.454390272, + "loss": 0.2489, + "grad_norm": 1.2091186046600342, + "learning_rate": 0.00015 + }, + { + "step": 163, + "epoch": 1.1050847457627118, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453866496, + "loss": 0.3402, + "grad_norm": 2.0717952251434326, + "learning_rate": 0.00014821504630247785 + }, + { + "step": 164, + "epoch": 1.111864406779661, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453778944, + "loss": 0.3422, + "grad_norm": 2.076378583908081, + "learning_rate": 0.00014643034536808387 + }, + { + "step": 165, + "epoch": 1.11864406779661, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453728256, + "loss": 0.2951, + "grad_norm": 1.9243735074996948, + "learning_rate": 0.00014464614992415294 + }, + { + "step": 166, + "epoch": 1.1254237288135593, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453823488, + "loss": 0.2486, + "grad_norm": 1.2542691230773926, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 167, + "epoch": 1.1322033898305084, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453740544, + "loss": 0.3358, + "grad_norm": 2.4461007118225098, + "learning_rate": 0.00014108028602333536 + }, + { + "step": 168, + "epoch": 1.1389830508474577, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453758976, + "loss": 0.2705, + "grad_norm": 1.7438157796859741, + "learning_rate": 0.00013929912252011516 + }, + { + "step": 169, + "epoch": 1.1457627118644067, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453846528, + "loss": 0.3069, + "grad_norm": 2.480360746383667, + "learning_rate": 0.00013751947434318564 + }, + { + "step": 170, + "epoch": 1.152542372881356, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453731328, + "loss": 0.3447, + "grad_norm": 2.8746397495269775, + "learning_rate": 0.00013574159350437261 + }, + { + "step": 171, + "epoch": 1.159322033898305, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453794304, + "loss": 0.3716, + "grad_norm": 3.260237455368042, + "learning_rate": 0.0001339657317652331 + }, + { + "step": 172, + "epoch": 1.1661016949152543, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453702144, + "loss": 0.2372, + "grad_norm": 2.816953659057617, + "learning_rate": 0.00013219214060140424 + }, + { + "step": 173, + "epoch": 1.1728813559322033, + "cpu_mem": 3.291140096, + "gpu_mem": 1.454001664, + "loss": 0.3229, + "grad_norm": 2.9355452060699463, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 174, + "epoch": 1.1796610169491526, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453725184, + "loss": 0.3398, + "grad_norm": 3.049189329147339, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 175, + "epoch": 1.1864406779661016, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453691392, + "loss": 0.2697, + "grad_norm": 2.035909414291382, + "learning_rate": 0.00012688750028184818 + }, + { + "step": 176, + "epoch": 1.193220338983051, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453829632, + "loss": 0.2936, + "grad_norm": 2.758373498916626, + "learning_rate": 0.0001251254992118439 + }, + { + "step": 177, + "epoch": 1.2, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453927936, + "loss": 0.3547, + "grad_norm": 2.8373870849609375, + "learning_rate": 0.00012336702056185453 + }, + { + "step": 178, + "epoch": 1.2067796610169492, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453674496, + "loss": 0.4241, + "grad_norm": 3.082672357559204, + "learning_rate": 0.00012161231334593851 + }, + { + "step": 179, + "epoch": 1.2135593220338983, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453774336, + "loss": 0.4208, + "grad_norm": 3.028663396835327, + "learning_rate": 0.00011986162604409015 + }, + { + "step": 180, + "epoch": 1.2203389830508475, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453746688, + "loss": 0.2707, + "grad_norm": 2.6282424926757812, + "learning_rate": 0.00011811520656705348 + }, + { + "step": 181, + "epoch": 1.2271186440677966, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453683712, + "loss": 0.1984, + "grad_norm": 1.6759904623031616, + "learning_rate": 0.00011637330222121543 + }, + { + "step": 182, + "epoch": 1.2338983050847459, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453901824, + "loss": 0.2819, + "grad_norm": 2.799743413925171, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 183, + "epoch": 1.240677966101695, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453798912, + "loss": 0.1996, + "grad_norm": 2.296999931335449, + "learning_rate": 0.00011290402491686766 + }, + { + "step": 184, + "epoch": 1.2474576271186442, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453746688, + "loss": 0.2791, + "grad_norm": 2.6136858463287354, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 185, + "epoch": 1.2542372881355932, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453725184, + "loss": 0.3855, + "grad_norm": 2.2122905254364014, + "learning_rate": 0.00010945575916653407 + }, + { + "step": 186, + "epoch": 1.2610169491525425, + "cpu_mem": 3.291140096, + "gpu_mem": 1.4537344, + "loss": 0.3524, + "grad_norm": 2.5085608959198, + "learning_rate": 0.00010774011647378553 + }, + { + "step": 187, + "epoch": 1.2677966101694915, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453666816, + "loss": 0.3594, + "grad_norm": 2.1399080753326416, + "learning_rate": 0.00010603045810453468 + }, + { + "step": 188, + "epoch": 1.2745762711864406, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453829632, + "loss": 0.1542, + "grad_norm": 1.7707912921905518, + "learning_rate": 0.00010432702615951396 + }, + { + "step": 189, + "epoch": 1.2813559322033898, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453699072, + "loss": 0.2627, + "grad_norm": 2.2475831508636475, + "learning_rate": 0.00010263006185774627 + }, + { + "step": 190, + "epoch": 1.288135593220339, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45381888, + "loss": 0.3552, + "grad_norm": 2.3976614475250244, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 191, + "epoch": 1.2949152542372881, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453637632, + "loss": 0.2369, + "grad_norm": 1.596405267715454, + "learning_rate": 9.925649644669391e-05 + }, + { + "step": 192, + "epoch": 1.3016949152542372, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453769728, + "loss": 0.2469, + "grad_norm": 2.9309306144714355, + "learning_rate": 9.758037306013526e-05 + }, + { + "step": 193, + "epoch": 1.3084745762711865, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453743616, + "loss": 0.2913, + "grad_norm": 2.43153715133667, + "learning_rate": 9.591167269463255e-05 + }, + { + "step": 194, + "epoch": 1.3152542372881357, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453709824, + "loss": 0.2635, + "grad_norm": 2.521719217300415, + "learning_rate": 9.425063165095088e-05 + }, + { + "step": 195, + "epoch": 1.3220338983050848, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453814272, + "loss": 0.1696, + "grad_norm": 1.9092698097229004, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 196, + "epoch": 1.3288135593220338, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453809664, + "loss": 0.2914, + "grad_norm": 2.139155626296997, + "learning_rate": 9.095246727570879e-05 + }, + { + "step": 197, + "epoch": 1.335593220338983, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453668352, + "loss": 0.2583, + "grad_norm": 2.0147974491119385, + "learning_rate": 8.931581098950973e-05 + }, + { + "step": 198, + "epoch": 1.3423728813559321, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453860352, + "loss": 0.2881, + "grad_norm": 3.2696149349212646, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 199, + "epoch": 1.3491525423728814, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45371136, + "loss": 0.3242, + "grad_norm": 3.260291576385498, + "learning_rate": 8.606850900252478e-05 + }, + { + "step": 200, + "epoch": 1.3559322033898304, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453814272, + "loss": 0.1845, + "grad_norm": 2.3716726303100586, + "learning_rate": 8.445832314459608e-05 + }, + { + "step": 201, + "epoch": 1.3627118644067797, + "cpu_mem": 3.291140096, + "gpu_mem": 1.454017024, + "loss": 0.2167, + "grad_norm": 2.4561824798583984, + "learning_rate": 8.285741849059311e-05 + }, + { + "step": 202, + "epoch": 1.3694915254237288, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45381888, + "loss": 0.2133, + "grad_norm": 1.9550893306732178, + "learning_rate": 8.126602174088843e-05 + }, + { + "step": 203, + "epoch": 1.376271186440678, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453705216, + "loss": 0.2534, + "grad_norm": 2.1783416271209717, + "learning_rate": 7.968435824946242e-05 + }, + { + "step": 204, + "epoch": 1.383050847457627, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45371904, + "loss": 0.2003, + "grad_norm": 3.034559726715088, + "learning_rate": 7.811265199199152e-05 + }, + { + "step": 205, + "epoch": 1.3898305084745763, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453763584, + "loss": 0.2712, + "grad_norm": 3.4562106132507324, + "learning_rate": 7.655112553413135e-05 + }, + { + "step": 206, + "epoch": 1.3966101694915254, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453705216, + "loss": 0.2035, + "grad_norm": 2.3324050903320312, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 207, + "epoch": 1.4033898305084747, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453938688, + "loss": 0.3598, + "grad_norm": 2.3462777137756348, + "learning_rate": 7.345949504086507e-05 + }, + { + "step": 208, + "epoch": 1.4101694915254237, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453969408, + "loss": 0.2396, + "grad_norm": 3.872950792312622, + "learning_rate": 7.192982880403917e-05 + }, + { + "step": 209, + "epoch": 1.4169491525423727, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45389568, + "loss": 0.2716, + "grad_norm": 2.644681453704834, + "learning_rate": 7.041121790198881e-05 + }, + { + "step": 210, + "epoch": 1.423728813559322, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453783552, + "loss": 0.2773, + "grad_norm": 2.026806592941284, + "learning_rate": 6.890387738166041e-05 + }, + { + "step": 211, + "epoch": 1.4305084745762713, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453732864, + "loss": 0.3092, + "grad_norm": 2.088056802749634, + "learning_rate": 6.740802069402771e-05 + }, + { + "step": 212, + "epoch": 1.4372881355932203, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453702144, + "loss": 0.1717, + "grad_norm": 1.709466576576233, + "learning_rate": 6.592385966386588e-05 + }, + { + "step": 213, + "epoch": 1.4440677966101694, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453725184, + "loss": 0.2074, + "grad_norm": 2.447991371154785, + "learning_rate": 6.445160445975536e-05 + }, + { + "step": 214, + "epoch": 1.4508474576271186, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453808128, + "loss": 0.3464, + "grad_norm": 2.3371737003326416, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 215, + "epoch": 1.457627118644068, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453735936, + "loss": 0.2929, + "grad_norm": 4.052587032318115, + "learning_rate": 6.154364374470568e-05 + }, + { + "step": 216, + "epoch": 1.464406779661017, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453901824, + "loss": 0.2246, + "grad_norm": 2.5455780029296875, + "learning_rate": 6.010835002329795e-05 + }, + { + "step": 217, + "epoch": 1.471186440677966, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453743616, + "loss": 0.2468, + "grad_norm": 3.649040460586548, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 218, + "epoch": 1.4779661016949153, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453720576, + "loss": 0.2949, + "grad_norm": 2.2728219032287598, + "learning_rate": 5.72761520669092e-05 + }, + { + "step": 219, + "epoch": 1.4847457627118645, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453846528, + "loss": 0.3096, + "grad_norm": 2.6980154514312744, + "learning_rate": 5.587964889287218e-05 + }, + { + "step": 220, + "epoch": 1.4915254237288136, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45388032, + "loss": 0.2024, + "grad_norm": 1.6067042350769043, + "learning_rate": 5.449647388213678e-05 + }, + { + "step": 221, + "epoch": 1.4983050847457626, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453748224, + "loss": 0.2761, + "grad_norm": 2.5046310424804688, + "learning_rate": 5.312682290288869e-05 + }, + { + "step": 222, + "epoch": 1.505084745762712, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453884928, + "loss": 0.3015, + "grad_norm": 2.998739242553711, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 223, + "epoch": 1.5118644067796612, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453798912, + "loss": 0.2316, + "grad_norm": 2.664515733718872, + "learning_rate": 5.0428866908599864e-05 + }, + { + "step": 224, + "epoch": 1.5186440677966102, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453763584, + "loss": 0.1442, + "grad_norm": 1.5527210235595703, + "learning_rate": 4.9100943944812114e-05 + }, + { + "step": 225, + "epoch": 1.5254237288135593, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453728256, + "loss": 0.2243, + "grad_norm": 2.129601240158081, + "learning_rate": 4.778730906091632e-05 + }, + { + "step": 226, + "epoch": 1.5322033898305085, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453877248, + "loss": 0.248, + "grad_norm": 2.177233934402466, + "learning_rate": 4.648814827768322e-05 + }, + { + "step": 227, + "epoch": 1.5389830508474578, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453766656, + "loss": 0.2062, + "grad_norm": 2.2934036254882812, + "learning_rate": 4.5203645566239816e-05 + }, + { + "step": 228, + "epoch": 1.5457627118644068, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45371136, + "loss": 0.2273, + "grad_norm": 2.4996700286865234, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 229, + "epoch": 1.5525423728813559, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453652992, + "loss": 0.1832, + "grad_norm": 1.6163780689239502, + "learning_rate": 4.267933983899601e-05 + }, + { + "step": 230, + "epoch": 1.559322033898305, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453709824, + "loss": 0.2247, + "grad_norm": 3.6382203102111816, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 231, + "epoch": 1.5661016949152542, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45398784, + "loss": 0.2379, + "grad_norm": 2.013258457183838, + "learning_rate": 4.0215821672741213e-05 + }, + { + "step": 232, + "epoch": 1.5728813559322035, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45371136, + "loss": 0.2831, + "grad_norm": 2.201864242553711, + "learning_rate": 3.900729534256745e-05 + }, + { + "step": 233, + "epoch": 1.5796610169491525, + "cpu_mem": 3.291140096, + "gpu_mem": 1.454024704, + "loss": 0.2569, + "grad_norm": 2.412001132965088, + "learning_rate": 3.781448643031187e-05 + }, + { + "step": 234, + "epoch": 1.5864406779661016, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453900288, + "loss": 0.1976, + "grad_norm": 2.4555327892303467, + "learning_rate": 3.663756384686127e-05 + }, + { + "step": 235, + "epoch": 1.5932203389830508, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453656064, + "loss": 0.1781, + "grad_norm": 1.9383608102798462, + "learning_rate": 3.547669425347647e-05 + }, + { + "step": 236, + "epoch": 1.6, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453715968, + "loss": 0.2498, + "grad_norm": 2.2796216011047363, + "learning_rate": 3.433204203819185e-05 + }, + { + "step": 237, + "epoch": 1.6067796610169491, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453777408, + "loss": 0.2997, + "grad_norm": 2.5772972106933594, + "learning_rate": 3.3203769292536764e-05 + }, + { + "step": 238, + "epoch": 1.6135593220338982, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453778944, + "loss": 0.1457, + "grad_norm": 1.8782415390014648, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 239, + "epoch": 1.6203389830508474, + "cpu_mem": 3.291140096, + "gpu_mem": 1.454032384, + "loss": 0.2267, + "grad_norm": 2.3701887130737305, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 240, + "epoch": 1.6271186440677967, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453682176, + "loss": 0.4488, + "grad_norm": 4.399943828582764, + "learning_rate": 2.9918813861345952e-05 + }, + { + "step": 241, + "epoch": 1.6338983050847458, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453978624, + "loss": 0.2631, + "grad_norm": 2.4196090698242188, + "learning_rate": 2.885763318295102e-05 + }, + { + "step": 242, + "epoch": 1.6406779661016948, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453840384, + "loss": 0.1896, + "grad_norm": 2.048516273498535, + "learning_rate": 2.781360719244964e-05 + }, + { + "step": 243, + "epoch": 1.647457627118644, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453692928, + "loss": 0.2435, + "grad_norm": 2.6612093448638916, + "learning_rate": 2.6786883731926306e-05 + }, + { + "step": 244, + "epoch": 1.6542372881355933, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453832704, + "loss": 0.1639, + "grad_norm": 2.2809085845947266, + "learning_rate": 2.5777608193294396e-05 + }, + { + "step": 245, + "epoch": 1.6610169491525424, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45371136, + "loss": 0.3188, + "grad_norm": 2.3731846809387207, + "learning_rate": 2.4785923497707956e-05 + }, + { + "step": 246, + "epoch": 1.6677966101694914, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453805056, + "loss": 0.3136, + "grad_norm": 2.5154881477355957, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 247, + "epoch": 1.6745762711864407, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453823488, + "loss": 0.184, + "grad_norm": 2.2865190505981445, + "learning_rate": 2.285588584541047e-05 + }, + { + "step": 248, + "epoch": 1.68135593220339, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453775872, + "loss": 0.2138, + "grad_norm": 2.565019369125366, + "learning_rate": 2.1917806196827792e-05 + }, + { + "step": 249, + "epoch": 1.688135593220339, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453682176, + "loss": 0.2071, + "grad_norm": 1.9629261493682861, + "learning_rate": 2.0997863968844914e-05 + }, + { + "step": 250, + "epoch": 1.694915254237288, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453774336, + "loss": 0.2386, + "grad_norm": 2.8802688121795654, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 251, + "epoch": 1.7016949152542373, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453686784, + "loss": 0.2027, + "grad_norm": 3.1179840564727783, + "learning_rate": 1.921291027132278e-05 + }, + { + "step": 252, + "epoch": 1.7084745762711866, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453729792, + "loss": 0.2684, + "grad_norm": 2.4378418922424316, + "learning_rate": 1.834815156491165e-05 + }, + { + "step": 253, + "epoch": 1.7152542372881356, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453923328, + "loss": 0.3045, + "grad_norm": 3.1592676639556885, + "learning_rate": 1.750203576956341e-05 + }, + { + "step": 254, + "epoch": 1.7220338983050847, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45371904, + "loss": 0.2762, + "grad_norm": 2.454993724822998, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 255, + "epoch": 1.7288135593220337, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453875712, + "loss": 0.3388, + "grad_norm": 4.071863174438477, + "learning_rate": 1.5866209521043304e-05 + }, + { + "step": 256, + "epoch": 1.735593220338983, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453702144, + "loss": 0.1554, + "grad_norm": 2.4159345626831055, + "learning_rate": 1.5076730713409523e-05 + }, + { + "step": 257, + "epoch": 1.7423728813559323, + "cpu_mem": 3.291140096, + "gpu_mem": 1.454115328, + "loss": 0.2966, + "grad_norm": 1.8865463733673096, + "learning_rate": 1.4306358075111923e-05 + }, + { + "step": 258, + "epoch": 1.7491525423728813, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453774336, + "loss": 0.3437, + "grad_norm": 3.890566825866699, + "learning_rate": 1.3555200696822232e-05 + }, + { + "step": 259, + "epoch": 1.7559322033898304, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453691392, + "loss": 0.3754, + "grad_norm": 2.8612890243530273, + "learning_rate": 1.2823364948184095e-05 + }, + { + "step": 260, + "epoch": 1.7627118644067796, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453808128, + "loss": 0.1562, + "grad_norm": 1.4306381940841675, + "learning_rate": 1.2110954462750166e-05 + }, + { + "step": 261, + "epoch": 1.769491525423729, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453763584, + "loss": 0.1289, + "grad_norm": 1.3967266082763672, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 262, + "epoch": 1.776271186440678, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453720576, + "loss": 0.2061, + "grad_norm": 3.2110791206359863, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 263, + "epoch": 1.783050847457627, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45375744, + "loss": 0.2216, + "grad_norm": 1.7870162725448608, + "learning_rate": 1.0091269574384874e-05 + }, + { + "step": 264, + "epoch": 1.7898305084745763, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453844992, + "loss": 0.1635, + "grad_norm": 2.092017412185669, + "learning_rate": 9.45754125003576e-06 + }, + { + "step": 265, + "epoch": 1.7966101694915255, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453763584, + "loss": 0.2781, + "grad_norm": 2.2330682277679443, + "learning_rate": 8.843714815330987e-06 + }, + { + "step": 266, + "epoch": 1.8033898305084746, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453978624, + "loss": 0.4129, + "grad_norm": 3.27122163772583, + "learning_rate": 8.249877192799731e-06 + }, + { + "step": 267, + "epoch": 1.8101694915254236, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453771264, + "loss": 0.2624, + "grad_norm": 3.2668426036834717, + "learning_rate": 7.676112474402068e-06 + }, + { + "step": 268, + "epoch": 1.8169491525423729, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453775872, + "loss": 0.2195, + "grad_norm": 2.148636817932129, + "learning_rate": 7.122501909620926e-06 + }, + { + "step": 269, + "epoch": 1.8237288135593221, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453786624, + "loss": 0.2735, + "grad_norm": 3.1508138179779053, + "learning_rate": 6.5891238939566275e-06 + }, + { + "step": 270, + "epoch": 1.8305084745762712, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453825024, + "loss": 0.1802, + "grad_norm": 1.6321183443069458, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 271, + "epoch": 1.8372881355932202, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453877248, + "loss": 0.2309, + "grad_norm": 2.607900857925415, + "learning_rate": 5.583364755863701e-06 + }, + { + "step": 272, + "epoch": 1.8440677966101695, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453735936, + "loss": 0.2338, + "grad_norm": 1.886138677597046, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 273, + "epoch": 1.8508474576271188, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453616128, + "loss": 0.2715, + "grad_norm": 2.6831719875335693, + "learning_rate": 4.659404732773908e-06 + }, + { + "step": 274, + "epoch": 1.8576271186440678, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453843456, + "loss": 0.1915, + "grad_norm": 2.0600273609161377, + "learning_rate": 4.228264751468752e-06 + }, + { + "step": 275, + "epoch": 1.8644067796610169, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45408768, + "loss": 0.1929, + "grad_norm": 1.67916738986969, + "learning_rate": 3.817767165451041e-06 + }, + { + "step": 276, + "epoch": 1.8711864406779661, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453748224, + "loss": 0.2425, + "grad_norm": 2.4127755165100098, + "learning_rate": 3.4279701043260886e-06 + }, + { + "step": 277, + "epoch": 1.8779661016949154, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453694464, + "loss": 0.3065, + "grad_norm": 2.571768045425415, + "learning_rate": 3.0589287663461472e-06 + }, + { + "step": 278, + "epoch": 1.8847457627118644, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45385728, + "loss": 0.2602, + "grad_norm": 2.5837905406951904, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 279, + "epoch": 1.8915254237288135, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453797376, + "loss": 0.2798, + "grad_norm": 2.0310022830963135, + "learning_rate": 2.3833193495825853e-06 + }, + { + "step": 280, + "epoch": 1.8983050847457628, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453777408, + "loss": 0.2634, + "grad_norm": 2.534637451171875, + "learning_rate": 2.076846942272026e-06 + }, + { + "step": 281, + "epoch": 1.905084745762712, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453712896, + "loss": 0.3489, + "grad_norm": 3.0440380573272705, + "learning_rate": 1.791321587504768e-06 + }, + { + "step": 282, + "epoch": 1.911864406779661, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45414144, + "loss": 0.1556, + "grad_norm": 2.0398788452148438, + "learning_rate": 1.5267837178600972e-06 + }, + { + "step": 283, + "epoch": 1.9186440677966101, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453848064, + "loss": 0.2723, + "grad_norm": 2.3819687366485596, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 284, + "epoch": 1.9254237288135592, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45370368, + "loss": 0.292, + "grad_norm": 1.6775835752487183, + "learning_rate": 1.0608172990067553e-06 + }, + { + "step": 285, + "epoch": 1.9322033898305084, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45375744, + "loss": 0.3043, + "grad_norm": 2.653355360031128, + "learning_rate": 8.594547342153979e-07 + }, + { + "step": 286, + "epoch": 1.9389830508474577, + "cpu_mem": 3.291140096, + "gpu_mem": 1.454175232, + "loss": 0.1654, + "grad_norm": 2.10739803314209, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 287, + "epoch": 1.9457627118644067, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453944832, + "loss": 0.2695, + "grad_norm": 2.905089855194092, + "learning_rate": 5.201134622801473e-07 + }, + { + "step": 288, + "epoch": 1.9525423728813558, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453729792, + "loss": 0.3554, + "grad_norm": 2.2342350482940674, + "learning_rate": 3.821828084619727e-07 + }, + { + "step": 289, + "epoch": 1.959322033898305, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453814272, + "loss": 0.2116, + "grad_norm": 2.4138240814208984, + "learning_rate": 2.654391846207915e-07 + }, + { + "step": 290, + "epoch": 1.9661016949152543, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453739008, + "loss": 0.3009, + "grad_norm": 2.43693208694458, + "learning_rate": 1.6989912254880556e-07 + }, + { + "step": 291, + "epoch": 1.9728813559322034, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453774336, + "loss": 0.3595, + "grad_norm": 4.465625762939453, + "learning_rate": 9.557615145123765e-08 + }, + { + "step": 292, + "epoch": 1.9796610169491524, + "cpu_mem": 3.291140096, + "gpu_mem": 1.45385728, + "loss": 0.247, + "grad_norm": 2.9458413124084473, + "learning_rate": 4.248079603064724e-08 + }, + { + "step": 293, + "epoch": 1.9864406779661017, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453774336, + "loss": 0.3473, + "grad_norm": 3.9143316745758057, + "learning_rate": 1.0620574996372811e-08 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453800448, + "loss": 0.1845, + "grad_norm": 2.9766409397125244, + "learning_rate": 0.0 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 3.291140096, + "gpu_mem": 1.453800448, + "train_runtime": 4557.6762, + "train_samples_per_second": 4.137, + "train_steps_per_second": 0.065, + "total_flos": 4.809368057590579e+16, + "train_loss": 0.5715306683563862 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r8-a2/README.md b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r8-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r8-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r8-a2/adapter_config.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a34e999804ff05ab393ed2117c936e4d7827f88f --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r8-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r8-a2/eval_results.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..2e1dac555eb1e3b72ac38d61f8c27d359b31df5a --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "boolq", + "results": 0.810091743119266 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r8-a2/training_configuration.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..ad77c2e37ef42e756d0721dd530ac8ae50180c74 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "BOOLQ", + "dataset_id": "google/boolq", + "preprocess_id": "boolq_train_deepeval" + }, + "peft_config": { + "method": "loraq4", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 6307840 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 2, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loraq4-boolq-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loraq4-fix/TinyLlama_v1.1-loraq4-boolq-r8-a2", + "seed": 42, + "timestamp": "2025-09-01T01:54:13.257291" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r8-a2/training_logs.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..4dbdac6c6de4f342e195d15871f887c5c88b28b0 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-boolq-r8-a2/training_logs.json @@ -0,0 +1,2659 @@ +[ + { + "step": 1, + "epoch": 0.006779661016949152, + "cpu_mem": 3.410243584, + "gpu_mem": 1.075236864, + "loss": 8.7378, + "grad_norm": 26.1995906829834, + "learning_rate": 9.999999999999999e-06 + }, + { + "step": 2, + "epoch": 0.013559322033898305, + "cpu_mem": 3.411030016, + "gpu_mem": 1.125837312, + "loss": 8.7839, + "grad_norm": 28.54578399658203, + "learning_rate": 1.9999999999999998e-05 + }, + { + "step": 3, + "epoch": 0.020338983050847456, + "cpu_mem": 3.41161984, + "gpu_mem": 1.125755904, + "loss": 8.5794, + "grad_norm": 30.11810302734375, + "learning_rate": 2.9999999999999997e-05 + }, + { + "step": 4, + "epoch": 0.02711864406779661, + "cpu_mem": 3.412209664, + "gpu_mem": 1.125755904, + "loss": 8.326, + "grad_norm": 30.236257553100586, + "learning_rate": 3.9999999999999996e-05 + }, + { + "step": 5, + "epoch": 0.03389830508474576, + "cpu_mem": 3.412799488, + "gpu_mem": 1.125691392, + "loss": 7.7685, + "grad_norm": 26.87846565246582, + "learning_rate": 4.9999999999999996e-05 + }, + { + "step": 6, + "epoch": 0.04067796610169491, + "cpu_mem": 3.413389312, + "gpu_mem": 1.12571136, + "loss": 7.5338, + "grad_norm": 27.69971466064453, + "learning_rate": 5.9999999999999995e-05 + }, + { + "step": 7, + "epoch": 0.04745762711864407, + "cpu_mem": 3.413979136, + "gpu_mem": 1.125763584, + "loss": 6.5283, + "grad_norm": 35.67375564575195, + "learning_rate": 7e-05 + }, + { + "step": 8, + "epoch": 0.05423728813559322, + "cpu_mem": 3.414372352, + "gpu_mem": 1.1258496, + "loss": 5.4259, + "grad_norm": 35.3130989074707, + "learning_rate": 7.999999999999999e-05 + }, + { + "step": 9, + "epoch": 0.061016949152542375, + "cpu_mem": 3.414962176, + "gpu_mem": 1.12575744, + "loss": 4.1362, + "grad_norm": 30.471166610717773, + "learning_rate": 8.999999999999999e-05 + }, + { + "step": 10, + "epoch": 0.06779661016949153, + "cpu_mem": 3.415552, + "gpu_mem": 1.1256576, + "loss": 3.0231, + "grad_norm": 23.184316635131836, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 11, + "epoch": 0.07457627118644068, + "cpu_mem": 3.415945216, + "gpu_mem": 1.125762048, + "loss": 1.9223, + "grad_norm": 12.0885009765625, + "learning_rate": 0.00010999999999999998 + }, + { + "step": 12, + "epoch": 0.08135593220338982, + "cpu_mem": 3.416338432, + "gpu_mem": 1.12613376, + "loss": 1.4266, + "grad_norm": 7.942366600036621, + "learning_rate": 0.00011999999999999999 + }, + { + "step": 13, + "epoch": 0.08813559322033898, + "cpu_mem": 3.416731648, + "gpu_mem": 1.125737472, + "loss": 1.1452, + "grad_norm": 7.100488662719727, + "learning_rate": 0.00013 + }, + { + "step": 14, + "epoch": 0.09491525423728814, + "cpu_mem": 3.417124864, + "gpu_mem": 1.125714432, + "loss": 0.8461, + "grad_norm": 4.046794414520264, + "learning_rate": 0.00014 + }, + { + "step": 15, + "epoch": 0.1016949152542373, + "cpu_mem": 3.417321472, + "gpu_mem": 1.125652992, + "loss": 0.8495, + "grad_norm": 6.2335286140441895, + "learning_rate": 0.00015 + }, + { + "step": 16, + "epoch": 0.10847457627118644, + "cpu_mem": 3.417714688, + "gpu_mem": 1.125737472, + "loss": 0.7407, + "grad_norm": 2.684365749359131, + "learning_rate": 0.00015999999999999999 + }, + { + "step": 17, + "epoch": 0.1152542372881356, + "cpu_mem": 3.418107904, + "gpu_mem": 1.125777408, + "loss": 0.7832, + "grad_norm": 7.295577049255371, + "learning_rate": 0.00016999999999999999 + }, + { + "step": 18, + "epoch": 0.12203389830508475, + "cpu_mem": 3.41850112, + "gpu_mem": 1.125840384, + "loss": 0.8286, + "grad_norm": 10.931790351867676, + "learning_rate": 0.00017999999999999998 + }, + { + "step": 19, + "epoch": 0.1288135593220339, + "cpu_mem": 3.418894336, + "gpu_mem": 1.125677568, + "loss": 0.7148, + "grad_norm": 2.9000322818756104, + "learning_rate": 0.00018999999999999998 + }, + { + "step": 20, + "epoch": 0.13559322033898305, + "cpu_mem": 3.419090944, + "gpu_mem": 1.125789696, + "loss": 0.6265, + "grad_norm": 2.836984872817993, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 21, + "epoch": 0.1423728813559322, + "cpu_mem": 3.41948416, + "gpu_mem": 1.125947904, + "loss": 0.7164, + "grad_norm": 5.819405555725098, + "learning_rate": 0.00020999999999999998 + }, + { + "step": 22, + "epoch": 0.14915254237288136, + "cpu_mem": 3.419680768, + "gpu_mem": 1.125840384, + "loss": 0.7838, + "grad_norm": 7.818148612976074, + "learning_rate": 0.00021999999999999995 + }, + { + "step": 23, + "epoch": 0.15593220338983052, + "cpu_mem": 3.420073984, + "gpu_mem": 1.125812736, + "loss": 0.7046, + "grad_norm": 3.987525224685669, + "learning_rate": 0.00023 + }, + { + "step": 24, + "epoch": 0.16271186440677965, + "cpu_mem": 3.4204672, + "gpu_mem": 1.125869568, + "loss": 0.6261, + "grad_norm": 5.296741485595703, + "learning_rate": 0.00023999999999999998 + }, + { + "step": 25, + "epoch": 0.1694915254237288, + "cpu_mem": 3.420663808, + "gpu_mem": 1.125654528, + "loss": 0.6603, + "grad_norm": 4.211419582366943, + "learning_rate": 0.00025 + }, + { + "step": 26, + "epoch": 0.17627118644067796, + "cpu_mem": 3.421057024, + "gpu_mem": 1.125709824, + "loss": 0.7015, + "grad_norm": 4.6295166015625, + "learning_rate": 0.00026 + }, + { + "step": 27, + "epoch": 0.18305084745762712, + "cpu_mem": 3.421253632, + "gpu_mem": 1.126001664, + "loss": 0.6938, + "grad_norm": 5.898733139038086, + "learning_rate": 0.00027 + }, + { + "step": 28, + "epoch": 0.18983050847457628, + "cpu_mem": 3.42145024, + "gpu_mem": 1.12568064, + "loss": 0.6608, + "grad_norm": 1.269522786140442, + "learning_rate": 0.00028 + }, + { + "step": 29, + "epoch": 0.19661016949152543, + "cpu_mem": 3.421843456, + "gpu_mem": 1.125745152, + "loss": 0.6048, + "grad_norm": 1.127472996711731, + "learning_rate": 0.00029 + }, + { + "step": 30, + "epoch": 0.2033898305084746, + "cpu_mem": 3.422040064, + "gpu_mem": 1.125823488, + "loss": 0.6482, + "grad_norm": 3.699575424194336, + "learning_rate": 0.0003 + }, + { + "step": 31, + "epoch": 0.21016949152542372, + "cpu_mem": 3.422236672, + "gpu_mem": 1.12562688, + "loss": 0.5839, + "grad_norm": 3.921264886856079, + "learning_rate": 0.0002999893794250036 + }, + { + "step": 32, + "epoch": 0.21694915254237288, + "cpu_mem": 3.42243328, + "gpu_mem": 1.125740544, + "loss": 0.6626, + "grad_norm": 1.5626909732818604, + "learning_rate": 0.00029995751920396937 + }, + { + "step": 33, + "epoch": 0.22372881355932203, + "cpu_mem": 3.422629888, + "gpu_mem": 1.125978624, + "loss": 0.7618, + "grad_norm": 3.591050386428833, + "learning_rate": 0.00029990442384854874 + }, + { + "step": 34, + "epoch": 0.2305084745762712, + "cpu_mem": 3.422826496, + "gpu_mem": 1.12568064, + "loss": 0.5929, + "grad_norm": 1.1916701793670654, + "learning_rate": 0.0002998301008774512 + }, + { + "step": 35, + "epoch": 0.23728813559322035, + "cpu_mem": 3.423023104, + "gpu_mem": 1.125891072, + "loss": 0.6488, + "grad_norm": 2.4754061698913574, + "learning_rate": 0.0002997345608153792 + }, + { + "step": 36, + "epoch": 0.2440677966101695, + "cpu_mem": 3.423219712, + "gpu_mem": 1.12584192, + "loss": 0.7123, + "grad_norm": 5.0760817527771, + "learning_rate": 0.000299617817191538 + }, + { + "step": 37, + "epoch": 0.25084745762711863, + "cpu_mem": 3.42341632, + "gpu_mem": 1.125652992, + "loss": 0.7396, + "grad_norm": 6.977909088134766, + "learning_rate": 0.0002994798865377198 + }, + { + "step": 38, + "epoch": 0.2576271186440678, + "cpu_mem": 3.423612928, + "gpu_mem": 1.125900288, + "loss": 0.6633, + "grad_norm": 1.0818983316421509, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 39, + "epoch": 0.26440677966101694, + "cpu_mem": 3.423809536, + "gpu_mem": 1.12627968, + "loss": 0.6899, + "grad_norm": 3.0420215129852295, + "learning_rate": 0.0002991405452657846 + }, + { + "step": 40, + "epoch": 0.2711864406779661, + "cpu_mem": 3.424006144, + "gpu_mem": 1.1258496, + "loss": 0.6012, + "grad_norm": 2.6977217197418213, + "learning_rate": 0.00029893918270099324 + }, + { + "step": 41, + "epoch": 0.27796610169491526, + "cpu_mem": 3.424202752, + "gpu_mem": 1.126076928, + "loss": 0.6328, + "grad_norm": 1.239274263381958, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 42, + "epoch": 0.2847457627118644, + "cpu_mem": 3.42439936, + "gpu_mem": 1.125974016, + "loss": 0.6688, + "grad_norm": 3.4270358085632324, + "learning_rate": 0.0002984732162821399 + }, + { + "step": 43, + "epoch": 0.29152542372881357, + "cpu_mem": 3.42439936, + "gpu_mem": 1.12579584, + "loss": 0.765, + "grad_norm": 5.451464653015137, + "learning_rate": 0.0002982086784124952 + }, + { + "step": 44, + "epoch": 0.2983050847457627, + "cpu_mem": 3.424595968, + "gpu_mem": 1.125938688, + "loss": 0.6589, + "grad_norm": 3.830467700958252, + "learning_rate": 0.00029792315305772796 + }, + { + "step": 45, + "epoch": 0.3050847457627119, + "cpu_mem": 3.424792576, + "gpu_mem": 1.12571904, + "loss": 0.7541, + "grad_norm": 2.8271572589874268, + "learning_rate": 0.0002976166806504174 + }, + { + "step": 46, + "epoch": 0.31186440677966104, + "cpu_mem": 3.424989184, + "gpu_mem": 1.125961728, + "loss": 0.5737, + "grad_norm": 1.1384495496749878, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 47, + "epoch": 0.31864406779661014, + "cpu_mem": 3.424989184, + "gpu_mem": 1.125685248, + "loss": 0.5814, + "grad_norm": 0.9779514074325562, + "learning_rate": 0.00029694107123365385 + }, + { + "step": 48, + "epoch": 0.3254237288135593, + "cpu_mem": 3.424989184, + "gpu_mem": 1.125762048, + "loss": 0.5721, + "grad_norm": 1.9588422775268555, + "learning_rate": 0.00029657202989567393 + }, + { + "step": 49, + "epoch": 0.33220338983050846, + "cpu_mem": 3.425185792, + "gpu_mem": 1.125778944, + "loss": 0.6653, + "grad_norm": 2.0305728912353516, + "learning_rate": 0.00029618223283454893 + }, + { + "step": 50, + "epoch": 0.3389830508474576, + "cpu_mem": 3.425185792, + "gpu_mem": 1.125717504, + "loss": 0.5551, + "grad_norm": 1.525816559791565, + "learning_rate": 0.00029577173524853123 + }, + { + "step": 51, + "epoch": 0.34576271186440677, + "cpu_mem": 3.4253824, + "gpu_mem": 1.125722112, + "loss": 0.5807, + "grad_norm": 2.668461799621582, + "learning_rate": 0.0002953405952672261 + }, + { + "step": 52, + "epoch": 0.3525423728813559, + "cpu_mem": 3.4253824, + "gpu_mem": 1.125801984, + "loss": 0.5911, + "grad_norm": 2.0633888244628906, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 53, + "epoch": 0.3593220338983051, + "cpu_mem": 3.425579008, + "gpu_mem": 1.125825024, + "loss": 0.5287, + "grad_norm": 1.2228673696517944, + "learning_rate": 0.0002944166352441363 + }, + { + "step": 54, + "epoch": 0.36610169491525424, + "cpu_mem": 3.425579008, + "gpu_mem": 1.125752832, + "loss": 0.6796, + "grad_norm": 4.010019779205322, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 55, + "epoch": 0.3728813559322034, + "cpu_mem": 3.425775616, + "gpu_mem": 1.126023168, + "loss": 0.5583, + "grad_norm": 1.2962052822113037, + "learning_rate": 0.00029341087610604337 + }, + { + "step": 56, + "epoch": 0.37966101694915255, + "cpu_mem": 3.425775616, + "gpu_mem": 1.125809664, + "loss": 0.6339, + "grad_norm": 1.3489032983779907, + "learning_rate": 0.00029287749809037904 + }, + { + "step": 57, + "epoch": 0.3864406779661017, + "cpu_mem": 3.425972224, + "gpu_mem": 1.12580352, + "loss": 0.6388, + "grad_norm": 3.5700502395629883, + "learning_rate": 0.0002923238875255979 + }, + { + "step": 58, + "epoch": 0.39322033898305087, + "cpu_mem": 3.426168832, + "gpu_mem": 1.125699072, + "loss": 0.6169, + "grad_norm": 3.135751962661743, + "learning_rate": 0.00029175012280720024 + }, + { + "step": 59, + "epoch": 0.4, + "cpu_mem": 3.426168832, + "gpu_mem": 1.125715968, + "loss": 0.6196, + "grad_norm": 1.9370789527893066, + "learning_rate": 0.000291156285184669 + }, + { + "step": 60, + "epoch": 0.4067796610169492, + "cpu_mem": 3.426168832, + "gpu_mem": 1.125809664, + "loss": 0.4922, + "grad_norm": 1.549032211303711, + "learning_rate": 0.00029054245874996426 + }, + { + "step": 61, + "epoch": 0.4135593220338983, + "cpu_mem": 3.42636544, + "gpu_mem": 1.125820416, + "loss": 0.5887, + "grad_norm": 2.3944451808929443, + "learning_rate": 0.0002899087304256151 + }, + { + "step": 62, + "epoch": 0.42033898305084744, + "cpu_mem": 3.426562048, + "gpu_mem": 1.125808128, + "loss": 0.7917, + "grad_norm": 4.3218560218811035, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 63, + "epoch": 0.4271186440677966, + "cpu_mem": 3.426562048, + "gpu_mem": 1.125800448, + "loss": 0.3926, + "grad_norm": 1.1173707246780396, + "learning_rate": 0.000288581929876693 + }, + { + "step": 64, + "epoch": 0.43389830508474575, + "cpu_mem": 3.426562048, + "gpu_mem": 1.125729792, + "loss": 0.5133, + "grad_norm": 1.562133550643921, + "learning_rate": 0.0002878890455372498 + }, + { + "step": 65, + "epoch": 0.4406779661016949, + "cpu_mem": 3.426758656, + "gpu_mem": 1.125774336, + "loss": 0.5806, + "grad_norm": 1.6686031818389893, + "learning_rate": 0.0002871766350518159 + }, + { + "step": 66, + "epoch": 0.44745762711864406, + "cpu_mem": 3.426758656, + "gpu_mem": 1.125967872, + "loss": 0.5154, + "grad_norm": 1.7546840906143188, + "learning_rate": 0.00028644479930317775 + }, + { + "step": 67, + "epoch": 0.4542372881355932, + "cpu_mem": 3.426758656, + "gpu_mem": 1.125677568, + "loss": 0.5787, + "grad_norm": 1.644349455833435, + "learning_rate": 0.00028569364192488803 + }, + { + "step": 68, + "epoch": 0.4610169491525424, + "cpu_mem": 3.426955264, + "gpu_mem": 1.125645312, + "loss": 0.7204, + "grad_norm": 3.089144468307495, + "learning_rate": 0.00028492326928659045 + }, + { + "step": 69, + "epoch": 0.46779661016949153, + "cpu_mem": 3.426955264, + "gpu_mem": 1.12571136, + "loss": 0.5287, + "grad_norm": 1.2680445909500122, + "learning_rate": 0.00028413379047895665 + }, + { + "step": 70, + "epoch": 0.4745762711864407, + "cpu_mem": 3.426955264, + "gpu_mem": 1.125705216, + "loss": 0.5519, + "grad_norm": 1.632736086845398, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 71, + "epoch": 0.48135593220338985, + "cpu_mem": 3.427151872, + "gpu_mem": 1.12593408, + "loss": 0.6061, + "grad_norm": 3.43094801902771, + "learning_rate": 0.0002824979642304366 + }, + { + "step": 72, + "epoch": 0.488135593220339, + "cpu_mem": 3.427151872, + "gpu_mem": 1.1259264, + "loss": 0.563, + "grad_norm": 2.4741904735565186, + "learning_rate": 0.0002816518484350883 + }, + { + "step": 73, + "epoch": 0.49491525423728816, + "cpu_mem": 3.427151872, + "gpu_mem": 1.125892608, + "loss": 0.7027, + "grad_norm": 3.3420116901397705, + "learning_rate": 0.0002807870897286772 + }, + { + "step": 74, + "epoch": 0.5016949152542373, + "cpu_mem": 3.427151872, + "gpu_mem": 1.125752832, + "loss": 0.5317, + "grad_norm": 1.935846209526062, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 75, + "epoch": 0.5084745762711864, + "cpu_mem": 3.427151872, + "gpu_mem": 1.125677568, + "loss": 0.4532, + "grad_norm": 0.9749165773391724, + "learning_rate": 0.000279002136031155 + }, + { + "step": 76, + "epoch": 0.5152542372881356, + "cpu_mem": 3.427151872, + "gpu_mem": 1.125617664, + "loss": 0.507, + "grad_norm": 1.4989597797393799, + "learning_rate": 0.00027808219380317216 + }, + { + "step": 77, + "epoch": 0.5220338983050847, + "cpu_mem": 3.427151872, + "gpu_mem": 1.125691392, + "loss": 0.4563, + "grad_norm": 1.0682824850082397, + "learning_rate": 0.0002771441141545895 + }, + { + "step": 78, + "epoch": 0.5288135593220339, + "cpu_mem": 3.427151872, + "gpu_mem": 1.125743616, + "loss": 0.7282, + "grad_norm": 6.257880210876465, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 79, + "epoch": 0.535593220338983, + "cpu_mem": 3.42734848, + "gpu_mem": 1.125875712, + "loss": 0.7232, + "grad_norm": 3.974815845489502, + "learning_rate": 0.000275214076502292 + }, + { + "step": 80, + "epoch": 0.5423728813559322, + "cpu_mem": 3.42734848, + "gpu_mem": 1.125766656, + "loss": 0.5114, + "grad_norm": 1.9176770448684692, + "learning_rate": 0.0002742223918067056 + }, + { + "step": 81, + "epoch": 0.5491525423728814, + "cpu_mem": 3.427545088, + "gpu_mem": 1.125646848, + "loss": 0.5146, + "grad_norm": 2.0399527549743652, + "learning_rate": 0.00027321311626807374 + }, + { + "step": 82, + "epoch": 0.5559322033898305, + "cpu_mem": 3.427545088, + "gpu_mem": 1.125715968, + "loss": 0.5525, + "grad_norm": 3.1574547290802, + "learning_rate": 0.0002721863928075503 + }, + { + "step": 83, + "epoch": 0.5627118644067797, + "cpu_mem": 3.427545088, + "gpu_mem": 1.125815808, + "loss": 0.6222, + "grad_norm": 2.2953574657440186, + "learning_rate": 0.000271142366817049 + }, + { + "step": 84, + "epoch": 0.5694915254237288, + "cpu_mem": 3.427545088, + "gpu_mem": 1.125778944, + "loss": 0.4909, + "grad_norm": 2.386277437210083, + "learning_rate": 0.00027008118613865406 + }, + { + "step": 85, + "epoch": 0.576271186440678, + "cpu_mem": 3.427545088, + "gpu_mem": 1.1258112, + "loss": 0.5816, + "grad_norm": 2.171255588531494, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 86, + "epoch": 0.5830508474576271, + "cpu_mem": 3.427545088, + "gpu_mem": 1.125762048, + "loss": 0.607, + "grad_norm": 2.587982416152954, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 87, + "epoch": 0.5898305084745763, + "cpu_mem": 3.427545088, + "gpu_mem": 1.125769728, + "loss": 0.5444, + "grad_norm": 2.332745313644409, + "learning_rate": 0.00026679623070746325 + }, + { + "step": 88, + "epoch": 0.5966101694915255, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125914112, + "loss": 0.4747, + "grad_norm": 1.3445320129394531, + "learning_rate": 0.0002656679579618081 + }, + { + "step": 89, + "epoch": 0.6033898305084746, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125696, + "loss": 0.5293, + "grad_norm": 1.7004798650741577, + "learning_rate": 0.0002645233057465235 + }, + { + "step": 90, + "epoch": 0.6101694915254238, + "cpu_mem": 3.427741696, + "gpu_mem": 1.12574976, + "loss": 0.4815, + "grad_norm": 1.8152844905853271, + "learning_rate": 0.00026336243615313873 + }, + { + "step": 91, + "epoch": 0.6169491525423729, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125717504, + "loss": 0.483, + "grad_norm": 2.4496662616729736, + "learning_rate": 0.00026218551356968814 + }, + { + "step": 92, + "epoch": 0.6237288135593221, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125798912, + "loss": 0.6074, + "grad_norm": 3.082737445831299, + "learning_rate": 0.00026099270465743254 + }, + { + "step": 93, + "epoch": 0.6305084745762712, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125602304, + "loss": 0.6955, + "grad_norm": 2.550952672958374, + "learning_rate": 0.0002597841783272588 + }, + { + "step": 94, + "epoch": 0.6372881355932203, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125715968, + "loss": 0.5038, + "grad_norm": 1.306609869003296, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 95, + "epoch": 0.6440677966101694, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125735936, + "loss": 0.5241, + "grad_norm": 1.3793981075286865, + "learning_rate": 0.00025732066016100394 + }, + { + "step": 96, + "epoch": 0.6508474576271186, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125774336, + "loss": 0.4329, + "grad_norm": 1.3967863321304321, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 97, + "epoch": 0.6576271186440678, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125758976, + "loss": 0.565, + "grad_norm": 1.5141547918319702, + "learning_rate": 0.0002547963544337602 + }, + { + "step": 98, + "epoch": 0.6644067796610169, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125671424, + "loss": 0.4727, + "grad_norm": 1.47848641872406, + "learning_rate": 0.0002535118517223168 + }, + { + "step": 99, + "epoch": 0.6711864406779661, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125620736, + "loss": 0.5385, + "grad_norm": 1.9108830690383911, + "learning_rate": 0.00025221269093908365 + }, + { + "step": 100, + "epoch": 0.6779661016949152, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125737472, + "loss": 0.5099, + "grad_norm": 1.7991509437561035, + "learning_rate": 0.0002508990560551879 + }, + { + "step": 101, + "epoch": 0.6847457627118644, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125769728, + "loss": 0.492, + "grad_norm": 2.6267879009246826, + "learning_rate": 0.0002495711330914001 + }, + { + "step": 102, + "epoch": 0.6915254237288135, + "cpu_mem": 3.427741696, + "gpu_mem": 1.12580352, + "loss": 0.5732, + "grad_norm": 2.2555246353149414, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 103, + "epoch": 0.6983050847457627, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125854208, + "loss": 0.5725, + "grad_norm": 2.3993818759918213, + "learning_rate": 0.0002468731770971113 + }, + { + "step": 104, + "epoch": 0.7050847457627119, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125758976, + "loss": 0.5512, + "grad_norm": 2.4269518852233887, + "learning_rate": 0.0002455035261178632 + }, + { + "step": 105, + "epoch": 0.711864406779661, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125860352, + "loss": 0.4985, + "grad_norm": 1.7222180366516113, + "learning_rate": 0.0002441203511071278 + }, + { + "step": 106, + "epoch": 0.7186440677966102, + "cpu_mem": 3.427741696, + "gpu_mem": 1.1258112, + "loss": 0.5036, + "grad_norm": 2.7957687377929688, + "learning_rate": 0.00024272384793309077 + }, + { + "step": 107, + "epoch": 0.7254237288135593, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125699072, + "loss": 0.4633, + "grad_norm": 2.214627742767334, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 108, + "epoch": 0.7322033898305085, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125883392, + "loss": 0.5273, + "grad_norm": 1.683442234992981, + "learning_rate": 0.00023989164997670202 + }, + { + "step": 109, + "epoch": 0.7389830508474576, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125737472, + "loss": 0.5959, + "grad_norm": 2.7632293701171875, + "learning_rate": 0.0002384563562552943 + }, + { + "step": 110, + "epoch": 0.7457627118644068, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125740544, + "loss": 0.4838, + "grad_norm": 2.2415990829467773, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 111, + "epoch": 0.752542372881356, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125709824, + "loss": 0.4854, + "grad_norm": 1.8701057434082031, + "learning_rate": 0.0002355483955402446 + }, + { + "step": 112, + "epoch": 0.7593220338983051, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125755904, + "loss": 0.5012, + "grad_norm": 2.4984281063079834, + "learning_rate": 0.00023407614033613407 + }, + { + "step": 113, + "epoch": 0.7661016949152543, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125746688, + "loss": 0.501, + "grad_norm": 2.603296995162964, + "learning_rate": 0.0002325919793059723 + }, + { + "step": 114, + "epoch": 0.7728813559322034, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125728256, + "loss": 0.4775, + "grad_norm": 2.500393867492676, + "learning_rate": 0.00023109612261833963 + }, + { + "step": 115, + "epoch": 0.7796610169491526, + "cpu_mem": 3.427741696, + "gpu_mem": 1.12580352, + "loss": 0.5571, + "grad_norm": 2.1858267784118652, + "learning_rate": 0.0002295887820980112 + }, + { + "step": 116, + "epoch": 0.7864406779661017, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125723648, + "loss": 0.4345, + "grad_norm": 1.9086796045303345, + "learning_rate": 0.0002280701711959608 + }, + { + "step": 117, + "epoch": 0.7932203389830509, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125614592, + "loss": 0.4208, + "grad_norm": 2.3681225776672363, + "learning_rate": 0.00022654050495913495 + }, + { + "step": 118, + "epoch": 0.8, + "cpu_mem": 3.427741696, + "gpu_mem": 1.125852672, + "loss": 0.5979, + "grad_norm": 4.626540660858154, + "learning_rate": 0.000225 + }, + { + "step": 119, + "epoch": 0.8067796610169492, + "cpu_mem": 3.427741696, + "gpu_mem": 1.126023168, + "loss": 0.4919, + "grad_norm": 2.7611777782440186, + "learning_rate": 0.00022344887446586865 + }, + { + "step": 120, + "epoch": 0.8135593220338984, + "cpu_mem": 3.427938304, + "gpu_mem": 1.125755904, + "loss": 0.4805, + "grad_norm": 2.8226702213287354, + "learning_rate": 0.00022188734800800852 + }, + { + "step": 121, + "epoch": 0.8203389830508474, + "cpu_mem": 3.427938304, + "gpu_mem": 1.125783552, + "loss": 0.4785, + "grad_norm": 1.9544380903244019, + "learning_rate": 0.00022031564175053754 + }, + { + "step": 122, + "epoch": 0.8271186440677966, + "cpu_mem": 3.428134912, + "gpu_mem": 1.12583424, + "loss": 0.3866, + "grad_norm": 1.9233179092407227, + "learning_rate": 0.00021873397825911153 + }, + { + "step": 123, + "epoch": 0.8338983050847457, + "cpu_mem": 3.428134912, + "gpu_mem": 1.125643776, + "loss": 0.4325, + "grad_norm": 2.385383367538452, + "learning_rate": 0.00021714258150940685 + }, + { + "step": 124, + "epoch": 0.8406779661016949, + "cpu_mem": 3.428134912, + "gpu_mem": 1.126086144, + "loss": 0.4122, + "grad_norm": 1.8369063138961792, + "learning_rate": 0.0002155416768554039 + }, + { + "step": 125, + "epoch": 0.847457627118644, + "cpu_mem": 3.428134912, + "gpu_mem": 1.125812736, + "loss": 0.4932, + "grad_norm": 1.7532641887664795, + "learning_rate": 0.00021393149099747523 + }, + { + "step": 126, + "epoch": 0.8542372881355932, + "cpu_mem": 3.428134912, + "gpu_mem": 1.125696, + "loss": 0.5018, + "grad_norm": 2.3380966186523438, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 127, + "epoch": 0.8610169491525423, + "cpu_mem": 3.428134912, + "gpu_mem": 1.126135296, + "loss": 0.5135, + "grad_norm": 2.1224722862243652, + "learning_rate": 0.00021068418901049025 + }, + { + "step": 128, + "epoch": 0.8677966101694915, + "cpu_mem": 3.428134912, + "gpu_mem": 1.12591104, + "loss": 0.4488, + "grad_norm": 2.445929765701294, + "learning_rate": 0.0002090475327242912 + }, + { + "step": 129, + "epoch": 0.8745762711864407, + "cpu_mem": 3.428134912, + "gpu_mem": 1.125950976, + "loss": 0.4852, + "grad_norm": 1.6421163082122803, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 130, + "epoch": 0.8813559322033898, + "cpu_mem": 3.428134912, + "gpu_mem": 1.125732864, + "loss": 0.6086, + "grad_norm": 1.6469106674194336, + "learning_rate": 0.0002057493683490491 + }, + { + "step": 131, + "epoch": 0.888135593220339, + "cpu_mem": 3.428134912, + "gpu_mem": 1.125861888, + "loss": 0.4663, + "grad_norm": 2.1512928009033203, + "learning_rate": 0.00020408832730536746 + }, + { + "step": 132, + "epoch": 0.8949152542372881, + "cpu_mem": 3.428134912, + "gpu_mem": 1.125943296, + "loss": 0.3919, + "grad_norm": 2.774719715118408, + "learning_rate": 0.00020241962693986476 + }, + { + "step": 133, + "epoch": 0.9016949152542373, + "cpu_mem": 3.428134912, + "gpu_mem": 1.12572672, + "loss": 0.4558, + "grad_norm": 1.967437505722046, + "learning_rate": 0.0002007435035533061 + }, + { + "step": 134, + "epoch": 0.9084745762711864, + "cpu_mem": 3.428134912, + "gpu_mem": 1.125860352, + "loss": 0.4149, + "grad_norm": 3.297884941101074, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 135, + "epoch": 0.9152542372881356, + "cpu_mem": 3.428134912, + "gpu_mem": 1.125883392, + "loss": 0.4586, + "grad_norm": 1.6875859498977661, + "learning_rate": 0.00019736993814225374 + }, + { + "step": 136, + "epoch": 0.9220338983050848, + "cpu_mem": 3.428134912, + "gpu_mem": 1.125720576, + "loss": 0.4531, + "grad_norm": 2.1743087768554688, + "learning_rate": 0.00019567297384048604 + }, + { + "step": 137, + "epoch": 0.9288135593220339, + "cpu_mem": 3.428134912, + "gpu_mem": 1.125600768, + "loss": 0.5072, + "grad_norm": 2.233001708984375, + "learning_rate": 0.0001939695418954653 + }, + { + "step": 138, + "epoch": 0.9355932203389831, + "cpu_mem": 3.428134912, + "gpu_mem": 1.125782016, + "loss": 0.4689, + "grad_norm": 2.294285297393799, + "learning_rate": 0.00019225988352621445 + }, + { + "step": 139, + "epoch": 0.9423728813559322, + "cpu_mem": 3.428134912, + "gpu_mem": 1.12568064, + "loss": 0.4524, + "grad_norm": 2.164747953414917, + "learning_rate": 0.00019054424083346592 + }, + { + "step": 140, + "epoch": 0.9491525423728814, + "cpu_mem": 3.428134912, + "gpu_mem": 1.125732864, + "loss": 0.4312, + "grad_norm": 1.6793618202209473, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 141, + "epoch": 0.9559322033898305, + "cpu_mem": 3.428134912, + "gpu_mem": 1.12576512, + "loss": 0.5573, + "grad_norm": 3.8546879291534424, + "learning_rate": 0.0001870959750831323 + }, + { + "step": 142, + "epoch": 0.9627118644067797, + "cpu_mem": 3.428134912, + "gpu_mem": 1.125904896, + "loss": 0.4436, + "grad_norm": 3.1343538761138916, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 143, + "epoch": 0.9694915254237289, + "cpu_mem": 3.428134912, + "gpu_mem": 1.125888, + "loss": 0.5815, + "grad_norm": 2.3474795818328857, + "learning_rate": 0.00018362669777878453 + }, + { + "step": 144, + "epoch": 0.976271186440678, + "cpu_mem": 3.428134912, + "gpu_mem": 1.12608, + "loss": 0.4996, + "grad_norm": 1.5809093713760376, + "learning_rate": 0.00018188479343294648 + }, + { + "step": 145, + "epoch": 0.9830508474576272, + "cpu_mem": 3.428134912, + "gpu_mem": 1.125791232, + "loss": 0.4287, + "grad_norm": 1.5517871379852295, + "learning_rate": 0.0001801383739559098 + }, + { + "step": 146, + "epoch": 0.9898305084745763, + "cpu_mem": 3.428134912, + "gpu_mem": 1.12582656, + "loss": 0.529, + "grad_norm": 2.5757737159729004, + "learning_rate": 0.0001783876866540615 + }, + { + "step": 147, + "epoch": 0.9966101694915255, + "cpu_mem": 3.428134912, + "gpu_mem": 1.125725184, + "loss": 0.4854, + "grad_norm": 2.3408732414245605, + "learning_rate": 0.00017663297943814552 + }, + { + "step": 148, + "epoch": 1.0033898305084745, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151120896, + "loss": 0.6334, + "grad_norm": 4.124669075012207, + "learning_rate": 0.0001748745007881561 + }, + { + "step": 149, + "epoch": 1.0101694915254238, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151056384, + "loss": 0.3634, + "grad_norm": 2.3019583225250244, + "learning_rate": 0.00017311249971815185 + }, + { + "step": 150, + "epoch": 1.0169491525423728, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150893568, + "loss": 0.3833, + "grad_norm": 1.5041892528533936, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 151, + "epoch": 1.023728813559322, + "cpu_mem": 3.428134912, + "gpu_mem": 1.15096576, + "loss": 0.3864, + "grad_norm": 1.595664143562317, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 152, + "epoch": 1.0305084745762711, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151001088, + "loss": 0.3319, + "grad_norm": 1.6147993803024292, + "learning_rate": 0.00016780785939859576 + }, + { + "step": 153, + "epoch": 1.0372881355932204, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151025664, + "loss": 0.4996, + "grad_norm": 2.1374635696411133, + "learning_rate": 0.00016603426823476693 + }, + { + "step": 154, + "epoch": 1.0440677966101695, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150987264, + "loss": 0.3776, + "grad_norm": 1.9883310794830322, + "learning_rate": 0.00016425840649562736 + }, + { + "step": 155, + "epoch": 1.0508474576271187, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151208448, + "loss": 0.4377, + "grad_norm": 1.6302087306976318, + "learning_rate": 0.00016248052565681436 + }, + { + "step": 156, + "epoch": 1.0576271186440678, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151116288, + "loss": 0.2468, + "grad_norm": 1.4566242694854736, + "learning_rate": 0.00016070087747988482 + }, + { + "step": 157, + "epoch": 1.064406779661017, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151022592, + "loss": 0.405, + "grad_norm": 1.8313920497894287, + "learning_rate": 0.00015891971397666464 + }, + { + "step": 158, + "epoch": 1.071186440677966, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150948864, + "loss": 0.3275, + "grad_norm": 2.2069098949432373, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 159, + "epoch": 1.0779661016949154, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151297536, + "loss": 0.3212, + "grad_norm": 2.0373239517211914, + "learning_rate": 0.00015535385007584706 + }, + { + "step": 160, + "epoch": 1.0847457627118644, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150892032, + "loss": 0.2939, + "grad_norm": 2.416083335876465, + "learning_rate": 0.0001535696546319161 + }, + { + "step": 161, + "epoch": 1.0915254237288137, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150838272, + "loss": 0.3446, + "grad_norm": 1.6689667701721191, + "learning_rate": 0.00015178495369752213 + }, + { + "step": 162, + "epoch": 1.0983050847457627, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151613952, + "loss": 0.3072, + "grad_norm": 2.0056674480438232, + "learning_rate": 0.00015 + }, + { + "step": 163, + "epoch": 1.1050847457627118, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151090176, + "loss": 0.3809, + "grad_norm": 2.123126268386841, + "learning_rate": 0.00014821504630247785 + }, + { + "step": 164, + "epoch": 1.111864406779661, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151002624, + "loss": 0.4749, + "grad_norm": 2.2435545921325684, + "learning_rate": 0.00014643034536808387 + }, + { + "step": 165, + "epoch": 1.11864406779661, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150951936, + "loss": 0.2802, + "grad_norm": 2.1707935333251953, + "learning_rate": 0.00014464614992415294 + }, + { + "step": 166, + "epoch": 1.1254237288135593, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151047168, + "loss": 0.3322, + "grad_norm": 1.7609379291534424, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 167, + "epoch": 1.1322033898305084, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150964224, + "loss": 0.3831, + "grad_norm": 2.0881130695343018, + "learning_rate": 0.00014108028602333536 + }, + { + "step": 168, + "epoch": 1.1389830508474577, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150982656, + "loss": 0.4039, + "grad_norm": 1.9189516305923462, + "learning_rate": 0.00013929912252011516 + }, + { + "step": 169, + "epoch": 1.1457627118644067, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151070208, + "loss": 0.3563, + "grad_norm": 3.017411947250366, + "learning_rate": 0.00013751947434318564 + }, + { + "step": 170, + "epoch": 1.152542372881356, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150955008, + "loss": 0.4269, + "grad_norm": 2.6068131923675537, + "learning_rate": 0.00013574159350437261 + }, + { + "step": 171, + "epoch": 1.159322033898305, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151017984, + "loss": 0.4944, + "grad_norm": 3.4840762615203857, + "learning_rate": 0.0001339657317652331 + }, + { + "step": 172, + "epoch": 1.1661016949152543, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150925824, + "loss": 0.2722, + "grad_norm": 2.579272985458374, + "learning_rate": 0.00013219214060140424 + }, + { + "step": 173, + "epoch": 1.1728813559322033, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151225344, + "loss": 0.3796, + "grad_norm": 2.225433588027954, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 174, + "epoch": 1.1796610169491526, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150948864, + "loss": 0.3958, + "grad_norm": 2.4849770069122314, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 175, + "epoch": 1.1864406779661016, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150915072, + "loss": 0.4057, + "grad_norm": 2.0921683311462402, + "learning_rate": 0.00012688750028184818 + }, + { + "step": 176, + "epoch": 1.193220338983051, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151053312, + "loss": 0.3398, + "grad_norm": 2.74931263923645, + "learning_rate": 0.0001251254992118439 + }, + { + "step": 177, + "epoch": 1.2, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151151616, + "loss": 0.3925, + "grad_norm": 2.009072780609131, + "learning_rate": 0.00012336702056185453 + }, + { + "step": 178, + "epoch": 1.2067796610169492, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150898176, + "loss": 0.3925, + "grad_norm": 2.179717779159546, + "learning_rate": 0.00012161231334593851 + }, + { + "step": 179, + "epoch": 1.2135593220338983, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150998016, + "loss": 0.4901, + "grad_norm": 2.84321665763855, + "learning_rate": 0.00011986162604409015 + }, + { + "step": 180, + "epoch": 1.2203389830508475, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150970368, + "loss": 0.3518, + "grad_norm": 2.1980111598968506, + "learning_rate": 0.00011811520656705348 + }, + { + "step": 181, + "epoch": 1.2271186440677966, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150907392, + "loss": 0.2876, + "grad_norm": 2.4873311519622803, + "learning_rate": 0.00011637330222121543 + }, + { + "step": 182, + "epoch": 1.2338983050847459, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151125504, + "loss": 0.3581, + "grad_norm": 3.2393558025360107, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 183, + "epoch": 1.240677966101695, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151022592, + "loss": 0.3579, + "grad_norm": 2.248528480529785, + "learning_rate": 0.00011290402491686766 + }, + { + "step": 184, + "epoch": 1.2474576271186442, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150970368, + "loss": 0.3735, + "grad_norm": 2.068347692489624, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 185, + "epoch": 1.2542372881355932, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150948864, + "loss": 0.416, + "grad_norm": 2.761140823364258, + "learning_rate": 0.00010945575916653407 + }, + { + "step": 186, + "epoch": 1.2610169491525425, + "cpu_mem": 3.428134912, + "gpu_mem": 1.15095808, + "loss": 0.3646, + "grad_norm": 2.420140027999878, + "learning_rate": 0.00010774011647378553 + }, + { + "step": 187, + "epoch": 1.2677966101694915, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150890496, + "loss": 0.4491, + "grad_norm": 2.4814517498016357, + "learning_rate": 0.00010603045810453468 + }, + { + "step": 188, + "epoch": 1.2745762711864406, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151053312, + "loss": 0.255, + "grad_norm": 2.2799582481384277, + "learning_rate": 0.00010432702615951396 + }, + { + "step": 189, + "epoch": 1.2813559322033898, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150922752, + "loss": 0.4211, + "grad_norm": 2.5999755859375, + "learning_rate": 0.00010263006185774627 + }, + { + "step": 190, + "epoch": 1.288135593220339, + "cpu_mem": 3.428134912, + "gpu_mem": 1.15104256, + "loss": 0.4414, + "grad_norm": 3.8879923820495605, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 191, + "epoch": 1.2949152542372881, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150861312, + "loss": 0.2919, + "grad_norm": 2.4823381900787354, + "learning_rate": 9.925649644669391e-05 + }, + { + "step": 192, + "epoch": 1.3016949152542372, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150993408, + "loss": 0.2909, + "grad_norm": 3.563950538635254, + "learning_rate": 9.758037306013526e-05 + }, + { + "step": 193, + "epoch": 1.3084745762711865, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150967296, + "loss": 0.2698, + "grad_norm": 1.580678105354309, + "learning_rate": 9.591167269463255e-05 + }, + { + "step": 194, + "epoch": 1.3152542372881357, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150933504, + "loss": 0.3329, + "grad_norm": 2.132497549057007, + "learning_rate": 9.425063165095088e-05 + }, + { + "step": 195, + "epoch": 1.3220338983050848, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151037952, + "loss": 0.2507, + "grad_norm": 2.0013768672943115, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 196, + "epoch": 1.3288135593220338, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151033344, + "loss": 0.3384, + "grad_norm": 2.2814812660217285, + "learning_rate": 9.095246727570879e-05 + }, + { + "step": 197, + "epoch": 1.335593220338983, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150892032, + "loss": 0.4074, + "grad_norm": 2.490309953689575, + "learning_rate": 8.931581098950973e-05 + }, + { + "step": 198, + "epoch": 1.3423728813559321, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151084032, + "loss": 0.3296, + "grad_norm": 1.9882185459136963, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 199, + "epoch": 1.3491525423728814, + "cpu_mem": 3.428134912, + "gpu_mem": 1.15093504, + "loss": 0.374, + "grad_norm": 2.8949124813079834, + "learning_rate": 8.606850900252478e-05 + }, + { + "step": 200, + "epoch": 1.3559322033898304, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151037952, + "loss": 0.3449, + "grad_norm": 2.137866497039795, + "learning_rate": 8.445832314459608e-05 + }, + { + "step": 201, + "epoch": 1.3627118644067797, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151240704, + "loss": 0.2758, + "grad_norm": 2.045112133026123, + "learning_rate": 8.285741849059311e-05 + }, + { + "step": 202, + "epoch": 1.3694915254237288, + "cpu_mem": 3.428134912, + "gpu_mem": 1.15104256, + "loss": 0.2837, + "grad_norm": 2.212925434112549, + "learning_rate": 8.126602174088843e-05 + }, + { + "step": 203, + "epoch": 1.376271186440678, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150928896, + "loss": 0.2982, + "grad_norm": 2.0492701530456543, + "learning_rate": 7.968435824946242e-05 + }, + { + "step": 204, + "epoch": 1.383050847457627, + "cpu_mem": 3.428134912, + "gpu_mem": 1.15094272, + "loss": 0.27, + "grad_norm": 1.9272792339324951, + "learning_rate": 7.811265199199152e-05 + }, + { + "step": 205, + "epoch": 1.3898305084745763, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150987264, + "loss": 0.265, + "grad_norm": 2.206463098526001, + "learning_rate": 7.655112553413135e-05 + }, + { + "step": 206, + "epoch": 1.3966101694915254, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150928896, + "loss": 0.2654, + "grad_norm": 2.140951156616211, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 207, + "epoch": 1.4033898305084747, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151162368, + "loss": 0.4013, + "grad_norm": 3.3770956993103027, + "learning_rate": 7.345949504086507e-05 + }, + { + "step": 208, + "epoch": 1.4101694915254237, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151193088, + "loss": 0.3595, + "grad_norm": 3.8507437705993652, + "learning_rate": 7.192982880403917e-05 + }, + { + "step": 209, + "epoch": 1.4169491525423727, + "cpu_mem": 3.428134912, + "gpu_mem": 1.15111936, + "loss": 0.3987, + "grad_norm": 2.9722373485565186, + "learning_rate": 7.041121790198881e-05 + }, + { + "step": 210, + "epoch": 1.423728813559322, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151007232, + "loss": 0.3938, + "grad_norm": 2.515491247177124, + "learning_rate": 6.890387738166041e-05 + }, + { + "step": 211, + "epoch": 1.4305084745762713, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150956544, + "loss": 0.3097, + "grad_norm": 3.0874598026275635, + "learning_rate": 6.740802069402771e-05 + }, + { + "step": 212, + "epoch": 1.4372881355932203, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150925824, + "loss": 0.298, + "grad_norm": 2.131585121154785, + "learning_rate": 6.592385966386588e-05 + }, + { + "step": 213, + "epoch": 1.4440677966101694, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150948864, + "loss": 0.3317, + "grad_norm": 2.942235231399536, + "learning_rate": 6.445160445975536e-05 + }, + { + "step": 214, + "epoch": 1.4508474576271186, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151031808, + "loss": 0.3189, + "grad_norm": 2.5085628032684326, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 215, + "epoch": 1.457627118644068, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150959616, + "loss": 0.5257, + "grad_norm": 5.828885078430176, + "learning_rate": 6.154364374470568e-05 + }, + { + "step": 216, + "epoch": 1.464406779661017, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151125504, + "loss": 0.2747, + "grad_norm": 1.972609043121338, + "learning_rate": 6.010835002329795e-05 + }, + { + "step": 217, + "epoch": 1.471186440677966, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150967296, + "loss": 0.3539, + "grad_norm": 3.8559679985046387, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 218, + "epoch": 1.4779661016949153, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150944256, + "loss": 0.3141, + "grad_norm": 2.4019277095794678, + "learning_rate": 5.72761520669092e-05 + }, + { + "step": 219, + "epoch": 1.4847457627118645, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151070208, + "loss": 0.4596, + "grad_norm": 3.182147264480591, + "learning_rate": 5.587964889287218e-05 + }, + { + "step": 220, + "epoch": 1.4915254237288136, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151104, + "loss": 0.3402, + "grad_norm": 1.8264169692993164, + "learning_rate": 5.449647388213678e-05 + }, + { + "step": 221, + "epoch": 1.4983050847457626, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150971904, + "loss": 0.3546, + "grad_norm": 2.7376444339752197, + "learning_rate": 5.312682290288869e-05 + }, + { + "step": 222, + "epoch": 1.505084745762712, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151108608, + "loss": 0.3975, + "grad_norm": 2.827303647994995, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 223, + "epoch": 1.5118644067796612, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151022592, + "loss": 0.2844, + "grad_norm": 3.039685010910034, + "learning_rate": 5.0428866908599864e-05 + }, + { + "step": 224, + "epoch": 1.5186440677966102, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150987264, + "loss": 0.249, + "grad_norm": 2.339630365371704, + "learning_rate": 4.9100943944812114e-05 + }, + { + "step": 225, + "epoch": 1.5254237288135593, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150951936, + "loss": 0.2359, + "grad_norm": 2.153271198272705, + "learning_rate": 4.778730906091632e-05 + }, + { + "step": 226, + "epoch": 1.5322033898305085, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151100928, + "loss": 0.3321, + "grad_norm": 2.406315803527832, + "learning_rate": 4.648814827768322e-05 + }, + { + "step": 227, + "epoch": 1.5389830508474578, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150990336, + "loss": 0.3475, + "grad_norm": 2.4297118186950684, + "learning_rate": 4.5203645566239816e-05 + }, + { + "step": 228, + "epoch": 1.5457627118644068, + "cpu_mem": 3.428134912, + "gpu_mem": 1.15093504, + "loss": 0.3103, + "grad_norm": 2.009911298751831, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 229, + "epoch": 1.5525423728813559, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150876672, + "loss": 0.2991, + "grad_norm": 1.6910120248794556, + "learning_rate": 4.267933983899601e-05 + }, + { + "step": 230, + "epoch": 1.559322033898305, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150933504, + "loss": 0.3052, + "grad_norm": 2.1668050289154053, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 231, + "epoch": 1.5661016949152542, + "cpu_mem": 3.428134912, + "gpu_mem": 1.15121152, + "loss": 0.3701, + "grad_norm": 2.9518678188323975, + "learning_rate": 4.0215821672741213e-05 + }, + { + "step": 232, + "epoch": 1.5728813559322035, + "cpu_mem": 3.428134912, + "gpu_mem": 1.15093504, + "loss": 0.3427, + "grad_norm": 1.7339729070663452, + "learning_rate": 3.900729534256745e-05 + }, + { + "step": 233, + "epoch": 1.5796610169491525, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151248384, + "loss": 0.3023, + "grad_norm": 2.895573616027832, + "learning_rate": 3.781448643031187e-05 + }, + { + "step": 234, + "epoch": 1.5864406779661016, + "cpu_mem": 3.428134912, + "gpu_mem": 1.151123968, + "loss": 0.2956, + "grad_norm": 1.7656444311141968, + "learning_rate": 3.663756384686127e-05 + }, + { + "step": 235, + "epoch": 1.5932203389830508, + "cpu_mem": 3.428134912, + "gpu_mem": 1.150879744, + "loss": 0.2534, + "grad_norm": 1.9604936838150024, + "learning_rate": 3.547669425347647e-05 + }, + { + "step": 236, + "epoch": 1.6, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150939648, + "loss": 0.3585, + "grad_norm": 2.8539772033691406, + "learning_rate": 3.433204203819185e-05 + }, + { + "step": 237, + "epoch": 1.6067796610169491, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151001088, + "loss": 0.3756, + "grad_norm": 2.740112781524658, + "learning_rate": 3.3203769292536764e-05 + }, + { + "step": 238, + "epoch": 1.6135593220338982, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151002624, + "loss": 0.3434, + "grad_norm": 2.702164888381958, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 239, + "epoch": 1.6203389830508474, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151256064, + "loss": 0.4156, + "grad_norm": 3.2156434059143066, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 240, + "epoch": 1.6271186440677967, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150905856, + "loss": 0.5157, + "grad_norm": 3.0920889377593994, + "learning_rate": 2.9918813861345952e-05 + }, + { + "step": 241, + "epoch": 1.6338983050847458, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151202304, + "loss": 0.35, + "grad_norm": 2.421147346496582, + "learning_rate": 2.885763318295102e-05 + }, + { + "step": 242, + "epoch": 1.6406779661016948, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151064064, + "loss": 0.363, + "grad_norm": 2.704820156097412, + "learning_rate": 2.781360719244964e-05 + }, + { + "step": 243, + "epoch": 1.647457627118644, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150916608, + "loss": 0.4123, + "grad_norm": 2.898569107055664, + "learning_rate": 2.6786883731926306e-05 + }, + { + "step": 244, + "epoch": 1.6542372881355933, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151056384, + "loss": 0.2162, + "grad_norm": 2.04628586769104, + "learning_rate": 2.5777608193294396e-05 + }, + { + "step": 245, + "epoch": 1.6610169491525424, + "cpu_mem": 3.427938304, + "gpu_mem": 1.15093504, + "loss": 0.3895, + "grad_norm": 2.126142978668213, + "learning_rate": 2.4785923497707956e-05 + }, + { + "step": 246, + "epoch": 1.6677966101694914, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151028736, + "loss": 0.4034, + "grad_norm": 2.276557207107544, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 247, + "epoch": 1.6745762711864407, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151047168, + "loss": 0.1957, + "grad_norm": 1.5787477493286133, + "learning_rate": 2.285588584541047e-05 + }, + { + "step": 248, + "epoch": 1.68135593220339, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150999552, + "loss": 0.3401, + "grad_norm": 2.9995689392089844, + "learning_rate": 2.1917806196827792e-05 + }, + { + "step": 249, + "epoch": 1.688135593220339, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150905856, + "loss": 0.2717, + "grad_norm": 2.7105324268341064, + "learning_rate": 2.0997863968844914e-05 + }, + { + "step": 250, + "epoch": 1.694915254237288, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150998016, + "loss": 0.3489, + "grad_norm": 2.5771191120147705, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 251, + "epoch": 1.7016949152542373, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150910464, + "loss": 0.2211, + "grad_norm": 2.0282697677612305, + "learning_rate": 1.921291027132278e-05 + }, + { + "step": 252, + "epoch": 1.7084745762711866, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150953472, + "loss": 0.3152, + "grad_norm": 2.027552366256714, + "learning_rate": 1.834815156491165e-05 + }, + { + "step": 253, + "epoch": 1.7152542372881356, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151147008, + "loss": 0.3807, + "grad_norm": 2.3190534114837646, + "learning_rate": 1.750203576956341e-05 + }, + { + "step": 254, + "epoch": 1.7220338983050847, + "cpu_mem": 3.427938304, + "gpu_mem": 1.15094272, + "loss": 0.3435, + "grad_norm": 2.573007106781006, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 255, + "epoch": 1.7288135593220337, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151099392, + "loss": 0.3364, + "grad_norm": 2.3798046112060547, + "learning_rate": 1.5866209521043304e-05 + }, + { + "step": 256, + "epoch": 1.735593220338983, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150925824, + "loss": 0.2871, + "grad_norm": 2.4988105297088623, + "learning_rate": 1.5076730713409523e-05 + }, + { + "step": 257, + "epoch": 1.7423728813559323, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151339008, + "loss": 0.342, + "grad_norm": 1.6820766925811768, + "learning_rate": 1.4306358075111923e-05 + }, + { + "step": 258, + "epoch": 1.7491525423728813, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150998016, + "loss": 0.3808, + "grad_norm": 3.190187931060791, + "learning_rate": 1.3555200696822232e-05 + }, + { + "step": 259, + "epoch": 1.7559322033898304, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150915072, + "loss": 0.386, + "grad_norm": 2.526961088180542, + "learning_rate": 1.2823364948184095e-05 + }, + { + "step": 260, + "epoch": 1.7627118644067796, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151031808, + "loss": 0.1903, + "grad_norm": 1.7485580444335938, + "learning_rate": 1.2110954462750166e-05 + }, + { + "step": 261, + "epoch": 1.769491525423729, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150987264, + "loss": 0.2024, + "grad_norm": 1.873334527015686, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 262, + "epoch": 1.776271186440678, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150944256, + "loss": 0.2634, + "grad_norm": 2.3487143516540527, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 263, + "epoch": 1.783050847457627, + "cpu_mem": 3.427938304, + "gpu_mem": 1.15098112, + "loss": 0.2421, + "grad_norm": 2.3149917125701904, + "learning_rate": 1.0091269574384874e-05 + }, + { + "step": 264, + "epoch": 1.7898305084745763, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151068672, + "loss": 0.3037, + "grad_norm": 2.4313669204711914, + "learning_rate": 9.45754125003576e-06 + }, + { + "step": 265, + "epoch": 1.7966101694915255, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150987264, + "loss": 0.3437, + "grad_norm": 2.0913071632385254, + "learning_rate": 8.843714815330987e-06 + }, + { + "step": 266, + "epoch": 1.8033898305084746, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151202304, + "loss": 0.4368, + "grad_norm": 3.2440764904022217, + "learning_rate": 8.249877192799731e-06 + }, + { + "step": 267, + "epoch": 1.8101694915254236, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150994944, + "loss": 0.3583, + "grad_norm": 2.756072998046875, + "learning_rate": 7.676112474402068e-06 + }, + { + "step": 268, + "epoch": 1.8169491525423729, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150999552, + "loss": 0.2927, + "grad_norm": 2.6357853412628174, + "learning_rate": 7.122501909620926e-06 + }, + { + "step": 269, + "epoch": 1.8237288135593221, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151010304, + "loss": 0.3663, + "grad_norm": 3.0727593898773193, + "learning_rate": 6.5891238939566275e-06 + }, + { + "step": 270, + "epoch": 1.8305084745762712, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151048704, + "loss": 0.2852, + "grad_norm": 2.724729061126709, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 271, + "epoch": 1.8372881355932202, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151100928, + "loss": 0.2868, + "grad_norm": 2.6316792964935303, + "learning_rate": 5.583364755863701e-06 + }, + { + "step": 272, + "epoch": 1.8440677966101695, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150959616, + "loss": 0.2989, + "grad_norm": 1.9837610721588135, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 273, + "epoch": 1.8508474576271188, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150839808, + "loss": 0.3541, + "grad_norm": 2.509685754776001, + "learning_rate": 4.659404732773908e-06 + }, + { + "step": 274, + "epoch": 1.8576271186440678, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151067136, + "loss": 0.3187, + "grad_norm": 3.2919540405273438, + "learning_rate": 4.228264751468752e-06 + }, + { + "step": 275, + "epoch": 1.8644067796610169, + "cpu_mem": 3.427938304, + "gpu_mem": 1.15131136, + "loss": 0.2671, + "grad_norm": 2.7242918014526367, + "learning_rate": 3.817767165451041e-06 + }, + { + "step": 276, + "epoch": 1.8711864406779661, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150971904, + "loss": 0.3522, + "grad_norm": 2.346522569656372, + "learning_rate": 3.4279701043260886e-06 + }, + { + "step": 277, + "epoch": 1.8779661016949154, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150918144, + "loss": 0.3514, + "grad_norm": 2.1101818084716797, + "learning_rate": 3.0589287663461472e-06 + }, + { + "step": 278, + "epoch": 1.8847457627118644, + "cpu_mem": 3.427938304, + "gpu_mem": 1.15108096, + "loss": 0.3938, + "grad_norm": 2.36015248298645, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 279, + "epoch": 1.8915254237288135, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151021056, + "loss": 0.4156, + "grad_norm": 2.557774543762207, + "learning_rate": 2.3833193495825853e-06 + }, + { + "step": 280, + "epoch": 1.8983050847457628, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151001088, + "loss": 0.2794, + "grad_norm": 2.1779892444610596, + "learning_rate": 2.076846942272026e-06 + }, + { + "step": 281, + "epoch": 1.905084745762712, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150936576, + "loss": 0.3958, + "grad_norm": 2.4951038360595703, + "learning_rate": 1.791321587504768e-06 + }, + { + "step": 282, + "epoch": 1.911864406779661, + "cpu_mem": 3.427938304, + "gpu_mem": 1.15136512, + "loss": 0.2689, + "grad_norm": 2.382336139678955, + "learning_rate": 1.5267837178600972e-06 + }, + { + "step": 283, + "epoch": 1.9186440677966101, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151071744, + "loss": 0.3074, + "grad_norm": 3.1651668548583984, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 284, + "epoch": 1.9254237288135592, + "cpu_mem": 3.427938304, + "gpu_mem": 1.15092736, + "loss": 0.3036, + "grad_norm": 1.8676244020462036, + "learning_rate": 1.0608172990067553e-06 + }, + { + "step": 285, + "epoch": 1.9322033898305084, + "cpu_mem": 3.427938304, + "gpu_mem": 1.15098112, + "loss": 0.3142, + "grad_norm": 2.0635573863983154, + "learning_rate": 8.594547342153979e-07 + }, + { + "step": 286, + "epoch": 1.9389830508474577, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151398912, + "loss": 0.2157, + "grad_norm": 1.9075093269348145, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 287, + "epoch": 1.9457627118644067, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151168512, + "loss": 0.2813, + "grad_norm": 2.295100688934326, + "learning_rate": 5.201134622801473e-07 + }, + { + "step": 288, + "epoch": 1.9525423728813558, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150953472, + "loss": 0.4683, + "grad_norm": 3.1202943325042725, + "learning_rate": 3.821828084619727e-07 + }, + { + "step": 289, + "epoch": 1.959322033898305, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151037952, + "loss": 0.2679, + "grad_norm": 2.079688549041748, + "learning_rate": 2.654391846207915e-07 + }, + { + "step": 290, + "epoch": 1.9661016949152543, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150962688, + "loss": 0.402, + "grad_norm": 2.4235527515411377, + "learning_rate": 1.6989912254880556e-07 + }, + { + "step": 291, + "epoch": 1.9728813559322034, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150998016, + "loss": 0.3175, + "grad_norm": 2.2637393474578857, + "learning_rate": 9.557615145123765e-08 + }, + { + "step": 292, + "epoch": 1.9796610169491524, + "cpu_mem": 3.427938304, + "gpu_mem": 1.15108096, + "loss": 0.3392, + "grad_norm": 3.0754969120025635, + "learning_rate": 4.248079603064724e-08 + }, + { + "step": 293, + "epoch": 1.9864406779661017, + "cpu_mem": 3.427938304, + "gpu_mem": 1.150998016, + "loss": 0.3904, + "grad_norm": 3.2826051712036133, + "learning_rate": 1.0620574996372811e-08 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151024128, + "loss": 0.2956, + "grad_norm": 2.351271629333496, + "learning_rate": 0.0 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 3.427938304, + "gpu_mem": 1.151024128, + "train_runtime": 4528.4118, + "train_samples_per_second": 4.163, + "train_steps_per_second": 0.065, + "total_flos": 4.723488642741043e+16, + "train_loss": 0.6811871752649749 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r2-a2/README.md b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r2-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r2-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r2-a2/adapter_config.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c5f43ee5d95e6efa86bc12e96d56fbf5a2c265b7 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r2-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 4, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 2, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r2-a2/eval_results.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..bdcad625a762fc430e1bc2a66e166215d8022f58 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "hellaswag", + "results": 0.7796255725951006 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r2-a2/training_configuration.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..f4a78538b4895baa6329e1d23a12fb63af98328a --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "HELLASWAG", + "dataset_id": "Rowan/hellaswag", + "preprocess_id": "hellaswag_train_deepeval" + }, + "peft_config": { + "method": "loraq4", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 1576960 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 1, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loraq4-hellaswag-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loraq4-fix/TinyLlama_v1.1-loraq4-hellaswag-r2-a2", + "seed": 42, + "timestamp": "2025-08-31T23:22:08.410558" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r2-a2/training_logs.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..99171165dfc48677b07582fb21e4aece88ea71f2 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r2-a2/training_logs.json @@ -0,0 +1,5629 @@ +[ + { + "step": 1, + "epoch": 0.0016025641025641025, + "cpu_mem": 3.603488768, + "gpu_mem": 1.05629952, + "loss": 4.3397, + "grad_norm": 41.043678283691406, + "learning_rate": 4.7619047619047615e-06 + }, + { + "step": 2, + "epoch": 0.003205128205128205, + "cpu_mem": 3.604471808, + "gpu_mem": 1.068908544, + "loss": 4.4533, + "grad_norm": 54.867210388183594, + "learning_rate": 9.523809523809523e-06 + }, + { + "step": 3, + "epoch": 0.004807692307692308, + "cpu_mem": 3.605454848, + "gpu_mem": 1.068916224, + "loss": 4.224, + "grad_norm": 82.9432144165039, + "learning_rate": 1.4285714285714284e-05 + }, + { + "step": 4, + "epoch": 0.00641025641025641, + "cpu_mem": 3.606634496, + "gpu_mem": 1.068950016, + "loss": 4.5355, + "grad_norm": 105.09892272949219, + "learning_rate": 1.9047619047619046e-05 + }, + { + "step": 5, + "epoch": 0.008012820512820512, + "cpu_mem": 3.607617536, + "gpu_mem": 1.068913152, + "loss": 4.3378, + "grad_norm": 156.826416015625, + "learning_rate": 2.3809523809523807e-05 + }, + { + "step": 6, + "epoch": 0.009615384615384616, + "cpu_mem": 3.608600576, + "gpu_mem": 1.068959232, + "loss": 4.1828, + "grad_norm": 239.82785034179688, + "learning_rate": 2.8571428571428567e-05 + }, + { + "step": 7, + "epoch": 0.011217948717948718, + "cpu_mem": 3.609387008, + "gpu_mem": 1.068919296, + "loss": 3.7907, + "grad_norm": 255.95201110839844, + "learning_rate": 3.333333333333333e-05 + }, + { + "step": 8, + "epoch": 0.01282051282051282, + "cpu_mem": 3.61017344, + "gpu_mem": 1.068950016, + "loss": 3.516, + "grad_norm": 96.03472137451172, + "learning_rate": 3.809523809523809e-05 + }, + { + "step": 9, + "epoch": 0.014423076923076924, + "cpu_mem": 3.610959872, + "gpu_mem": 1.068950016, + "loss": 3.3294, + "grad_norm": 18.508323669433594, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 10, + "epoch": 0.016025641025641024, + "cpu_mem": 3.611746304, + "gpu_mem": 1.068893184, + "loss": 3.2187, + "grad_norm": 10.667830467224121, + "learning_rate": 4.7619047619047614e-05 + }, + { + "step": 11, + "epoch": 0.017628205128205128, + "cpu_mem": 3.612532736, + "gpu_mem": 1.068913152, + "loss": 3.1372, + "grad_norm": 12.046659469604492, + "learning_rate": 5.238095238095237e-05 + }, + { + "step": 12, + "epoch": 0.019230769230769232, + "cpu_mem": 3.613319168, + "gpu_mem": 1.06891008, + "loss": 3.3133, + "grad_norm": 13.522429466247559, + "learning_rate": 5.7142857142857135e-05 + }, + { + "step": 13, + "epoch": 0.020833333333333332, + "cpu_mem": 3.613908992, + "gpu_mem": 1.0689024, + "loss": 2.9476, + "grad_norm": 16.53629493713379, + "learning_rate": 6.190476190476189e-05 + }, + { + "step": 14, + "epoch": 0.022435897435897436, + "cpu_mem": 3.614695424, + "gpu_mem": 1.068928512, + "loss": 2.7344, + "grad_norm": 21.96213722229004, + "learning_rate": 6.666666666666666e-05 + }, + { + "step": 15, + "epoch": 0.02403846153846154, + "cpu_mem": 3.615481856, + "gpu_mem": 1.068926976, + "loss": 2.3951, + "grad_norm": 25.21925163269043, + "learning_rate": 7.142857142857142e-05 + }, + { + "step": 16, + "epoch": 0.02564102564102564, + "cpu_mem": 3.616268288, + "gpu_mem": 1.068919296, + "loss": 2.2664, + "grad_norm": 10.447446823120117, + "learning_rate": 7.619047619047618e-05 + }, + { + "step": 17, + "epoch": 0.027243589743589744, + "cpu_mem": 3.61705472, + "gpu_mem": 1.068919296, + "loss": 1.9629, + "grad_norm": 6.2509636878967285, + "learning_rate": 8.095238095238093e-05 + }, + { + "step": 18, + "epoch": 0.028846153846153848, + "cpu_mem": 3.617644544, + "gpu_mem": 1.068919296, + "loss": 1.8987, + "grad_norm": 6.279534339904785, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 19, + "epoch": 0.030448717948717948, + "cpu_mem": 3.618234368, + "gpu_mem": 1.068919296, + "loss": 1.6982, + "grad_norm": 3.894256353378296, + "learning_rate": 9.047619047619046e-05 + }, + { + "step": 20, + "epoch": 0.03205128205128205, + "cpu_mem": 3.6190208, + "gpu_mem": 1.068893184, + "loss": 1.5199, + "grad_norm": 4.565058708190918, + "learning_rate": 9.523809523809523e-05 + }, + { + "step": 21, + "epoch": 0.03365384615384615, + "cpu_mem": 3.619610624, + "gpu_mem": 1.06891008, + "loss": 1.5504, + "grad_norm": 5.891337871551514, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 22, + "epoch": 0.035256410256410256, + "cpu_mem": 3.620200448, + "gpu_mem": 1.06891776, + "loss": 1.5381, + "grad_norm": 3.0525569915771484, + "learning_rate": 0.00010476190476190474 + }, + { + "step": 23, + "epoch": 0.03685897435897436, + "cpu_mem": 3.62098688, + "gpu_mem": 1.068931584, + "loss": 1.4615, + "grad_norm": 1.8830287456512451, + "learning_rate": 0.0001095238095238095 + }, + { + "step": 24, + "epoch": 0.038461538461538464, + "cpu_mem": 3.621576704, + "gpu_mem": 1.068916224, + "loss": 1.4286, + "grad_norm": 2.557070016860962, + "learning_rate": 0.00011428571428571427 + }, + { + "step": 25, + "epoch": 0.04006410256410257, + "cpu_mem": 3.622166528, + "gpu_mem": 1.068903936, + "loss": 1.5127, + "grad_norm": 5.478477954864502, + "learning_rate": 0.00011904761904761903 + }, + { + "step": 26, + "epoch": 0.041666666666666664, + "cpu_mem": 3.622756352, + "gpu_mem": 1.06891008, + "loss": 1.55, + "grad_norm": 6.982666969299316, + "learning_rate": 0.00012380952380952378 + }, + { + "step": 27, + "epoch": 0.04326923076923077, + "cpu_mem": 3.623346176, + "gpu_mem": 1.06891776, + "loss": 1.4835, + "grad_norm": 3.575786828994751, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 28, + "epoch": 0.04487179487179487, + "cpu_mem": 3.623936, + "gpu_mem": 1.068913152, + "loss": 1.4417, + "grad_norm": 3.5137927532196045, + "learning_rate": 0.0001333333333333333 + }, + { + "step": 29, + "epoch": 0.046474358974358976, + "cpu_mem": 3.624329216, + "gpu_mem": 1.068922368, + "loss": 1.4264, + "grad_norm": 2.9167840480804443, + "learning_rate": 0.00013809523809523808 + }, + { + "step": 30, + "epoch": 0.04807692307692308, + "cpu_mem": 3.62491904, + "gpu_mem": 1.06889472, + "loss": 1.3955, + "grad_norm": 2.1484053134918213, + "learning_rate": 0.00014285714285714284 + }, + { + "step": 31, + "epoch": 0.049679487179487176, + "cpu_mem": 3.625508864, + "gpu_mem": 1.068950016, + "loss": 1.4061, + "grad_norm": 2.1221139430999756, + "learning_rate": 0.0001476190476190476 + }, + { + "step": 32, + "epoch": 0.05128205128205128, + "cpu_mem": 3.626098688, + "gpu_mem": 1.068942336, + "loss": 1.3943, + "grad_norm": 2.1024370193481445, + "learning_rate": 0.00015238095238095237 + }, + { + "step": 33, + "epoch": 0.052884615384615384, + "cpu_mem": 3.626688512, + "gpu_mem": 1.068896256, + "loss": 1.3767, + "grad_norm": 1.2469357252120972, + "learning_rate": 0.00015714285714285713 + }, + { + "step": 34, + "epoch": 0.05448717948717949, + "cpu_mem": 3.627278336, + "gpu_mem": 1.068914688, + "loss": 1.3986, + "grad_norm": 1.5803430080413818, + "learning_rate": 0.00016190476190476187 + }, + { + "step": 35, + "epoch": 0.05608974358974359, + "cpu_mem": 3.62786816, + "gpu_mem": 1.068936192, + "loss": 1.471, + "grad_norm": 4.4713053703308105, + "learning_rate": 0.00016666666666666666 + }, + { + "step": 36, + "epoch": 0.057692307692307696, + "cpu_mem": 3.628457984, + "gpu_mem": 1.068934656, + "loss": 1.3938, + "grad_norm": 1.7546533346176147, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 37, + "epoch": 0.05929487179487179, + "cpu_mem": 3.629047808, + "gpu_mem": 1.068966912, + "loss": 1.4003, + "grad_norm": 1.5442172288894653, + "learning_rate": 0.0001761904761904762 + }, + { + "step": 38, + "epoch": 0.060897435897435896, + "cpu_mem": 3.629441024, + "gpu_mem": 1.068919296, + "loss": 1.4512, + "grad_norm": 1.7243833541870117, + "learning_rate": 0.00018095238095238093 + }, + { + "step": 39, + "epoch": 0.0625, + "cpu_mem": 3.630030848, + "gpu_mem": 1.068976128, + "loss": 1.3224, + "grad_norm": 1.6897941827774048, + "learning_rate": 0.00018571428571428572 + }, + { + "step": 40, + "epoch": 0.0641025641025641, + "cpu_mem": 3.630620672, + "gpu_mem": 1.068903936, + "loss": 1.4738, + "grad_norm": 2.391934871673584, + "learning_rate": 0.00019047619047619045 + }, + { + "step": 41, + "epoch": 0.06570512820512821, + "cpu_mem": 3.631013888, + "gpu_mem": 1.068931584, + "loss": 1.4114, + "grad_norm": 1.297035813331604, + "learning_rate": 0.00019523809523809522 + }, + { + "step": 42, + "epoch": 0.0673076923076923, + "cpu_mem": 3.631603712, + "gpu_mem": 1.068945408, + "loss": 1.4009, + "grad_norm": 1.2073386907577515, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 43, + "epoch": 0.06891025641025642, + "cpu_mem": 3.632193536, + "gpu_mem": 1.068951552, + "loss": 1.377, + "grad_norm": 0.608805775642395, + "learning_rate": 0.00020476190476190475 + }, + { + "step": 44, + "epoch": 0.07051282051282051, + "cpu_mem": 3.632586752, + "gpu_mem": 1.068930048, + "loss": 1.3888, + "grad_norm": 0.8392311334609985, + "learning_rate": 0.00020952380952380948 + }, + { + "step": 45, + "epoch": 0.07211538461538461, + "cpu_mem": 3.633176576, + "gpu_mem": 1.068930048, + "loss": 1.396, + "grad_norm": 0.8560051918029785, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 46, + "epoch": 0.07371794871794872, + "cpu_mem": 3.633569792, + "gpu_mem": 1.068930048, + "loss": 1.398, + "grad_norm": 1.6736313104629517, + "learning_rate": 0.000219047619047619 + }, + { + "step": 47, + "epoch": 0.07532051282051282, + "cpu_mem": 3.634159616, + "gpu_mem": 1.068916224, + "loss": 1.4094, + "grad_norm": 0.9003703594207764, + "learning_rate": 0.0002238095238095238 + }, + { + "step": 48, + "epoch": 0.07692307692307693, + "cpu_mem": 3.63474944, + "gpu_mem": 1.068934656, + "loss": 1.3779, + "grad_norm": 1.181053638458252, + "learning_rate": 0.00022857142857142854 + }, + { + "step": 49, + "epoch": 0.07852564102564102, + "cpu_mem": 3.635142656, + "gpu_mem": 1.068946944, + "loss": 1.436, + "grad_norm": 1.4775186777114868, + "learning_rate": 0.0002333333333333333 + }, + { + "step": 50, + "epoch": 0.08012820512820513, + "cpu_mem": 3.63573248, + "gpu_mem": 1.068923904, + "loss": 1.4002, + "grad_norm": 1.2089508771896362, + "learning_rate": 0.00023809523809523807 + }, + { + "step": 51, + "epoch": 0.08173076923076923, + "cpu_mem": 3.636125696, + "gpu_mem": 1.068908544, + "loss": 1.3782, + "grad_norm": 0.9166827201843262, + "learning_rate": 0.00024285714285714283 + }, + { + "step": 52, + "epoch": 0.08333333333333333, + "cpu_mem": 3.63671552, + "gpu_mem": 1.068913152, + "loss": 1.3684, + "grad_norm": 0.487032413482666, + "learning_rate": 0.00024761904761904757 + }, + { + "step": 53, + "epoch": 0.08493589743589744, + "cpu_mem": 3.637108736, + "gpu_mem": 1.0689408, + "loss": 1.4051, + "grad_norm": 1.4723758697509766, + "learning_rate": 0.0002523809523809524 + }, + { + "step": 54, + "epoch": 0.08653846153846154, + "cpu_mem": 3.63769856, + "gpu_mem": 1.068916224, + "loss": 1.4488, + "grad_norm": 2.0456278324127197, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 55, + "epoch": 0.08814102564102565, + "cpu_mem": 3.638091776, + "gpu_mem": 1.068934656, + "loss": 1.3875, + "grad_norm": 1.1456187963485718, + "learning_rate": 0.00026190476190476186 + }, + { + "step": 56, + "epoch": 0.08974358974358974, + "cpu_mem": 3.6386816, + "gpu_mem": 1.068928512, + "loss": 1.3906, + "grad_norm": 1.0054882764816284, + "learning_rate": 0.0002666666666666666 + }, + { + "step": 57, + "epoch": 0.09134615384615384, + "cpu_mem": 3.639074816, + "gpu_mem": 1.06889472, + "loss": 1.4019, + "grad_norm": 1.149444341659546, + "learning_rate": 0.0002714285714285714 + }, + { + "step": 58, + "epoch": 0.09294871794871795, + "cpu_mem": 3.639468032, + "gpu_mem": 1.068923904, + "loss": 1.4183, + "grad_norm": 1.3061683177947998, + "learning_rate": 0.00027619047619047615 + }, + { + "step": 59, + "epoch": 0.09455128205128205, + "cpu_mem": 3.640057856, + "gpu_mem": 1.068907008, + "loss": 1.32, + "grad_norm": 0.7658959031105042, + "learning_rate": 0.0002809523809523809 + }, + { + "step": 60, + "epoch": 0.09615384615384616, + "cpu_mem": 3.640451072, + "gpu_mem": 1.06894848, + "loss": 1.4117, + "grad_norm": 1.7494112253189087, + "learning_rate": 0.0002857142857142857 + }, + { + "step": 61, + "epoch": 0.09775641025641026, + "cpu_mem": 3.641040896, + "gpu_mem": 1.068914688, + "loss": 1.4214, + "grad_norm": 0.8701615929603577, + "learning_rate": 0.00029047619047619045 + }, + { + "step": 62, + "epoch": 0.09935897435897435, + "cpu_mem": 3.641434112, + "gpu_mem": 1.068954624, + "loss": 1.3574, + "grad_norm": 1.3487862348556519, + "learning_rate": 0.0002952380952380952 + }, + { + "step": 63, + "epoch": 0.10096153846153846, + "cpu_mem": 3.641827328, + "gpu_mem": 1.068908544, + "loss": 1.4556, + "grad_norm": 1.0214182138442993, + "learning_rate": 0.0003 + }, + { + "step": 64, + "epoch": 0.10256410256410256, + "cpu_mem": 3.642220544, + "gpu_mem": 1.068913152, + "loss": 1.4521, + "grad_norm": 1.1015413999557495, + "learning_rate": 0.00029999764801714643 + }, + { + "step": 65, + "epoch": 0.10416666666666667, + "cpu_mem": 3.642810368, + "gpu_mem": 1.06891008, + "loss": 1.4476, + "grad_norm": 1.1731510162353516, + "learning_rate": 0.00029999059214234344 + }, + { + "step": 66, + "epoch": 0.10576923076923077, + "cpu_mem": 3.643203584, + "gpu_mem": 1.068928512, + "loss": 1.4347, + "grad_norm": 0.7913596630096436, + "learning_rate": 0.00029997883259686163 + }, + { + "step": 67, + "epoch": 0.10737179487179487, + "cpu_mem": 3.643793408, + "gpu_mem": 1.068920832, + "loss": 1.3902, + "grad_norm": 0.34518906474113464, + "learning_rate": 0.00029996236974947764 + }, + { + "step": 68, + "epoch": 0.10897435897435898, + "cpu_mem": 3.644383232, + "gpu_mem": 1.068905472, + "loss": 1.4517, + "grad_norm": 1.493249773979187, + "learning_rate": 0.00029994120411646263 + }, + { + "step": 69, + "epoch": 0.11057692307692307, + "cpu_mem": 3.644776448, + "gpu_mem": 1.068976128, + "loss": 1.4174, + "grad_norm": 1.0877691507339478, + "learning_rate": 0.00029991533636156603 + }, + { + "step": 70, + "epoch": 0.11217948717948718, + "cpu_mem": 3.645169664, + "gpu_mem": 1.068926976, + "loss": 1.4161, + "grad_norm": 1.1136096715927124, + "learning_rate": 0.00029988476729599464 + }, + { + "step": 71, + "epoch": 0.11378205128205128, + "cpu_mem": 3.64556288, + "gpu_mem": 1.068951552, + "loss": 1.3872, + "grad_norm": 1.0074044466018677, + "learning_rate": 0.0002998494978783874 + }, + { + "step": 72, + "epoch": 0.11538461538461539, + "cpu_mem": 3.645956096, + "gpu_mem": 1.068922368, + "loss": 1.4075, + "grad_norm": 0.8339865803718567, + "learning_rate": 0.0002998095292147852 + }, + { + "step": 73, + "epoch": 0.11698717948717949, + "cpu_mem": 3.646349312, + "gpu_mem": 1.068914688, + "loss": 1.4056, + "grad_norm": 1.1939915418624878, + "learning_rate": 0.0002997648625585962 + }, + { + "step": 74, + "epoch": 0.11858974358974358, + "cpu_mem": 3.646939136, + "gpu_mem": 1.068908544, + "loss": 1.3825, + "grad_norm": 0.4442974627017975, + "learning_rate": 0.0002997154993105566 + }, + { + "step": 75, + "epoch": 0.1201923076923077, + "cpu_mem": 3.647332352, + "gpu_mem": 1.068937728, + "loss": 1.4247, + "grad_norm": 1.1121824979782104, + "learning_rate": 0.00029966144101868636 + }, + { + "step": 76, + "epoch": 0.12179487179487179, + "cpu_mem": 3.64752896, + "gpu_mem": 1.068928512, + "loss": 1.442, + "grad_norm": 1.6552414894104004, + "learning_rate": 0.0002996026893782414 + }, + { + "step": 77, + "epoch": 0.1233974358974359, + "cpu_mem": 3.647922176, + "gpu_mem": 1.068916224, + "loss": 1.4005, + "grad_norm": 0.5918968319892883, + "learning_rate": 0.00029953924623165955 + }, + { + "step": 78, + "epoch": 0.125, + "cpu_mem": 3.648315392, + "gpu_mem": 1.068908544, + "loss": 1.4024, + "grad_norm": 0.9621551632881165, + "learning_rate": 0.0002994711135685035 + }, + { + "step": 79, + "epoch": 0.1266025641025641, + "cpu_mem": 3.648708608, + "gpu_mem": 1.068960768, + "loss": 1.392, + "grad_norm": 0.7375335097312927, + "learning_rate": 0.00029939829352539787 + }, + { + "step": 80, + "epoch": 0.1282051282051282, + "cpu_mem": 3.649101824, + "gpu_mem": 1.068939264, + "loss": 1.4082, + "grad_norm": 0.6020700931549072, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 81, + "epoch": 0.12980769230769232, + "cpu_mem": 3.64949504, + "gpu_mem": 1.06893312, + "loss": 1.3552, + "grad_norm": 0.4545043706893921, + "learning_rate": 0.0002992386005807413 + }, + { + "step": 82, + "epoch": 0.13141025641025642, + "cpu_mem": 3.650084864, + "gpu_mem": 1.06891008, + "loss": 1.3875, + "grad_norm": 0.8592332005500793, + "learning_rate": 0.00029915173268712456 + }, + { + "step": 83, + "epoch": 0.1330128205128205, + "cpu_mem": 3.65047808, + "gpu_mem": 1.068931584, + "loss": 1.4453, + "grad_norm": 1.509513020515442, + "learning_rate": 0.0002990601874292698 + }, + { + "step": 84, + "epoch": 0.1346153846153846, + "cpu_mem": 3.650871296, + "gpu_mem": 1.068903936, + "loss": 1.4679, + "grad_norm": 1.5138100385665894, + "learning_rate": 0.0002989639676780152 + }, + { + "step": 85, + "epoch": 0.1362179487179487, + "cpu_mem": 3.651067904, + "gpu_mem": 1.068911616, + "loss": 1.4023, + "grad_norm": 0.8283829092979431, + "learning_rate": 0.0002988630764507904 + }, + { + "step": 86, + "epoch": 0.13782051282051283, + "cpu_mem": 3.65146112, + "gpu_mem": 1.068930048, + "loss": 1.3772, + "grad_norm": 0.6338778138160706, + "learning_rate": 0.00029875751691152094 + }, + { + "step": 87, + "epoch": 0.13942307692307693, + "cpu_mem": 3.652050944, + "gpu_mem": 1.068919296, + "loss": 1.4026, + "grad_norm": 0.6547983288764954, + "learning_rate": 0.0002986472923705301 + }, + { + "step": 88, + "epoch": 0.14102564102564102, + "cpu_mem": 3.652247552, + "gpu_mem": 1.06891776, + "loss": 1.3887, + "grad_norm": 0.7275934815406799, + "learning_rate": 0.0002985324062844341 + }, + { + "step": 89, + "epoch": 0.14262820512820512, + "cpu_mem": 3.652640768, + "gpu_mem": 1.068913152, + "loss": 1.4238, + "grad_norm": 1.0901581048965454, + "learning_rate": 0.0002984128622560345 + }, + { + "step": 90, + "epoch": 0.14423076923076922, + "cpu_mem": 3.653033984, + "gpu_mem": 1.06891776, + "loss": 1.3937, + "grad_norm": 0.6593455076217651, + "learning_rate": 0.0002982886640342046 + }, + { + "step": 91, + "epoch": 0.14583333333333334, + "cpu_mem": 3.6534272, + "gpu_mem": 1.068928512, + "loss": 1.391, + "grad_norm": 0.6273136734962463, + "learning_rate": 0.00029815981551377217 + }, + { + "step": 92, + "epoch": 0.14743589743589744, + "cpu_mem": 3.653820416, + "gpu_mem": 1.068931584, + "loss": 1.4274, + "grad_norm": 0.8155838847160339, + "learning_rate": 0.00029802632073539745 + }, + { + "step": 93, + "epoch": 0.14903846153846154, + "cpu_mem": 3.654213632, + "gpu_mem": 1.068931584, + "loss": 1.4036, + "grad_norm": 0.3780589997768402, + "learning_rate": 0.0002978881838854462 + }, + { + "step": 94, + "epoch": 0.15064102564102563, + "cpu_mem": 3.654606848, + "gpu_mem": 1.068926976, + "loss": 1.4067, + "grad_norm": 0.6934848427772522, + "learning_rate": 0.00029774540929585847 + }, + { + "step": 95, + "epoch": 0.15224358974358973, + "cpu_mem": 3.655000064, + "gpu_mem": 1.068945408, + "loss": 1.3734, + "grad_norm": 0.5355036854743958, + "learning_rate": 0.0002975980014440126 + }, + { + "step": 96, + "epoch": 0.15384615384615385, + "cpu_mem": 3.65539328, + "gpu_mem": 1.06894848, + "loss": 1.3808, + "grad_norm": 0.25487735867500305, + "learning_rate": 0.00029744596495258525 + }, + { + "step": 97, + "epoch": 0.15544871794871795, + "cpu_mem": 3.655786496, + "gpu_mem": 1.06892544, + "loss": 1.4126, + "grad_norm": 0.5576616525650024, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 98, + "epoch": 0.15705128205128205, + "cpu_mem": 3.656179712, + "gpu_mem": 1.068936192, + "loss": 1.4213, + "grad_norm": 0.7194792032241821, + "learning_rate": 0.000297128025267308 + }, + { + "step": 99, + "epoch": 0.15865384615384615, + "cpu_mem": 3.656572928, + "gpu_mem": 1.068936192, + "loss": 1.377, + "grad_norm": 0.7278963923454285, + "learning_rate": 0.00029696213204397396 + }, + { + "step": 100, + "epoch": 0.16025641025641027, + "cpu_mem": 3.656966144, + "gpu_mem": 1.068911616, + "loss": 1.3911, + "grad_norm": 0.4599713981151581, + "learning_rate": 0.00029679163012177737 + }, + { + "step": 101, + "epoch": 0.16185897435897437, + "cpu_mem": 3.65735936, + "gpu_mem": 1.0689408, + "loss": 1.4013, + "grad_norm": 0.9241965413093567, + "learning_rate": 0.0002966165248476196 + }, + { + "step": 102, + "epoch": 0.16346153846153846, + "cpu_mem": 3.657555968, + "gpu_mem": 1.06891776, + "loss": 1.3509, + "grad_norm": 0.5053690671920776, + "learning_rate": 0.00029643682171276203 + }, + { + "step": 103, + "epoch": 0.16506410256410256, + "cpu_mem": 3.657949184, + "gpu_mem": 1.068934656, + "loss": 1.4243, + "grad_norm": 0.7901169061660767, + "learning_rate": 0.0002962525263526538 + }, + { + "step": 104, + "epoch": 0.16666666666666666, + "cpu_mem": 3.658145792, + "gpu_mem": 1.0689024, + "loss": 1.4212, + "grad_norm": 1.0145132541656494, + "learning_rate": 0.0002960636445467553 + }, + { + "step": 105, + "epoch": 0.16826923076923078, + "cpu_mem": 3.658539008, + "gpu_mem": 1.06891776, + "loss": 1.375, + "grad_norm": 0.5602287650108337, + "learning_rate": 0.0002958701822183569 + }, + { + "step": 106, + "epoch": 0.16987179487179488, + "cpu_mem": 3.658735616, + "gpu_mem": 1.068897792, + "loss": 1.4191, + "grad_norm": 1.0905739068984985, + "learning_rate": 0.0002956721454343928 + }, + { + "step": 107, + "epoch": 0.17147435897435898, + "cpu_mem": 3.659128832, + "gpu_mem": 1.068939264, + "loss": 1.3938, + "grad_norm": 0.5754500031471252, + "learning_rate": 0.0002954695404052514 + }, + { + "step": 108, + "epoch": 0.17307692307692307, + "cpu_mem": 3.659522048, + "gpu_mem": 1.068934656, + "loss": 1.3914, + "grad_norm": 0.6425297260284424, + "learning_rate": 0.00029526237348458003 + }, + { + "step": 109, + "epoch": 0.17467948717948717, + "cpu_mem": 3.659915264, + "gpu_mem": 1.0689408, + "loss": 1.3875, + "grad_norm": 0.6998670697212219, + "learning_rate": 0.000295050651169086 + }, + { + "step": 110, + "epoch": 0.1762820512820513, + "cpu_mem": 3.66030848, + "gpu_mem": 1.068937728, + "loss": 1.381, + "grad_norm": 0.35458239912986755, + "learning_rate": 0.00029483438009833264 + }, + { + "step": 111, + "epoch": 0.1778846153846154, + "cpu_mem": 3.660505088, + "gpu_mem": 1.068939264, + "loss": 1.3753, + "grad_norm": 0.6327486038208008, + "learning_rate": 0.0002946135670545314 + }, + { + "step": 112, + "epoch": 0.1794871794871795, + "cpu_mem": 3.660898304, + "gpu_mem": 1.068936192, + "loss": 1.3841, + "grad_norm": 0.40174758434295654, + "learning_rate": 0.0002943882189623288 + }, + { + "step": 113, + "epoch": 0.18108974358974358, + "cpu_mem": 3.661094912, + "gpu_mem": 1.068916224, + "loss": 1.3995, + "grad_norm": 0.5586431622505188, + "learning_rate": 0.00029415834288858947 + }, + { + "step": 114, + "epoch": 0.18269230769230768, + "cpu_mem": 3.661488128, + "gpu_mem": 1.068911616, + "loss": 1.3484, + "grad_norm": 0.4549715518951416, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 115, + "epoch": 0.1842948717948718, + "cpu_mem": 3.661881344, + "gpu_mem": 1.068930048, + "loss": 1.4193, + "grad_norm": 0.7280191779136658, + "learning_rate": 0.0002936850357737156 + }, + { + "step": 116, + "epoch": 0.1858974358974359, + "cpu_mem": 3.66227456, + "gpu_mem": 1.0689408, + "loss": 1.4056, + "grad_norm": 0.553000807762146, + "learning_rate": 0.0002934416195753839 + }, + { + "step": 117, + "epoch": 0.1875, + "cpu_mem": 3.662471168, + "gpu_mem": 1.068926976, + "loss": 1.387, + "grad_norm": 0.4424668550491333, + "learning_rate": 0.00029319370508065594 + }, + { + "step": 118, + "epoch": 0.1891025641025641, + "cpu_mem": 3.662864384, + "gpu_mem": 1.068942336, + "loss": 1.4243, + "grad_norm": 0.9642329812049866, + "learning_rate": 0.0002929413000640735 + }, + { + "step": 119, + "epoch": 0.1907051282051282, + "cpu_mem": 3.6632576, + "gpu_mem": 1.068923904, + "loss": 1.363, + "grad_norm": 0.7512794733047485, + "learning_rate": 0.0002926844124410001 + }, + { + "step": 120, + "epoch": 0.19230769230769232, + "cpu_mem": 3.663650816, + "gpu_mem": 1.068950016, + "loss": 1.3944, + "grad_norm": 0.7992354035377502, + "learning_rate": 0.0002924230502673731 + }, + { + "step": 121, + "epoch": 0.19391025641025642, + "cpu_mem": 3.663847424, + "gpu_mem": 1.068908544, + "loss": 1.3785, + "grad_norm": 0.4586975574493408, + "learning_rate": 0.00029215722173945034 + }, + { + "step": 122, + "epoch": 0.1955128205128205, + "cpu_mem": 3.66424064, + "gpu_mem": 1.0689408, + "loss": 1.4058, + "grad_norm": 0.5738359093666077, + "learning_rate": 0.0002918869351935537 + }, + { + "step": 123, + "epoch": 0.1971153846153846, + "cpu_mem": 3.664633856, + "gpu_mem": 1.068934656, + "loss": 1.4023, + "grad_norm": 0.40117764472961426, + "learning_rate": 0.00029161219910580754 + }, + { + "step": 124, + "epoch": 0.1987179487179487, + "cpu_mem": 3.664830464, + "gpu_mem": 1.068936192, + "loss": 1.3815, + "grad_norm": 0.5213998556137085, + "learning_rate": 0.00029133302209187267 + }, + { + "step": 125, + "epoch": 0.20032051282051283, + "cpu_mem": 3.66522368, + "gpu_mem": 1.068911616, + "loss": 1.3972, + "grad_norm": 0.7175266146659851, + "learning_rate": 0.00029104941290667655 + }, + { + "step": 126, + "epoch": 0.20192307692307693, + "cpu_mem": 3.665420288, + "gpu_mem": 1.068920832, + "loss": 1.3735, + "grad_norm": 0.6172037124633789, + "learning_rate": 0.00029076138044413827 + }, + { + "step": 127, + "epoch": 0.20352564102564102, + "cpu_mem": 3.665616896, + "gpu_mem": 1.068907008, + "loss": 1.4233, + "grad_norm": 0.7675486207008362, + "learning_rate": 0.00029046893373689 + }, + { + "step": 128, + "epoch": 0.20512820512820512, + "cpu_mem": 3.666010112, + "gpu_mem": 1.068943872, + "loss": 1.3698, + "grad_norm": 0.40757447481155396, + "learning_rate": 0.00029017208195599375 + }, + { + "step": 129, + "epoch": 0.20673076923076922, + "cpu_mem": 3.666403328, + "gpu_mem": 1.0689408, + "loss": 1.3866, + "grad_norm": 0.26603972911834717, + "learning_rate": 0.0002898708344106533 + }, + { + "step": 130, + "epoch": 0.20833333333333334, + "cpu_mem": 3.666599936, + "gpu_mem": 1.0689408, + "loss": 1.4145, + "grad_norm": 0.623124361038208, + "learning_rate": 0.00028956520054792303 + }, + { + "step": 131, + "epoch": 0.20993589743589744, + "cpu_mem": 3.666993152, + "gpu_mem": 1.068930048, + "loss": 1.3987, + "grad_norm": 0.5772870182991028, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 132, + "epoch": 0.21153846153846154, + "cpu_mem": 3.667386368, + "gpu_mem": 1.068930048, + "loss": 1.3612, + "grad_norm": 0.4209688901901245, + "learning_rate": 0.0002889408123459782 + }, + { + "step": 133, + "epoch": 0.21314102564102563, + "cpu_mem": 3.667582976, + "gpu_mem": 1.068911616, + "loss": 1.3921, + "grad_norm": 0.6344068050384521, + "learning_rate": 0.000288622077587435 + }, + { + "step": 134, + "epoch": 0.21474358974358973, + "cpu_mem": 3.667976192, + "gpu_mem": 1.068922368, + "loss": 1.4048, + "grad_norm": 0.6471256613731384, + "learning_rate": 0.0002882989956722303 + }, + { + "step": 135, + "epoch": 0.21634615384615385, + "cpu_mem": 3.6681728, + "gpu_mem": 1.068931584, + "loss": 1.3801, + "grad_norm": 0.46671023964881897, + "learning_rate": 0.00028797157673213914 + }, + { + "step": 136, + "epoch": 0.21794871794871795, + "cpu_mem": 3.668566016, + "gpu_mem": 1.068946944, + "loss": 1.4327, + "grad_norm": 0.851349949836731, + "learning_rate": 0.00028763983103494465 + }, + { + "step": 137, + "epoch": 0.21955128205128205, + "cpu_mem": 3.668959232, + "gpu_mem": 1.06889472, + "loss": 1.3994, + "grad_norm": 0.5963117480278015, + "learning_rate": 0.00028730376898411606 + }, + { + "step": 138, + "epoch": 0.22115384615384615, + "cpu_mem": 3.66915584, + "gpu_mem": 1.068914688, + "loss": 1.3925, + "grad_norm": 0.37339600920677185, + "learning_rate": 0.00028696340111848245 + }, + { + "step": 139, + "epoch": 0.22275641025641027, + "cpu_mem": 3.669352448, + "gpu_mem": 1.068896256, + "loss": 1.3894, + "grad_norm": 0.42718419432640076, + "learning_rate": 0.00028661873811190226 + }, + { + "step": 140, + "epoch": 0.22435897435897437, + "cpu_mem": 3.669549056, + "gpu_mem": 1.068913152, + "loss": 1.3959, + "grad_norm": 0.8113850951194763, + "learning_rate": 0.0002862697907729285 + }, + { + "step": 141, + "epoch": 0.22596153846153846, + "cpu_mem": 3.669942272, + "gpu_mem": 1.068919296, + "loss": 1.4062, + "grad_norm": 0.6125122308731079, + "learning_rate": 0.0002859165700444701 + }, + { + "step": 142, + "epoch": 0.22756410256410256, + "cpu_mem": 3.67013888, + "gpu_mem": 1.068916224, + "loss": 1.3938, + "grad_norm": 0.3761835992336273, + "learning_rate": 0.00028555908700344824 + }, + { + "step": 143, + "epoch": 0.22916666666666666, + "cpu_mem": 3.670532096, + "gpu_mem": 1.068942336, + "loss": 1.3807, + "grad_norm": 0.3114449977874756, + "learning_rate": 0.00028519735286044936 + }, + { + "step": 144, + "epoch": 0.23076923076923078, + "cpu_mem": 3.670728704, + "gpu_mem": 1.068916224, + "loss": 1.3978, + "grad_norm": 0.3510706126689911, + "learning_rate": 0.0002848313789593736 + }, + { + "step": 145, + "epoch": 0.23237179487179488, + "cpu_mem": 3.67112192, + "gpu_mem": 1.06895616, + "loss": 1.3823, + "grad_norm": 0.7981694340705872, + "learning_rate": 0.00028446117677707867 + }, + { + "step": 146, + "epoch": 0.23397435897435898, + "cpu_mem": 3.671318528, + "gpu_mem": 1.068905472, + "loss": 1.3961, + "grad_norm": 0.6980183124542236, + "learning_rate": 0.0002840867579230205 + }, + { + "step": 147, + "epoch": 0.23557692307692307, + "cpu_mem": 3.671515136, + "gpu_mem": 1.068914688, + "loss": 1.3976, + "grad_norm": 0.7641631364822388, + "learning_rate": 0.00028370813413888866 + }, + { + "step": 148, + "epoch": 0.23717948717948717, + "cpu_mem": 3.671711744, + "gpu_mem": 1.068934656, + "loss": 1.3648, + "grad_norm": 0.4158077836036682, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 149, + "epoch": 0.2387820512820513, + "cpu_mem": 3.67210496, + "gpu_mem": 1.06892544, + "loss": 1.3781, + "grad_norm": 0.5091270208358765, + "learning_rate": 0.0002829383194061186 + }, + { + "step": 150, + "epoch": 0.2403846153846154, + "cpu_mem": 3.672301568, + "gpu_mem": 1.068937728, + "loss": 1.3883, + "grad_norm": 0.7599169611930847, + "learning_rate": 0.00028254715259869444 + }, + { + "step": 151, + "epoch": 0.2419871794871795, + "cpu_mem": 3.672694784, + "gpu_mem": 1.0689024, + "loss": 1.4264, + "grad_norm": 0.961350679397583, + "learning_rate": 0.00028215182914286766 + }, + { + "step": 152, + "epoch": 0.24358974358974358, + "cpu_mem": 3.673088, + "gpu_mem": 1.06893312, + "loss": 1.4054, + "grad_norm": 0.9087377190589905, + "learning_rate": 0.00028175236143589144 + }, + { + "step": 153, + "epoch": 0.24519230769230768, + "cpu_mem": 3.673284608, + "gpu_mem": 1.068928512, + "loss": 1.376, + "grad_norm": 0.6747002601623535, + "learning_rate": 0.0002813487620049817 + }, + { + "step": 154, + "epoch": 0.2467948717948718, + "cpu_mem": 3.673481216, + "gpu_mem": 1.068953088, + "loss": 1.3739, + "grad_norm": 0.4286094903945923, + "learning_rate": 0.00028094104350692435 + }, + { + "step": 155, + "epoch": 0.2483974358974359, + "cpu_mem": 3.673677824, + "gpu_mem": 1.068890112, + "loss": 1.4371, + "grad_norm": 0.8431199789047241, + "learning_rate": 0.0002805292187276783 + }, + { + "step": 156, + "epoch": 0.25, + "cpu_mem": 3.67407104, + "gpu_mem": 1.068943872, + "loss": 1.3784, + "grad_norm": 0.4023621678352356, + "learning_rate": 0.0002801133005819744 + }, + { + "step": 157, + "epoch": 0.2516025641025641, + "cpu_mem": 3.674267648, + "gpu_mem": 1.068936192, + "loss": 1.3833, + "grad_norm": 0.32896727323532104, + "learning_rate": 0.00027969330211291077 + }, + { + "step": 158, + "epoch": 0.2532051282051282, + "cpu_mem": 3.674464256, + "gpu_mem": 1.068951552, + "loss": 1.4251, + "grad_norm": 0.673317015171051, + "learning_rate": 0.00027926923649154327 + }, + { + "step": 159, + "epoch": 0.2548076923076923, + "cpu_mem": 3.674660864, + "gpu_mem": 1.068953088, + "loss": 1.4051, + "grad_norm": 0.4557768404483795, + "learning_rate": 0.00027884111701647284 + }, + { + "step": 160, + "epoch": 0.2564102564102564, + "cpu_mem": 3.674857472, + "gpu_mem": 1.068920832, + "loss": 1.4418, + "grad_norm": 0.7233399152755737, + "learning_rate": 0.00027840895711342834 + }, + { + "step": 161, + "epoch": 0.25801282051282054, + "cpu_mem": 3.675250688, + "gpu_mem": 1.068913152, + "loss": 1.404, + "grad_norm": 0.4967329800128937, + "learning_rate": 0.00027797277033484553 + }, + { + "step": 162, + "epoch": 0.25961538461538464, + "cpu_mem": 3.675643904, + "gpu_mem": 1.06894848, + "loss": 1.3954, + "grad_norm": 0.27028393745422363, + "learning_rate": 0.0002775325703594421 + }, + { + "step": 163, + "epoch": 0.26121794871794873, + "cpu_mem": 3.675840512, + "gpu_mem": 1.068896256, + "loss": 1.3935, + "grad_norm": 0.480778306722641, + "learning_rate": 0.0002770883709917886 + }, + { + "step": 164, + "epoch": 0.26282051282051283, + "cpu_mem": 3.67603712, + "gpu_mem": 1.068931584, + "loss": 1.4011, + "grad_norm": 0.5831640362739563, + "learning_rate": 0.0002766401861618757 + }, + { + "step": 165, + "epoch": 0.2644230769230769, + "cpu_mem": 3.676233728, + "gpu_mem": 1.068920832, + "loss": 1.3837, + "grad_norm": 0.22451813519001007, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 166, + "epoch": 0.266025641025641, + "cpu_mem": 3.676430336, + "gpu_mem": 1.068953088, + "loss": 1.3816, + "grad_norm": 0.317891389131546, + "learning_rate": 0.0002757319164597092 + }, + { + "step": 167, + "epoch": 0.2676282051282051, + "cpu_mem": 3.676626944, + "gpu_mem": 1.068946944, + "loss": 1.3848, + "grad_norm": 0.7181174755096436, + "learning_rate": 0.0002752718600705858 + }, + { + "step": 168, + "epoch": 0.2692307692307692, + "cpu_mem": 3.676823552, + "gpu_mem": 1.06892544, + "loss": 1.3879, + "grad_norm": 0.28904303908348083, + "learning_rate": 0.00027480787518457023 + }, + { + "step": 169, + "epoch": 0.2708333333333333, + "cpu_mem": 3.67702016, + "gpu_mem": 1.068922368, + "loss": 1.3997, + "grad_norm": 0.5036405324935913, + "learning_rate": 0.0002743399763521223 + }, + { + "step": 170, + "epoch": 0.2724358974358974, + "cpu_mem": 3.677413376, + "gpu_mem": 1.068959232, + "loss": 1.4049, + "grad_norm": 0.49359646439552307, + "learning_rate": 0.0002738681782464426 + }, + { + "step": 171, + "epoch": 0.27403846153846156, + "cpu_mem": 3.677609984, + "gpu_mem": 1.06893312, + "loss": 1.3747, + "grad_norm": 0.3612525761127472, + "learning_rate": 0.0002733924956630117 + }, + { + "step": 172, + "epoch": 0.27564102564102566, + "cpu_mem": 3.677806592, + "gpu_mem": 1.06891008, + "loss": 1.3921, + "grad_norm": 0.42489948868751526, + "learning_rate": 0.00027291294351912664 + }, + { + "step": 173, + "epoch": 0.27724358974358976, + "cpu_mem": 3.6780032, + "gpu_mem": 1.068936192, + "loss": 1.4152, + "grad_norm": 0.7515605688095093, + "learning_rate": 0.00027242953685343327 + }, + { + "step": 174, + "epoch": 0.27884615384615385, + "cpu_mem": 3.678396416, + "gpu_mem": 1.06894848, + "loss": 1.4185, + "grad_norm": 0.9692003130912781, + "learning_rate": 0.0002719422908254538 + }, + { + "step": 175, + "epoch": 0.28044871794871795, + "cpu_mem": 3.678593024, + "gpu_mem": 1.06891008, + "loss": 1.3881, + "grad_norm": 0.39034876227378845, + "learning_rate": 0.0002714512207151125 + }, + { + "step": 176, + "epoch": 0.28205128205128205, + "cpu_mem": 3.678789632, + "gpu_mem": 1.068919296, + "loss": 1.4069, + "grad_norm": 0.4588618576526642, + "learning_rate": 0.0002709563419222557 + }, + { + "step": 177, + "epoch": 0.28365384615384615, + "cpu_mem": 3.67898624, + "gpu_mem": 1.068900864, + "loss": 1.3965, + "grad_norm": 0.5333432555198669, + "learning_rate": 0.0002704576699661691 + }, + { + "step": 178, + "epoch": 0.28525641025641024, + "cpu_mem": 3.679379456, + "gpu_mem": 1.068914688, + "loss": 1.4238, + "grad_norm": 0.9636707305908203, + "learning_rate": 0.0002699552204850914 + }, + { + "step": 179, + "epoch": 0.28685897435897434, + "cpu_mem": 3.679576064, + "gpu_mem": 1.068922368, + "loss": 1.403, + "grad_norm": 0.6377411484718323, + "learning_rate": 0.0002694490092357233 + }, + { + "step": 180, + "epoch": 0.28846153846153844, + "cpu_mem": 3.679772672, + "gpu_mem": 1.068903936, + "loss": 1.3956, + "grad_norm": 0.5933225750923157, + "learning_rate": 0.00026893905209273404 + }, + { + "step": 181, + "epoch": 0.2900641025641026, + "cpu_mem": 3.67996928, + "gpu_mem": 1.068934656, + "loss": 1.387, + "grad_norm": 0.6096765398979187, + "learning_rate": 0.00026842536504826286 + }, + { + "step": 182, + "epoch": 0.2916666666666667, + "cpu_mem": 3.680165888, + "gpu_mem": 1.068905472, + "loss": 1.406, + "grad_norm": 0.8229501843452454, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 183, + "epoch": 0.2932692307692308, + "cpu_mem": 3.680362496, + "gpu_mem": 1.068930048, + "loss": 1.392, + "grad_norm": 0.5232793092727661, + "learning_rate": 0.0002673868658077717 + }, + { + "step": 184, + "epoch": 0.2948717948717949, + "cpu_mem": 3.680559104, + "gpu_mem": 1.06891008, + "loss": 1.4361, + "grad_norm": 1.002776026725769, + "learning_rate": 0.00026686208617885055 + }, + { + "step": 185, + "epoch": 0.296474358974359, + "cpu_mem": 3.680755712, + "gpu_mem": 1.068942336, + "loss": 1.4522, + "grad_norm": 1.2525769472122192, + "learning_rate": 0.0002663336417816238 + }, + { + "step": 186, + "epoch": 0.2980769230769231, + "cpu_mem": 3.68095232, + "gpu_mem": 1.06893312, + "loss": 1.3976, + "grad_norm": 0.5960250496864319, + "learning_rate": 0.0002658015491879868 + }, + { + "step": 187, + "epoch": 0.29967948717948717, + "cpu_mem": 3.681148928, + "gpu_mem": 1.068928512, + "loss": 1.3979, + "grad_norm": 0.9944146275520325, + "learning_rate": 0.00026526582508424175 + }, + { + "step": 188, + "epoch": 0.30128205128205127, + "cpu_mem": 3.681345536, + "gpu_mem": 1.068885504, + "loss": 1.3794, + "grad_norm": 0.38420671224594116, + "learning_rate": 0.0002647264862705741 + }, + { + "step": 189, + "epoch": 0.30288461538461536, + "cpu_mem": 3.681542144, + "gpu_mem": 1.068965376, + "loss": 1.3989, + "grad_norm": 0.5632681846618652, + "learning_rate": 0.00026418354966052573 + }, + { + "step": 190, + "epoch": 0.30448717948717946, + "cpu_mem": 3.681738752, + "gpu_mem": 1.068916224, + "loss": 1.3932, + "grad_norm": 0.7512533664703369, + "learning_rate": 0.00026363703228046454 + }, + { + "step": 191, + "epoch": 0.3060897435897436, + "cpu_mem": 3.68193536, + "gpu_mem": 1.068916224, + "loss": 1.3912, + "grad_norm": 0.5973755121231079, + "learning_rate": 0.0002630869512690507 + }, + { + "step": 192, + "epoch": 0.3076923076923077, + "cpu_mem": 3.682328576, + "gpu_mem": 1.068882432, + "loss": 1.404, + "grad_norm": 0.6814656257629395, + "learning_rate": 0.0002625333238766989 + }, + { + "step": 193, + "epoch": 0.3092948717948718, + "cpu_mem": 3.682525184, + "gpu_mem": 1.068922368, + "loss": 1.3439, + "grad_norm": 0.42433035373687744, + "learning_rate": 0.0002619761674650377 + }, + { + "step": 194, + "epoch": 0.3108974358974359, + "cpu_mem": 3.6829184, + "gpu_mem": 1.06891776, + "loss": 1.3931, + "grad_norm": 0.886184573173523, + "learning_rate": 0.0002614154995063647 + }, + { + "step": 195, + "epoch": 0.3125, + "cpu_mem": 3.683115008, + "gpu_mem": 1.068905472, + "loss": 1.3862, + "grad_norm": 0.4638538360595703, + "learning_rate": 0.00026085133758309883 + }, + { + "step": 196, + "epoch": 0.3141025641025641, + "cpu_mem": 3.683311616, + "gpu_mem": 1.068930048, + "loss": 1.4067, + "grad_norm": 1.0693466663360596, + "learning_rate": 0.0002602836993872292 + }, + { + "step": 197, + "epoch": 0.3157051282051282, + "cpu_mem": 3.683508224, + "gpu_mem": 1.068945408, + "loss": 1.3823, + "grad_norm": 0.806023359298706, + "learning_rate": 0.0002597126027197598 + }, + { + "step": 198, + "epoch": 0.3173076923076923, + "cpu_mem": 3.683704832, + "gpu_mem": 1.06891776, + "loss": 1.3685, + "grad_norm": 0.4403788447380066, + "learning_rate": 0.0002591380654901515 + }, + { + "step": 199, + "epoch": 0.3189102564102564, + "cpu_mem": 3.68390144, + "gpu_mem": 1.068914688, + "loss": 1.406, + "grad_norm": 0.7057942748069763, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 200, + "epoch": 0.32051282051282054, + "cpu_mem": 3.684098048, + "gpu_mem": 1.068930048, + "loss": 1.432, + "grad_norm": 0.7056812047958374, + "learning_rate": 0.0002579787415212732 + }, + { + "step": 201, + "epoch": 0.32211538461538464, + "cpu_mem": 3.684294656, + "gpu_mem": 1.068907008, + "loss": 1.3901, + "grad_norm": 0.2778489887714386, + "learning_rate": 0.00025739399113813784 + }, + { + "step": 202, + "epoch": 0.32371794871794873, + "cpu_mem": 3.684294656, + "gpu_mem": 1.068908544, + "loss": 1.4082, + "grad_norm": 0.431721955537796, + "learning_rate": 0.00025680587290399277 + }, + { + "step": 203, + "epoch": 0.32532051282051283, + "cpu_mem": 3.684491264, + "gpu_mem": 1.068950016, + "loss": 1.4094, + "grad_norm": 0.6701951026916504, + "learning_rate": 0.0002562144052620913 + }, + { + "step": 204, + "epoch": 0.3269230769230769, + "cpu_mem": 3.68488448, + "gpu_mem": 1.068920832, + "loss": 1.3854, + "grad_norm": 0.36968040466308594, + "learning_rate": 0.00025561960676072354 + }, + { + "step": 205, + "epoch": 0.328525641025641, + "cpu_mem": 3.685081088, + "gpu_mem": 1.068920832, + "loss": 1.3755, + "grad_norm": 0.42050042748451233, + "learning_rate": 0.0002550214960526344 + }, + { + "step": 206, + "epoch": 0.3301282051282051, + "cpu_mem": 3.685277696, + "gpu_mem": 1.06891776, + "loss": 1.3732, + "grad_norm": 0.6240767240524292, + "learning_rate": 0.000254420091894439 + }, + { + "step": 207, + "epoch": 0.3317307692307692, + "cpu_mem": 3.685474304, + "gpu_mem": 1.06891776, + "loss": 1.3994, + "grad_norm": 0.6604028940200806, + "learning_rate": 0.0002538154131460342 + }, + { + "step": 208, + "epoch": 0.3333333333333333, + "cpu_mem": 3.685670912, + "gpu_mem": 1.068908544, + "loss": 1.3933, + "grad_norm": 0.7134949564933777, + "learning_rate": 0.00025320747877000745 + }, + { + "step": 209, + "epoch": 0.3349358974358974, + "cpu_mem": 3.68586752, + "gpu_mem": 1.068943872, + "loss": 1.388, + "grad_norm": 0.9533185362815857, + "learning_rate": 0.00025259630783104164 + }, + { + "step": 210, + "epoch": 0.33653846153846156, + "cpu_mem": 3.686064128, + "gpu_mem": 1.068900864, + "loss": 1.3676, + "grad_norm": 0.3691646158695221, + "learning_rate": 0.00025198191949531786 + }, + { + "step": 211, + "epoch": 0.33814102564102566, + "cpu_mem": 3.686260736, + "gpu_mem": 1.068928512, + "loss": 1.3932, + "grad_norm": 0.560864269733429, + "learning_rate": 0.00025136433302991366 + }, + { + "step": 212, + "epoch": 0.33974358974358976, + "cpu_mem": 3.686260736, + "gpu_mem": 1.068937728, + "loss": 1.3845, + "grad_norm": 1.0631719827651978, + "learning_rate": 0.00025074356780219946 + }, + { + "step": 213, + "epoch": 0.34134615384615385, + "cpu_mem": 3.686457344, + "gpu_mem": 1.06891008, + "loss": 1.3628, + "grad_norm": 0.9732236266136169, + "learning_rate": 0.000250119643279231 + }, + { + "step": 214, + "epoch": 0.34294871794871795, + "cpu_mem": 3.686653952, + "gpu_mem": 1.068919296, + "loss": 1.3733, + "grad_norm": 0.7067987322807312, + "learning_rate": 0.0002494925790271386 + }, + { + "step": 215, + "epoch": 0.34455128205128205, + "cpu_mem": 3.686653952, + "gpu_mem": 1.068920832, + "loss": 1.4469, + "grad_norm": 1.1544378995895386, + "learning_rate": 0.00024886239471051376 + }, + { + "step": 216, + "epoch": 0.34615384615384615, + "cpu_mem": 3.687047168, + "gpu_mem": 1.068920832, + "loss": 1.3917, + "grad_norm": 0.754188597202301, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 217, + "epoch": 0.34775641025641024, + "cpu_mem": 3.687243776, + "gpu_mem": 1.068905472, + "loss": 1.4345, + "grad_norm": 1.1262266635894775, + "learning_rate": 0.0002475927450306363 + }, + { + "step": 218, + "epoch": 0.34935897435897434, + "cpu_mem": 3.687636992, + "gpu_mem": 1.068926976, + "loss": 1.3979, + "grad_norm": 0.7733461260795593, + "learning_rate": 0.0002469533194833073 + }, + { + "step": 219, + "epoch": 0.35096153846153844, + "cpu_mem": 3.6878336, + "gpu_mem": 1.068960768, + "loss": 1.4021, + "grad_norm": 0.7447898387908936, + "learning_rate": 0.0002463108535020447 + }, + { + "step": 220, + "epoch": 0.3525641025641026, + "cpu_mem": 3.688030208, + "gpu_mem": 1.068914688, + "loss": 1.3707, + "grad_norm": 0.5296040177345276, + "learning_rate": 0.0002456653672344348 + }, + { + "step": 221, + "epoch": 0.3541666666666667, + "cpu_mem": 3.688226816, + "gpu_mem": 1.068920832, + "loss": 1.3724, + "grad_norm": 0.470849871635437, + "learning_rate": 0.0002450168809227794 + }, + { + "step": 222, + "epoch": 0.3557692307692308, + "cpu_mem": 3.688226816, + "gpu_mem": 1.068936192, + "loss": 1.4018, + "grad_norm": 0.6405095458030701, + "learning_rate": 0.00024436541490346095 + }, + { + "step": 223, + "epoch": 0.3573717948717949, + "cpu_mem": 3.688423424, + "gpu_mem": 1.068954624, + "loss": 1.3883, + "grad_norm": 0.5859748721122742, + "learning_rate": 0.00024371098960630495 + }, + { + "step": 224, + "epoch": 0.358974358974359, + "cpu_mem": 3.688423424, + "gpu_mem": 1.068923904, + "loss": 1.3654, + "grad_norm": 0.31066417694091797, + "learning_rate": 0.000243053625553939 + }, + { + "step": 225, + "epoch": 0.3605769230769231, + "cpu_mem": 3.688620032, + "gpu_mem": 1.06891008, + "loss": 1.3872, + "grad_norm": 0.4624931812286377, + "learning_rate": 0.00024239334336114953 + }, + { + "step": 226, + "epoch": 0.36217948717948717, + "cpu_mem": 3.68881664, + "gpu_mem": 1.0689024, + "loss": 1.3874, + "grad_norm": 0.3328680694103241, + "learning_rate": 0.0002417301637342352 + }, + { + "step": 227, + "epoch": 0.36378205128205127, + "cpu_mem": 3.689013248, + "gpu_mem": 1.068966912, + "loss": 1.387, + "grad_norm": 0.4248436689376831, + "learning_rate": 0.00024106410747035744 + }, + { + "step": 228, + "epoch": 0.36538461538461536, + "cpu_mem": 3.689209856, + "gpu_mem": 1.068905472, + "loss": 1.3494, + "grad_norm": 0.4819003641605377, + "learning_rate": 0.00024039519545688846 + }, + { + "step": 229, + "epoch": 0.36698717948717946, + "cpu_mem": 3.689406464, + "gpu_mem": 1.068957696, + "loss": 1.3806, + "grad_norm": 0.3653075098991394, + "learning_rate": 0.000239723448670756 + }, + { + "step": 230, + "epoch": 0.3685897435897436, + "cpu_mem": 3.689603072, + "gpu_mem": 1.068939264, + "loss": 1.3765, + "grad_norm": 0.38901910185813904, + "learning_rate": 0.0002390488881777858 + }, + { + "step": 231, + "epoch": 0.3701923076923077, + "cpu_mem": 3.68979968, + "gpu_mem": 1.068937728, + "loss": 1.3964, + "grad_norm": 0.6662193536758423, + "learning_rate": 0.0002383715351320406 + }, + { + "step": 232, + "epoch": 0.3717948717948718, + "cpu_mem": 3.690192896, + "gpu_mem": 1.068942336, + "loss": 1.3771, + "grad_norm": 0.4676947295665741, + "learning_rate": 0.00023769141077515713 + }, + { + "step": 233, + "epoch": 0.3733974358974359, + "cpu_mem": 3.690192896, + "gpu_mem": 1.06891776, + "loss": 1.3645, + "grad_norm": 1.0399150848388672, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 234, + "epoch": 0.375, + "cpu_mem": 3.690389504, + "gpu_mem": 1.068946944, + "loss": 1.3693, + "grad_norm": 0.6490265130996704, + "learning_rate": 0.0002363229335283915 + }, + { + "step": 235, + "epoch": 0.3766025641025641, + "cpu_mem": 3.690389504, + "gpu_mem": 1.068923904, + "loss": 1.3469, + "grad_norm": 0.7418602108955383, + "learning_rate": 0.00023563462355364297 + }, + { + "step": 236, + "epoch": 0.3782051282051282, + "cpu_mem": 3.690586112, + "gpu_mem": 1.068985344, + "loss": 1.4312, + "grad_norm": 1.4843430519104004, + "learning_rate": 0.0002349436280966775 + }, + { + "step": 237, + "epoch": 0.3798076923076923, + "cpu_mem": 3.69078272, + "gpu_mem": 1.06891008, + "loss": 1.4352, + "grad_norm": 1.6809767484664917, + "learning_rate": 0.00023424996882695468 + }, + { + "step": 238, + "epoch": 0.3814102564102564, + "cpu_mem": 3.690979328, + "gpu_mem": 1.068920832, + "loss": 1.3992, + "grad_norm": 1.2306311130523682, + "learning_rate": 0.00023355366749747063 + }, + { + "step": 239, + "epoch": 0.38301282051282054, + "cpu_mem": 3.691175936, + "gpu_mem": 1.068919296, + "loss": 1.3991, + "grad_norm": 0.7124042510986328, + "learning_rate": 0.00023285474594407585 + }, + { + "step": 240, + "epoch": 0.38461538461538464, + "cpu_mem": 3.691372544, + "gpu_mem": 1.068916224, + "loss": 1.3395, + "grad_norm": 0.6723741888999939, + "learning_rate": 0.0002321532260847905 + }, + { + "step": 241, + "epoch": 0.38621794871794873, + "cpu_mem": 3.691569152, + "gpu_mem": 1.068946944, + "loss": 1.3487, + "grad_norm": 0.7925060987472534, + "learning_rate": 0.00023144912991911691 + }, + { + "step": 242, + "epoch": 0.38782051282051283, + "cpu_mem": 3.691569152, + "gpu_mem": 1.06892544, + "loss": 1.3544, + "grad_norm": 0.6993321180343628, + "learning_rate": 0.0002307424795273499 + }, + { + "step": 243, + "epoch": 0.3894230769230769, + "cpu_mem": 3.69176576, + "gpu_mem": 1.068920832, + "loss": 1.314, + "grad_norm": 0.7342727184295654, + "learning_rate": 0.00023003329706988425 + }, + { + "step": 244, + "epoch": 0.391025641025641, + "cpu_mem": 3.691962368, + "gpu_mem": 1.068931584, + "loss": 1.3714, + "grad_norm": 1.0872082710266113, + "learning_rate": 0.00022932160478651963 + }, + { + "step": 245, + "epoch": 0.3926282051282051, + "cpu_mem": 3.692158976, + "gpu_mem": 1.068936192, + "loss": 1.347, + "grad_norm": 0.5439728498458862, + "learning_rate": 0.00022860742499576338 + }, + { + "step": 246, + "epoch": 0.3942307692307692, + "cpu_mem": 3.692355584, + "gpu_mem": 1.068897792, + "loss": 1.352, + "grad_norm": 0.5284808874130249, + "learning_rate": 0.00022789078009413042 + }, + { + "step": 247, + "epoch": 0.3958333333333333, + "cpu_mem": 3.692552192, + "gpu_mem": 1.068965376, + "loss": 1.3331, + "grad_norm": 0.6979990005493164, + "learning_rate": 0.00022717169255544108 + }, + { + "step": 248, + "epoch": 0.3974358974358974, + "cpu_mem": 3.6927488, + "gpu_mem": 1.068928512, + "loss": 1.2749, + "grad_norm": 0.8431934714317322, + "learning_rate": 0.00022645018493011612 + }, + { + "step": 249, + "epoch": 0.39903846153846156, + "cpu_mem": 3.6927488, + "gpu_mem": 1.06891776, + "loss": 1.3258, + "grad_norm": 0.9904539585113525, + "learning_rate": 0.0002257262798444698 + }, + { + "step": 250, + "epoch": 0.40064102564102566, + "cpu_mem": 3.692945408, + "gpu_mem": 1.068934656, + "loss": 1.3319, + "grad_norm": 1.3245640993118286, + "learning_rate": 0.000225 + }, + { + "step": 251, + "epoch": 0.40224358974358976, + "cpu_mem": 3.693142016, + "gpu_mem": 1.068908544, + "loss": 1.3553, + "grad_norm": 2.3166708946228027, + "learning_rate": 0.00022427136817267668 + }, + { + "step": 252, + "epoch": 0.40384615384615385, + "cpu_mem": 3.693338624, + "gpu_mem": 1.06895616, + "loss": 1.3, + "grad_norm": 1.9807707071304321, + "learning_rate": 0.0002235404072122273 + }, + { + "step": 253, + "epoch": 0.40544871794871795, + "cpu_mem": 3.693535232, + "gpu_mem": 1.068923904, + "loss": 1.2769, + "grad_norm": 2.745403528213501, + "learning_rate": 0.00022280714004142054 + }, + { + "step": 254, + "epoch": 0.40705128205128205, + "cpu_mem": 3.693535232, + "gpu_mem": 1.068913152, + "loss": 1.2319, + "grad_norm": 1.5667729377746582, + "learning_rate": 0.00022207158965534726 + }, + { + "step": 255, + "epoch": 0.40865384615384615, + "cpu_mem": 3.69373184, + "gpu_mem": 1.068928512, + "loss": 1.2342, + "grad_norm": 1.3794996738433838, + "learning_rate": 0.0002213337791206993 + }, + { + "step": 256, + "epoch": 0.41025641025641024, + "cpu_mem": 3.69373184, + "gpu_mem": 1.06892544, + "loss": 1.1938, + "grad_norm": 1.3089722394943237, + "learning_rate": 0.00022059373157504636 + }, + { + "step": 257, + "epoch": 0.41185897435897434, + "cpu_mem": 3.693928448, + "gpu_mem": 1.06892544, + "loss": 1.2204, + "grad_norm": 2.402496814727783, + "learning_rate": 0.00021985147022611038 + }, + { + "step": 258, + "epoch": 0.41346153846153844, + "cpu_mem": 3.694125056, + "gpu_mem": 1.068913152, + "loss": 1.3166, + "grad_norm": 2.732975959777832, + "learning_rate": 0.0002191070183510375 + }, + { + "step": 259, + "epoch": 0.4150641025641026, + "cpu_mem": 3.694321664, + "gpu_mem": 1.068896256, + "loss": 1.193, + "grad_norm": 2.404637336730957, + "learning_rate": 0.00021836039929566835 + }, + { + "step": 260, + "epoch": 0.4166666666666667, + "cpu_mem": 3.694518272, + "gpu_mem": 1.068959232, + "loss": 1.2287, + "grad_norm": 2.5566787719726562, + "learning_rate": 0.0002176116364738058 + }, + { + "step": 261, + "epoch": 0.4182692307692308, + "cpu_mem": 3.694518272, + "gpu_mem": 1.068913152, + "loss": 1.3176, + "grad_norm": 4.991761207580566, + "learning_rate": 0.00021686075336648075 + }, + { + "step": 262, + "epoch": 0.4198717948717949, + "cpu_mem": 3.69471488, + "gpu_mem": 1.068922368, + "loss": 1.19, + "grad_norm": 3.712761878967285, + "learning_rate": 0.00021610777352121574 + }, + { + "step": 263, + "epoch": 0.421474358974359, + "cpu_mem": 3.694911488, + "gpu_mem": 1.068957696, + "loss": 0.9667, + "grad_norm": 3.3610150814056396, + "learning_rate": 0.0002153527205512867 + }, + { + "step": 264, + "epoch": 0.4230769230769231, + "cpu_mem": 3.695108096, + "gpu_mem": 1.068922368, + "loss": 1.0759, + "grad_norm": 3.132323980331421, + "learning_rate": 0.0002145956181349821 + }, + { + "step": 265, + "epoch": 0.42467948717948717, + "cpu_mem": 3.695108096, + "gpu_mem": 1.068926976, + "loss": 1.3511, + "grad_norm": 5.3820929527282715, + "learning_rate": 0.00021383649001486055 + }, + { + "step": 266, + "epoch": 0.42628205128205127, + "cpu_mem": 3.695304704, + "gpu_mem": 1.068974592, + "loss": 1.1029, + "grad_norm": 4.647565841674805, + "learning_rate": 0.00021307535999700637 + }, + { + "step": 267, + "epoch": 0.42788461538461536, + "cpu_mem": 3.695304704, + "gpu_mem": 1.068983808, + "loss": 1.1408, + "grad_norm": 3.025301218032837, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 268, + "epoch": 0.42948717948717946, + "cpu_mem": 3.695501312, + "gpu_mem": 1.068937728, + "loss": 1.3128, + "grad_norm": 4.65925407409668, + "learning_rate": 0.00021154718980558417 + }, + { + "step": 269, + "epoch": 0.4310897435897436, + "cpu_mem": 3.69569792, + "gpu_mem": 1.068931584, + "loss": 1.1448, + "grad_norm": 5.294458389282227, + "learning_rate": 0.00021078019755508398 + }, + { + "step": 270, + "epoch": 0.4326923076923077, + "cpu_mem": 3.695894528, + "gpu_mem": 1.068993024, + "loss": 1.1863, + "grad_norm": 3.558245897293091, + "learning_rate": 0.00021001129925148396 + }, + { + "step": 271, + "epoch": 0.4342948717948718, + "cpu_mem": 3.695894528, + "gpu_mem": 1.068919296, + "loss": 1.0877, + "grad_norm": 1.9053583145141602, + "learning_rate": 0.00020924051900725923 + }, + { + "step": 272, + "epoch": 0.4358974358974359, + "cpu_mem": 3.696091136, + "gpu_mem": 1.06891776, + "loss": 1.1503, + "grad_norm": 2.6998753547668457, + "learning_rate": 0.00020846788099390188 + }, + { + "step": 273, + "epoch": 0.4375, + "cpu_mem": 3.696287744, + "gpu_mem": 1.068920832, + "loss": 1.1162, + "grad_norm": 2.830162525177002, + "learning_rate": 0.0002076934094411635 + }, + { + "step": 274, + "epoch": 0.4391025641025641, + "cpu_mem": 3.696484352, + "gpu_mem": 1.068907008, + "loss": 1.0042, + "grad_norm": 2.7182862758636475, + "learning_rate": 0.0002069171286362949 + }, + { + "step": 275, + "epoch": 0.4407051282051282, + "cpu_mem": 3.696484352, + "gpu_mem": 1.068922368, + "loss": 1.0412, + "grad_norm": 2.3739230632781982, + "learning_rate": 0.00020613906292328457 + }, + { + "step": 276, + "epoch": 0.4423076923076923, + "cpu_mem": 3.69668096, + "gpu_mem": 1.068960768, + "loss": 1.0588, + "grad_norm": 2.265842914581299, + "learning_rate": 0.0002053592367020955 + }, + { + "step": 277, + "epoch": 0.4439102564102564, + "cpu_mem": 3.696877568, + "gpu_mem": 1.0689408, + "loss": 1.1147, + "grad_norm": 3.8851513862609863, + "learning_rate": 0.0002045776744278996 + }, + { + "step": 278, + "epoch": 0.44551282051282054, + "cpu_mem": 3.696877568, + "gpu_mem": 1.068966912, + "loss": 1.0823, + "grad_norm": 2.450185775756836, + "learning_rate": 0.00020379440061031118 + }, + { + "step": 279, + "epoch": 0.44711538461538464, + "cpu_mem": 3.697074176, + "gpu_mem": 1.06891776, + "loss": 0.9792, + "grad_norm": 2.9332261085510254, + "learning_rate": 0.00020300943981261808 + }, + { + "step": 280, + "epoch": 0.44871794871794873, + "cpu_mem": 3.697270784, + "gpu_mem": 1.068911616, + "loss": 1.0576, + "grad_norm": 3.6962268352508545, + "learning_rate": 0.0002022228166510114 + }, + { + "step": 281, + "epoch": 0.45032051282051283, + "cpu_mem": 3.697270784, + "gpu_mem": 1.068934656, + "loss": 1.0071, + "grad_norm": 3.428941249847412, + "learning_rate": 0.00020143455579381373 + }, + { + "step": 282, + "epoch": 0.4519230769230769, + "cpu_mem": 3.697467392, + "gpu_mem": 1.068913152, + "loss": 0.9923, + "grad_norm": 4.1819071769714355, + "learning_rate": 0.0002006446819607053 + }, + { + "step": 283, + "epoch": 0.453525641025641, + "cpu_mem": 3.697467392, + "gpu_mem": 1.068926976, + "loss": 1.1644, + "grad_norm": 4.040997505187988, + "learning_rate": 0.00019985321992194892 + }, + { + "step": 284, + "epoch": 0.4551282051282051, + "cpu_mem": 3.697664, + "gpu_mem": 1.068931584, + "loss": 0.9334, + "grad_norm": 4.6385884284973145, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 285, + "epoch": 0.4567307692307692, + "cpu_mem": 3.697664, + "gpu_mem": 1.068950016, + "loss": 0.9878, + "grad_norm": 4.289714813232422, + "learning_rate": 0.00019826563055679418 + }, + { + "step": 286, + "epoch": 0.4583333333333333, + "cpu_mem": 3.697860608, + "gpu_mem": 1.068920832, + "loss": 0.8499, + "grad_norm": 4.5127153396606445, + "learning_rate": 0.00019746955301683537 + }, + { + "step": 287, + "epoch": 0.4599358974358974, + "cpu_mem": 3.697860608, + "gpu_mem": 1.06894848, + "loss": 1.1073, + "grad_norm": 4.822424411773682, + "learning_rate": 0.0001966719868425464 + }, + { + "step": 288, + "epoch": 0.46153846153846156, + "cpu_mem": 3.698057216, + "gpu_mem": 1.068930048, + "loss": 0.9929, + "grad_norm": 3.9841525554656982, + "learning_rate": 0.0001958729570454201 + }, + { + "step": 289, + "epoch": 0.46314102564102566, + "cpu_mem": 3.698057216, + "gpu_mem": 1.06891776, + "loss": 1.0487, + "grad_norm": 6.2619428634643555, + "learning_rate": 0.0001950724886828484 + }, + { + "step": 290, + "epoch": 0.46474358974358976, + "cpu_mem": 3.698253824, + "gpu_mem": 1.068926976, + "loss": 1.1391, + "grad_norm": 6.163171291351318, + "learning_rate": 0.000194270606857336 + }, + { + "step": 291, + "epoch": 0.46634615384615385, + "cpu_mem": 3.627278336, + "gpu_mem": 1.068923904, + "loss": 1.0397, + "grad_norm": 4.1983747482299805, + "learning_rate": 0.00019346733671571367 + }, + { + "step": 292, + "epoch": 0.46794871794871795, + "cpu_mem": 3.628064768, + "gpu_mem": 1.068939264, + "loss": 0.7898, + "grad_norm": 3.4024879932403564, + "learning_rate": 0.00019266270344834942 + }, + { + "step": 293, + "epoch": 0.46955128205128205, + "cpu_mem": 3.6288512, + "gpu_mem": 1.068946944, + "loss": 0.8792, + "grad_norm": 3.8177716732025146, + "learning_rate": 0.00019185673228835857 + }, + { + "step": 294, + "epoch": 0.47115384615384615, + "cpu_mem": 3.629637632, + "gpu_mem": 1.068936192, + "loss": 0.9729, + "grad_norm": 3.7735037803649902, + "learning_rate": 0.00019104944851081244 + }, + { + "step": 295, + "epoch": 0.47275641025641024, + "cpu_mem": 3.630424064, + "gpu_mem": 1.068920832, + "loss": 1.0974, + "grad_norm": 4.682862281799316, + "learning_rate": 0.00019024087743194564 + }, + { + "step": 296, + "epoch": 0.47435897435897434, + "cpu_mem": 3.631210496, + "gpu_mem": 1.068923904, + "loss": 0.9242, + "grad_norm": 4.400240898132324, + "learning_rate": 0.0001894310444083625 + }, + { + "step": 297, + "epoch": 0.47596153846153844, + "cpu_mem": 3.631996928, + "gpu_mem": 1.06891776, + "loss": 0.983, + "grad_norm": 4.009866237640381, + "learning_rate": 0.00018861997483624136 + }, + { + "step": 298, + "epoch": 0.4775641025641026, + "cpu_mem": 3.632586752, + "gpu_mem": 1.068913152, + "loss": 1.0707, + "grad_norm": 5.258261203765869, + "learning_rate": 0.00018780769415053866 + }, + { + "step": 299, + "epoch": 0.4791666666666667, + "cpu_mem": 3.633176576, + "gpu_mem": 1.068934656, + "loss": 0.8267, + "grad_norm": 3.869562864303589, + "learning_rate": 0.00018699422782419094 + }, + { + "step": 300, + "epoch": 0.4807692307692308, + "cpu_mem": 3.6337664, + "gpu_mem": 1.068926976, + "loss": 0.8856, + "grad_norm": 2.821077346801758, + "learning_rate": 0.00018617960136731624 + }, + { + "step": 301, + "epoch": 0.4823717948717949, + "cpu_mem": 3.634356224, + "gpu_mem": 1.068899328, + "loss": 1.181, + "grad_norm": 6.042818546295166, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 302, + "epoch": 0.483974358974359, + "cpu_mem": 3.634946048, + "gpu_mem": 1.068897792, + "loss": 0.8988, + "grad_norm": 2.859086751937866, + "learning_rate": 0.0001845469702835641 + }, + { + "step": 303, + "epoch": 0.4855769230769231, + "cpu_mem": 3.635535872, + "gpu_mem": 1.068923904, + "loss": 0.9627, + "grad_norm": 6.569732189178467, + "learning_rate": 0.00018372901685562414 + }, + { + "step": 304, + "epoch": 0.48717948717948717, + "cpu_mem": 3.636125696, + "gpu_mem": 1.068907008, + "loss": 0.7715, + "grad_norm": 3.856107711791992, + "learning_rate": 0.00018291000569342676 + }, + { + "step": 305, + "epoch": 0.48878205128205127, + "cpu_mem": 3.63671552, + "gpu_mem": 1.068937728, + "loss": 0.8673, + "grad_norm": 6.9080095291137695, + "learning_rate": 0.00018208996248097458 + }, + { + "step": 306, + "epoch": 0.49038461538461536, + "cpu_mem": 3.637305344, + "gpu_mem": 1.068920832, + "loss": 0.907, + "grad_norm": 5.620052337646484, + "learning_rate": 0.00018126891293463547 + }, + { + "step": 307, + "epoch": 0.49198717948717946, + "cpu_mem": 3.637895168, + "gpu_mem": 1.068951552, + "loss": 0.886, + "grad_norm": 4.264993190765381, + "learning_rate": 0.0001804468828023354 + }, + { + "step": 308, + "epoch": 0.4935897435897436, + "cpu_mem": 3.638484992, + "gpu_mem": 1.068919296, + "loss": 0.8687, + "grad_norm": 4.096889495849609, + "learning_rate": 0.00017962389786275142 + }, + { + "step": 309, + "epoch": 0.4951923076923077, + "cpu_mem": 3.639074816, + "gpu_mem": 1.068945408, + "loss": 0.7881, + "grad_norm": 5.0207295417785645, + "learning_rate": 0.000178799983924503 + }, + { + "step": 310, + "epoch": 0.4967948717948718, + "cpu_mem": 3.639468032, + "gpu_mem": 1.068920832, + "loss": 0.8773, + "grad_norm": 7.086152076721191, + "learning_rate": 0.00017797516682534293 + }, + { + "step": 311, + "epoch": 0.4983974358974359, + "cpu_mem": 3.640057856, + "gpu_mem": 1.068916224, + "loss": 0.8521, + "grad_norm": 4.409608364105225, + "learning_rate": 0.00017714947243134695 + }, + { + "step": 312, + "epoch": 0.5, + "cpu_mem": 3.640451072, + "gpu_mem": 1.068919296, + "loss": 0.8387, + "grad_norm": 4.034334182739258, + "learning_rate": 0.0001763229266361024 + }, + { + "step": 313, + "epoch": 0.5016025641025641, + "cpu_mem": 3.641040896, + "gpu_mem": 1.068937728, + "loss": 0.7109, + "grad_norm": 3.930853843688965, + "learning_rate": 0.00017549555535989648 + }, + { + "step": 314, + "epoch": 0.5032051282051282, + "cpu_mem": 3.64163072, + "gpu_mem": 1.06891776, + "loss": 0.8167, + "grad_norm": 3.172741651535034, + "learning_rate": 0.00017466738454890323 + }, + { + "step": 315, + "epoch": 0.5048076923076923, + "cpu_mem": 3.642023936, + "gpu_mem": 1.068922368, + "loss": 0.8581, + "grad_norm": 4.152272701263428, + "learning_rate": 0.00017383844017436996 + }, + { + "step": 316, + "epoch": 0.5064102564102564, + "cpu_mem": 3.64261376, + "gpu_mem": 1.06891776, + "loss": 0.8972, + "grad_norm": 5.2409348487854, + "learning_rate": 0.00017300874823180282 + }, + { + "step": 317, + "epoch": 0.5080128205128205, + "cpu_mem": 3.643006976, + "gpu_mem": 1.06892544, + "loss": 0.584, + "grad_norm": 4.1399383544921875, + "learning_rate": 0.00017217833474015128 + }, + { + "step": 318, + "epoch": 0.5096153846153846, + "cpu_mem": 3.643400192, + "gpu_mem": 1.068950016, + "loss": 0.9496, + "grad_norm": 9.175780296325684, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 319, + "epoch": 0.5112179487179487, + "cpu_mem": 3.643793408, + "gpu_mem": 1.068942336, + "loss": 0.8584, + "grad_norm": 5.816468715667725, + "learning_rate": 0.0001705154472977154 + }, + { + "step": 320, + "epoch": 0.5128205128205128, + "cpu_mem": 3.644383232, + "gpu_mem": 1.068943872, + "loss": 0.9008, + "grad_norm": 6.626401424407959, + "learning_rate": 0.00016968302549470095 + }, + { + "step": 321, + "epoch": 0.5144230769230769, + "cpu_mem": 3.644776448, + "gpu_mem": 1.068919296, + "loss": 0.6698, + "grad_norm": 4.236902713775635, + "learning_rate": 0.00016884998643650694 + }, + { + "step": 322, + "epoch": 0.5160256410256411, + "cpu_mem": 3.645169664, + "gpu_mem": 1.068920832, + "loss": 1.0648, + "grad_norm": 6.311054706573486, + "learning_rate": 0.00016801635624704776 + }, + { + "step": 323, + "epoch": 0.5176282051282052, + "cpu_mem": 3.64556288, + "gpu_mem": 1.0689408, + "loss": 0.9143, + "grad_norm": 4.998040199279785, + "learning_rate": 0.0001671821610687756 + }, + { + "step": 324, + "epoch": 0.5192307692307693, + "cpu_mem": 3.646152704, + "gpu_mem": 1.068913152, + "loss": 0.9507, + "grad_norm": 6.0702595710754395, + "learning_rate": 0.00016634742706186036 + }, + { + "step": 325, + "epoch": 0.5208333333333334, + "cpu_mem": 3.64654592, + "gpu_mem": 1.06892544, + "loss": 0.6631, + "grad_norm": 4.230674743652344, + "learning_rate": 0.00016551218040336993 + }, + { + "step": 326, + "epoch": 0.5224358974358975, + "cpu_mem": 3.646939136, + "gpu_mem": 1.068934656, + "loss": 0.7695, + "grad_norm": 4.2649383544921875, + "learning_rate": 0.00016467644728644843 + }, + { + "step": 327, + "epoch": 0.5240384615384616, + "cpu_mem": 3.647332352, + "gpu_mem": 1.068911616, + "loss": 0.7407, + "grad_norm": 3.3310205936431885, + "learning_rate": 0.0001638402539194953 + }, + { + "step": 328, + "epoch": 0.5256410256410257, + "cpu_mem": 3.647725568, + "gpu_mem": 1.068936192, + "loss": 0.9543, + "grad_norm": 6.778459072113037, + "learning_rate": 0.00016300362652534346 + }, + { + "step": 329, + "epoch": 0.5272435897435898, + "cpu_mem": 3.648118784, + "gpu_mem": 1.068936192, + "loss": 0.6481, + "grad_norm": 3.740067958831787, + "learning_rate": 0.00016216659134043657 + }, + { + "step": 330, + "epoch": 0.5288461538461539, + "cpu_mem": 3.648512, + "gpu_mem": 1.068919296, + "loss": 0.8445, + "grad_norm": 6.251247406005859, + "learning_rate": 0.00016132917461400686 + }, + { + "step": 331, + "epoch": 0.530448717948718, + "cpu_mem": 3.648905216, + "gpu_mem": 1.068916224, + "loss": 0.6453, + "grad_norm": 4.003299236297607, + "learning_rate": 0.00016049140260725127 + }, + { + "step": 332, + "epoch": 0.532051282051282, + "cpu_mem": 3.649298432, + "gpu_mem": 1.068908544, + "loss": 0.7712, + "grad_norm": 5.071538925170898, + "learning_rate": 0.00015965330159250845 + }, + { + "step": 333, + "epoch": 0.5336538461538461, + "cpu_mem": 3.649691648, + "gpu_mem": 1.068946944, + "loss": 0.7863, + "grad_norm": 5.853189945220947, + "learning_rate": 0.00015881489785243467 + }, + { + "step": 334, + "epoch": 0.5352564102564102, + "cpu_mem": 3.650084864, + "gpu_mem": 1.068923904, + "loss": 1.0234, + "grad_norm": 6.13104772567749, + "learning_rate": 0.00015797621767917942 + }, + { + "step": 335, + "epoch": 0.5368589743589743, + "cpu_mem": 3.65047808, + "gpu_mem": 1.068922368, + "loss": 0.8941, + "grad_norm": 4.807613849639893, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 336, + "epoch": 0.5384615384615384, + "cpu_mem": 3.650871296, + "gpu_mem": 1.068939264, + "loss": 0.7332, + "grad_norm": 4.303494453430176, + "learning_rate": 0.00015629813324424292 + }, + { + "step": 337, + "epoch": 0.5400641025641025, + "cpu_mem": 3.651264512, + "gpu_mem": 1.068923904, + "loss": 0.7166, + "grad_norm": 6.109379291534424, + "learning_rate": 0.00015545878160690583 + }, + { + "step": 338, + "epoch": 0.5416666666666666, + "cpu_mem": 3.651657728, + "gpu_mem": 1.068936192, + "loss": 0.7991, + "grad_norm": 5.545070171356201, + "learning_rate": 0.00015461925878342556 + }, + { + "step": 339, + "epoch": 0.5432692307692307, + "cpu_mem": 3.652050944, + "gpu_mem": 1.06894848, + "loss": 0.6189, + "grad_norm": 4.7248311042785645, + "learning_rate": 0.00015377959110104584 + }, + { + "step": 340, + "epoch": 0.5448717948717948, + "cpu_mem": 3.65244416, + "gpu_mem": 1.068923904, + "loss": 0.8112, + "grad_norm": 4.536865234375, + "learning_rate": 0.00015293980489155333 + }, + { + "step": 341, + "epoch": 0.5464743589743589, + "cpu_mem": 3.652837376, + "gpu_mem": 1.068968448, + "loss": 0.8855, + "grad_norm": 5.280837535858154, + "learning_rate": 0.00015209992649045152 + }, + { + "step": 342, + "epoch": 0.5480769230769231, + "cpu_mem": 3.653230592, + "gpu_mem": 1.068942336, + "loss": 0.751, + "grad_norm": 4.931146144866943, + "learning_rate": 0.000151259982236135 + }, + { + "step": 343, + "epoch": 0.5496794871794872, + "cpu_mem": 3.653623808, + "gpu_mem": 1.068939264, + "loss": 0.7955, + "grad_norm": 6.167722702026367, + "learning_rate": 0.00015041999846906367 + }, + { + "step": 344, + "epoch": 0.5512820512820513, + "cpu_mem": 3.654017024, + "gpu_mem": 1.068920832, + "loss": 0.7443, + "grad_norm": 5.990055561065674, + "learning_rate": 0.00014958000153093634 + }, + { + "step": 345, + "epoch": 0.5528846153846154, + "cpu_mem": 3.65441024, + "gpu_mem": 1.068926976, + "loss": 0.594, + "grad_norm": 3.5547595024108887, + "learning_rate": 0.000148740017763865 + }, + { + "step": 346, + "epoch": 0.5544871794871795, + "cpu_mem": 3.654803456, + "gpu_mem": 1.068896256, + "loss": 0.7364, + "grad_norm": 4.6511359214782715, + "learning_rate": 0.00014790007350954845 + }, + { + "step": 347, + "epoch": 0.5560897435897436, + "cpu_mem": 3.655196672, + "gpu_mem": 1.068960768, + "loss": 0.7884, + "grad_norm": 3.388368844985962, + "learning_rate": 0.00014706019510844664 + }, + { + "step": 348, + "epoch": 0.5576923076923077, + "cpu_mem": 3.65539328, + "gpu_mem": 1.068914688, + "loss": 0.7019, + "grad_norm": 5.678548336029053, + "learning_rate": 0.0001462204088989541 + }, + { + "step": 349, + "epoch": 0.5592948717948718, + "cpu_mem": 3.655786496, + "gpu_mem": 1.068908544, + "loss": 0.7943, + "grad_norm": 3.7414724826812744, + "learning_rate": 0.00014538074121657447 + }, + { + "step": 350, + "epoch": 0.5608974358974359, + "cpu_mem": 3.656179712, + "gpu_mem": 1.06896384, + "loss": 0.4515, + "grad_norm": 3.413428544998169, + "learning_rate": 0.00014454121839309415 + }, + { + "step": 351, + "epoch": 0.5625, + "cpu_mem": 3.656572928, + "gpu_mem": 1.068930048, + "loss": 0.6822, + "grad_norm": 5.245158672332764, + "learning_rate": 0.00014370186675575705 + }, + { + "step": 352, + "epoch": 0.5641025641025641, + "cpu_mem": 3.656966144, + "gpu_mem": 1.06891776, + "loss": 0.619, + "grad_norm": 4.785041809082031, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 353, + "epoch": 0.5657051282051282, + "cpu_mem": 3.65735936, + "gpu_mem": 1.068922368, + "loss": 0.7296, + "grad_norm": 4.404187202453613, + "learning_rate": 0.00014202378232082053 + }, + { + "step": 354, + "epoch": 0.5673076923076923, + "cpu_mem": 3.657752576, + "gpu_mem": 1.0689024, + "loss": 0.7032, + "grad_norm": 4.180550575256348, + "learning_rate": 0.00014118510214756536 + }, + { + "step": 355, + "epoch": 0.5689102564102564, + "cpu_mem": 3.657949184, + "gpu_mem": 1.068926976, + "loss": 0.7405, + "grad_norm": 4.910958766937256, + "learning_rate": 0.00014034669840749152 + }, + { + "step": 356, + "epoch": 0.5705128205128205, + "cpu_mem": 3.6583424, + "gpu_mem": 1.068905472, + "loss": 0.512, + "grad_norm": 4.361656188964844, + "learning_rate": 0.0001395085973927487 + }, + { + "step": 357, + "epoch": 0.5721153846153846, + "cpu_mem": 3.658735616, + "gpu_mem": 1.068922368, + "loss": 0.552, + "grad_norm": 4.110744476318359, + "learning_rate": 0.00013867082538599317 + }, + { + "step": 358, + "epoch": 0.5737179487179487, + "cpu_mem": 3.659128832, + "gpu_mem": 1.06888704, + "loss": 1.0341, + "grad_norm": 6.688344955444336, + "learning_rate": 0.00013783340865956338 + }, + { + "step": 359, + "epoch": 0.5753205128205128, + "cpu_mem": 3.65932544, + "gpu_mem": 1.068919296, + "loss": 0.6135, + "grad_norm": 5.444723129272461, + "learning_rate": 0.0001369963734746566 + }, + { + "step": 360, + "epoch": 0.5769230769230769, + "cpu_mem": 3.659522048, + "gpu_mem": 1.068908544, + "loss": 0.5615, + "grad_norm": 4.0923075675964355, + "learning_rate": 0.0001361597460805047 + }, + { + "step": 361, + "epoch": 0.5785256410256411, + "cpu_mem": 3.659915264, + "gpu_mem": 1.068945408, + "loss": 0.8284, + "grad_norm": 5.875912666320801, + "learning_rate": 0.00013532355271355154 + }, + { + "step": 362, + "epoch": 0.5801282051282052, + "cpu_mem": 3.66030848, + "gpu_mem": 1.068911616, + "loss": 0.8588, + "grad_norm": 6.305935382843018, + "learning_rate": 0.00013448781959663004 + }, + { + "step": 363, + "epoch": 0.5817307692307693, + "cpu_mem": 3.660701696, + "gpu_mem": 1.068934656, + "loss": 0.8492, + "grad_norm": 5.849969863891602, + "learning_rate": 0.00013365257293813956 + }, + { + "step": 364, + "epoch": 0.5833333333333334, + "cpu_mem": 3.660898304, + "gpu_mem": 1.068923904, + "loss": 0.6204, + "grad_norm": 3.349527597427368, + "learning_rate": 0.00013281783893122446 + }, + { + "step": 365, + "epoch": 0.5849358974358975, + "cpu_mem": 3.66129152, + "gpu_mem": 1.068930048, + "loss": 0.7296, + "grad_norm": 4.776098728179932, + "learning_rate": 0.00013198364375295224 + }, + { + "step": 366, + "epoch": 0.5865384615384616, + "cpu_mem": 3.661488128, + "gpu_mem": 1.068923904, + "loss": 0.7697, + "grad_norm": 3.893139362335205, + "learning_rate": 0.000131150013563493 + }, + { + "step": 367, + "epoch": 0.5881410256410257, + "cpu_mem": 3.661881344, + "gpu_mem": 1.068942336, + "loss": 0.8079, + "grad_norm": 4.086031436920166, + "learning_rate": 0.00013031697450529902 + }, + { + "step": 368, + "epoch": 0.5897435897435898, + "cpu_mem": 3.66227456, + "gpu_mem": 1.0689024, + "loss": 0.6083, + "grad_norm": 3.3459105491638184, + "learning_rate": 0.0001294845527022846 + }, + { + "step": 369, + "epoch": 0.5913461538461539, + "cpu_mem": 3.662667776, + "gpu_mem": 1.068934656, + "loss": 0.9591, + "grad_norm": 4.738063335418701, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 370, + "epoch": 0.592948717948718, + "cpu_mem": 3.663060992, + "gpu_mem": 1.068954624, + "loss": 0.6531, + "grad_norm": 3.172518014907837, + "learning_rate": 0.0001278216652598487 + }, + { + "step": 371, + "epoch": 0.594551282051282, + "cpu_mem": 3.663454208, + "gpu_mem": 1.06894848, + "loss": 0.6172, + "grad_norm": 3.9292311668395996, + "learning_rate": 0.00012699125176819716 + }, + { + "step": 372, + "epoch": 0.5961538461538461, + "cpu_mem": 3.663650816, + "gpu_mem": 1.068911616, + "loss": 0.7705, + "grad_norm": 5.426601886749268, + "learning_rate": 0.00012616155982563 + }, + { + "step": 373, + "epoch": 0.5977564102564102, + "cpu_mem": 3.663847424, + "gpu_mem": 1.068928512, + "loss": 0.5207, + "grad_norm": 2.992875337600708, + "learning_rate": 0.00012533261545109674 + }, + { + "step": 374, + "epoch": 0.5993589743589743, + "cpu_mem": 3.66424064, + "gpu_mem": 1.068905472, + "loss": 0.8671, + "grad_norm": 4.146549701690674, + "learning_rate": 0.00012450444464010352 + }, + { + "step": 375, + "epoch": 0.6009615384615384, + "cpu_mem": 3.664437248, + "gpu_mem": 1.068937728, + "loss": 0.7624, + "grad_norm": 5.474912166595459, + "learning_rate": 0.0001236770733638976 + }, + { + "step": 376, + "epoch": 0.6025641025641025, + "cpu_mem": 3.664830464, + "gpu_mem": 1.06893312, + "loss": 0.7192, + "grad_norm": 4.443337917327881, + "learning_rate": 0.000122850527568653 + }, + { + "step": 377, + "epoch": 0.6041666666666666, + "cpu_mem": 3.66522368, + "gpu_mem": 1.068942336, + "loss": 0.6634, + "grad_norm": 4.000138759613037, + "learning_rate": 0.00012202483317465704 + }, + { + "step": 378, + "epoch": 0.6057692307692307, + "cpu_mem": 3.665616896, + "gpu_mem": 1.068916224, + "loss": 0.7006, + "grad_norm": 4.549437046051025, + "learning_rate": 0.00012120001607549698 + }, + { + "step": 379, + "epoch": 0.6073717948717948, + "cpu_mem": 3.666010112, + "gpu_mem": 1.068936192, + "loss": 0.5514, + "grad_norm": 3.3391573429107666, + "learning_rate": 0.00012037610213724862 + }, + { + "step": 380, + "epoch": 0.6089743589743589, + "cpu_mem": 3.666403328, + "gpu_mem": 1.06891008, + "loss": 0.8424, + "grad_norm": 5.571780681610107, + "learning_rate": 0.0001195531171976646 + }, + { + "step": 381, + "epoch": 0.6105769230769231, + "cpu_mem": 3.666599936, + "gpu_mem": 1.068934656, + "loss": 0.6746, + "grad_norm": 5.679649353027344, + "learning_rate": 0.00011873108706536448 + }, + { + "step": 382, + "epoch": 0.6121794871794872, + "cpu_mem": 3.666993152, + "gpu_mem": 1.068919296, + "loss": 0.5828, + "grad_norm": 5.107166290283203, + "learning_rate": 0.00011791003751902542 + }, + { + "step": 383, + "epoch": 0.6137820512820513, + "cpu_mem": 3.66718976, + "gpu_mem": 1.068953088, + "loss": 0.8542, + "grad_norm": 5.064152717590332, + "learning_rate": 0.00011708999430657325 + }, + { + "step": 384, + "epoch": 0.6153846153846154, + "cpu_mem": 3.667582976, + "gpu_mem": 1.06893312, + "loss": 0.6634, + "grad_norm": 3.8671913146972656, + "learning_rate": 0.00011627098314437586 + }, + { + "step": 385, + "epoch": 0.6169871794871795, + "cpu_mem": 3.667779584, + "gpu_mem": 1.06891776, + "loss": 0.5969, + "grad_norm": 3.456469774246216, + "learning_rate": 0.0001154530297164359 + }, + { + "step": 386, + "epoch": 0.6185897435897436, + "cpu_mem": 3.667976192, + "gpu_mem": 1.068953088, + "loss": 0.7159, + "grad_norm": 4.374657154083252, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 387, + "epoch": 0.6201923076923077, + "cpu_mem": 3.668369408, + "gpu_mem": 1.068959232, + "loss": 0.5248, + "grad_norm": 4.052602767944336, + "learning_rate": 0.00011382039863268374 + }, + { + "step": 388, + "epoch": 0.6217948717948718, + "cpu_mem": 3.668566016, + "gpu_mem": 1.068922368, + "loss": 0.7735, + "grad_norm": 5.499065399169922, + "learning_rate": 0.00011300577217580905 + }, + { + "step": 389, + "epoch": 0.6233974358974359, + "cpu_mem": 3.668959232, + "gpu_mem": 1.068900864, + "loss": 0.66, + "grad_norm": 4.687088489532471, + "learning_rate": 0.00011219230584946136 + }, + { + "step": 390, + "epoch": 0.625, + "cpu_mem": 3.66915584, + "gpu_mem": 1.068953088, + "loss": 0.7378, + "grad_norm": 4.42564582824707, + "learning_rate": 0.00011138002516375864 + }, + { + "step": 391, + "epoch": 0.6266025641025641, + "cpu_mem": 3.669549056, + "gpu_mem": 1.068939264, + "loss": 0.5222, + "grad_norm": 3.963618040084839, + "learning_rate": 0.00011056895559163748 + }, + { + "step": 392, + "epoch": 0.6282051282051282, + "cpu_mem": 3.669745664, + "gpu_mem": 1.06893312, + "loss": 0.6482, + "grad_norm": 3.637404441833496, + "learning_rate": 0.00010975912256805436 + }, + { + "step": 393, + "epoch": 0.6298076923076923, + "cpu_mem": 3.669942272, + "gpu_mem": 1.068939264, + "loss": 0.5215, + "grad_norm": 2.80082368850708, + "learning_rate": 0.00010895055148918756 + }, + { + "step": 394, + "epoch": 0.6314102564102564, + "cpu_mem": 3.670335488, + "gpu_mem": 1.068916224, + "loss": 0.751, + "grad_norm": 5.434114456176758, + "learning_rate": 0.00010814326771164141 + }, + { + "step": 395, + "epoch": 0.6330128205128205, + "cpu_mem": 3.670532096, + "gpu_mem": 1.068930048, + "loss": 0.5404, + "grad_norm": 4.525430679321289, + "learning_rate": 0.00010733729655165054 + }, + { + "step": 396, + "epoch": 0.6346153846153846, + "cpu_mem": 3.670728704, + "gpu_mem": 1.068930048, + "loss": 0.7697, + "grad_norm": 3.933042049407959, + "learning_rate": 0.00010653266328428628 + }, + { + "step": 397, + "epoch": 0.6362179487179487, + "cpu_mem": 3.67112192, + "gpu_mem": 1.068899328, + "loss": 0.5525, + "grad_norm": 3.013709306716919, + "learning_rate": 0.00010572939314266402 + }, + { + "step": 398, + "epoch": 0.6378205128205128, + "cpu_mem": 3.671318528, + "gpu_mem": 1.06893312, + "loss": 0.639, + "grad_norm": 4.330655574798584, + "learning_rate": 0.00010492751131715159 + }, + { + "step": 399, + "epoch": 0.6394230769230769, + "cpu_mem": 3.671711744, + "gpu_mem": 1.068911616, + "loss": 0.6767, + "grad_norm": 4.145953178405762, + "learning_rate": 0.00010412704295457988 + }, + { + "step": 400, + "epoch": 0.6410256410256411, + "cpu_mem": 3.671908352, + "gpu_mem": 1.068919296, + "loss": 0.525, + "grad_norm": 4.256671905517578, + "learning_rate": 0.00010332801315745361 + }, + { + "step": 401, + "epoch": 0.6426282051282052, + "cpu_mem": 3.672301568, + "gpu_mem": 1.068937728, + "loss": 0.7488, + "grad_norm": 6.108345985412598, + "learning_rate": 0.00010253044698316464 + }, + { + "step": 402, + "epoch": 0.6442307692307693, + "cpu_mem": 3.672498176, + "gpu_mem": 1.068905472, + "loss": 0.775, + "grad_norm": 6.402830600738525, + "learning_rate": 0.00010173436944320582 + }, + { + "step": 403, + "epoch": 0.6458333333333334, + "cpu_mem": 3.672891392, + "gpu_mem": 1.06891008, + "loss": 0.5887, + "grad_norm": 5.264021396636963, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 404, + "epoch": 0.6474358974358975, + "cpu_mem": 3.673088, + "gpu_mem": 1.068905472, + "loss": 0.5376, + "grad_norm": 4.079029560089111, + "learning_rate": 0.00010014678007805106 + }, + { + "step": 405, + "epoch": 0.6490384615384616, + "cpu_mem": 3.673284608, + "gpu_mem": 1.068950016, + "loss": 0.6167, + "grad_norm": 4.916357040405273, + "learning_rate": 9.935531803929469e-05 + }, + { + "step": 406, + "epoch": 0.6506410256410257, + "cpu_mem": 3.673481216, + "gpu_mem": 1.06893312, + "loss": 0.4996, + "grad_norm": 3.913182497024536, + "learning_rate": 9.856544420618624e-05 + }, + { + "step": 407, + "epoch": 0.6522435897435898, + "cpu_mem": 3.673677824, + "gpu_mem": 1.068922368, + "loss": 0.6428, + "grad_norm": 4.817429065704346, + "learning_rate": 9.777718334898859e-05 + }, + { + "step": 408, + "epoch": 0.6538461538461539, + "cpu_mem": 3.673874432, + "gpu_mem": 1.068943872, + "loss": 0.6884, + "grad_norm": 5.728304862976074, + "learning_rate": 9.699056018738192e-05 + }, + { + "step": 409, + "epoch": 0.655448717948718, + "cpu_mem": 3.674267648, + "gpu_mem": 1.06891008, + "loss": 0.6222, + "grad_norm": 4.745124340057373, + "learning_rate": 9.62055993896888e-05 + }, + { + "step": 410, + "epoch": 0.657051282051282, + "cpu_mem": 3.674660864, + "gpu_mem": 1.06892544, + "loss": 0.4946, + "grad_norm": 3.9268112182617188, + "learning_rate": 9.542232557210039e-05 + }, + { + "step": 411, + "epoch": 0.6586538461538461, + "cpu_mem": 3.674857472, + "gpu_mem": 1.06892544, + "loss": 0.6331, + "grad_norm": 4.514761924743652, + "learning_rate": 9.464076329790451e-05 + }, + { + "step": 412, + "epoch": 0.6602564102564102, + "cpu_mem": 3.67505408, + "gpu_mem": 1.068916224, + "loss": 0.6301, + "grad_norm": 5.261031627655029, + "learning_rate": 9.386093707671543e-05 + }, + { + "step": 413, + "epoch": 0.6618589743589743, + "cpu_mem": 3.675250688, + "gpu_mem": 1.068926976, + "loss": 0.9208, + "grad_norm": 7.41164493560791, + "learning_rate": 9.308287136370511e-05 + }, + { + "step": 414, + "epoch": 0.6634615384615384, + "cpu_mem": 3.675447296, + "gpu_mem": 1.068951552, + "loss": 0.5531, + "grad_norm": 4.394953727722168, + "learning_rate": 9.230659055883649e-05 + }, + { + "step": 415, + "epoch": 0.6650641025641025, + "cpu_mem": 3.675643904, + "gpu_mem": 1.068903936, + "loss": 0.9875, + "grad_norm": 7.115608215332031, + "learning_rate": 9.15321190060981e-05 + }, + { + "step": 416, + "epoch": 0.6666666666666666, + "cpu_mem": 3.675840512, + "gpu_mem": 1.068939264, + "loss": 0.667, + "grad_norm": 7.749819278717041, + "learning_rate": 9.075948099274078e-05 + }, + { + "step": 417, + "epoch": 0.6682692307692307, + "cpu_mem": 3.676233728, + "gpu_mem": 1.068900864, + "loss": 0.5808, + "grad_norm": 5.520841598510742, + "learning_rate": 8.998870074851604e-05 + }, + { + "step": 418, + "epoch": 0.6698717948717948, + "cpu_mem": 3.676430336, + "gpu_mem": 1.068919296, + "loss": 0.6864, + "grad_norm": 4.350959300994873, + "learning_rate": 8.9219802444916e-05 + }, + { + "step": 419, + "epoch": 0.6714743589743589, + "cpu_mem": 3.676626944, + "gpu_mem": 1.068911616, + "loss": 0.594, + "grad_norm": 4.002154350280762, + "learning_rate": 8.845281019441583e-05 + }, + { + "step": 420, + "epoch": 0.6730769230769231, + "cpu_mem": 3.676823552, + "gpu_mem": 1.06894848, + "loss": 0.6793, + "grad_norm": 5.034749507904053, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 421, + "epoch": 0.6746794871794872, + "cpu_mem": 3.677216768, + "gpu_mem": 1.068908544, + "loss": 0.561, + "grad_norm": 5.928739070892334, + "learning_rate": 8.692464000299362e-05 + }, + { + "step": 422, + "epoch": 0.6762820512820513, + "cpu_mem": 3.677413376, + "gpu_mem": 1.068922368, + "loss": 0.5695, + "grad_norm": 5.07590389251709, + "learning_rate": 8.61635099851395e-05 + }, + { + "step": 423, + "epoch": 0.6778846153846154, + "cpu_mem": 3.677609984, + "gpu_mem": 1.068926976, + "loss": 0.4764, + "grad_norm": 4.166762351989746, + "learning_rate": 8.540438186501792e-05 + }, + { + "step": 424, + "epoch": 0.6794871794871795, + "cpu_mem": 3.677806592, + "gpu_mem": 1.068888576, + "loss": 0.6157, + "grad_norm": 3.5054402351379395, + "learning_rate": 8.464727944871322e-05 + }, + { + "step": 425, + "epoch": 0.6810897435897436, + "cpu_mem": 3.678199808, + "gpu_mem": 1.068911616, + "loss": 0.859, + "grad_norm": 5.1402974128723145, + "learning_rate": 8.389222647878426e-05 + }, + { + "step": 426, + "epoch": 0.6826923076923077, + "cpu_mem": 3.678396416, + "gpu_mem": 1.06891008, + "loss": 0.5816, + "grad_norm": 4.171634674072266, + "learning_rate": 8.313924663351926e-05 + }, + { + "step": 427, + "epoch": 0.6842948717948718, + "cpu_mem": 3.678593024, + "gpu_mem": 1.068928512, + "loss": 0.5752, + "grad_norm": 3.6238744258880615, + "learning_rate": 8.238836352619424e-05 + }, + { + "step": 428, + "epoch": 0.6858974358974359, + "cpu_mem": 3.678789632, + "gpu_mem": 1.06892544, + "loss": 0.7205, + "grad_norm": 5.293195724487305, + "learning_rate": 8.163960070433164e-05 + }, + { + "step": 429, + "epoch": 0.6875, + "cpu_mem": 3.67898624, + "gpu_mem": 1.068923904, + "loss": 0.652, + "grad_norm": 4.673212051391602, + "learning_rate": 8.089298164896245e-05 + }, + { + "step": 430, + "epoch": 0.6891025641025641, + "cpu_mem": 3.679182848, + "gpu_mem": 1.068942336, + "loss": 0.4656, + "grad_norm": 3.5685439109802246, + "learning_rate": 8.014852977388964e-05 + }, + { + "step": 431, + "epoch": 0.6907051282051282, + "cpu_mem": 3.679379456, + "gpu_mem": 1.068903936, + "loss": 0.6198, + "grad_norm": 3.7935495376586914, + "learning_rate": 7.940626842495362e-05 + }, + { + "step": 432, + "epoch": 0.6923076923076923, + "cpu_mem": 3.679576064, + "gpu_mem": 1.06894848, + "loss": 0.4605, + "grad_norm": 3.1759986877441406, + "learning_rate": 7.866622087930074e-05 + }, + { + "step": 433, + "epoch": 0.6939102564102564, + "cpu_mem": 3.679772672, + "gpu_mem": 1.068913152, + "loss": 0.5463, + "grad_norm": 4.8020453453063965, + "learning_rate": 7.792841034465275e-05 + }, + { + "step": 434, + "epoch": 0.6955128205128205, + "cpu_mem": 3.680165888, + "gpu_mem": 1.0689408, + "loss": 0.6799, + "grad_norm": 4.44742488861084, + "learning_rate": 7.719285995857938e-05 + }, + { + "step": 435, + "epoch": 0.6971153846153846, + "cpu_mem": 3.680362496, + "gpu_mem": 1.068920832, + "loss": 0.5919, + "grad_norm": 4.612300872802734, + "learning_rate": 7.64595927877727e-05 + }, + { + "step": 436, + "epoch": 0.6987179487179487, + "cpu_mem": 3.680559104, + "gpu_mem": 1.068966912, + "loss": 0.6977, + "grad_norm": 5.49208927154541, + "learning_rate": 7.572863182732332e-05 + }, + { + "step": 437, + "epoch": 0.7003205128205128, + "cpu_mem": 3.680755712, + "gpu_mem": 1.068931584, + "loss": 0.5507, + "grad_norm": 3.222627878189087, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 438, + "epoch": 0.7019230769230769, + "cpu_mem": 3.68095232, + "gpu_mem": 1.068922368, + "loss": 0.4585, + "grad_norm": 3.6906895637512207, + "learning_rate": 7.42737201555302e-05 + }, + { + "step": 439, + "epoch": 0.7035256410256411, + "cpu_mem": 3.681148928, + "gpu_mem": 1.068916224, + "loss": 0.4838, + "grad_norm": 4.87263822555542, + "learning_rate": 7.354981506988387e-05 + }, + { + "step": 440, + "epoch": 0.7051282051282052, + "cpu_mem": 3.681345536, + "gpu_mem": 1.068900864, + "loss": 0.5431, + "grad_norm": 5.150810718536377, + "learning_rate": 7.282830744455895e-05 + }, + { + "step": 441, + "epoch": 0.7067307692307693, + "cpu_mem": 3.681542144, + "gpu_mem": 1.068919296, + "loss": 0.7398, + "grad_norm": 7.363195896148682, + "learning_rate": 7.210921990586957e-05 + }, + { + "step": 442, + "epoch": 0.7083333333333334, + "cpu_mem": 3.681738752, + "gpu_mem": 1.068920832, + "loss": 0.4754, + "grad_norm": 3.6689155101776123, + "learning_rate": 7.139257500423665e-05 + }, + { + "step": 443, + "epoch": 0.7099358974358975, + "cpu_mem": 3.68193536, + "gpu_mem": 1.06892544, + "loss": 0.7902, + "grad_norm": 7.714516639709473, + "learning_rate": 7.067839521348035e-05 + }, + { + "step": 444, + "epoch": 0.7115384615384616, + "cpu_mem": 3.682131968, + "gpu_mem": 1.068928512, + "loss": 0.5132, + "grad_norm": 5.26685094833374, + "learning_rate": 6.996670293011575e-05 + }, + { + "step": 445, + "epoch": 0.7131410256410257, + "cpu_mem": 3.682328576, + "gpu_mem": 1.068922368, + "loss": 0.8145, + "grad_norm": 5.670719623565674, + "learning_rate": 6.92575204726501e-05 + }, + { + "step": 446, + "epoch": 0.7147435897435898, + "cpu_mem": 3.682525184, + "gpu_mem": 1.06894848, + "loss": 0.6959, + "grad_norm": 4.778154373168945, + "learning_rate": 6.855087008088307e-05 + }, + { + "step": 447, + "epoch": 0.7163461538461539, + "cpu_mem": 3.682721792, + "gpu_mem": 1.068916224, + "loss": 0.5175, + "grad_norm": 3.3921396732330322, + "learning_rate": 6.784677391520952e-05 + }, + { + "step": 448, + "epoch": 0.717948717948718, + "cpu_mem": 3.6829184, + "gpu_mem": 1.068943872, + "loss": 0.5008, + "grad_norm": 4.269102573394775, + "learning_rate": 6.714525405592412e-05 + }, + { + "step": 449, + "epoch": 0.719551282051282, + "cpu_mem": 3.683115008, + "gpu_mem": 1.068951552, + "loss": 0.5957, + "grad_norm": 5.3184494972229, + "learning_rate": 6.644633250252937e-05 + }, + { + "step": 450, + "epoch": 0.7211538461538461, + "cpu_mem": 3.683311616, + "gpu_mem": 1.06893312, + "loss": 0.737, + "grad_norm": 6.011037826538086, + "learning_rate": 6.575003117304535e-05 + }, + { + "step": 451, + "epoch": 0.7227564102564102, + "cpu_mem": 3.683508224, + "gpu_mem": 1.068919296, + "loss": 0.4666, + "grad_norm": 3.5450170040130615, + "learning_rate": 6.50563719033225e-05 + }, + { + "step": 452, + "epoch": 0.7243589743589743, + "cpu_mem": 3.683704832, + "gpu_mem": 1.068930048, + "loss": 0.5067, + "grad_norm": 3.7873008251190186, + "learning_rate": 6.436537644635705e-05 + }, + { + "step": 453, + "epoch": 0.7259615384615384, + "cpu_mem": 3.68390144, + "gpu_mem": 1.068922368, + "loss": 0.5592, + "grad_norm": 4.465864658355713, + "learning_rate": 6.367706647160847e-05 + }, + { + "step": 454, + "epoch": 0.7275641025641025, + "cpu_mem": 3.684098048, + "gpu_mem": 1.068939264, + "loss": 0.6529, + "grad_norm": 4.5889458656311035, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 455, + "epoch": 0.7291666666666666, + "cpu_mem": 3.684294656, + "gpu_mem": 1.068911616, + "loss": 0.4304, + "grad_norm": 3.320127248764038, + "learning_rate": 6.230858922484288e-05 + }, + { + "step": 456, + "epoch": 0.7307692307692307, + "cpu_mem": 3.684491264, + "gpu_mem": 1.068942336, + "loss": 0.6556, + "grad_norm": 5.153737545013428, + "learning_rate": 6.162846486795938e-05 + }, + { + "step": 457, + "epoch": 0.7323717948717948, + "cpu_mem": 3.684491264, + "gpu_mem": 1.068923904, + "loss": 0.7311, + "grad_norm": 6.1648712158203125, + "learning_rate": 6.095111182221422e-05 + }, + { + "step": 458, + "epoch": 0.7339743589743589, + "cpu_mem": 3.684687872, + "gpu_mem": 1.068911616, + "loss": 0.7341, + "grad_norm": 4.8717942237854, + "learning_rate": 6.027655132924397e-05 + }, + { + "step": 459, + "epoch": 0.7355769230769231, + "cpu_mem": 3.68488448, + "gpu_mem": 1.068923904, + "loss": 0.6688, + "grad_norm": 6.242313861846924, + "learning_rate": 5.960480454311155e-05 + }, + { + "step": 460, + "epoch": 0.7371794871794872, + "cpu_mem": 3.685081088, + "gpu_mem": 1.068930048, + "loss": 0.4514, + "grad_norm": 3.957883358001709, + "learning_rate": 5.893589252964258e-05 + }, + { + "step": 461, + "epoch": 0.7387820512820513, + "cpu_mem": 3.685277696, + "gpu_mem": 1.06891776, + "loss": 0.4141, + "grad_norm": 4.1003899574279785, + "learning_rate": 5.826983626576479e-05 + }, + { + "step": 462, + "epoch": 0.7403846153846154, + "cpu_mem": 3.685474304, + "gpu_mem": 1.068907008, + "loss": 0.4795, + "grad_norm": 4.800900459289551, + "learning_rate": 5.760665663885046e-05 + }, + { + "step": 463, + "epoch": 0.7419871794871795, + "cpu_mem": 3.685670912, + "gpu_mem": 1.068908544, + "loss": 0.5706, + "grad_norm": 4.966209888458252, + "learning_rate": 5.6946374446060984e-05 + }, + { + "step": 464, + "epoch": 0.7435897435897436, + "cpu_mem": 3.68586752, + "gpu_mem": 1.068922368, + "loss": 0.6212, + "grad_norm": 3.8365538120269775, + "learning_rate": 5.6289010393695056e-05 + }, + { + "step": 465, + "epoch": 0.7451923076923077, + "cpu_mem": 3.686064128, + "gpu_mem": 1.06892544, + "loss": 0.5309, + "grad_norm": 3.9537160396575928, + "learning_rate": 5.563458509653904e-05 + }, + { + "step": 466, + "epoch": 0.7467948717948718, + "cpu_mem": 3.686260736, + "gpu_mem": 1.068936192, + "loss": 0.6038, + "grad_norm": 5.394312858581543, + "learning_rate": 5.498311907722057e-05 + }, + { + "step": 467, + "epoch": 0.7483974358974359, + "cpu_mem": 3.686457344, + "gpu_mem": 1.06891008, + "loss": 0.5619, + "grad_norm": 4.403983116149902, + "learning_rate": 5.43346327655652e-05 + }, + { + "step": 468, + "epoch": 0.75, + "cpu_mem": 3.686653952, + "gpu_mem": 1.06892544, + "loss": 0.7091, + "grad_norm": 5.001778602600098, + "learning_rate": 5.3689146497955274e-05 + }, + { + "step": 469, + "epoch": 0.7516025641025641, + "cpu_mem": 3.68685056, + "gpu_mem": 1.068934656, + "loss": 0.7453, + "grad_norm": 4.899540424346924, + "learning_rate": 5.30466805166927e-05 + }, + { + "step": 470, + "epoch": 0.7532051282051282, + "cpu_mem": 3.68685056, + "gpu_mem": 1.068908544, + "loss": 0.5948, + "grad_norm": 4.7623724937438965, + "learning_rate": 5.240725496936372e-05 + }, + { + "step": 471, + "epoch": 0.7548076923076923, + "cpu_mem": 3.687047168, + "gpu_mem": 1.068914688, + "loss": 0.6355, + "grad_norm": 5.609147071838379, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 472, + "epoch": 0.7564102564102564, + "cpu_mem": 3.687243776, + "gpu_mem": 1.068903936, + "loss": 0.5703, + "grad_norm": 4.839284896850586, + "learning_rate": 5.113760528948622e-05 + }, + { + "step": 473, + "epoch": 0.7580128205128205, + "cpu_mem": 3.687440384, + "gpu_mem": 1.06891008, + "loss": 0.4874, + "grad_norm": 3.9422295093536377, + "learning_rate": 5.05074209728614e-05 + }, + { + "step": 474, + "epoch": 0.7596153846153846, + "cpu_mem": 3.687636992, + "gpu_mem": 1.068946944, + "loss": 0.4686, + "grad_norm": 4.363880157470703, + "learning_rate": 4.988035672076899e-05 + }, + { + "step": 475, + "epoch": 0.7612179487179487, + "cpu_mem": 3.6878336, + "gpu_mem": 1.06889472, + "loss": 0.6875, + "grad_norm": 4.734619140625, + "learning_rate": 4.925643219780052e-05 + }, + { + "step": 476, + "epoch": 0.7628205128205128, + "cpu_mem": 3.688030208, + "gpu_mem": 1.068914688, + "loss": 0.6277, + "grad_norm": 4.633767127990723, + "learning_rate": 4.863566697008634e-05 + }, + { + "step": 477, + "epoch": 0.7644230769230769, + "cpu_mem": 3.688226816, + "gpu_mem": 1.068914688, + "loss": 0.5665, + "grad_norm": 4.933912754058838, + "learning_rate": 4.801808050468219e-05 + }, + { + "step": 478, + "epoch": 0.7660256410256411, + "cpu_mem": 3.688423424, + "gpu_mem": 1.068913152, + "loss": 0.5227, + "grad_norm": 3.5733566284179688, + "learning_rate": 4.7403692168958305e-05 + }, + { + "step": 479, + "epoch": 0.7676282051282052, + "cpu_mem": 3.688620032, + "gpu_mem": 1.068911616, + "loss": 0.4782, + "grad_norm": 4.727837562561035, + "learning_rate": 4.679252122999255e-05 + }, + { + "step": 480, + "epoch": 0.7692307692307693, + "cpu_mem": 3.68881664, + "gpu_mem": 1.068903936, + "loss": 0.6475, + "grad_norm": 4.555899620056152, + "learning_rate": 4.618458685396579e-05 + }, + { + "step": 481, + "epoch": 0.7708333333333334, + "cpu_mem": 3.689013248, + "gpu_mem": 1.06896384, + "loss": 0.6005, + "grad_norm": 5.9155073165893555, + "learning_rate": 4.5579908105561016e-05 + }, + { + "step": 482, + "epoch": 0.7724358974358975, + "cpu_mem": 3.689209856, + "gpu_mem": 1.068908544, + "loss": 0.4718, + "grad_norm": 4.199721336364746, + "learning_rate": 4.497850394736563e-05 + }, + { + "step": 483, + "epoch": 0.7740384615384616, + "cpu_mem": 3.689209856, + "gpu_mem": 1.068891648, + "loss": 0.6654, + "grad_norm": 3.492733955383301, + "learning_rate": 4.438039323927648e-05 + }, + { + "step": 484, + "epoch": 0.7756410256410257, + "cpu_mem": 3.689406464, + "gpu_mem": 1.068922368, + "loss": 0.6579, + "grad_norm": 4.723977088928223, + "learning_rate": 4.3785594737908676e-05 + }, + { + "step": 485, + "epoch": 0.7772435897435898, + "cpu_mem": 3.689406464, + "gpu_mem": 1.068966912, + "loss": 0.6274, + "grad_norm": 3.4643001556396484, + "learning_rate": 4.319412709600723e-05 + }, + { + "step": 486, + "epoch": 0.7788461538461539, + "cpu_mem": 3.689603072, + "gpu_mem": 1.068946944, + "loss": 0.4363, + "grad_norm": 3.7927098274230957, + "learning_rate": 4.2606008861862116e-05 + }, + { + "step": 487, + "epoch": 0.780448717948718, + "cpu_mem": 3.68979968, + "gpu_mem": 1.068946944, + "loss": 0.4601, + "grad_norm": 3.6562843322753906, + "learning_rate": 4.2021258478726774e-05 + }, + { + "step": 488, + "epoch": 0.782051282051282, + "cpu_mem": 3.689996288, + "gpu_mem": 1.068913152, + "loss": 0.5589, + "grad_norm": 4.747631072998047, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 489, + "epoch": 0.7836538461538461, + "cpu_mem": 3.690192896, + "gpu_mem": 1.068937728, + "loss": 0.5499, + "grad_norm": 3.4296746253967285, + "learning_rate": 4.0861934509848507e-05 + }, + { + "step": 490, + "epoch": 0.7852564102564102, + "cpu_mem": 3.690389504, + "gpu_mem": 1.0689408, + "loss": 0.7061, + "grad_norm": 4.09951114654541, + "learning_rate": 4.028739728024022e-05 + }, + { + "step": 491, + "epoch": 0.7868589743589743, + "cpu_mem": 3.690389504, + "gpu_mem": 1.068919296, + "loss": 0.4788, + "grad_norm": 3.6731388568878174, + "learning_rate": 3.971630061277077e-05 + }, + { + "step": 492, + "epoch": 0.7884615384615384, + "cpu_mem": 3.690586112, + "gpu_mem": 1.068942336, + "loss": 0.6127, + "grad_norm": 7.020590782165527, + "learning_rate": 3.914866241690115e-05 + }, + { + "step": 493, + "epoch": 0.7900641025641025, + "cpu_mem": 3.69078272, + "gpu_mem": 1.068922368, + "loss": 0.6355, + "grad_norm": 4.589057922363281, + "learning_rate": 3.858450049363532e-05 + }, + { + "step": 494, + "epoch": 0.7916666666666666, + "cpu_mem": 3.690979328, + "gpu_mem": 1.068946944, + "loss": 0.4284, + "grad_norm": 2.7822108268737793, + "learning_rate": 3.8023832534962314e-05 + }, + { + "step": 495, + "epoch": 0.7932692307692307, + "cpu_mem": 3.691175936, + "gpu_mem": 1.068930048, + "loss": 0.6005, + "grad_norm": 4.445858955383301, + "learning_rate": 3.746667612330109e-05 + }, + { + "step": 496, + "epoch": 0.7948717948717948, + "cpu_mem": 3.691372544, + "gpu_mem": 1.06892544, + "loss": 0.5123, + "grad_norm": 5.312275409698486, + "learning_rate": 3.691304873094927e-05 + }, + { + "step": 497, + "epoch": 0.7964743589743589, + "cpu_mem": 3.691569152, + "gpu_mem": 1.068937728, + "loss": 0.4995, + "grad_norm": 4.677011966705322, + "learning_rate": 3.636296771953544e-05 + }, + { + "step": 498, + "epoch": 0.7980769230769231, + "cpu_mem": 3.691569152, + "gpu_mem": 1.068908544, + "loss": 0.4492, + "grad_norm": 4.087525367736816, + "learning_rate": 3.581645033947425e-05 + }, + { + "step": 499, + "epoch": 0.7996794871794872, + "cpu_mem": 3.69176576, + "gpu_mem": 1.068922368, + "loss": 0.7452, + "grad_norm": 5.13217830657959, + "learning_rate": 3.527351372942588e-05 + }, + { + "step": 500, + "epoch": 0.8012820512820513, + "cpu_mem": 3.691962368, + "gpu_mem": 1.068908544, + "loss": 0.4957, + "grad_norm": 4.13884162902832, + "learning_rate": 3.473417491575824e-05 + }, + { + "step": 501, + "epoch": 0.8028846153846154, + "cpu_mem": 3.692158976, + "gpu_mem": 1.0689024, + "loss": 0.5629, + "grad_norm": 4.908017635345459, + "learning_rate": 3.41984508120132e-05 + }, + { + "step": 502, + "epoch": 0.8044871794871795, + "cpu_mem": 3.692158976, + "gpu_mem": 1.068908544, + "loss": 0.4695, + "grad_norm": 3.737800359725952, + "learning_rate": 3.366635821837627e-05 + }, + { + "step": 503, + "epoch": 0.8060897435897436, + "cpu_mem": 3.692355584, + "gpu_mem": 1.068922368, + "loss": 0.569, + "grad_norm": 4.514204025268555, + "learning_rate": 3.3137913821149425e-05 + }, + { + "step": 504, + "epoch": 0.8076923076923077, + "cpu_mem": 3.692355584, + "gpu_mem": 1.068905472, + "loss": 0.7146, + "grad_norm": 4.970044136047363, + "learning_rate": 3.261313419222825e-05 + }, + { + "step": 505, + "epoch": 0.8092948717948718, + "cpu_mem": 3.692552192, + "gpu_mem": 1.068959232, + "loss": 0.4688, + "grad_norm": 4.302603721618652, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 506, + "epoch": 0.8108974358974359, + "cpu_mem": 3.692552192, + "gpu_mem": 1.0689024, + "loss": 0.5877, + "grad_norm": 5.03633975982666, + "learning_rate": 3.157463495173713e-05 + }, + { + "step": 507, + "epoch": 0.8125, + "cpu_mem": 3.6927488, + "gpu_mem": 1.068980736, + "loss": 0.4788, + "grad_norm": 4.328243732452393, + "learning_rate": 3.1060947907265936e-05 + }, + { + "step": 508, + "epoch": 0.8141025641025641, + "cpu_mem": 3.692945408, + "gpu_mem": 1.068923904, + "loss": 0.5718, + "grad_norm": 5.801661968231201, + "learning_rate": 3.0550990764276634e-05 + }, + { + "step": 509, + "epoch": 0.8157051282051282, + "cpu_mem": 3.692945408, + "gpu_mem": 1.068942336, + "loss": 0.6691, + "grad_norm": 5.900737762451172, + "learning_rate": 3.00447795149086e-05 + }, + { + "step": 510, + "epoch": 0.8173076923076923, + "cpu_mem": 3.693142016, + "gpu_mem": 1.06891776, + "loss": 0.4992, + "grad_norm": 4.211178302764893, + "learning_rate": 2.9542330033830884e-05 + }, + { + "step": 511, + "epoch": 0.8189102564102564, + "cpu_mem": 3.693338624, + "gpu_mem": 1.068950016, + "loss": 0.4803, + "grad_norm": 3.553295373916626, + "learning_rate": 2.9043658077744316e-05 + }, + { + "step": 512, + "epoch": 0.8205128205128205, + "cpu_mem": 3.693535232, + "gpu_mem": 1.068969984, + "loss": 0.6709, + "grad_norm": 5.269535541534424, + "learning_rate": 2.8548779284887442e-05 + }, + { + "step": 513, + "epoch": 0.8221153846153846, + "cpu_mem": 3.69373184, + "gpu_mem": 1.068899328, + "loss": 0.4393, + "grad_norm": 3.828449010848999, + "learning_rate": 2.805770917454614e-05 + }, + { + "step": 514, + "epoch": 0.8237179487179487, + "cpu_mem": 3.69373184, + "gpu_mem": 1.068913152, + "loss": 0.4214, + "grad_norm": 4.072690963745117, + "learning_rate": 2.7570463146566758e-05 + }, + { + "step": 515, + "epoch": 0.8253205128205128, + "cpu_mem": 3.693928448, + "gpu_mem": 1.068897792, + "loss": 0.6396, + "grad_norm": 4.663785934448242, + "learning_rate": 2.708705648087332e-05 + }, + { + "step": 516, + "epoch": 0.8269230769230769, + "cpu_mem": 3.694125056, + "gpu_mem": 1.068936192, + "loss": 0.4275, + "grad_norm": 3.04491925239563, + "learning_rate": 2.6607504336988317e-05 + }, + { + "step": 517, + "epoch": 0.8285256410256411, + "cpu_mem": 3.694125056, + "gpu_mem": 1.068936192, + "loss": 0.56, + "grad_norm": 4.937060356140137, + "learning_rate": 2.613182175355739e-05 + }, + { + "step": 518, + "epoch": 0.8301282051282052, + "cpu_mem": 3.694321664, + "gpu_mem": 1.068922368, + "loss": 0.4752, + "grad_norm": 3.9767162799835205, + "learning_rate": 2.5660023647877644e-05 + }, + { + "step": 519, + "epoch": 0.8317307692307693, + "cpu_mem": 3.694321664, + "gpu_mem": 1.068913152, + "loss": 0.6371, + "grad_norm": 4.739984512329102, + "learning_rate": 2.5192124815429777e-05 + }, + { + "step": 520, + "epoch": 0.8333333333333334, + "cpu_mem": 3.694518272, + "gpu_mem": 1.06891776, + "loss": 0.6044, + "grad_norm": 3.8127496242523193, + "learning_rate": 2.472813992941418e-05 + }, + { + "step": 521, + "epoch": 0.8349358974358975, + "cpu_mem": 3.694518272, + "gpu_mem": 1.068920832, + "loss": 0.5476, + "grad_norm": 3.7579257488250732, + "learning_rate": 2.426808354029078e-05 + }, + { + "step": 522, + "epoch": 0.8365384615384616, + "cpu_mem": 3.69471488, + "gpu_mem": 1.068926976, + "loss": 0.4237, + "grad_norm": 3.084677219390869, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 523, + "epoch": 0.8381410256410257, + "cpu_mem": 3.694911488, + "gpu_mem": 1.068945408, + "loss": 0.598, + "grad_norm": 4.271091461181641, + "learning_rate": 2.3359813838124277e-05 + }, + { + "step": 524, + "epoch": 0.8397435897435898, + "cpu_mem": 3.694911488, + "gpu_mem": 1.068939264, + "loss": 0.6914, + "grad_norm": 7.305878639221191, + "learning_rate": 2.2911629008211363e-05 + }, + { + "step": 525, + "epoch": 0.8413461538461539, + "cpu_mem": 3.695108096, + "gpu_mem": 1.068916224, + "loss": 0.4301, + "grad_norm": 4.184849739074707, + "learning_rate": 2.24674296405579e-05 + }, + { + "step": 526, + "epoch": 0.842948717948718, + "cpu_mem": 3.695108096, + "gpu_mem": 1.068903936, + "loss": 0.6571, + "grad_norm": 6.094544887542725, + "learning_rate": 2.2027229665154446e-05 + }, + { + "step": 527, + "epoch": 0.844551282051282, + "cpu_mem": 3.695304704, + "gpu_mem": 1.068870144, + "loss": 0.6321, + "grad_norm": 4.58673620223999, + "learning_rate": 2.1591042886571634e-05 + }, + { + "step": 528, + "epoch": 0.8461538461538461, + "cpu_mem": 3.695304704, + "gpu_mem": 1.06891776, + "loss": 0.5783, + "grad_norm": 4.275995254516602, + "learning_rate": 2.1158882983527166e-05 + }, + { + "step": 529, + "epoch": 0.8477564102564102, + "cpu_mem": 3.695501312, + "gpu_mem": 1.068883968, + "loss": 0.6459, + "grad_norm": 4.28321647644043, + "learning_rate": 2.0730763508456738e-05 + }, + { + "step": 530, + "epoch": 0.8493589743589743, + "cpu_mem": 3.695501312, + "gpu_mem": 1.068931584, + "loss": 0.5234, + "grad_norm": 4.24468994140625, + "learning_rate": 2.0306697887089235e-05 + }, + { + "step": 531, + "epoch": 0.8509615384615384, + "cpu_mem": 3.69569792, + "gpu_mem": 1.068930048, + "loss": 0.6962, + "grad_norm": 4.224254131317139, + "learning_rate": 1.9886699418025543e-05 + }, + { + "step": 532, + "epoch": 0.8525641025641025, + "cpu_mem": 3.69569792, + "gpu_mem": 1.068931584, + "loss": 0.3024, + "grad_norm": 3.2612826824188232, + "learning_rate": 1.947078127232169e-05 + }, + { + "step": 533, + "epoch": 0.8541666666666666, + "cpu_mem": 3.695894528, + "gpu_mem": 1.0689408, + "loss": 0.8216, + "grad_norm": 5.949036121368408, + "learning_rate": 1.9058956493075644e-05 + }, + { + "step": 534, + "epoch": 0.8557692307692307, + "cpu_mem": 3.696091136, + "gpu_mem": 1.068916224, + "loss": 0.65, + "grad_norm": 4.233695030212402, + "learning_rate": 1.8651237995018324e-05 + }, + { + "step": 535, + "epoch": 0.8573717948717948, + "cpu_mem": 3.696091136, + "gpu_mem": 1.068900864, + "loss": 0.7033, + "grad_norm": 4.306166172027588, + "learning_rate": 1.8247638564108607e-05 + }, + { + "step": 536, + "epoch": 0.8589743589743589, + "cpu_mem": 3.696287744, + "gpu_mem": 1.068930048, + "loss": 0.5628, + "grad_norm": 4.698094844818115, + "learning_rate": 1.7848170857132325e-05 + }, + { + "step": 537, + "epoch": 0.8605769230769231, + "cpu_mem": 3.696484352, + "gpu_mem": 1.068943872, + "loss": 0.6219, + "grad_norm": 5.521763324737549, + "learning_rate": 1.7452847401305496e-05 + }, + { + "step": 538, + "epoch": 0.8621794871794872, + "cpu_mem": 3.696484352, + "gpu_mem": 1.068899328, + "loss": 0.6146, + "grad_norm": 4.403607368469238, + "learning_rate": 1.7061680593881344e-05 + }, + { + "step": 539, + "epoch": 0.8637820512820513, + "cpu_mem": 3.696484352, + "gpu_mem": 1.068905472, + "loss": 0.6388, + "grad_norm": 4.291751384735107, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 540, + "epoch": 0.8653846153846154, + "cpu_mem": 3.69668096, + "gpu_mem": 1.068934656, + "loss": 0.6389, + "grad_norm": 4.635309219360352, + "learning_rate": 1.6291865861111353e-05 + }, + { + "step": 541, + "epoch": 0.8669871794871795, + "cpu_mem": 3.696877568, + "gpu_mem": 1.068930048, + "loss": 0.4253, + "grad_norm": 3.165823459625244, + "learning_rate": 1.5913242076979493e-05 + }, + { + "step": 542, + "epoch": 0.8685897435897436, + "cpu_mem": 3.697074176, + "gpu_mem": 1.068916224, + "loss": 0.7286, + "grad_norm": 4.186193466186523, + "learning_rate": 1.5538823222921288e-05 + }, + { + "step": 543, + "epoch": 0.8701923076923077, + "cpu_mem": 3.697270784, + "gpu_mem": 1.068930048, + "loss": 0.6098, + "grad_norm": 5.229433059692383, + "learning_rate": 1.5168621040626388e-05 + }, + { + "step": 544, + "epoch": 0.8717948717948718, + "cpu_mem": 3.697270784, + "gpu_mem": 1.068919296, + "loss": 0.626, + "grad_norm": 3.8929617404937744, + "learning_rate": 1.4802647139550577e-05 + }, + { + "step": 545, + "epoch": 0.8733974358974359, + "cpu_mem": 3.697467392, + "gpu_mem": 1.06892544, + "loss": 0.4106, + "grad_norm": 3.5405330657958984, + "learning_rate": 1.444091299655175e-05 + }, + { + "step": 546, + "epoch": 0.875, + "cpu_mem": 3.697467392, + "gpu_mem": 1.068930048, + "loss": 0.6439, + "grad_norm": 4.773582458496094, + "learning_rate": 1.408342995552988e-05 + }, + { + "step": 547, + "epoch": 0.8766025641025641, + "cpu_mem": 3.697664, + "gpu_mem": 1.06892544, + "loss": 0.5664, + "grad_norm": 4.036649703979492, + "learning_rate": 1.3730209227071436e-05 + }, + { + "step": 548, + "epoch": 0.8782051282051282, + "cpu_mem": 3.697664, + "gpu_mem": 1.068899328, + "loss": 0.5174, + "grad_norm": 3.582227945327759, + "learning_rate": 1.3381261888097755e-05 + }, + { + "step": 549, + "epoch": 0.8798076923076923, + "cpu_mem": 3.697860608, + "gpu_mem": 1.068908544, + "loss": 0.5817, + "grad_norm": 4.715714454650879, + "learning_rate": 1.303659888151753e-05 + }, + { + "step": 550, + "epoch": 0.8814102564102564, + "cpu_mem": 3.697860608, + "gpu_mem": 1.068926976, + "loss": 0.6481, + "grad_norm": 4.382011413574219, + "learning_rate": 1.2696231015883913e-05 + }, + { + "step": 551, + "epoch": 0.8830128205128205, + "cpu_mem": 3.698057216, + "gpu_mem": 1.068897792, + "loss": 0.547, + "grad_norm": 4.958312034606934, + "learning_rate": 1.2360168965055301e-05 + }, + { + "step": 552, + "epoch": 0.8846153846153846, + "cpu_mem": 3.698057216, + "gpu_mem": 1.068928512, + "loss": 0.5647, + "grad_norm": 4.69865083694458, + "learning_rate": 1.2028423267860805e-05 + }, + { + "step": 553, + "epoch": 0.8862179487179487, + "cpu_mem": 3.698057216, + "gpu_mem": 1.068937728, + "loss": 0.5149, + "grad_norm": 3.9285285472869873, + "learning_rate": 1.1701004327769709e-05 + }, + { + "step": 554, + "epoch": 0.8878205128205128, + "cpu_mem": 3.698057216, + "gpu_mem": 1.068899328, + "loss": 0.7035, + "grad_norm": 4.010663986206055, + "learning_rate": 1.1377922412565005e-05 + }, + { + "step": 555, + "epoch": 0.8894230769230769, + "cpu_mem": 3.698253824, + "gpu_mem": 1.068903936, + "loss": 0.4743, + "grad_norm": 3.8897485733032227, + "learning_rate": 1.1059187654021762e-05 + }, + { + "step": 556, + "epoch": 0.8910256410256411, + "cpu_mem": 3.698253824, + "gpu_mem": 1.068928512, + "loss": 0.5699, + "grad_norm": 4.136284828186035, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 557, + "epoch": 0.8926282051282052, + "cpu_mem": 3.698450432, + "gpu_mem": 1.068946944, + "loss": 0.5453, + "grad_norm": 4.241669178009033, + "learning_rate": 1.0434799452076915e-05 + }, + { + "step": 558, + "epoch": 0.8942307692307693, + "cpu_mem": 3.698450432, + "gpu_mem": 1.068928512, + "loss": 0.5908, + "grad_norm": 4.509785175323486, + "learning_rate": 1.0129165589346643e-05 + }, + { + "step": 559, + "epoch": 0.8958333333333334, + "cpu_mem": 3.69864704, + "gpu_mem": 1.0689792, + "loss": 0.679, + "grad_norm": 4.614045143127441, + "learning_rate": 9.82791804400626e-06 + }, + { + "step": 560, + "epoch": 0.8974358974358975, + "cpu_mem": 3.69864704, + "gpu_mem": 1.068911616, + "loss": 0.8797, + "grad_norm": 5.482229709625244, + "learning_rate": 9.531066263109971e-06 + }, + { + "step": 561, + "epoch": 0.8990384615384616, + "cpu_mem": 3.698843648, + "gpu_mem": 1.068913152, + "loss": 0.6484, + "grad_norm": 6.532393932342529, + "learning_rate": 9.238619555861731e-06 + }, + { + "step": 562, + "epoch": 0.9006410256410257, + "cpu_mem": 3.698843648, + "gpu_mem": 1.068913152, + "loss": 0.5767, + "grad_norm": 4.378352642059326, + "learning_rate": 8.950587093323435e-06 + }, + { + "step": 563, + "epoch": 0.9022435897435898, + "cpu_mem": 3.699040256, + "gpu_mem": 1.068919296, + "loss": 0.563, + "grad_norm": 3.8064088821411133, + "learning_rate": 8.66697790812731e-06 + }, + { + "step": 564, + "epoch": 0.9038461538461539, + "cpu_mem": 3.699040256, + "gpu_mem": 1.06893312, + "loss": 0.3985, + "grad_norm": 3.112156867980957, + "learning_rate": 8.387800894192453e-06 + }, + { + "step": 565, + "epoch": 0.905448717948718, + "cpu_mem": 3.699236864, + "gpu_mem": 1.068937728, + "loss": 0.6647, + "grad_norm": 4.8927435874938965, + "learning_rate": 8.113064806446285e-06 + }, + { + "step": 566, + "epoch": 0.907051282051282, + "cpu_mem": 3.699236864, + "gpu_mem": 1.068931584, + "loss": 0.4965, + "grad_norm": 4.474597454071045, + "learning_rate": 7.842778260549654e-06 + }, + { + "step": 567, + "epoch": 0.9086538461538461, + "cpu_mem": 3.699433472, + "gpu_mem": 1.06892544, + "loss": 0.44, + "grad_norm": 5.053464412689209, + "learning_rate": 7.5769497326268804e-06 + }, + { + "step": 568, + "epoch": 0.9102564102564102, + "cpu_mem": 3.699433472, + "gpu_mem": 1.068939264, + "loss": 0.7102, + "grad_norm": 5.657016754150391, + "learning_rate": 7.315587558999864e-06 + }, + { + "step": 569, + "epoch": 0.9118589743589743, + "cpu_mem": 3.699433472, + "gpu_mem": 1.068931584, + "loss": 0.7371, + "grad_norm": 4.528857231140137, + "learning_rate": 7.058699935926526e-06 + }, + { + "step": 570, + "epoch": 0.9134615384615384, + "cpu_mem": 3.69963008, + "gpu_mem": 1.068916224, + "loss": 0.667, + "grad_norm": 5.1351823806762695, + "learning_rate": 6.8062949193440515e-06 + }, + { + "step": 571, + "epoch": 0.9150641025641025, + "cpu_mem": 3.699826688, + "gpu_mem": 1.06892544, + "loss": 0.6823, + "grad_norm": 4.498108863830566, + "learning_rate": 6.5583804246160385e-06 + }, + { + "step": 572, + "epoch": 0.9166666666666666, + "cpu_mem": 3.699826688, + "gpu_mem": 1.068934656, + "loss": 0.5289, + "grad_norm": 4.13942813873291, + "learning_rate": 6.3149642262843804e-06 + }, + { + "step": 573, + "epoch": 0.9182692307692307, + "cpu_mem": 3.699826688, + "gpu_mem": 1.068937728, + "loss": 0.5303, + "grad_norm": 4.272716999053955, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 574, + "epoch": 0.9198717948717948, + "cpu_mem": 3.700023296, + "gpu_mem": 1.068891648, + "loss": 0.6603, + "grad_norm": 4.804344654083252, + "learning_rate": 5.84165711141048e-06 + }, + { + "step": 575, + "epoch": 0.9214743589743589, + "cpu_mem": 3.700219904, + "gpu_mem": 1.068945408, + "loss": 0.4328, + "grad_norm": 3.9889183044433594, + "learning_rate": 5.611781037671176e-06 + }, + { + "step": 576, + "epoch": 0.9230769230769231, + "cpu_mem": 3.700219904, + "gpu_mem": 1.068946944, + "loss": 0.5002, + "grad_norm": 3.6955435276031494, + "learning_rate": 5.386432945468555e-06 + }, + { + "step": 577, + "epoch": 0.9246794871794872, + "cpu_mem": 3.700416512, + "gpu_mem": 1.068891648, + "loss": 0.5883, + "grad_norm": 4.687688827514648, + "learning_rate": 5.165619901667311e-06 + }, + { + "step": 578, + "epoch": 0.9262820512820513, + "cpu_mem": 3.700416512, + "gpu_mem": 1.06892544, + "loss": 0.6536, + "grad_norm": 3.500284433364868, + "learning_rate": 4.949348830914002e-06 + }, + { + "step": 579, + "epoch": 0.9278846153846154, + "cpu_mem": 3.70061312, + "gpu_mem": 1.068903936, + "loss": 0.6392, + "grad_norm": 3.868014097213745, + "learning_rate": 4.737626515419951e-06 + }, + { + "step": 580, + "epoch": 0.9294871794871795, + "cpu_mem": 3.70061312, + "gpu_mem": 1.068934656, + "loss": 0.5381, + "grad_norm": 4.011346340179443, + "learning_rate": 4.530459594748592e-06 + }, + { + "step": 581, + "epoch": 0.9310897435897436, + "cpu_mem": 3.70061312, + "gpu_mem": 1.06891008, + "loss": 0.5674, + "grad_norm": 5.461002349853516, + "learning_rate": 4.327854565607164e-06 + }, + { + "step": 582, + "epoch": 0.9326923076923077, + "cpu_mem": 3.70061312, + "gpu_mem": 1.068943872, + "loss": 0.5364, + "grad_norm": 3.958575963973999, + "learning_rate": 4.129817781643091e-06 + }, + { + "step": 583, + "epoch": 0.9342948717948718, + "cpu_mem": 3.700809728, + "gpu_mem": 1.06896384, + "loss": 0.6262, + "grad_norm": 4.548100471496582, + "learning_rate": 3.9363554532446276e-06 + }, + { + "step": 584, + "epoch": 0.9358974358974359, + "cpu_mem": 3.700809728, + "gpu_mem": 1.068928512, + "loss": 0.4246, + "grad_norm": 4.977705001831055, + "learning_rate": 3.7474736473461607e-06 + }, + { + "step": 585, + "epoch": 0.9375, + "cpu_mem": 3.701006336, + "gpu_mem": 1.06894848, + "loss": 0.4728, + "grad_norm": 3.3230903148651123, + "learning_rate": 3.56317828723795e-06 + }, + { + "step": 586, + "epoch": 0.9391025641025641, + "cpu_mem": 3.701006336, + "gpu_mem": 1.068928512, + "loss": 0.4614, + "grad_norm": 3.138862371444702, + "learning_rate": 3.383475152380355e-06 + }, + { + "step": 587, + "epoch": 0.9407051282051282, + "cpu_mem": 3.701202944, + "gpu_mem": 1.068930048, + "loss": 0.5533, + "grad_norm": 4.466731071472168, + "learning_rate": 3.2083698782225997e-06 + }, + { + "step": 588, + "epoch": 0.9423076923076923, + "cpu_mem": 3.701202944, + "gpu_mem": 1.068922368, + "loss": 0.4547, + "grad_norm": 3.639528512954712, + "learning_rate": 3.0378679560260467e-06 + }, + { + "step": 589, + "epoch": 0.9439102564102564, + "cpu_mem": 3.701399552, + "gpu_mem": 1.068920832, + "loss": 0.5222, + "grad_norm": 5.035019874572754, + "learning_rate": 2.871974732691984e-06 + }, + { + "step": 590, + "epoch": 0.9455128205128205, + "cpu_mem": 3.701399552, + "gpu_mem": 1.068934656, + "loss": 0.6339, + "grad_norm": 4.1888427734375, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 591, + "epoch": 0.9471153846153846, + "cpu_mem": 3.70159616, + "gpu_mem": 1.068905472, + "loss": 0.5845, + "grad_norm": 4.517126083374023, + "learning_rate": 2.554035047414732e-06 + }, + { + "step": 592, + "epoch": 0.9487179487179487, + "cpu_mem": 3.70159616, + "gpu_mem": 1.06895616, + "loss": 0.6354, + "grad_norm": 4.511905670166016, + "learning_rate": 2.401998555987389e-06 + }, + { + "step": 593, + "epoch": 0.9503205128205128, + "cpu_mem": 3.701792768, + "gpu_mem": 1.068953088, + "loss": 0.6027, + "grad_norm": 3.977062225341797, + "learning_rate": 2.2545907041415457e-06 + }, + { + "step": 594, + "epoch": 0.9519230769230769, + "cpu_mem": 3.701792768, + "gpu_mem": 1.06893312, + "loss": 0.654, + "grad_norm": 4.243345260620117, + "learning_rate": 2.1118161145537436e-06 + }, + { + "step": 595, + "epoch": 0.9535256410256411, + "cpu_mem": 3.701792768, + "gpu_mem": 1.068914688, + "loss": 0.467, + "grad_norm": 3.285316228866577, + "learning_rate": 1.973679264602485e-06 + }, + { + "step": 596, + "epoch": 0.9551282051282052, + "cpu_mem": 3.701792768, + "gpu_mem": 1.068923904, + "loss": 0.5529, + "grad_norm": 3.6769697666168213, + "learning_rate": 1.840184486227808e-06 + }, + { + "step": 597, + "epoch": 0.9567307692307693, + "cpu_mem": 3.701989376, + "gpu_mem": 1.068891648, + "loss": 0.7058, + "grad_norm": 5.954848289489746, + "learning_rate": 1.711335965795435e-06 + }, + { + "step": 598, + "epoch": 0.9583333333333334, + "cpu_mem": 3.701989376, + "gpu_mem": 1.068953088, + "loss": 0.6095, + "grad_norm": 3.912317991256714, + "learning_rate": 1.5871377439655054e-06 + }, + { + "step": 599, + "epoch": 0.9599358974358975, + "cpu_mem": 3.701989376, + "gpu_mem": 1.068951552, + "loss": 0.4329, + "grad_norm": 3.595440626144409, + "learning_rate": 1.4675937155658456e-06 + }, + { + "step": 600, + "epoch": 0.9615384615384616, + "cpu_mem": 3.701989376, + "gpu_mem": 1.068907008, + "loss": 0.5718, + "grad_norm": 3.993145704269409, + "learning_rate": 1.3527076294698846e-06 + }, + { + "step": 601, + "epoch": 0.9631410256410257, + "cpu_mem": 3.702185984, + "gpu_mem": 1.068939264, + "loss": 0.5107, + "grad_norm": 3.7426540851593018, + "learning_rate": 1.2424830884790126e-06 + }, + { + "step": 602, + "epoch": 0.9647435897435898, + "cpu_mem": 3.702382592, + "gpu_mem": 1.06893312, + "loss": 0.5868, + "grad_norm": 4.952030658721924, + "learning_rate": 1.1369235492096397e-06 + }, + { + "step": 603, + "epoch": 0.9663461538461539, + "cpu_mem": 3.702382592, + "gpu_mem": 1.068919296, + "loss": 0.4077, + "grad_norm": 4.373257160186768, + "learning_rate": 1.0360323219847645e-06 + }, + { + "step": 604, + "epoch": 0.967948717948718, + "cpu_mem": 3.702382592, + "gpu_mem": 1.068919296, + "loss": 0.714, + "grad_norm": 5.872477054595947, + "learning_rate": 9.398125707302084e-07 + }, + { + "step": 605, + "epoch": 0.969551282051282, + "cpu_mem": 3.702382592, + "gpu_mem": 1.068945408, + "loss": 0.5196, + "grad_norm": 4.012609004974365, + "learning_rate": 8.482673128753947e-07 + }, + { + "step": 606, + "epoch": 0.9711538461538461, + "cpu_mem": 3.702382592, + "gpu_mem": 1.06893312, + "loss": 0.3796, + "grad_norm": 3.6921918392181396, + "learning_rate": 7.613994192586736e-07 + }, + { + "step": 607, + "epoch": 0.9727564102564102, + "cpu_mem": 3.7025792, + "gpu_mem": 1.068923904, + "loss": 0.7424, + "grad_norm": 4.875439167022705, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 608, + "epoch": 0.9743589743589743, + "cpu_mem": 3.7025792, + "gpu_mem": 1.068916224, + "loss": 0.6439, + "grad_norm": 4.768589019775391, + "learning_rate": 6.017064746021094e-07 + }, + { + "step": 609, + "epoch": 0.9759615384615384, + "cpu_mem": 3.702775808, + "gpu_mem": 1.068939264, + "loss": 0.48, + "grad_norm": 4.131857872009277, + "learning_rate": 5.288864314965003e-07 + }, + { + "step": 610, + "epoch": 0.9775641025641025, + "cpu_mem": 3.702775808, + "gpu_mem": 1.068928512, + "loss": 0.4316, + "grad_norm": 3.5021822452545166, + "learning_rate": 4.607537683404106e-07 + }, + { + "step": 611, + "epoch": 0.9791666666666666, + "cpu_mem": 3.702775808, + "gpu_mem": 1.068913152, + "loss": 0.45, + "grad_norm": 4.173514366149902, + "learning_rate": 3.973106217585842e-07 + }, + { + "step": 612, + "epoch": 0.9807692307692307, + "cpu_mem": 3.702972416, + "gpu_mem": 1.0689792, + "loss": 0.5277, + "grad_norm": 4.584526062011719, + "learning_rate": 3.3855898131356915e-07 + }, + { + "step": 613, + "epoch": 0.9823717948717948, + "cpu_mem": 3.702972416, + "gpu_mem": 1.068922368, + "loss": 0.6162, + "grad_norm": 5.303351879119873, + "learning_rate": 2.845006894433843e-07 + }, + { + "step": 614, + "epoch": 0.9839743589743589, + "cpu_mem": 3.702972416, + "gpu_mem": 1.068908544, + "loss": 0.6024, + "grad_norm": 4.325442790985107, + "learning_rate": 2.351374414037155e-07 + }, + { + "step": 615, + "epoch": 0.9855769230769231, + "cpu_mem": 3.703169024, + "gpu_mem": 1.068973056, + "loss": 0.5827, + "grad_norm": 4.7859206199646, + "learning_rate": 1.9047078521474135e-07 + }, + { + "step": 616, + "epoch": 0.9871794871794872, + "cpu_mem": 3.703169024, + "gpu_mem": 1.0689024, + "loss": 0.5653, + "grad_norm": 5.052596092224121, + "learning_rate": 1.505021216125557e-07 + }, + { + "step": 617, + "epoch": 0.9887820512820513, + "cpu_mem": 3.703169024, + "gpu_mem": 1.068930048, + "loss": 0.3526, + "grad_norm": 4.2595696449279785, + "learning_rate": 1.1523270400535245e-07 + }, + { + "step": 618, + "epoch": 0.9903846153846154, + "cpu_mem": 3.703169024, + "gpu_mem": 1.06893312, + "loss": 0.5458, + "grad_norm": 3.7583529949188232, + "learning_rate": 8.466363843397383e-08 + }, + { + "step": 619, + "epoch": 0.9919871794871795, + "cpu_mem": 3.703365632, + "gpu_mem": 1.068908544, + "loss": 0.6823, + "grad_norm": 4.819891452789307, + "learning_rate": 5.8795883537338106e-08 + }, + { + "step": 620, + "epoch": 0.9935897435897436, + "cpu_mem": 3.703365632, + "gpu_mem": 1.068936192, + "loss": 0.4797, + "grad_norm": 3.910820484161377, + "learning_rate": 3.763025052231361e-08 + }, + { + "step": 621, + "epoch": 0.9951923076923077, + "cpu_mem": 3.703365632, + "gpu_mem": 1.068945408, + "loss": 0.5003, + "grad_norm": 4.4316205978393555, + "learning_rate": 2.1167403138339088e-08 + }, + { + "step": 622, + "epoch": 0.9967948717948718, + "cpu_mem": 3.703365632, + "gpu_mem": 1.068942336, + "loss": 0.5403, + "grad_norm": 3.389054536819458, + "learning_rate": 9.407857656540397e-09 + }, + { + "step": 623, + "epoch": 0.9983974358974359, + "cpu_mem": 3.70356224, + "gpu_mem": 1.068914688, + "loss": 0.5898, + "grad_norm": 4.551488399505615, + "learning_rate": 2.3519828535434325e-09 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 3.70356224, + "gpu_mem": 1.068661248, + "loss": 0.5554, + "grad_norm": 5.126403331756592, + "learning_rate": 0.0 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 3.70356224, + "gpu_mem": 1.068661248, + "train_runtime": 8218.2008, + "train_samples_per_second": 4.856, + "train_steps_per_second": 0.076, + "total_flos": 8.436434983002931e+16, + "train_loss": 1.034108434111262 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r32-a2/README.md b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r32-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r32-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r32-a2/adapter_config.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..97cff55d3f03a364161498b7b6299c246238daf5 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r32-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r32-a2/eval_results.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..bf2d4646b3c02c64af7c95651e608cf3ec2327d7 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "hellaswag", + "results": 0.8402708623780123 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r32-a2/training_configuration.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..491b002a28ed2c842a4ef1d5abc3bb97e5cf8b53 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "HELLASWAG", + "dataset_id": "Rowan/hellaswag", + "preprocess_id": "hellaswag_train_deepeval" + }, + "peft_config": { + "method": "loraq4", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 25231360 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 1, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loraq4-hellaswag-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loraq4-fix/TinyLlama_v1.1-loraq4-hellaswag-r32-a2", + "seed": 42, + "timestamp": "2025-09-01T13:44:24.650133" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r32-a2/training_logs.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..b67579da337b3f1344f94172c1ab087cb9f98420 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r32-a2/training_logs.json @@ -0,0 +1,5629 @@ +[ + { + "step": 1, + "epoch": 0.0016025641025641025, + "cpu_mem": 3.273887744, + "gpu_mem": 1.15091712, + "loss": 4.3397, + "grad_norm": 100.92987823486328, + "learning_rate": 4.7619047619047615e-06 + }, + { + "step": 2, + "epoch": 0.003205128205128205, + "cpu_mem": 3.274870784, + "gpu_mem": 1.352761344, + "loss": 4.4533, + "grad_norm": 133.08168029785156, + "learning_rate": 9.523809523809523e-06 + }, + { + "step": 3, + "epoch": 0.004807692307692308, + "cpu_mem": 3.276050432, + "gpu_mem": 1.352769024, + "loss": 3.7654, + "grad_norm": 590.2216186523438, + "learning_rate": 1.4285714285714284e-05 + }, + { + "step": 4, + "epoch": 0.00641025641025641, + "cpu_mem": 3.277033472, + "gpu_mem": 1.352802816, + "loss": 3.3595, + "grad_norm": 41.941226959228516, + "learning_rate": 1.9047619047619046e-05 + }, + { + "step": 5, + "epoch": 0.008012820512820512, + "cpu_mem": 3.278016512, + "gpu_mem": 1.352765952, + "loss": 2.8706, + "grad_norm": 37.05831527709961, + "learning_rate": 2.3809523809523807e-05 + }, + { + "step": 6, + "epoch": 0.009615384615384616, + "cpu_mem": 3.278999552, + "gpu_mem": 1.352812032, + "loss": 2.3462, + "grad_norm": 32.51831817626953, + "learning_rate": 2.8571428571428567e-05 + }, + { + "step": 7, + "epoch": 0.011217948717948718, + "cpu_mem": 3.279785984, + "gpu_mem": 1.352772096, + "loss": 1.954, + "grad_norm": 15.28129768371582, + "learning_rate": 3.333333333333333e-05 + }, + { + "step": 8, + "epoch": 0.01282051282051282, + "cpu_mem": 3.280769024, + "gpu_mem": 1.352802816, + "loss": 1.6054, + "grad_norm": 9.346803665161133, + "learning_rate": 3.809523809523809e-05 + }, + { + "step": 9, + "epoch": 0.014423076923076924, + "cpu_mem": 3.281555456, + "gpu_mem": 1.352802816, + "loss": 1.4567, + "grad_norm": 4.407958507537842, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 10, + "epoch": 0.016025641025641024, + "cpu_mem": 3.282341888, + "gpu_mem": 1.352745984, + "loss": 1.4654, + "grad_norm": 5.738370418548584, + "learning_rate": 4.7619047619047614e-05 + }, + { + "step": 11, + "epoch": 0.017628205128205128, + "cpu_mem": 3.28312832, + "gpu_mem": 1.352765952, + "loss": 1.5278, + "grad_norm": 12.690369606018066, + "learning_rate": 5.238095238095237e-05 + }, + { + "step": 12, + "epoch": 0.019230769230769232, + "cpu_mem": 3.283718144, + "gpu_mem": 1.35276288, + "loss": 1.3955, + "grad_norm": 3.4439308643341064, + "learning_rate": 5.7142857142857135e-05 + }, + { + "step": 13, + "epoch": 0.020833333333333332, + "cpu_mem": 3.284504576, + "gpu_mem": 1.3527552, + "loss": 1.4089, + "grad_norm": 4.246111869812012, + "learning_rate": 6.190476190476189e-05 + }, + { + "step": 14, + "epoch": 0.022435897435897436, + "cpu_mem": 3.285291008, + "gpu_mem": 1.352781312, + "loss": 1.3696, + "grad_norm": 2.61682391166687, + "learning_rate": 6.666666666666666e-05 + }, + { + "step": 15, + "epoch": 0.02403846153846154, + "cpu_mem": 3.285880832, + "gpu_mem": 1.352779776, + "loss": 1.4196, + "grad_norm": 3.3423378467559814, + "learning_rate": 7.142857142857142e-05 + }, + { + "step": 16, + "epoch": 0.02564102564102564, + "cpu_mem": 3.286667264, + "gpu_mem": 1.352772096, + "loss": 1.4437, + "grad_norm": 6.27014684677124, + "learning_rate": 7.619047619047618e-05 + }, + { + "step": 17, + "epoch": 0.027243589743589744, + "cpu_mem": 3.287257088, + "gpu_mem": 1.352772096, + "loss": 1.4769, + "grad_norm": 4.142366886138916, + "learning_rate": 8.095238095238093e-05 + }, + { + "step": 18, + "epoch": 0.028846153846153848, + "cpu_mem": 3.28804352, + "gpu_mem": 1.352772096, + "loss": 1.3308, + "grad_norm": 2.7859575748443604, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 19, + "epoch": 0.030448717948717948, + "cpu_mem": 3.288633344, + "gpu_mem": 1.352772096, + "loss": 1.4682, + "grad_norm": 3.6660778522491455, + "learning_rate": 9.047619047619046e-05 + }, + { + "step": 20, + "epoch": 0.03205128205128205, + "cpu_mem": 3.289419776, + "gpu_mem": 1.352745984, + "loss": 1.4964, + "grad_norm": 4.667084693908691, + "learning_rate": 9.523809523809523e-05 + }, + { + "step": 21, + "epoch": 0.03365384615384615, + "cpu_mem": 3.2900096, + "gpu_mem": 1.35276288, + "loss": 1.4264, + "grad_norm": 2.772883176803589, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 22, + "epoch": 0.035256410256410256, + "cpu_mem": 3.290599424, + "gpu_mem": 1.35277056, + "loss": 1.4308, + "grad_norm": 2.9880077838897705, + "learning_rate": 0.00010476190476190474 + }, + { + "step": 23, + "epoch": 0.03685897435897436, + "cpu_mem": 3.291189248, + "gpu_mem": 1.352784384, + "loss": 1.3426, + "grad_norm": 1.3749895095825195, + "learning_rate": 0.0001095238095238095 + }, + { + "step": 24, + "epoch": 0.038461538461538464, + "cpu_mem": 3.291779072, + "gpu_mem": 1.352769024, + "loss": 1.4355, + "grad_norm": 3.804508924484253, + "learning_rate": 0.00011428571428571427 + }, + { + "step": 25, + "epoch": 0.04006410256410257, + "cpu_mem": 3.292368896, + "gpu_mem": 1.352756736, + "loss": 1.5942, + "grad_norm": 4.976098537445068, + "learning_rate": 0.00011904761904761903 + }, + { + "step": 26, + "epoch": 0.041666666666666664, + "cpu_mem": 3.29295872, + "gpu_mem": 1.35276288, + "loss": 1.4757, + "grad_norm": 3.71560001373291, + "learning_rate": 0.00012380952380952378 + }, + { + "step": 27, + "epoch": 0.04326923076923077, + "cpu_mem": 3.293548544, + "gpu_mem": 1.35277056, + "loss": 1.4164, + "grad_norm": 1.912729263305664, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 28, + "epoch": 0.04487179487179487, + "cpu_mem": 3.294138368, + "gpu_mem": 1.352765952, + "loss": 1.3877, + "grad_norm": 1.3759660720825195, + "learning_rate": 0.0001333333333333333 + }, + { + "step": 29, + "epoch": 0.046474358974358976, + "cpu_mem": 3.294728192, + "gpu_mem": 1.352775168, + "loss": 1.464, + "grad_norm": 3.4728949069976807, + "learning_rate": 0.00013809523809523808 + }, + { + "step": 30, + "epoch": 0.04807692307692308, + "cpu_mem": 3.295318016, + "gpu_mem": 1.35274752, + "loss": 1.3937, + "grad_norm": 1.7360819578170776, + "learning_rate": 0.00014285714285714284 + }, + { + "step": 31, + "epoch": 0.049679487179487176, + "cpu_mem": 3.29590784, + "gpu_mem": 1.352802816, + "loss": 1.429, + "grad_norm": 3.5168113708496094, + "learning_rate": 0.0001476190476190476 + }, + { + "step": 32, + "epoch": 0.05128205128205128, + "cpu_mem": 3.296497664, + "gpu_mem": 1.352795136, + "loss": 1.4386, + "grad_norm": 7.591375350952148, + "learning_rate": 0.00015238095238095237 + }, + { + "step": 33, + "epoch": 0.052884615384615384, + "cpu_mem": 3.297087488, + "gpu_mem": 1.352749056, + "loss": 1.4115, + "grad_norm": 7.452263355255127, + "learning_rate": 0.00015714285714285713 + }, + { + "step": 34, + "epoch": 0.05448717948717949, + "cpu_mem": 3.297677312, + "gpu_mem": 1.352767488, + "loss": 1.4308, + "grad_norm": 1.4257779121398926, + "learning_rate": 0.00016190476190476187 + }, + { + "step": 35, + "epoch": 0.05608974358974359, + "cpu_mem": 3.298267136, + "gpu_mem": 1.352788992, + "loss": 1.4258, + "grad_norm": 2.988766670227051, + "learning_rate": 0.00016666666666666666 + }, + { + "step": 36, + "epoch": 0.057692307692307696, + "cpu_mem": 3.298660352, + "gpu_mem": 1.352787456, + "loss": 1.4514, + "grad_norm": 3.049619674682617, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 37, + "epoch": 0.05929487179487179, + "cpu_mem": 3.299250176, + "gpu_mem": 1.352819712, + "loss": 1.4488, + "grad_norm": 3.132469654083252, + "learning_rate": 0.0001761904761904762 + }, + { + "step": 38, + "epoch": 0.060897435897435896, + "cpu_mem": 3.29984, + "gpu_mem": 1.352772096, + "loss": 1.3989, + "grad_norm": 7.604755878448486, + "learning_rate": 0.00018095238095238093 + }, + { + "step": 39, + "epoch": 0.0625, + "cpu_mem": 3.300429824, + "gpu_mem": 1.352828928, + "loss": 3.2663, + "grad_norm": 113.9813003540039, + "learning_rate": 0.00018571428571428572 + }, + { + "step": 40, + "epoch": 0.0641025641025641, + "cpu_mem": 3.301019648, + "gpu_mem": 1.352756736, + "loss": 2.2254, + "grad_norm": 65.93851470947266, + "learning_rate": 0.00019047619047619045 + }, + { + "step": 41, + "epoch": 0.06570512820512821, + "cpu_mem": 3.301412864, + "gpu_mem": 1.352784384, + "loss": 1.5008, + "grad_norm": 8.109131813049316, + "learning_rate": 0.00019523809523809522 + }, + { + "step": 42, + "epoch": 0.0673076923076923, + "cpu_mem": 3.302002688, + "gpu_mem": 1.352798208, + "loss": 1.4679, + "grad_norm": 2.565181255340576, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 43, + "epoch": 0.06891025641025642, + "cpu_mem": 3.302592512, + "gpu_mem": 1.352804352, + "loss": 1.3984, + "grad_norm": 2.0048704147338867, + "learning_rate": 0.00020476190476190475 + }, + { + "step": 44, + "epoch": 0.07051282051282051, + "cpu_mem": 3.302985728, + "gpu_mem": 1.352782848, + "loss": 1.405, + "grad_norm": 1.1386798620224, + "learning_rate": 0.00020952380952380948 + }, + { + "step": 45, + "epoch": 0.07211538461538461, + "cpu_mem": 3.303575552, + "gpu_mem": 1.352782848, + "loss": 1.4013, + "grad_norm": 1.4569026231765747, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 46, + "epoch": 0.07371794871794872, + "cpu_mem": 3.303968768, + "gpu_mem": 1.352782848, + "loss": 1.4272, + "grad_norm": 2.0531582832336426, + "learning_rate": 0.000219047619047619 + }, + { + "step": 47, + "epoch": 0.07532051282051282, + "cpu_mem": 3.304558592, + "gpu_mem": 1.352769024, + "loss": 1.379, + "grad_norm": 0.37954869866371155, + "learning_rate": 0.0002238095238095238 + }, + { + "step": 48, + "epoch": 0.07692307692307693, + "cpu_mem": 3.305148416, + "gpu_mem": 1.352787456, + "loss": 1.3688, + "grad_norm": 0.639064371585846, + "learning_rate": 0.00022857142857142854 + }, + { + "step": 49, + "epoch": 0.07852564102564102, + "cpu_mem": 3.305541632, + "gpu_mem": 1.352799744, + "loss": 1.4548, + "grad_norm": 1.8328825235366821, + "learning_rate": 0.0002333333333333333 + }, + { + "step": 50, + "epoch": 0.08012820512820513, + "cpu_mem": 3.306131456, + "gpu_mem": 1.352776704, + "loss": 1.3982, + "grad_norm": 1.063116431236267, + "learning_rate": 0.00023809523809523807 + }, + { + "step": 51, + "epoch": 0.08173076923076923, + "cpu_mem": 3.30672128, + "gpu_mem": 1.352761344, + "loss": 1.5278, + "grad_norm": 3.9077277183532715, + "learning_rate": 0.00024285714285714283 + }, + { + "step": 52, + "epoch": 0.08333333333333333, + "cpu_mem": 3.307311104, + "gpu_mem": 1.352765952, + "loss": 1.4479, + "grad_norm": 2.3305723667144775, + "learning_rate": 0.00024761904761904757 + }, + { + "step": 53, + "epoch": 0.08493589743589744, + "cpu_mem": 3.30770432, + "gpu_mem": 1.3527936, + "loss": 1.394, + "grad_norm": 1.3088741302490234, + "learning_rate": 0.0002523809523809524 + }, + { + "step": 54, + "epoch": 0.08653846153846154, + "cpu_mem": 3.308097536, + "gpu_mem": 1.352769024, + "loss": 1.5003, + "grad_norm": 3.1876721382141113, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 55, + "epoch": 0.08814102564102565, + "cpu_mem": 3.308490752, + "gpu_mem": 1.352787456, + "loss": 1.4057, + "grad_norm": 1.1560564041137695, + "learning_rate": 0.00026190476190476186 + }, + { + "step": 56, + "epoch": 0.08974358974358974, + "cpu_mem": 3.308883968, + "gpu_mem": 1.352781312, + "loss": 1.4846, + "grad_norm": 3.240968704223633, + "learning_rate": 0.0002666666666666666 + }, + { + "step": 57, + "epoch": 0.09134615384615384, + "cpu_mem": 3.309473792, + "gpu_mem": 1.35274752, + "loss": 1.3526, + "grad_norm": 0.8894960284233093, + "learning_rate": 0.0002714285714285714 + }, + { + "step": 58, + "epoch": 0.09294871794871795, + "cpu_mem": 3.309867008, + "gpu_mem": 1.352776704, + "loss": 1.4378, + "grad_norm": 1.7731860876083374, + "learning_rate": 0.00027619047619047615 + }, + { + "step": 59, + "epoch": 0.09455128205128205, + "cpu_mem": 3.310456832, + "gpu_mem": 1.352759808, + "loss": 1.3963, + "grad_norm": 2.0301012992858887, + "learning_rate": 0.0002809523809523809 + }, + { + "step": 60, + "epoch": 0.09615384615384616, + "cpu_mem": 3.310850048, + "gpu_mem": 1.35280128, + "loss": 1.466, + "grad_norm": 2.2221343517303467, + "learning_rate": 0.0002857142857142857 + }, + { + "step": 61, + "epoch": 0.09775641025641026, + "cpu_mem": 3.311243264, + "gpu_mem": 1.352767488, + "loss": 1.4203, + "grad_norm": 0.8976136445999146, + "learning_rate": 0.00029047619047619045 + }, + { + "step": 62, + "epoch": 0.09935897435897435, + "cpu_mem": 3.311833088, + "gpu_mem": 1.352807424, + "loss": 1.3267, + "grad_norm": 0.6199930310249329, + "learning_rate": 0.0002952380952380952 + }, + { + "step": 63, + "epoch": 0.10096153846153846, + "cpu_mem": 3.312226304, + "gpu_mem": 1.352761344, + "loss": 1.4729, + "grad_norm": 1.5067228078842163, + "learning_rate": 0.0003 + }, + { + "step": 64, + "epoch": 0.10256410256410256, + "cpu_mem": 3.312816128, + "gpu_mem": 1.352765952, + "loss": 1.4489, + "grad_norm": 1.2022075653076172, + "learning_rate": 0.00029999764801714643 + }, + { + "step": 65, + "epoch": 0.10416666666666667, + "cpu_mem": 3.313209344, + "gpu_mem": 1.35276288, + "loss": 1.3945, + "grad_norm": 0.5797547698020935, + "learning_rate": 0.00029999059214234344 + }, + { + "step": 66, + "epoch": 0.10576923076923077, + "cpu_mem": 3.31360256, + "gpu_mem": 1.352781312, + "loss": 1.4355, + "grad_norm": 0.9906496405601501, + "learning_rate": 0.00029997883259686163 + }, + { + "step": 67, + "epoch": 0.10737179487179487, + "cpu_mem": 3.314192384, + "gpu_mem": 1.352773632, + "loss": 1.4619, + "grad_norm": 1.590461015701294, + "learning_rate": 0.00029996236974947764 + }, + { + "step": 68, + "epoch": 0.10897435897435898, + "cpu_mem": 3.3145856, + "gpu_mem": 1.352758272, + "loss": 1.4233, + "grad_norm": 1.724750280380249, + "learning_rate": 0.00029994120411646263 + }, + { + "step": 69, + "epoch": 0.11057692307692307, + "cpu_mem": 3.314978816, + "gpu_mem": 1.352828928, + "loss": 1.4006, + "grad_norm": 1.6941472291946411, + "learning_rate": 0.00029991533636156603 + }, + { + "step": 70, + "epoch": 0.11217948717948718, + "cpu_mem": 3.31556864, + "gpu_mem": 1.352779776, + "loss": 1.4582, + "grad_norm": 2.514803171157837, + "learning_rate": 0.00029988476729599464 + }, + { + "step": 71, + "epoch": 0.11378205128205128, + "cpu_mem": 3.315961856, + "gpu_mem": 1.352804352, + "loss": 1.3566, + "grad_norm": 1.3100533485412598, + "learning_rate": 0.0002998494978783874 + }, + { + "step": 72, + "epoch": 0.11538461538461539, + "cpu_mem": 3.316355072, + "gpu_mem": 1.352775168, + "loss": 1.4475, + "grad_norm": 1.839597463607788, + "learning_rate": 0.0002998095292147852 + }, + { + "step": 73, + "epoch": 0.11698717948717949, + "cpu_mem": 3.316748288, + "gpu_mem": 1.352767488, + "loss": 1.3991, + "grad_norm": 1.8404405117034912, + "learning_rate": 0.0002997648625585962 + }, + { + "step": 74, + "epoch": 0.11858974358974358, + "cpu_mem": 3.317338112, + "gpu_mem": 1.352761344, + "loss": 1.4373, + "grad_norm": 2.3490631580352783, + "learning_rate": 0.0002997154993105566 + }, + { + "step": 75, + "epoch": 0.1201923076923077, + "cpu_mem": 3.317731328, + "gpu_mem": 1.352790528, + "loss": 1.3951, + "grad_norm": 1.2521231174468994, + "learning_rate": 0.00029966144101868636 + }, + { + "step": 76, + "epoch": 0.12179487179487179, + "cpu_mem": 3.317927936, + "gpu_mem": 1.352781312, + "loss": 1.43, + "grad_norm": 2.6861536502838135, + "learning_rate": 0.0002996026893782414 + }, + { + "step": 77, + "epoch": 0.1233974358974359, + "cpu_mem": 3.318321152, + "gpu_mem": 1.352769024, + "loss": 1.489, + "grad_norm": 2.2058324813842773, + "learning_rate": 0.00029953924623165955 + }, + { + "step": 78, + "epoch": 0.125, + "cpu_mem": 3.318714368, + "gpu_mem": 1.352761344, + "loss": 1.5198, + "grad_norm": 2.5155413150787354, + "learning_rate": 0.0002994711135685035 + }, + { + "step": 79, + "epoch": 0.1266025641025641, + "cpu_mem": 3.319304192, + "gpu_mem": 1.352813568, + "loss": 1.4155, + "grad_norm": 1.308193564414978, + "learning_rate": 0.00029939829352539787 + }, + { + "step": 80, + "epoch": 0.1282051282051282, + "cpu_mem": 3.319697408, + "gpu_mem": 1.352792064, + "loss": 1.4008, + "grad_norm": 0.7080330848693848, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 81, + "epoch": 0.12980769230769232, + "cpu_mem": 3.320090624, + "gpu_mem": 1.35278592, + "loss": 1.3941, + "grad_norm": 1.3022620677947998, + "learning_rate": 0.0002992386005807413 + }, + { + "step": 82, + "epoch": 0.13141025641025642, + "cpu_mem": 3.32048384, + "gpu_mem": 1.35276288, + "loss": 1.3792, + "grad_norm": 1.2085868120193481, + "learning_rate": 0.00029915173268712456 + }, + { + "step": 83, + "epoch": 0.1330128205128205, + "cpu_mem": 3.320877056, + "gpu_mem": 1.352784384, + "loss": 1.4942, + "grad_norm": 2.4671788215637207, + "learning_rate": 0.0002990601874292698 + }, + { + "step": 84, + "epoch": 0.1346153846153846, + "cpu_mem": 3.321270272, + "gpu_mem": 1.352756736, + "loss": 1.4563, + "grad_norm": 1.8704583644866943, + "learning_rate": 0.0002989639676780152 + }, + { + "step": 85, + "epoch": 0.1362179487179487, + "cpu_mem": 3.32146688, + "gpu_mem": 1.352764416, + "loss": 1.3958, + "grad_norm": 0.8095455169677734, + "learning_rate": 0.0002988630764507904 + }, + { + "step": 86, + "epoch": 0.13782051282051283, + "cpu_mem": 3.322056704, + "gpu_mem": 1.352782848, + "loss": 1.4233, + "grad_norm": 1.2917191982269287, + "learning_rate": 0.00029875751691152094 + }, + { + "step": 87, + "epoch": 0.13942307692307693, + "cpu_mem": 3.32244992, + "gpu_mem": 1.352772096, + "loss": 1.393, + "grad_norm": 0.8256767988204956, + "learning_rate": 0.0002986472923705301 + }, + { + "step": 88, + "epoch": 0.14102564102564102, + "cpu_mem": 3.322646528, + "gpu_mem": 1.35277056, + "loss": 1.392, + "grad_norm": 0.9205710291862488, + "learning_rate": 0.0002985324062844341 + }, + { + "step": 89, + "epoch": 0.14262820512820512, + "cpu_mem": 3.323039744, + "gpu_mem": 1.352765952, + "loss": 1.3871, + "grad_norm": 0.6106694936752319, + "learning_rate": 0.0002984128622560345 + }, + { + "step": 90, + "epoch": 0.14423076923076922, + "cpu_mem": 3.32343296, + "gpu_mem": 1.35277056, + "loss": 1.3894, + "grad_norm": 0.6501631736755371, + "learning_rate": 0.0002982886640342046 + }, + { + "step": 91, + "epoch": 0.14583333333333334, + "cpu_mem": 3.323826176, + "gpu_mem": 1.352781312, + "loss": 1.4803, + "grad_norm": 1.483115553855896, + "learning_rate": 0.00029815981551377217 + }, + { + "step": 92, + "epoch": 0.14743589743589744, + "cpu_mem": 3.324219392, + "gpu_mem": 1.352784384, + "loss": 1.4105, + "grad_norm": 0.6299130916595459, + "learning_rate": 0.00029802632073539745 + }, + { + "step": 93, + "epoch": 0.14903846153846154, + "cpu_mem": 3.324612608, + "gpu_mem": 1.352784384, + "loss": 1.4084, + "grad_norm": 0.46774837374687195, + "learning_rate": 0.0002978881838854462 + }, + { + "step": 94, + "epoch": 0.15064102564102563, + "cpu_mem": 3.325005824, + "gpu_mem": 1.352779776, + "loss": 1.3876, + "grad_norm": 0.6071069240570068, + "learning_rate": 0.00029774540929585847 + }, + { + "step": 95, + "epoch": 0.15224358974358973, + "cpu_mem": 3.32539904, + "gpu_mem": 1.352798208, + "loss": 1.4723, + "grad_norm": 1.4365779161453247, + "learning_rate": 0.0002975980014440126 + }, + { + "step": 96, + "epoch": 0.15384615384615385, + "cpu_mem": 3.325792256, + "gpu_mem": 1.35280128, + "loss": 1.4003, + "grad_norm": 0.5780792236328125, + "learning_rate": 0.00029744596495258525 + }, + { + "step": 97, + "epoch": 0.15544871794871795, + "cpu_mem": 3.32638208, + "gpu_mem": 1.35277824, + "loss": 1.389, + "grad_norm": 0.423502117395401, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 98, + "epoch": 0.15705128205128205, + "cpu_mem": 3.326578688, + "gpu_mem": 1.352788992, + "loss": 1.397, + "grad_norm": 0.6111699342727661, + "learning_rate": 0.000297128025267308 + }, + { + "step": 99, + "epoch": 0.15865384615384615, + "cpu_mem": 3.326971904, + "gpu_mem": 1.352788992, + "loss": 1.3802, + "grad_norm": 1.0353106260299683, + "learning_rate": 0.00029696213204397396 + }, + { + "step": 100, + "epoch": 0.16025641025641027, + "cpu_mem": 3.32736512, + "gpu_mem": 1.352764416, + "loss": 1.3751, + "grad_norm": 0.4910379648208618, + "learning_rate": 0.00029679163012177737 + }, + { + "step": 101, + "epoch": 0.16185897435897437, + "cpu_mem": 3.327758336, + "gpu_mem": 1.3527936, + "loss": 1.4364, + "grad_norm": 1.389988899230957, + "learning_rate": 0.0002966165248476196 + }, + { + "step": 102, + "epoch": 0.16346153846153846, + "cpu_mem": 3.327954944, + "gpu_mem": 1.35277056, + "loss": 1.3421, + "grad_norm": 0.3157550096511841, + "learning_rate": 0.00029643682171276203 + }, + { + "step": 103, + "epoch": 0.16506410256410256, + "cpu_mem": 3.32834816, + "gpu_mem": 1.352787456, + "loss": 1.4547, + "grad_norm": 1.2348442077636719, + "learning_rate": 0.0002962525263526538 + }, + { + "step": 104, + "epoch": 0.16666666666666666, + "cpu_mem": 3.328741376, + "gpu_mem": 1.3527552, + "loss": 1.4241, + "grad_norm": 1.1774156093597412, + "learning_rate": 0.0002960636445467553 + }, + { + "step": 105, + "epoch": 0.16826923076923078, + "cpu_mem": 3.328937984, + "gpu_mem": 1.35277056, + "loss": 1.3599, + "grad_norm": 0.3430196940898895, + "learning_rate": 0.0002958701822183569 + }, + { + "step": 106, + "epoch": 0.16987179487179488, + "cpu_mem": 3.3293312, + "gpu_mem": 1.352750592, + "loss": 1.5007, + "grad_norm": 1.422131896018982, + "learning_rate": 0.0002956721454343928 + }, + { + "step": 107, + "epoch": 0.17147435897435898, + "cpu_mem": 3.329724416, + "gpu_mem": 1.352792064, + "loss": 1.3974, + "grad_norm": 0.5625864863395691, + "learning_rate": 0.0002954695404052514 + }, + { + "step": 108, + "epoch": 0.17307692307692307, + "cpu_mem": 3.329921024, + "gpu_mem": 1.352787456, + "loss": 1.427, + "grad_norm": 0.8971032500267029, + "learning_rate": 0.00029526237348458003 + }, + { + "step": 109, + "epoch": 0.17467948717948717, + "cpu_mem": 3.33031424, + "gpu_mem": 1.3527936, + "loss": 1.3675, + "grad_norm": 0.6749948263168335, + "learning_rate": 0.000295050651169086 + }, + { + "step": 110, + "epoch": 0.1762820512820513, + "cpu_mem": 3.330707456, + "gpu_mem": 1.352790528, + "loss": 1.3845, + "grad_norm": 0.5770296454429626, + "learning_rate": 0.00029483438009833264 + }, + { + "step": 111, + "epoch": 0.1778846153846154, + "cpu_mem": 3.330904064, + "gpu_mem": 1.352792064, + "loss": 1.3845, + "grad_norm": 0.9436963796615601, + "learning_rate": 0.0002946135670545314 + }, + { + "step": 112, + "epoch": 0.1794871794871795, + "cpu_mem": 3.33129728, + "gpu_mem": 1.352788992, + "loss": 1.4012, + "grad_norm": 0.790702223777771, + "learning_rate": 0.0002943882189623288 + }, + { + "step": 113, + "epoch": 0.18108974358974358, + "cpu_mem": 3.331690496, + "gpu_mem": 1.352769024, + "loss": 1.3942, + "grad_norm": 0.5395233035087585, + "learning_rate": 0.00029415834288858947 + }, + { + "step": 114, + "epoch": 0.18269230769230768, + "cpu_mem": 3.332083712, + "gpu_mem": 1.352764416, + "loss": 1.3761, + "grad_norm": 0.9616730809211731, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 115, + "epoch": 0.1842948717948718, + "cpu_mem": 3.33228032, + "gpu_mem": 1.352782848, + "loss": 1.4024, + "grad_norm": 0.938653826713562, + "learning_rate": 0.0002936850357737156 + }, + { + "step": 116, + "epoch": 0.1858974358974359, + "cpu_mem": 3.332673536, + "gpu_mem": 1.3527936, + "loss": 1.3691, + "grad_norm": 0.8187636137008667, + "learning_rate": 0.0002934416195753839 + }, + { + "step": 117, + "epoch": 0.1875, + "cpu_mem": 3.333066752, + "gpu_mem": 1.352779776, + "loss": 1.3344, + "grad_norm": 0.8032742142677307, + "learning_rate": 0.00029319370508065594 + }, + { + "step": 118, + "epoch": 0.1891025641025641, + "cpu_mem": 3.33326336, + "gpu_mem": 1.352795136, + "loss": 1.3029, + "grad_norm": 1.257274866104126, + "learning_rate": 0.0002929413000640735 + }, + { + "step": 119, + "epoch": 0.1907051282051282, + "cpu_mem": 3.333656576, + "gpu_mem": 1.352776704, + "loss": 1.3671, + "grad_norm": 2.6734914779663086, + "learning_rate": 0.0002926844124410001 + }, + { + "step": 120, + "epoch": 0.19230769230769232, + "cpu_mem": 3.334049792, + "gpu_mem": 1.352802816, + "loss": 1.4158, + "grad_norm": 2.612907648086548, + "learning_rate": 0.0002924230502673731 + }, + { + "step": 121, + "epoch": 0.19391025641025642, + "cpu_mem": 3.3342464, + "gpu_mem": 1.352761344, + "loss": 1.1671, + "grad_norm": 3.7325973510742188, + "learning_rate": 0.00029215722173945034 + }, + { + "step": 122, + "epoch": 0.1955128205128205, + "cpu_mem": 3.334639616, + "gpu_mem": 1.3527936, + "loss": 1.1936, + "grad_norm": 2.984410524368286, + "learning_rate": 0.0002918869351935537 + }, + { + "step": 123, + "epoch": 0.1971153846153846, + "cpu_mem": 3.334836224, + "gpu_mem": 1.352787456, + "loss": 1.2621, + "grad_norm": 4.792877197265625, + "learning_rate": 0.00029161219910580754 + }, + { + "step": 124, + "epoch": 0.1987179487179487, + "cpu_mem": 3.33522944, + "gpu_mem": 1.352788992, + "loss": 1.2368, + "grad_norm": 4.5045366287231445, + "learning_rate": 0.00029133302209187267 + }, + { + "step": 125, + "epoch": 0.20032051282051283, + "cpu_mem": 3.335622656, + "gpu_mem": 1.352764416, + "loss": 1.1604, + "grad_norm": 3.405709743499756, + "learning_rate": 0.00029104941290667655 + }, + { + "step": 126, + "epoch": 0.20192307692307693, + "cpu_mem": 3.335819264, + "gpu_mem": 1.352773632, + "loss": 1.0169, + "grad_norm": 2.9670467376708984, + "learning_rate": 0.00029076138044413827 + }, + { + "step": 127, + "epoch": 0.20352564102564102, + "cpu_mem": 3.33621248, + "gpu_mem": 1.352759808, + "loss": 1.1367, + "grad_norm": 4.0289716720581055, + "learning_rate": 0.00029046893373689 + }, + { + "step": 128, + "epoch": 0.20512820512820512, + "cpu_mem": 3.336605696, + "gpu_mem": 1.352796672, + "loss": 1.6452, + "grad_norm": 18.877641677856445, + "learning_rate": 0.00029017208195599375 + }, + { + "step": 129, + "epoch": 0.20673076923076922, + "cpu_mem": 3.336802304, + "gpu_mem": 1.3527936, + "loss": 1.2496, + "grad_norm": 4.017213344573975, + "learning_rate": 0.0002898708344106533 + }, + { + "step": 130, + "epoch": 0.20833333333333334, + "cpu_mem": 3.33719552, + "gpu_mem": 1.3527936, + "loss": 1.1309, + "grad_norm": 2.1604321002960205, + "learning_rate": 0.00028956520054792303 + }, + { + "step": 131, + "epoch": 0.20993589743589744, + "cpu_mem": 3.337588736, + "gpu_mem": 1.352782848, + "loss": 1.0652, + "grad_norm": 3.4464449882507324, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 132, + "epoch": 0.21153846153846154, + "cpu_mem": 3.337785344, + "gpu_mem": 1.352782848, + "loss": 1.0171, + "grad_norm": 3.1324374675750732, + "learning_rate": 0.0002889408123459782 + }, + { + "step": 133, + "epoch": 0.21314102564102563, + "cpu_mem": 3.337981952, + "gpu_mem": 1.352764416, + "loss": 0.9996, + "grad_norm": 6.1859283447265625, + "learning_rate": 0.000288622077587435 + }, + { + "step": 134, + "epoch": 0.21474358974358973, + "cpu_mem": 3.338375168, + "gpu_mem": 1.352775168, + "loss": 0.9195, + "grad_norm": 5.48726224899292, + "learning_rate": 0.0002882989956722303 + }, + { + "step": 135, + "epoch": 0.21634615384615385, + "cpu_mem": 3.338768384, + "gpu_mem": 1.352784384, + "loss": 1.0941, + "grad_norm": 4.883859157562256, + "learning_rate": 0.00028797157673213914 + }, + { + "step": 136, + "epoch": 0.21794871794871795, + "cpu_mem": 3.3391616, + "gpu_mem": 1.352799744, + "loss": 1.2029, + "grad_norm": 5.216318607330322, + "learning_rate": 0.00028763983103494465 + }, + { + "step": 137, + "epoch": 0.21955128205128205, + "cpu_mem": 3.339358208, + "gpu_mem": 1.35274752, + "loss": 1.1267, + "grad_norm": 5.090916156768799, + "learning_rate": 0.00028730376898411606 + }, + { + "step": 138, + "epoch": 0.22115384615384615, + "cpu_mem": 3.339554816, + "gpu_mem": 1.352767488, + "loss": 1.0282, + "grad_norm": 5.455250263214111, + "learning_rate": 0.00028696340111848245 + }, + { + "step": 139, + "epoch": 0.22275641025641027, + "cpu_mem": 3.339751424, + "gpu_mem": 1.352749056, + "loss": 1.2556, + "grad_norm": 7.9033918380737305, + "learning_rate": 0.00028661873811190226 + }, + { + "step": 140, + "epoch": 0.22435897435897437, + "cpu_mem": 3.34014464, + "gpu_mem": 1.352765952, + "loss": 1.1382, + "grad_norm": 6.53822660446167, + "learning_rate": 0.0002862697907729285 + }, + { + "step": 141, + "epoch": 0.22596153846153846, + "cpu_mem": 3.340341248, + "gpu_mem": 1.352772096, + "loss": 1.2672, + "grad_norm": 7.310843467712402, + "learning_rate": 0.0002859165700444701 + }, + { + "step": 142, + "epoch": 0.22756410256410256, + "cpu_mem": 3.340537856, + "gpu_mem": 1.352769024, + "loss": 0.9532, + "grad_norm": 4.886610984802246, + "learning_rate": 0.00028555908700344824 + }, + { + "step": 143, + "epoch": 0.22916666666666666, + "cpu_mem": 3.340931072, + "gpu_mem": 1.352795136, + "loss": 1.0267, + "grad_norm": 4.539045333862305, + "learning_rate": 0.00028519735286044936 + }, + { + "step": 144, + "epoch": 0.23076923076923078, + "cpu_mem": 3.34112768, + "gpu_mem": 1.352769024, + "loss": 0.9589, + "grad_norm": 4.37456750869751, + "learning_rate": 0.0002848313789593736 + }, + { + "step": 145, + "epoch": 0.23237179487179488, + "cpu_mem": 3.341520896, + "gpu_mem": 1.35280896, + "loss": 0.9297, + "grad_norm": 4.019575119018555, + "learning_rate": 0.00028446117677707867 + }, + { + "step": 146, + "epoch": 0.23397435897435898, + "cpu_mem": 3.341914112, + "gpu_mem": 1.352758272, + "loss": 0.888, + "grad_norm": 4.063376426696777, + "learning_rate": 0.0002840867579230205 + }, + { + "step": 147, + "epoch": 0.23557692307692307, + "cpu_mem": 3.34211072, + "gpu_mem": 1.352767488, + "loss": 1.1078, + "grad_norm": 4.855127811431885, + "learning_rate": 0.00028370813413888866 + }, + { + "step": 148, + "epoch": 0.23717948717948717, + "cpu_mem": 3.342307328, + "gpu_mem": 1.352787456, + "loss": 0.9171, + "grad_norm": 3.656273365020752, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 149, + "epoch": 0.2387820512820513, + "cpu_mem": 3.342503936, + "gpu_mem": 1.35277824, + "loss": 0.8002, + "grad_norm": 3.9228451251983643, + "learning_rate": 0.0002829383194061186 + }, + { + "step": 150, + "epoch": 0.2403846153846154, + "cpu_mem": 3.342897152, + "gpu_mem": 1.352790528, + "loss": 1.1274, + "grad_norm": 6.739924430847168, + "learning_rate": 0.00028254715259869444 + }, + { + "step": 151, + "epoch": 0.2419871794871795, + "cpu_mem": 3.34309376, + "gpu_mem": 1.3527552, + "loss": 0.9213, + "grad_norm": 5.848925590515137, + "learning_rate": 0.00028215182914286766 + }, + { + "step": 152, + "epoch": 0.24358974358974358, + "cpu_mem": 3.343290368, + "gpu_mem": 1.35278592, + "loss": 0.953, + "grad_norm": 4.98676872253418, + "learning_rate": 0.00028175236143589144 + }, + { + "step": 153, + "epoch": 0.24519230769230768, + "cpu_mem": 3.343683584, + "gpu_mem": 1.352781312, + "loss": 1.0278, + "grad_norm": 6.806836128234863, + "learning_rate": 0.0002813487620049817 + }, + { + "step": 154, + "epoch": 0.2467948717948718, + "cpu_mem": 3.343880192, + "gpu_mem": 1.352805888, + "loss": 1.0563, + "grad_norm": 5.994335651397705, + "learning_rate": 0.00028094104350692435 + }, + { + "step": 155, + "epoch": 0.2483974358974359, + "cpu_mem": 3.3440768, + "gpu_mem": 1.352742912, + "loss": 0.8225, + "grad_norm": 4.079307556152344, + "learning_rate": 0.0002805292187276783 + }, + { + "step": 156, + "epoch": 0.25, + "cpu_mem": 3.344470016, + "gpu_mem": 1.352796672, + "loss": 0.9335, + "grad_norm": 3.615840435028076, + "learning_rate": 0.0002801133005819744 + }, + { + "step": 157, + "epoch": 0.2516025641025641, + "cpu_mem": 3.344666624, + "gpu_mem": 1.352788992, + "loss": 0.8604, + "grad_norm": 4.664431095123291, + "learning_rate": 0.00027969330211291077 + }, + { + "step": 158, + "epoch": 0.2532051282051282, + "cpu_mem": 3.34505984, + "gpu_mem": 1.352804352, + "loss": 0.8651, + "grad_norm": 5.017622947692871, + "learning_rate": 0.00027926923649154327 + }, + { + "step": 159, + "epoch": 0.2548076923076923, + "cpu_mem": 3.345256448, + "gpu_mem": 1.352805888, + "loss": 0.6998, + "grad_norm": 3.949697494506836, + "learning_rate": 0.00027884111701647284 + }, + { + "step": 160, + "epoch": 0.2564102564102564, + "cpu_mem": 3.345453056, + "gpu_mem": 1.352773632, + "loss": 0.9083, + "grad_norm": 5.275274753570557, + "learning_rate": 0.00027840895711342834 + }, + { + "step": 161, + "epoch": 0.25801282051282054, + "cpu_mem": 3.345649664, + "gpu_mem": 1.352765952, + "loss": 0.7813, + "grad_norm": 5.771602630615234, + "learning_rate": 0.00027797277033484553 + }, + { + "step": 162, + "epoch": 0.25961538461538464, + "cpu_mem": 3.345846272, + "gpu_mem": 1.35280128, + "loss": 0.7225, + "grad_norm": 4.672788619995117, + "learning_rate": 0.0002775325703594421 + }, + { + "step": 163, + "epoch": 0.26121794871794873, + "cpu_mem": 3.346239488, + "gpu_mem": 1.352749056, + "loss": 0.7838, + "grad_norm": 4.843015670776367, + "learning_rate": 0.0002770883709917886 + }, + { + "step": 164, + "epoch": 0.26282051282051283, + "cpu_mem": 3.346436096, + "gpu_mem": 1.352784384, + "loss": 0.6953, + "grad_norm": 4.913809776306152, + "learning_rate": 0.0002766401861618757 + }, + { + "step": 165, + "epoch": 0.2644230769230769, + "cpu_mem": 3.346632704, + "gpu_mem": 1.352773632, + "loss": 0.7524, + "grad_norm": 4.6714019775390625, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 166, + "epoch": 0.266025641025641, + "cpu_mem": 3.346829312, + "gpu_mem": 1.352805888, + "loss": 0.6402, + "grad_norm": 3.941246271133423, + "learning_rate": 0.0002757319164597092 + }, + { + "step": 167, + "epoch": 0.2676282051282051, + "cpu_mem": 3.347222528, + "gpu_mem": 1.352799744, + "loss": 1.0076, + "grad_norm": 6.541812896728516, + "learning_rate": 0.0002752718600705858 + }, + { + "step": 168, + "epoch": 0.2692307692307692, + "cpu_mem": 3.347419136, + "gpu_mem": 1.35277824, + "loss": 0.9792, + "grad_norm": 6.9837141036987305, + "learning_rate": 0.00027480787518457023 + }, + { + "step": 169, + "epoch": 0.2708333333333333, + "cpu_mem": 3.347615744, + "gpu_mem": 1.352775168, + "loss": 1.1514, + "grad_norm": 7.488488674163818, + "learning_rate": 0.0002743399763521223 + }, + { + "step": 170, + "epoch": 0.2724358974358974, + "cpu_mem": 3.347812352, + "gpu_mem": 1.352812032, + "loss": 0.7712, + "grad_norm": 4.2084736824035645, + "learning_rate": 0.0002738681782464426 + }, + { + "step": 171, + "epoch": 0.27403846153846156, + "cpu_mem": 3.348205568, + "gpu_mem": 1.35278592, + "loss": 0.5356, + "grad_norm": 3.5332729816436768, + "learning_rate": 0.0002733924956630117 + }, + { + "step": 172, + "epoch": 0.27564102564102566, + "cpu_mem": 3.348402176, + "gpu_mem": 1.35276288, + "loss": 0.8141, + "grad_norm": 4.9291534423828125, + "learning_rate": 0.00027291294351912664 + }, + { + "step": 173, + "epoch": 0.27724358974358976, + "cpu_mem": 3.348598784, + "gpu_mem": 1.352788992, + "loss": 0.7257, + "grad_norm": 3.6385445594787598, + "learning_rate": 0.00027242953685343327 + }, + { + "step": 174, + "epoch": 0.27884615384615385, + "cpu_mem": 3.348795392, + "gpu_mem": 1.35280128, + "loss": 0.8139, + "grad_norm": 5.343786239624023, + "learning_rate": 0.0002719422908254538 + }, + { + "step": 175, + "epoch": 0.28044871794871795, + "cpu_mem": 3.348992, + "gpu_mem": 1.35276288, + "loss": 0.6552, + "grad_norm": 4.209808826446533, + "learning_rate": 0.0002714512207151125 + }, + { + "step": 176, + "epoch": 0.28205128205128205, + "cpu_mem": 3.349188608, + "gpu_mem": 1.352772096, + "loss": 0.8632, + "grad_norm": 5.785989761352539, + "learning_rate": 0.0002709563419222557 + }, + { + "step": 177, + "epoch": 0.28365384615384615, + "cpu_mem": 3.349385216, + "gpu_mem": 1.352753664, + "loss": 0.801, + "grad_norm": 6.07953405380249, + "learning_rate": 0.0002704576699661691 + }, + { + "step": 178, + "epoch": 0.28525641025641024, + "cpu_mem": 3.349581824, + "gpu_mem": 1.352767488, + "loss": 0.6663, + "grad_norm": 5.91207218170166, + "learning_rate": 0.0002699552204850914 + }, + { + "step": 179, + "epoch": 0.28685897435897434, + "cpu_mem": 3.34997504, + "gpu_mem": 1.352775168, + "loss": 0.6188, + "grad_norm": 5.506045818328857, + "learning_rate": 0.0002694490092357233 + }, + { + "step": 180, + "epoch": 0.28846153846153844, + "cpu_mem": 3.350171648, + "gpu_mem": 1.352756736, + "loss": 0.7612, + "grad_norm": 5.508782386779785, + "learning_rate": 0.00026893905209273404 + }, + { + "step": 181, + "epoch": 0.2900641025641026, + "cpu_mem": 3.350368256, + "gpu_mem": 1.352787456, + "loss": 0.7216, + "grad_norm": 6.086572647094727, + "learning_rate": 0.00026842536504826286 + }, + { + "step": 182, + "epoch": 0.2916666666666667, + "cpu_mem": 3.350564864, + "gpu_mem": 1.352758272, + "loss": 1.0395, + "grad_norm": 6.62758731842041, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 183, + "epoch": 0.2932692307692308, + "cpu_mem": 3.350761472, + "gpu_mem": 1.352782848, + "loss": 0.8175, + "grad_norm": 5.682249069213867, + "learning_rate": 0.0002673868658077717 + }, + { + "step": 184, + "epoch": 0.2948717948717949, + "cpu_mem": 3.351154688, + "gpu_mem": 1.35276288, + "loss": 0.6992, + "grad_norm": 4.889549732208252, + "learning_rate": 0.00026686208617885055 + }, + { + "step": 185, + "epoch": 0.296474358974359, + "cpu_mem": 3.351351296, + "gpu_mem": 1.352795136, + "loss": 0.9211, + "grad_norm": 4.956994533538818, + "learning_rate": 0.0002663336417816238 + }, + { + "step": 186, + "epoch": 0.2980769230769231, + "cpu_mem": 3.351547904, + "gpu_mem": 1.35278592, + "loss": 0.8633, + "grad_norm": 4.5086565017700195, + "learning_rate": 0.0002658015491879868 + }, + { + "step": 187, + "epoch": 0.29967948717948717, + "cpu_mem": 3.351744512, + "gpu_mem": 1.352781312, + "loss": 0.6578, + "grad_norm": 3.113751173019409, + "learning_rate": 0.00026526582508424175 + }, + { + "step": 188, + "epoch": 0.30128205128205127, + "cpu_mem": 3.35194112, + "gpu_mem": 1.352738304, + "loss": 0.8889, + "grad_norm": 4.39888334274292, + "learning_rate": 0.0002647264862705741 + }, + { + "step": 189, + "epoch": 0.30288461538461536, + "cpu_mem": 3.352137728, + "gpu_mem": 1.352818176, + "loss": 0.7895, + "grad_norm": 3.483107805252075, + "learning_rate": 0.00026418354966052573 + }, + { + "step": 190, + "epoch": 0.30448717948717946, + "cpu_mem": 3.352334336, + "gpu_mem": 1.352769024, + "loss": 0.7678, + "grad_norm": 3.8315188884735107, + "learning_rate": 0.00026363703228046454 + }, + { + "step": 191, + "epoch": 0.3060897435897436, + "cpu_mem": 3.352530944, + "gpu_mem": 1.352769024, + "loss": 0.5827, + "grad_norm": 2.6410374641418457, + "learning_rate": 0.0002630869512690507 + }, + { + "step": 192, + "epoch": 0.3076923076923077, + "cpu_mem": 3.352727552, + "gpu_mem": 1.352735232, + "loss": 0.7833, + "grad_norm": 5.09644889831543, + "learning_rate": 0.0002625333238766989 + }, + { + "step": 193, + "epoch": 0.3092948717948718, + "cpu_mem": 3.35292416, + "gpu_mem": 1.352775168, + "loss": 0.3553, + "grad_norm": 3.804572105407715, + "learning_rate": 0.0002619761674650377 + }, + { + "step": 194, + "epoch": 0.3108974358974359, + "cpu_mem": 3.353120768, + "gpu_mem": 1.35277056, + "loss": 0.9002, + "grad_norm": 8.039546012878418, + "learning_rate": 0.0002614154995063647 + }, + { + "step": 195, + "epoch": 0.3125, + "cpu_mem": 3.353317376, + "gpu_mem": 1.352758272, + "loss": 0.7548, + "grad_norm": 6.184501647949219, + "learning_rate": 0.00026085133758309883 + }, + { + "step": 196, + "epoch": 0.3141025641025641, + "cpu_mem": 3.353513984, + "gpu_mem": 1.352782848, + "loss": 0.7569, + "grad_norm": 5.654435157775879, + "learning_rate": 0.0002602836993872292 + }, + { + "step": 197, + "epoch": 0.3157051282051282, + "cpu_mem": 3.3539072, + "gpu_mem": 1.352798208, + "loss": 0.8129, + "grad_norm": 5.561279773712158, + "learning_rate": 0.0002597126027197598 + }, + { + "step": 198, + "epoch": 0.3173076923076923, + "cpu_mem": 3.354103808, + "gpu_mem": 1.35277056, + "loss": 0.5849, + "grad_norm": 4.883157253265381, + "learning_rate": 0.0002591380654901515 + }, + { + "step": 199, + "epoch": 0.3189102564102564, + "cpu_mem": 3.354300416, + "gpu_mem": 1.352767488, + "loss": 0.8093, + "grad_norm": 4.850515365600586, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 200, + "epoch": 0.32051282051282054, + "cpu_mem": 3.354300416, + "gpu_mem": 1.352782848, + "loss": 0.8594, + "grad_norm": 4.959714889526367, + "learning_rate": 0.0002579787415212732 + }, + { + "step": 201, + "epoch": 0.32211538461538464, + "cpu_mem": 3.354497024, + "gpu_mem": 1.352759808, + "loss": 0.638, + "grad_norm": 3.9507689476013184, + "learning_rate": 0.00025739399113813784 + }, + { + "step": 202, + "epoch": 0.32371794871794873, + "cpu_mem": 3.354693632, + "gpu_mem": 1.352761344, + "loss": 0.6546, + "grad_norm": 3.734008312225342, + "learning_rate": 0.00025680587290399277 + }, + { + "step": 203, + "epoch": 0.32532051282051283, + "cpu_mem": 3.35489024, + "gpu_mem": 1.352802816, + "loss": 0.6195, + "grad_norm": 3.7673566341400146, + "learning_rate": 0.0002562144052620913 + }, + { + "step": 204, + "epoch": 0.3269230769230769, + "cpu_mem": 3.355283456, + "gpu_mem": 1.352773632, + "loss": 0.5819, + "grad_norm": 3.303795099258423, + "learning_rate": 0.00025561960676072354 + }, + { + "step": 205, + "epoch": 0.328525641025641, + "cpu_mem": 3.355480064, + "gpu_mem": 1.352773632, + "loss": 0.853, + "grad_norm": 4.460951805114746, + "learning_rate": 0.0002550214960526344 + }, + { + "step": 206, + "epoch": 0.3301282051282051, + "cpu_mem": 3.355676672, + "gpu_mem": 1.35277056, + "loss": 0.6226, + "grad_norm": 3.1424217224121094, + "learning_rate": 0.000254420091894439 + }, + { + "step": 207, + "epoch": 0.3317307692307692, + "cpu_mem": 3.35587328, + "gpu_mem": 1.35277056, + "loss": 0.6402, + "grad_norm": 3.6528384685516357, + "learning_rate": 0.0002538154131460342 + }, + { + "step": 208, + "epoch": 0.3333333333333333, + "cpu_mem": 3.356069888, + "gpu_mem": 1.352761344, + "loss": 0.6162, + "grad_norm": 3.266535520553589, + "learning_rate": 0.00025320747877000745 + }, + { + "step": 209, + "epoch": 0.3349358974358974, + "cpu_mem": 3.356266496, + "gpu_mem": 1.352796672, + "loss": 0.6388, + "grad_norm": 4.385846138000488, + "learning_rate": 0.00025259630783104164 + }, + { + "step": 210, + "epoch": 0.33653846153846156, + "cpu_mem": 3.356463104, + "gpu_mem": 1.352753664, + "loss": 0.6798, + "grad_norm": 4.57560396194458, + "learning_rate": 0.00025198191949531786 + }, + { + "step": 211, + "epoch": 0.33814102564102566, + "cpu_mem": 3.356659712, + "gpu_mem": 1.352781312, + "loss": 0.5279, + "grad_norm": 4.592564582824707, + "learning_rate": 0.00025136433302991366 + }, + { + "step": 212, + "epoch": 0.33974358974358976, + "cpu_mem": 3.356659712, + "gpu_mem": 1.352790528, + "loss": 0.5272, + "grad_norm": 4.677234649658203, + "learning_rate": 0.00025074356780219946 + }, + { + "step": 213, + "epoch": 0.34134615384615385, + "cpu_mem": 3.35685632, + "gpu_mem": 1.35276288, + "loss": 0.5515, + "grad_norm": 3.607342004776001, + "learning_rate": 0.000250119643279231 + }, + { + "step": 214, + "epoch": 0.34294871794871795, + "cpu_mem": 3.357052928, + "gpu_mem": 1.352772096, + "loss": 0.695, + "grad_norm": 4.254818916320801, + "learning_rate": 0.0002494925790271386 + }, + { + "step": 215, + "epoch": 0.34455128205128205, + "cpu_mem": 3.357249536, + "gpu_mem": 1.352773632, + "loss": 0.8328, + "grad_norm": 5.417482376098633, + "learning_rate": 0.00024886239471051376 + }, + { + "step": 216, + "epoch": 0.34615384615384615, + "cpu_mem": 3.357446144, + "gpu_mem": 1.352773632, + "loss": 0.688, + "grad_norm": 4.251400470733643, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 217, + "epoch": 0.34775641025641024, + "cpu_mem": 3.35783936, + "gpu_mem": 1.352758272, + "loss": 0.6903, + "grad_norm": 4.341720104217529, + "learning_rate": 0.0002475927450306363 + }, + { + "step": 218, + "epoch": 0.34935897435897434, + "cpu_mem": 3.358035968, + "gpu_mem": 1.352779776, + "loss": 0.44, + "grad_norm": 3.036832809448242, + "learning_rate": 0.0002469533194833073 + }, + { + "step": 219, + "epoch": 0.35096153846153844, + "cpu_mem": 3.358232576, + "gpu_mem": 1.352813568, + "loss": 0.6432, + "grad_norm": 4.279109954833984, + "learning_rate": 0.0002463108535020447 + }, + { + "step": 220, + "epoch": 0.3525641025641026, + "cpu_mem": 3.358232576, + "gpu_mem": 1.352767488, + "loss": 0.6921, + "grad_norm": 4.209002494812012, + "learning_rate": 0.0002456653672344348 + }, + { + "step": 221, + "epoch": 0.3541666666666667, + "cpu_mem": 3.358429184, + "gpu_mem": 1.352773632, + "loss": 0.9065, + "grad_norm": 5.55528450012207, + "learning_rate": 0.0002450168809227794 + }, + { + "step": 222, + "epoch": 0.3557692307692308, + "cpu_mem": 3.358625792, + "gpu_mem": 1.352788992, + "loss": 0.8392, + "grad_norm": 6.0490617752075195, + "learning_rate": 0.00024436541490346095 + }, + { + "step": 223, + "epoch": 0.3573717948717949, + "cpu_mem": 3.3588224, + "gpu_mem": 1.352807424, + "loss": 0.5724, + "grad_norm": 4.034276962280273, + "learning_rate": 0.00024371098960630495 + }, + { + "step": 224, + "epoch": 0.358974358974359, + "cpu_mem": 3.3588224, + "gpu_mem": 1.352776704, + "loss": 0.6796, + "grad_norm": 3.805037498474121, + "learning_rate": 0.000243053625553939 + }, + { + "step": 225, + "epoch": 0.3605769230769231, + "cpu_mem": 3.359019008, + "gpu_mem": 1.35276288, + "loss": 0.433, + "grad_norm": 2.9470736980438232, + "learning_rate": 0.00024239334336114953 + }, + { + "step": 226, + "epoch": 0.36217948717948717, + "cpu_mem": 3.359215616, + "gpu_mem": 1.3527552, + "loss": 0.5033, + "grad_norm": 4.362401485443115, + "learning_rate": 0.0002417301637342352 + }, + { + "step": 227, + "epoch": 0.36378205128205127, + "cpu_mem": 3.359412224, + "gpu_mem": 1.352819712, + "loss": 0.6267, + "grad_norm": 4.4238409996032715, + "learning_rate": 0.00024106410747035744 + }, + { + "step": 228, + "epoch": 0.36538461538461536, + "cpu_mem": 3.359608832, + "gpu_mem": 1.352758272, + "loss": 0.5334, + "grad_norm": 4.408234596252441, + "learning_rate": 0.00024039519545688846 + }, + { + "step": 229, + "epoch": 0.36698717948717946, + "cpu_mem": 3.35980544, + "gpu_mem": 1.352810496, + "loss": 0.5753, + "grad_norm": 4.130325794219971, + "learning_rate": 0.000239723448670756 + }, + { + "step": 230, + "epoch": 0.3685897435897436, + "cpu_mem": 3.360002048, + "gpu_mem": 1.352792064, + "loss": 0.5316, + "grad_norm": 3.742638111114502, + "learning_rate": 0.0002390488881777858 + }, + { + "step": 231, + "epoch": 0.3701923076923077, + "cpu_mem": 3.360198656, + "gpu_mem": 1.352790528, + "loss": 0.723, + "grad_norm": 4.273195266723633, + "learning_rate": 0.0002383715351320406 + }, + { + "step": 232, + "epoch": 0.3717948717948718, + "cpu_mem": 3.360591872, + "gpu_mem": 1.352795136, + "loss": 0.8053, + "grad_norm": 3.785038709640503, + "learning_rate": 0.00023769141077515713 + }, + { + "step": 233, + "epoch": 0.3733974358974359, + "cpu_mem": 3.36078848, + "gpu_mem": 1.35277056, + "loss": 0.5369, + "grad_norm": 3.9917356967926025, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 234, + "epoch": 0.375, + "cpu_mem": 3.360985088, + "gpu_mem": 1.352799744, + "loss": 0.76, + "grad_norm": 4.602860450744629, + "learning_rate": 0.0002363229335283915 + }, + { + "step": 235, + "epoch": 0.3766025641025641, + "cpu_mem": 3.360985088, + "gpu_mem": 1.352776704, + "loss": 0.6454, + "grad_norm": 4.467726707458496, + "learning_rate": 0.00023563462355364297 + }, + { + "step": 236, + "epoch": 0.3782051282051282, + "cpu_mem": 3.361181696, + "gpu_mem": 1.352838144, + "loss": 0.6896, + "grad_norm": 3.903024673461914, + "learning_rate": 0.0002349436280966775 + }, + { + "step": 237, + "epoch": 0.3798076923076923, + "cpu_mem": 3.361181696, + "gpu_mem": 1.35276288, + "loss": 0.5673, + "grad_norm": 3.6980621814727783, + "learning_rate": 0.00023424996882695468 + }, + { + "step": 238, + "epoch": 0.3814102564102564, + "cpu_mem": 3.361378304, + "gpu_mem": 1.352773632, + "loss": 0.5985, + "grad_norm": 3.917418956756592, + "learning_rate": 0.00023355366749747063 + }, + { + "step": 239, + "epoch": 0.38301282051282054, + "cpu_mem": 3.361574912, + "gpu_mem": 1.352772096, + "loss": 0.7482, + "grad_norm": 3.632282257080078, + "learning_rate": 0.00023285474594407585 + }, + { + "step": 240, + "epoch": 0.38461538461538464, + "cpu_mem": 3.36177152, + "gpu_mem": 1.352769024, + "loss": 0.6959, + "grad_norm": 3.346452474594116, + "learning_rate": 0.0002321532260847905 + }, + { + "step": 241, + "epoch": 0.38621794871794873, + "cpu_mem": 3.361968128, + "gpu_mem": 1.352799744, + "loss": 0.5244, + "grad_norm": 3.4886558055877686, + "learning_rate": 0.00023144912991911691 + }, + { + "step": 242, + "epoch": 0.38782051282051283, + "cpu_mem": 3.362164736, + "gpu_mem": 1.35277824, + "loss": 0.6486, + "grad_norm": 3.6581006050109863, + "learning_rate": 0.0002307424795273499 + }, + { + "step": 243, + "epoch": 0.3894230769230769, + "cpu_mem": 3.362361344, + "gpu_mem": 1.352773632, + "loss": 0.6321, + "grad_norm": 5.458331108093262, + "learning_rate": 0.00023003329706988425 + }, + { + "step": 244, + "epoch": 0.391025641025641, + "cpu_mem": 3.362557952, + "gpu_mem": 1.352784384, + "loss": 0.6029, + "grad_norm": 3.818387746810913, + "learning_rate": 0.00022932160478651963 + }, + { + "step": 245, + "epoch": 0.3926282051282051, + "cpu_mem": 3.36275456, + "gpu_mem": 1.352788992, + "loss": 0.5654, + "grad_norm": 4.140982151031494, + "learning_rate": 0.00022860742499576338 + }, + { + "step": 246, + "epoch": 0.3942307692307692, + "cpu_mem": 3.362951168, + "gpu_mem": 1.352750592, + "loss": 0.6352, + "grad_norm": 4.044063091278076, + "learning_rate": 0.00022789078009413042 + }, + { + "step": 247, + "epoch": 0.3958333333333333, + "cpu_mem": 3.362951168, + "gpu_mem": 1.352818176, + "loss": 0.9308, + "grad_norm": 5.916615962982178, + "learning_rate": 0.00022717169255544108 + }, + { + "step": 248, + "epoch": 0.3974358974358974, + "cpu_mem": 3.362951168, + "gpu_mem": 1.352781312, + "loss": 0.6995, + "grad_norm": 3.5534729957580566, + "learning_rate": 0.00022645018493011612 + }, + { + "step": 249, + "epoch": 0.39903846153846156, + "cpu_mem": 3.363147776, + "gpu_mem": 1.35277056, + "loss": 0.6003, + "grad_norm": 3.8903427124023438, + "learning_rate": 0.0002257262798444698 + }, + { + "step": 250, + "epoch": 0.40064102564102566, + "cpu_mem": 3.363344384, + "gpu_mem": 1.352787456, + "loss": 0.5697, + "grad_norm": 3.0323007106781006, + "learning_rate": 0.000225 + }, + { + "step": 251, + "epoch": 0.40224358974358976, + "cpu_mem": 3.363540992, + "gpu_mem": 1.352761344, + "loss": 0.5726, + "grad_norm": 5.007686614990234, + "learning_rate": 0.00022427136817267668 + }, + { + "step": 252, + "epoch": 0.40384615384615385, + "cpu_mem": 3.3637376, + "gpu_mem": 1.35280896, + "loss": 0.6706, + "grad_norm": 4.239434719085693, + "learning_rate": 0.0002235404072122273 + }, + { + "step": 253, + "epoch": 0.40544871794871795, + "cpu_mem": 3.363934208, + "gpu_mem": 1.352776704, + "loss": 0.6459, + "grad_norm": 3.958889961242676, + "learning_rate": 0.00022280714004142054 + }, + { + "step": 254, + "epoch": 0.40705128205128205, + "cpu_mem": 3.363934208, + "gpu_mem": 1.352765952, + "loss": 0.5181, + "grad_norm": 3.5305209159851074, + "learning_rate": 0.00022207158965534726 + }, + { + "step": 255, + "epoch": 0.40865384615384615, + "cpu_mem": 3.364130816, + "gpu_mem": 1.352781312, + "loss": 0.6849, + "grad_norm": 6.051270961761475, + "learning_rate": 0.0002213337791206993 + }, + { + "step": 256, + "epoch": 0.41025641025641024, + "cpu_mem": 3.364327424, + "gpu_mem": 1.35277824, + "loss": 0.6186, + "grad_norm": 5.957346439361572, + "learning_rate": 0.00022059373157504636 + }, + { + "step": 257, + "epoch": 0.41185897435897434, + "cpu_mem": 3.364524032, + "gpu_mem": 1.35277824, + "loss": 0.587, + "grad_norm": 4.974151134490967, + "learning_rate": 0.00021985147022611038 + }, + { + "step": 258, + "epoch": 0.41346153846153844, + "cpu_mem": 3.364524032, + "gpu_mem": 1.352765952, + "loss": 0.5941, + "grad_norm": 4.99975061416626, + "learning_rate": 0.0002191070183510375 + }, + { + "step": 259, + "epoch": 0.4150641025641026, + "cpu_mem": 3.36472064, + "gpu_mem": 1.352749056, + "loss": 0.8854, + "grad_norm": 6.647341728210449, + "learning_rate": 0.00021836039929566835 + }, + { + "step": 260, + "epoch": 0.4166666666666667, + "cpu_mem": 3.364917248, + "gpu_mem": 1.352812032, + "loss": 0.6545, + "grad_norm": 4.363974571228027, + "learning_rate": 0.0002176116364738058 + }, + { + "step": 261, + "epoch": 0.4182692307692308, + "cpu_mem": 3.364917248, + "gpu_mem": 1.352765952, + "loss": 0.5835, + "grad_norm": 4.4976701736450195, + "learning_rate": 0.00021686075336648075 + }, + { + "step": 262, + "epoch": 0.4198717948717949, + "cpu_mem": 3.365113856, + "gpu_mem": 1.352775168, + "loss": 0.5722, + "grad_norm": 3.332082986831665, + "learning_rate": 0.00021610777352121574 + }, + { + "step": 263, + "epoch": 0.421474358974359, + "cpu_mem": 3.365310464, + "gpu_mem": 1.352810496, + "loss": 0.4127, + "grad_norm": 3.295799970626831, + "learning_rate": 0.0002153527205512867 + }, + { + "step": 264, + "epoch": 0.4230769230769231, + "cpu_mem": 3.365507072, + "gpu_mem": 1.352775168, + "loss": 0.5963, + "grad_norm": 4.3185715675354, + "learning_rate": 0.0002145956181349821 + }, + { + "step": 265, + "epoch": 0.42467948717948717, + "cpu_mem": 3.365507072, + "gpu_mem": 1.352779776, + "loss": 0.6884, + "grad_norm": 4.085981845855713, + "learning_rate": 0.00021383649001486055 + }, + { + "step": 266, + "epoch": 0.42628205128205127, + "cpu_mem": 3.36570368, + "gpu_mem": 1.352827392, + "loss": 0.4007, + "grad_norm": 3.3694357872009277, + "learning_rate": 0.00021307535999700637 + }, + { + "step": 267, + "epoch": 0.42788461538461536, + "cpu_mem": 3.365900288, + "gpu_mem": 1.352836608, + "loss": 0.4089, + "grad_norm": 3.297218084335327, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 268, + "epoch": 0.42948717948717946, + "cpu_mem": 3.366096896, + "gpu_mem": 1.352790528, + "loss": 0.4873, + "grad_norm": 4.216040134429932, + "learning_rate": 0.00021154718980558417 + }, + { + "step": 269, + "epoch": 0.4310897435897436, + "cpu_mem": 3.366096896, + "gpu_mem": 1.352784384, + "loss": 0.673, + "grad_norm": 4.702591896057129, + "learning_rate": 0.00021078019755508398 + }, + { + "step": 270, + "epoch": 0.4326923076923077, + "cpu_mem": 3.366096896, + "gpu_mem": 1.352845824, + "loss": 0.5321, + "grad_norm": 4.1223297119140625, + "learning_rate": 0.00021001129925148396 + }, + { + "step": 271, + "epoch": 0.4342948717948718, + "cpu_mem": 3.366293504, + "gpu_mem": 1.352772096, + "loss": 0.5063, + "grad_norm": 5.441708564758301, + "learning_rate": 0.00020924051900725923 + }, + { + "step": 272, + "epoch": 0.4358974358974359, + "cpu_mem": 3.366490112, + "gpu_mem": 1.35277056, + "loss": 0.4912, + "grad_norm": 4.953702449798584, + "learning_rate": 0.00020846788099390188 + }, + { + "step": 273, + "epoch": 0.4375, + "cpu_mem": 3.36668672, + "gpu_mem": 1.352773632, + "loss": 0.4429, + "grad_norm": 4.165315628051758, + "learning_rate": 0.0002076934094411635 + }, + { + "step": 274, + "epoch": 0.4391025641025641, + "cpu_mem": 3.36668672, + "gpu_mem": 1.352759808, + "loss": 0.5366, + "grad_norm": 5.306684970855713, + "learning_rate": 0.0002069171286362949 + }, + { + "step": 275, + "epoch": 0.4407051282051282, + "cpu_mem": 3.366883328, + "gpu_mem": 1.352775168, + "loss": 0.3862, + "grad_norm": 5.7248125076293945, + "learning_rate": 0.00020613906292328457 + }, + { + "step": 276, + "epoch": 0.4423076923076923, + "cpu_mem": 3.367079936, + "gpu_mem": 1.352813568, + "loss": 0.5923, + "grad_norm": 5.04981803894043, + "learning_rate": 0.0002053592367020955 + }, + { + "step": 277, + "epoch": 0.4439102564102564, + "cpu_mem": 3.367079936, + "gpu_mem": 1.3527936, + "loss": 0.7655, + "grad_norm": 5.304744243621826, + "learning_rate": 0.0002045776744278996 + }, + { + "step": 278, + "epoch": 0.44551282051282054, + "cpu_mem": 3.367276544, + "gpu_mem": 1.352819712, + "loss": 0.6967, + "grad_norm": 4.695727348327637, + "learning_rate": 0.00020379440061031118 + }, + { + "step": 279, + "epoch": 0.44711538461538464, + "cpu_mem": 3.367473152, + "gpu_mem": 1.35277056, + "loss": 0.6353, + "grad_norm": 3.9860808849334717, + "learning_rate": 0.00020300943981261808 + }, + { + "step": 280, + "epoch": 0.44871794871794873, + "cpu_mem": 3.36766976, + "gpu_mem": 1.352764416, + "loss": 0.6546, + "grad_norm": 3.926760196685791, + "learning_rate": 0.0002022228166510114 + }, + { + "step": 281, + "epoch": 0.45032051282051283, + "cpu_mem": 3.36766976, + "gpu_mem": 1.352787456, + "loss": 0.5104, + "grad_norm": 2.7474117279052734, + "learning_rate": 0.00020143455579381373 + }, + { + "step": 282, + "epoch": 0.4519230769230769, + "cpu_mem": 3.367866368, + "gpu_mem": 1.352765952, + "loss": 0.6319, + "grad_norm": 3.122296094894409, + "learning_rate": 0.0002006446819607053 + }, + { + "step": 283, + "epoch": 0.453525641025641, + "cpu_mem": 3.368062976, + "gpu_mem": 1.352779776, + "loss": 0.688, + "grad_norm": 3.979159116744995, + "learning_rate": 0.00019985321992194892 + }, + { + "step": 284, + "epoch": 0.4551282051282051, + "cpu_mem": 3.368062976, + "gpu_mem": 1.352784384, + "loss": 0.5601, + "grad_norm": 3.670821189880371, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 285, + "epoch": 0.4567307692307692, + "cpu_mem": 3.368259584, + "gpu_mem": 1.352802816, + "loss": 0.5168, + "grad_norm": 3.53165864944458, + "learning_rate": 0.00019826563055679418 + }, + { + "step": 286, + "epoch": 0.4583333333333333, + "cpu_mem": 3.368456192, + "gpu_mem": 1.352773632, + "loss": 0.396, + "grad_norm": 2.9155280590057373, + "learning_rate": 0.00019746955301683537 + }, + { + "step": 287, + "epoch": 0.4599358974358974, + "cpu_mem": 3.368456192, + "gpu_mem": 1.35280128, + "loss": 0.7394, + "grad_norm": 4.186156272888184, + "learning_rate": 0.0001966719868425464 + }, + { + "step": 288, + "epoch": 0.46153846153846156, + "cpu_mem": 3.3686528, + "gpu_mem": 1.352782848, + "loss": 0.5321, + "grad_norm": 3.89064621925354, + "learning_rate": 0.0001958729570454201 + }, + { + "step": 289, + "epoch": 0.46314102564102566, + "cpu_mem": 3.3686528, + "gpu_mem": 1.35277056, + "loss": 0.5177, + "grad_norm": 3.5968830585479736, + "learning_rate": 0.0001950724886828484 + }, + { + "step": 290, + "epoch": 0.46474358974358976, + "cpu_mem": 3.3686528, + "gpu_mem": 1.352779776, + "loss": 0.8071, + "grad_norm": 5.362423419952393, + "learning_rate": 0.000194270606857336 + }, + { + "step": 291, + "epoch": 0.46634615384615385, + "cpu_mem": 3.3686528, + "gpu_mem": 1.352776704, + "loss": 0.6185, + "grad_norm": 4.483001708984375, + "learning_rate": 0.00019346733671571367 + }, + { + "step": 292, + "epoch": 0.46794871794871795, + "cpu_mem": 3.368849408, + "gpu_mem": 1.352792064, + "loss": 0.4301, + "grad_norm": 3.575410842895508, + "learning_rate": 0.00019266270344834942 + }, + { + "step": 293, + "epoch": 0.46955128205128205, + "cpu_mem": 3.369046016, + "gpu_mem": 1.352799744, + "loss": 0.6066, + "grad_norm": 4.356827259063721, + "learning_rate": 0.00019185673228835857 + }, + { + "step": 294, + "epoch": 0.47115384615384615, + "cpu_mem": 3.369242624, + "gpu_mem": 1.352788992, + "loss": 0.5641, + "grad_norm": 4.000094413757324, + "learning_rate": 0.00019104944851081244 + }, + { + "step": 295, + "epoch": 0.47275641025641024, + "cpu_mem": 3.369242624, + "gpu_mem": 1.352773632, + "loss": 0.7959, + "grad_norm": 4.407611846923828, + "learning_rate": 0.00019024087743194564 + }, + { + "step": 296, + "epoch": 0.47435897435897434, + "cpu_mem": 3.369439232, + "gpu_mem": 1.352776704, + "loss": 0.5858, + "grad_norm": 3.463501214981079, + "learning_rate": 0.0001894310444083625 + }, + { + "step": 297, + "epoch": 0.47596153846153844, + "cpu_mem": 3.36963584, + "gpu_mem": 1.35277056, + "loss": 0.5731, + "grad_norm": 3.739112615585327, + "learning_rate": 0.00018861997483624136 + }, + { + "step": 298, + "epoch": 0.4775641025641026, + "cpu_mem": 3.36963584, + "gpu_mem": 1.352765952, + "loss": 0.7162, + "grad_norm": 4.348962306976318, + "learning_rate": 0.00018780769415053866 + }, + { + "step": 299, + "epoch": 0.4791666666666667, + "cpu_mem": 3.36963584, + "gpu_mem": 1.352787456, + "loss": 0.8298, + "grad_norm": 3.7981348037719727, + "learning_rate": 0.00018699422782419094 + }, + { + "step": 300, + "epoch": 0.4807692307692308, + "cpu_mem": 3.369832448, + "gpu_mem": 1.352779776, + "loss": 0.7864, + "grad_norm": 3.926572561264038, + "learning_rate": 0.00018617960136731624 + }, + { + "step": 301, + "epoch": 0.4823717948717949, + "cpu_mem": 3.369832448, + "gpu_mem": 1.352752128, + "loss": 0.6977, + "grad_norm": 4.096456527709961, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 302, + "epoch": 0.483974358974359, + "cpu_mem": 3.370029056, + "gpu_mem": 1.352750592, + "loss": 0.6008, + "grad_norm": 2.906160354614258, + "learning_rate": 0.0001845469702835641 + }, + { + "step": 303, + "epoch": 0.4855769230769231, + "cpu_mem": 3.370029056, + "gpu_mem": 1.352776704, + "loss": 0.7415, + "grad_norm": 3.268622398376465, + "learning_rate": 0.00018372901685562414 + }, + { + "step": 304, + "epoch": 0.48717948717948717, + "cpu_mem": 3.370225664, + "gpu_mem": 1.352759808, + "loss": 0.5005, + "grad_norm": 2.9466443061828613, + "learning_rate": 0.00018291000569342676 + }, + { + "step": 305, + "epoch": 0.48878205128205127, + "cpu_mem": 3.370225664, + "gpu_mem": 1.352790528, + "loss": 0.4583, + "grad_norm": 2.4142391681671143, + "learning_rate": 0.00018208996248097458 + }, + { + "step": 306, + "epoch": 0.49038461538461536, + "cpu_mem": 3.370422272, + "gpu_mem": 1.352773632, + "loss": 0.5987, + "grad_norm": 3.795623779296875, + "learning_rate": 0.00018126891293463547 + }, + { + "step": 307, + "epoch": 0.49198717948717946, + "cpu_mem": 3.37061888, + "gpu_mem": 1.352804352, + "loss": 0.5715, + "grad_norm": 3.984400510787964, + "learning_rate": 0.0001804468828023354 + }, + { + "step": 308, + "epoch": 0.4935897435897436, + "cpu_mem": 3.37061888, + "gpu_mem": 1.352772096, + "loss": 0.6653, + "grad_norm": 5.927000999450684, + "learning_rate": 0.00017962389786275142 + }, + { + "step": 309, + "epoch": 0.4951923076923077, + "cpu_mem": 3.37061888, + "gpu_mem": 1.352798208, + "loss": 0.4119, + "grad_norm": 3.3445992469787598, + "learning_rate": 0.000178799983924503 + }, + { + "step": 310, + "epoch": 0.4967948717948718, + "cpu_mem": 3.370815488, + "gpu_mem": 1.352773632, + "loss": 0.4255, + "grad_norm": 3.9626619815826416, + "learning_rate": 0.00017797516682534293 + }, + { + "step": 311, + "epoch": 0.4983974358974359, + "cpu_mem": 3.370815488, + "gpu_mem": 1.352769024, + "loss": 0.7068, + "grad_norm": 5.114670753479004, + "learning_rate": 0.00017714947243134695 + }, + { + "step": 312, + "epoch": 0.5, + "cpu_mem": 3.371012096, + "gpu_mem": 1.352772096, + "loss": 0.6123, + "grad_norm": 6.201516151428223, + "learning_rate": 0.0001763229266361024 + }, + { + "step": 313, + "epoch": 0.5016025641025641, + "cpu_mem": 3.371012096, + "gpu_mem": 1.352790528, + "loss": 0.4429, + "grad_norm": 4.32002592086792, + "learning_rate": 0.00017549555535989648 + }, + { + "step": 314, + "epoch": 0.5032051282051282, + "cpu_mem": 3.371012096, + "gpu_mem": 1.35277056, + "loss": 0.6012, + "grad_norm": 4.6900224685668945, + "learning_rate": 0.00017466738454890323 + }, + { + "step": 315, + "epoch": 0.5048076923076923, + "cpu_mem": 3.371208704, + "gpu_mem": 1.352775168, + "loss": 0.6178, + "grad_norm": 4.292315483093262, + "learning_rate": 0.00017383844017436996 + }, + { + "step": 316, + "epoch": 0.5064102564102564, + "cpu_mem": 3.371405312, + "gpu_mem": 1.35277056, + "loss": 0.6111, + "grad_norm": 4.019443035125732, + "learning_rate": 0.00017300874823180282 + }, + { + "step": 317, + "epoch": 0.5080128205128205, + "cpu_mem": 3.371405312, + "gpu_mem": 1.35277824, + "loss": 0.3838, + "grad_norm": 3.6528115272521973, + "learning_rate": 0.00017217833474015128 + }, + { + "step": 318, + "epoch": 0.5096153846153846, + "cpu_mem": 3.37160192, + "gpu_mem": 1.352802816, + "loss": 0.6551, + "grad_norm": 3.811068296432495, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 319, + "epoch": 0.5112179487179487, + "cpu_mem": 3.37160192, + "gpu_mem": 1.352795136, + "loss": 0.5555, + "grad_norm": 3.3630189895629883, + "learning_rate": 0.0001705154472977154 + }, + { + "step": 320, + "epoch": 0.5128205128205128, + "cpu_mem": 3.371798528, + "gpu_mem": 1.352796672, + "loss": 0.5821, + "grad_norm": 2.9582321643829346, + "learning_rate": 0.00016968302549470095 + }, + { + "step": 321, + "epoch": 0.5144230769230769, + "cpu_mem": 3.371798528, + "gpu_mem": 1.352772096, + "loss": 0.5466, + "grad_norm": 3.0238680839538574, + "learning_rate": 0.00016884998643650694 + }, + { + "step": 322, + "epoch": 0.5160256410256411, + "cpu_mem": 3.371995136, + "gpu_mem": 1.352773632, + "loss": 0.661, + "grad_norm": 3.565948724746704, + "learning_rate": 0.00016801635624704776 + }, + { + "step": 323, + "epoch": 0.5176282051282052, + "cpu_mem": 3.371995136, + "gpu_mem": 1.3527936, + "loss": 0.6062, + "grad_norm": 3.9685187339782715, + "learning_rate": 0.0001671821610687756 + }, + { + "step": 324, + "epoch": 0.5192307692307693, + "cpu_mem": 3.371995136, + "gpu_mem": 1.352765952, + "loss": 0.7138, + "grad_norm": 3.292141914367676, + "learning_rate": 0.00016634742706186036 + }, + { + "step": 325, + "epoch": 0.5208333333333334, + "cpu_mem": 3.372191744, + "gpu_mem": 1.35277824, + "loss": 0.4363, + "grad_norm": 3.6430821418762207, + "learning_rate": 0.00016551218040336993 + }, + { + "step": 326, + "epoch": 0.5224358974358975, + "cpu_mem": 3.372388352, + "gpu_mem": 1.352787456, + "loss": 0.4193, + "grad_norm": 3.865039587020874, + "learning_rate": 0.00016467644728644843 + }, + { + "step": 327, + "epoch": 0.5240384615384616, + "cpu_mem": 3.372388352, + "gpu_mem": 1.352764416, + "loss": 0.5572, + "grad_norm": 4.9217095375061035, + "learning_rate": 0.0001638402539194953 + }, + { + "step": 328, + "epoch": 0.5256410256410257, + "cpu_mem": 3.37258496, + "gpu_mem": 1.352788992, + "loss": 0.6992, + "grad_norm": 6.7620625495910645, + "learning_rate": 0.00016300362652534346 + }, + { + "step": 329, + "epoch": 0.5272435897435898, + "cpu_mem": 3.37258496, + "gpu_mem": 1.352788992, + "loss": 0.5903, + "grad_norm": 4.646149158477783, + "learning_rate": 0.00016216659134043657 + }, + { + "step": 330, + "epoch": 0.5288461538461539, + "cpu_mem": 3.372781568, + "gpu_mem": 1.352772096, + "loss": 0.4419, + "grad_norm": 5.116950988769531, + "learning_rate": 0.00016132917461400686 + }, + { + "step": 331, + "epoch": 0.530448717948718, + "cpu_mem": 3.372781568, + "gpu_mem": 1.352769024, + "loss": 0.4842, + "grad_norm": 3.881664514541626, + "learning_rate": 0.00016049140260725127 + }, + { + "step": 332, + "epoch": 0.532051282051282, + "cpu_mem": 3.372781568, + "gpu_mem": 1.352761344, + "loss": 0.6204, + "grad_norm": 5.5193376541137695, + "learning_rate": 0.00015965330159250845 + }, + { + "step": 333, + "epoch": 0.5336538461538461, + "cpu_mem": 3.372781568, + "gpu_mem": 1.352799744, + "loss": 0.5987, + "grad_norm": 5.519906520843506, + "learning_rate": 0.00015881489785243467 + }, + { + "step": 334, + "epoch": 0.5352564102564102, + "cpu_mem": 3.372781568, + "gpu_mem": 1.352776704, + "loss": 0.9151, + "grad_norm": 5.508474826812744, + "learning_rate": 0.00015797621767917942 + }, + { + "step": 335, + "epoch": 0.5368589743589743, + "cpu_mem": 3.372978176, + "gpu_mem": 1.352775168, + "loss": 0.6687, + "grad_norm": 4.599000453948975, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 336, + "epoch": 0.5384615384615384, + "cpu_mem": 3.373174784, + "gpu_mem": 1.352792064, + "loss": 0.5676, + "grad_norm": 3.865410804748535, + "learning_rate": 0.00015629813324424292 + }, + { + "step": 337, + "epoch": 0.5400641025641025, + "cpu_mem": 3.373174784, + "gpu_mem": 1.352776704, + "loss": 0.4992, + "grad_norm": 3.1082351207733154, + "learning_rate": 0.00015545878160690583 + }, + { + "step": 338, + "epoch": 0.5416666666666666, + "cpu_mem": 3.373174784, + "gpu_mem": 1.352788992, + "loss": 0.6142, + "grad_norm": 3.3066911697387695, + "learning_rate": 0.00015461925878342556 + }, + { + "step": 339, + "epoch": 0.5432692307692307, + "cpu_mem": 3.373371392, + "gpu_mem": 1.35280128, + "loss": 0.4867, + "grad_norm": 2.791574716567993, + "learning_rate": 0.00015377959110104584 + }, + { + "step": 340, + "epoch": 0.5448717948717948, + "cpu_mem": 3.373371392, + "gpu_mem": 1.352776704, + "loss": 0.5614, + "grad_norm": 3.034593343734741, + "learning_rate": 0.00015293980489155333 + }, + { + "step": 341, + "epoch": 0.5464743589743589, + "cpu_mem": 3.373371392, + "gpu_mem": 1.352821248, + "loss": 0.587, + "grad_norm": 3.1059231758117676, + "learning_rate": 0.00015209992649045152 + }, + { + "step": 342, + "epoch": 0.5480769230769231, + "cpu_mem": 3.373568, + "gpu_mem": 1.352795136, + "loss": 0.5759, + "grad_norm": 3.3631482124328613, + "learning_rate": 0.000151259982236135 + }, + { + "step": 343, + "epoch": 0.5496794871794872, + "cpu_mem": 3.373764608, + "gpu_mem": 1.352792064, + "loss": 0.5557, + "grad_norm": 3.158411979675293, + "learning_rate": 0.00015041999846906367 + }, + { + "step": 344, + "epoch": 0.5512820512820513, + "cpu_mem": 3.373764608, + "gpu_mem": 1.352773632, + "loss": 0.3783, + "grad_norm": 2.969353437423706, + "learning_rate": 0.00014958000153093634 + }, + { + "step": 345, + "epoch": 0.5528846153846154, + "cpu_mem": 3.373764608, + "gpu_mem": 1.352779776, + "loss": 0.424, + "grad_norm": 3.653618335723877, + "learning_rate": 0.000148740017763865 + }, + { + "step": 346, + "epoch": 0.5544871794871795, + "cpu_mem": 3.373961216, + "gpu_mem": 1.352749056, + "loss": 0.5082, + "grad_norm": 4.198459148406982, + "learning_rate": 0.00014790007350954845 + }, + { + "step": 347, + "epoch": 0.5560897435897436, + "cpu_mem": 3.373961216, + "gpu_mem": 1.352813568, + "loss": 0.827, + "grad_norm": 4.518367767333984, + "learning_rate": 0.00014706019510844664 + }, + { + "step": 348, + "epoch": 0.5576923076923077, + "cpu_mem": 3.373961216, + "gpu_mem": 1.352767488, + "loss": 0.5685, + "grad_norm": 3.828432321548462, + "learning_rate": 0.0001462204088989541 + }, + { + "step": 349, + "epoch": 0.5592948717948718, + "cpu_mem": 3.373961216, + "gpu_mem": 1.352761344, + "loss": 0.4863, + "grad_norm": 3.5348360538482666, + "learning_rate": 0.00014538074121657447 + }, + { + "step": 350, + "epoch": 0.5608974358974359, + "cpu_mem": 3.374157824, + "gpu_mem": 1.35281664, + "loss": 0.3199, + "grad_norm": 2.988959550857544, + "learning_rate": 0.00014454121839309415 + }, + { + "step": 351, + "epoch": 0.5625, + "cpu_mem": 3.374354432, + "gpu_mem": 1.352782848, + "loss": 0.4593, + "grad_norm": 4.200687408447266, + "learning_rate": 0.00014370186675575705 + }, + { + "step": 352, + "epoch": 0.5641025641025641, + "cpu_mem": 3.374354432, + "gpu_mem": 1.35277056, + "loss": 0.4773, + "grad_norm": 4.666147232055664, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 353, + "epoch": 0.5657051282051282, + "cpu_mem": 3.374354432, + "gpu_mem": 1.352775168, + "loss": 0.422, + "grad_norm": 3.6602771282196045, + "learning_rate": 0.00014202378232082053 + }, + { + "step": 354, + "epoch": 0.5673076923076923, + "cpu_mem": 3.374354432, + "gpu_mem": 1.3527552, + "loss": 0.5954, + "grad_norm": 5.16617488861084, + "learning_rate": 0.00014118510214756536 + }, + { + "step": 355, + "epoch": 0.5689102564102564, + "cpu_mem": 3.374354432, + "gpu_mem": 1.352779776, + "loss": 0.4376, + "grad_norm": 3.597954034805298, + "learning_rate": 0.00014034669840749152 + }, + { + "step": 356, + "epoch": 0.5705128205128205, + "cpu_mem": 3.37455104, + "gpu_mem": 1.352758272, + "loss": 0.3423, + "grad_norm": 4.162787437438965, + "learning_rate": 0.0001395085973927487 + }, + { + "step": 357, + "epoch": 0.5721153846153846, + "cpu_mem": 3.37455104, + "gpu_mem": 1.352775168, + "loss": 0.4769, + "grad_norm": 4.876043319702148, + "learning_rate": 0.00013867082538599317 + }, + { + "step": 358, + "epoch": 0.5737179487179487, + "cpu_mem": 3.37455104, + "gpu_mem": 1.35273984, + "loss": 0.5801, + "grad_norm": 5.590660095214844, + "learning_rate": 0.00013783340865956338 + }, + { + "step": 359, + "epoch": 0.5753205128205128, + "cpu_mem": 3.374747648, + "gpu_mem": 1.352772096, + "loss": 0.3687, + "grad_norm": 4.309680461883545, + "learning_rate": 0.0001369963734746566 + }, + { + "step": 360, + "epoch": 0.5769230769230769, + "cpu_mem": 3.374944256, + "gpu_mem": 1.352761344, + "loss": 0.305, + "grad_norm": 3.170595169067383, + "learning_rate": 0.0001361597460805047 + }, + { + "step": 361, + "epoch": 0.5785256410256411, + "cpu_mem": 3.374944256, + "gpu_mem": 1.352798208, + "loss": 0.6644, + "grad_norm": 5.851312160491943, + "learning_rate": 0.00013532355271355154 + }, + { + "step": 362, + "epoch": 0.5801282051282052, + "cpu_mem": 3.374944256, + "gpu_mem": 1.352764416, + "loss": 0.4181, + "grad_norm": 4.313964366912842, + "learning_rate": 0.00013448781959663004 + }, + { + "step": 363, + "epoch": 0.5817307692307693, + "cpu_mem": 3.374944256, + "gpu_mem": 1.352787456, + "loss": 0.4462, + "grad_norm": 4.067446708679199, + "learning_rate": 0.00013365257293813956 + }, + { + "step": 364, + "epoch": 0.5833333333333334, + "cpu_mem": 3.375140864, + "gpu_mem": 1.352776704, + "loss": 0.5936, + "grad_norm": 4.847165584564209, + "learning_rate": 0.00013281783893122446 + }, + { + "step": 365, + "epoch": 0.5849358974358975, + "cpu_mem": 3.375337472, + "gpu_mem": 1.352782848, + "loss": 0.4842, + "grad_norm": 4.22496223449707, + "learning_rate": 0.00013198364375295224 + }, + { + "step": 366, + "epoch": 0.5865384615384616, + "cpu_mem": 3.375337472, + "gpu_mem": 1.352776704, + "loss": 0.5527, + "grad_norm": 3.6803245544433594, + "learning_rate": 0.000131150013563493 + }, + { + "step": 367, + "epoch": 0.5881410256410257, + "cpu_mem": 3.375337472, + "gpu_mem": 1.352795136, + "loss": 0.6514, + "grad_norm": 3.922409772872925, + "learning_rate": 0.00013031697450529902 + }, + { + "step": 368, + "epoch": 0.5897435897435898, + "cpu_mem": 3.375337472, + "gpu_mem": 1.3527552, + "loss": 0.4712, + "grad_norm": 3.146977424621582, + "learning_rate": 0.0001294845527022846 + }, + { + "step": 369, + "epoch": 0.5913461538461539, + "cpu_mem": 3.375337472, + "gpu_mem": 1.352787456, + "loss": 0.7957, + "grad_norm": 4.445869445800781, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 370, + "epoch": 0.592948717948718, + "cpu_mem": 3.37553408, + "gpu_mem": 1.352807424, + "loss": 0.5521, + "grad_norm": 3.059021234512329, + "learning_rate": 0.0001278216652598487 + }, + { + "step": 371, + "epoch": 0.594551282051282, + "cpu_mem": 3.37553408, + "gpu_mem": 1.35280128, + "loss": 0.4925, + "grad_norm": 3.274728536605835, + "learning_rate": 0.00012699125176819716 + }, + { + "step": 372, + "epoch": 0.5961538461538461, + "cpu_mem": 3.37553408, + "gpu_mem": 1.352764416, + "loss": 0.5477, + "grad_norm": 3.361356496810913, + "learning_rate": 0.00012616155982563 + }, + { + "step": 373, + "epoch": 0.5977564102564102, + "cpu_mem": 3.375730688, + "gpu_mem": 1.352781312, + "loss": 0.3974, + "grad_norm": 2.560831069946289, + "learning_rate": 0.00012533261545109674 + }, + { + "step": 374, + "epoch": 0.5993589743589743, + "cpu_mem": 3.375730688, + "gpu_mem": 1.352758272, + "loss": 0.6577, + "grad_norm": 3.0244691371917725, + "learning_rate": 0.00012450444464010352 + }, + { + "step": 375, + "epoch": 0.6009615384615384, + "cpu_mem": 3.375730688, + "gpu_mem": 1.352790528, + "loss": 0.5375, + "grad_norm": 3.9437148571014404, + "learning_rate": 0.0001236770733638976 + }, + { + "step": 376, + "epoch": 0.6025641025641025, + "cpu_mem": 3.375730688, + "gpu_mem": 1.35278592, + "loss": 0.4626, + "grad_norm": 4.583610534667969, + "learning_rate": 0.000122850527568653 + }, + { + "step": 377, + "epoch": 0.6041666666666666, + "cpu_mem": 3.375927296, + "gpu_mem": 1.352795136, + "loss": 0.3966, + "grad_norm": 3.6022286415100098, + "learning_rate": 0.00012202483317465704 + }, + { + "step": 378, + "epoch": 0.6057692307692307, + "cpu_mem": 3.375927296, + "gpu_mem": 1.352769024, + "loss": 0.4562, + "grad_norm": 3.879387378692627, + "learning_rate": 0.00012120001607549698 + }, + { + "step": 379, + "epoch": 0.6073717948717948, + "cpu_mem": 3.376123904, + "gpu_mem": 1.352788992, + "loss": 0.5144, + "grad_norm": 3.477837085723877, + "learning_rate": 0.00012037610213724862 + }, + { + "step": 380, + "epoch": 0.6089743589743589, + "cpu_mem": 3.376123904, + "gpu_mem": 1.35276288, + "loss": 0.4667, + "grad_norm": 3.6576311588287354, + "learning_rate": 0.0001195531171976646 + }, + { + "step": 381, + "epoch": 0.6105769230769231, + "cpu_mem": 3.376123904, + "gpu_mem": 1.352787456, + "loss": 0.5839, + "grad_norm": 4.575575828552246, + "learning_rate": 0.00011873108706536448 + }, + { + "step": 382, + "epoch": 0.6121794871794872, + "cpu_mem": 3.376123904, + "gpu_mem": 1.352772096, + "loss": 0.3424, + "grad_norm": 3.4057581424713135, + "learning_rate": 0.00011791003751902542 + }, + { + "step": 383, + "epoch": 0.6137820512820513, + "cpu_mem": 3.376320512, + "gpu_mem": 1.352805888, + "loss": 0.5419, + "grad_norm": 4.603096961975098, + "learning_rate": 0.00011708999430657325 + }, + { + "step": 384, + "epoch": 0.6153846153846154, + "cpu_mem": 3.376320512, + "gpu_mem": 1.35278592, + "loss": 0.3966, + "grad_norm": 3.5948843955993652, + "learning_rate": 0.00011627098314437586 + }, + { + "step": 385, + "epoch": 0.6169871794871795, + "cpu_mem": 3.376320512, + "gpu_mem": 1.35277056, + "loss": 0.4809, + "grad_norm": 4.278548717498779, + "learning_rate": 0.0001154530297164359 + }, + { + "step": 386, + "epoch": 0.6185897435897436, + "cpu_mem": 3.376320512, + "gpu_mem": 1.352805888, + "loss": 0.388, + "grad_norm": 3.663811683654785, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 387, + "epoch": 0.6201923076923077, + "cpu_mem": 3.376320512, + "gpu_mem": 1.352812032, + "loss": 0.3633, + "grad_norm": 3.677976369857788, + "learning_rate": 0.00011382039863268374 + }, + { + "step": 388, + "epoch": 0.6217948717948718, + "cpu_mem": 3.376320512, + "gpu_mem": 1.352775168, + "loss": 0.4472, + "grad_norm": 4.887213230133057, + "learning_rate": 0.00011300577217580905 + }, + { + "step": 389, + "epoch": 0.6233974358974359, + "cpu_mem": 3.376320512, + "gpu_mem": 1.352753664, + "loss": 0.4405, + "grad_norm": 4.323986530303955, + "learning_rate": 0.00011219230584946136 + }, + { + "step": 390, + "epoch": 0.625, + "cpu_mem": 3.37651712, + "gpu_mem": 1.352805888, + "loss": 0.6411, + "grad_norm": 4.974346160888672, + "learning_rate": 0.00011138002516375864 + }, + { + "step": 391, + "epoch": 0.6266025641025641, + "cpu_mem": 3.37651712, + "gpu_mem": 1.352792064, + "loss": 0.4295, + "grad_norm": 4.051982402801514, + "learning_rate": 0.00011056895559163748 + }, + { + "step": 392, + "epoch": 0.6282051282051282, + "cpu_mem": 3.37651712, + "gpu_mem": 1.35278592, + "loss": 0.604, + "grad_norm": 5.188896656036377, + "learning_rate": 0.00010975912256805436 + }, + { + "step": 393, + "epoch": 0.6298076923076923, + "cpu_mem": 3.37651712, + "gpu_mem": 1.352792064, + "loss": 0.5237, + "grad_norm": 3.2285068035125732, + "learning_rate": 0.00010895055148918756 + }, + { + "step": 394, + "epoch": 0.6314102564102564, + "cpu_mem": 3.376713728, + "gpu_mem": 1.352769024, + "loss": 0.6716, + "grad_norm": 3.8758554458618164, + "learning_rate": 0.00010814326771164141 + }, + { + "step": 395, + "epoch": 0.6330128205128205, + "cpu_mem": 3.376713728, + "gpu_mem": 1.352782848, + "loss": 0.2892, + "grad_norm": 3.24933123588562, + "learning_rate": 0.00010733729655165054 + }, + { + "step": 396, + "epoch": 0.6346153846153846, + "cpu_mem": 3.376713728, + "gpu_mem": 1.352782848, + "loss": 0.5856, + "grad_norm": 4.307365417480469, + "learning_rate": 0.00010653266328428628 + }, + { + "step": 397, + "epoch": 0.6362179487179487, + "cpu_mem": 3.376713728, + "gpu_mem": 1.352752128, + "loss": 0.5097, + "grad_norm": 4.113290786743164, + "learning_rate": 0.00010572939314266402 + }, + { + "step": 398, + "epoch": 0.6378205128205128, + "cpu_mem": 3.376910336, + "gpu_mem": 1.35278592, + "loss": 0.4865, + "grad_norm": 3.8677189350128174, + "learning_rate": 0.00010492751131715159 + }, + { + "step": 399, + "epoch": 0.6394230769230769, + "cpu_mem": 3.376910336, + "gpu_mem": 1.352764416, + "loss": 0.4762, + "grad_norm": 3.340097188949585, + "learning_rate": 0.00010412704295457988 + }, + { + "step": 400, + "epoch": 0.6410256410256411, + "cpu_mem": 3.377106944, + "gpu_mem": 1.352772096, + "loss": 0.4415, + "grad_norm": 3.5555615425109863, + "learning_rate": 0.00010332801315745361 + }, + { + "step": 401, + "epoch": 0.6426282051282052, + "cpu_mem": 3.377106944, + "gpu_mem": 1.352790528, + "loss": 0.556, + "grad_norm": 3.963819742202759, + "learning_rate": 0.00010253044698316464 + }, + { + "step": 402, + "epoch": 0.6442307692307693, + "cpu_mem": 3.377106944, + "gpu_mem": 1.352758272, + "loss": 0.655, + "grad_norm": 4.660390377044678, + "learning_rate": 0.00010173436944320582 + }, + { + "step": 403, + "epoch": 0.6458333333333334, + "cpu_mem": 3.377106944, + "gpu_mem": 1.35276288, + "loss": 0.429, + "grad_norm": 2.758380174636841, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 404, + "epoch": 0.6474358974358975, + "cpu_mem": 3.377106944, + "gpu_mem": 1.352758272, + "loss": 0.433, + "grad_norm": 2.957456350326538, + "learning_rate": 0.00010014678007805106 + }, + { + "step": 405, + "epoch": 0.6490384615384616, + "cpu_mem": 3.377303552, + "gpu_mem": 1.352802816, + "loss": 0.5757, + "grad_norm": 3.318423271179199, + "learning_rate": 9.935531803929469e-05 + }, + { + "step": 406, + "epoch": 0.6506410256410257, + "cpu_mem": 3.377303552, + "gpu_mem": 1.35278592, + "loss": 0.4857, + "grad_norm": 3.259636640548706, + "learning_rate": 9.856544420618624e-05 + }, + { + "step": 407, + "epoch": 0.6522435897435898, + "cpu_mem": 3.377303552, + "gpu_mem": 1.352775168, + "loss": 0.4237, + "grad_norm": 2.6246678829193115, + "learning_rate": 9.777718334898859e-05 + }, + { + "step": 408, + "epoch": 0.6538461538461539, + "cpu_mem": 3.377303552, + "gpu_mem": 1.352796672, + "loss": 0.4245, + "grad_norm": 3.4555017948150635, + "learning_rate": 9.699056018738192e-05 + }, + { + "step": 409, + "epoch": 0.655448717948718, + "cpu_mem": 3.377303552, + "gpu_mem": 1.35276288, + "loss": 0.3959, + "grad_norm": 2.6385881900787354, + "learning_rate": 9.62055993896888e-05 + }, + { + "step": 410, + "epoch": 0.657051282051282, + "cpu_mem": 3.377303552, + "gpu_mem": 1.35277824, + "loss": 0.3526, + "grad_norm": 2.285193920135498, + "learning_rate": 9.542232557210039e-05 + }, + { + "step": 411, + "epoch": 0.6586538461538461, + "cpu_mem": 3.377303552, + "gpu_mem": 1.35277824, + "loss": 0.5377, + "grad_norm": 3.7049150466918945, + "learning_rate": 9.464076329790451e-05 + }, + { + "step": 412, + "epoch": 0.6602564102564102, + "cpu_mem": 3.37750016, + "gpu_mem": 1.352769024, + "loss": 0.4016, + "grad_norm": 3.8973562717437744, + "learning_rate": 9.386093707671543e-05 + }, + { + "step": 413, + "epoch": 0.6618589743589743, + "cpu_mem": 3.37750016, + "gpu_mem": 1.352779776, + "loss": 0.7437, + "grad_norm": 4.763423919677734, + "learning_rate": 9.308287136370511e-05 + }, + { + "step": 414, + "epoch": 0.6634615384615384, + "cpu_mem": 3.37750016, + "gpu_mem": 1.352804352, + "loss": 0.4665, + "grad_norm": 3.732358455657959, + "learning_rate": 9.230659055883649e-05 + }, + { + "step": 415, + "epoch": 0.6650641025641025, + "cpu_mem": 3.37750016, + "gpu_mem": 1.352756736, + "loss": 0.5149, + "grad_norm": 3.4114763736724854, + "learning_rate": 9.15321190060981e-05 + }, + { + "step": 416, + "epoch": 0.6666666666666666, + "cpu_mem": 3.377696768, + "gpu_mem": 1.352792064, + "loss": 0.5312, + "grad_norm": 4.810686111450195, + "learning_rate": 9.075948099274078e-05 + }, + { + "step": 417, + "epoch": 0.6682692307692307, + "cpu_mem": 3.377696768, + "gpu_mem": 1.352753664, + "loss": 0.4394, + "grad_norm": 3.9476475715637207, + "learning_rate": 8.998870074851604e-05 + }, + { + "step": 418, + "epoch": 0.6698717948717948, + "cpu_mem": 3.377696768, + "gpu_mem": 1.352772096, + "loss": 0.508, + "grad_norm": 4.353238105773926, + "learning_rate": 8.9219802444916e-05 + }, + { + "step": 419, + "epoch": 0.6714743589743589, + "cpu_mem": 3.377893376, + "gpu_mem": 1.352764416, + "loss": 0.4599, + "grad_norm": 3.44915771484375, + "learning_rate": 8.845281019441583e-05 + }, + { + "step": 420, + "epoch": 0.6730769230769231, + "cpu_mem": 3.377893376, + "gpu_mem": 1.35280128, + "loss": 0.4546, + "grad_norm": 3.3591649532318115, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 421, + "epoch": 0.6746794871794872, + "cpu_mem": 3.377893376, + "gpu_mem": 1.352761344, + "loss": 0.3849, + "grad_norm": 3.3660662174224854, + "learning_rate": 8.692464000299362e-05 + }, + { + "step": 422, + "epoch": 0.6762820512820513, + "cpu_mem": 3.377893376, + "gpu_mem": 1.352775168, + "loss": 0.3477, + "grad_norm": 2.8061580657958984, + "learning_rate": 8.61635099851395e-05 + }, + { + "step": 423, + "epoch": 0.6778846153846154, + "cpu_mem": 3.377893376, + "gpu_mem": 1.352779776, + "loss": 0.3291, + "grad_norm": 3.1202316284179688, + "learning_rate": 8.540438186501792e-05 + }, + { + "step": 424, + "epoch": 0.6794871794871795, + "cpu_mem": 3.377893376, + "gpu_mem": 1.352741376, + "loss": 0.5207, + "grad_norm": 3.9994020462036133, + "learning_rate": 8.464727944871322e-05 + }, + { + "step": 425, + "epoch": 0.6810897435897436, + "cpu_mem": 3.377893376, + "gpu_mem": 1.352764416, + "loss": 0.5696, + "grad_norm": 4.1667704582214355, + "learning_rate": 8.389222647878426e-05 + }, + { + "step": 426, + "epoch": 0.6826923076923077, + "cpu_mem": 3.378089984, + "gpu_mem": 1.35276288, + "loss": 0.4065, + "grad_norm": 3.470923662185669, + "learning_rate": 8.313924663351926e-05 + }, + { + "step": 427, + "epoch": 0.6842948717948718, + "cpu_mem": 3.378286592, + "gpu_mem": 1.352781312, + "loss": 0.5779, + "grad_norm": 4.573644161224365, + "learning_rate": 8.238836352619424e-05 + }, + { + "step": 428, + "epoch": 0.6858974358974359, + "cpu_mem": 3.378286592, + "gpu_mem": 1.35277824, + "loss": 0.6074, + "grad_norm": 4.387336730957031, + "learning_rate": 8.163960070433164e-05 + }, + { + "step": 429, + "epoch": 0.6875, + "cpu_mem": 3.378286592, + "gpu_mem": 1.352776704, + "loss": 0.4229, + "grad_norm": 3.732419013977051, + "learning_rate": 8.089298164896245e-05 + }, + { + "step": 430, + "epoch": 0.6891025641025641, + "cpu_mem": 3.378286592, + "gpu_mem": 1.352795136, + "loss": 0.3205, + "grad_norm": 3.3473639488220215, + "learning_rate": 8.014852977388964e-05 + }, + { + "step": 431, + "epoch": 0.6907051282051282, + "cpu_mem": 3.378286592, + "gpu_mem": 1.352756736, + "loss": 0.5907, + "grad_norm": 4.7200236320495605, + "learning_rate": 7.940626842495362e-05 + }, + { + "step": 432, + "epoch": 0.6923076923076923, + "cpu_mem": 3.3784832, + "gpu_mem": 1.35280128, + "loss": 0.3881, + "grad_norm": 4.634664058685303, + "learning_rate": 7.866622087930074e-05 + }, + { + "step": 433, + "epoch": 0.6939102564102564, + "cpu_mem": 3.3784832, + "gpu_mem": 1.352765952, + "loss": 0.4399, + "grad_norm": 3.3430416584014893, + "learning_rate": 7.792841034465275e-05 + }, + { + "step": 434, + "epoch": 0.6955128205128205, + "cpu_mem": 3.3784832, + "gpu_mem": 1.3527936, + "loss": 0.4631, + "grad_norm": 3.4971299171447754, + "learning_rate": 7.719285995857938e-05 + }, + { + "step": 435, + "epoch": 0.6971153846153846, + "cpu_mem": 3.3784832, + "gpu_mem": 1.352773632, + "loss": 0.425, + "grad_norm": 4.054968357086182, + "learning_rate": 7.64595927877727e-05 + }, + { + "step": 436, + "epoch": 0.6987179487179487, + "cpu_mem": 3.3784832, + "gpu_mem": 1.352819712, + "loss": 0.6141, + "grad_norm": 4.100009918212891, + "learning_rate": 7.572863182732332e-05 + }, + { + "step": 437, + "epoch": 0.7003205128205128, + "cpu_mem": 3.3784832, + "gpu_mem": 1.352784384, + "loss": 0.4355, + "grad_norm": 3.421898126602173, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 438, + "epoch": 0.7019230769230769, + "cpu_mem": 3.378679808, + "gpu_mem": 1.352775168, + "loss": 0.3742, + "grad_norm": 3.2273480892181396, + "learning_rate": 7.42737201555302e-05 + }, + { + "step": 439, + "epoch": 0.7035256410256411, + "cpu_mem": 3.378679808, + "gpu_mem": 1.352769024, + "loss": 0.2697, + "grad_norm": 2.7979235649108887, + "learning_rate": 7.354981506988387e-05 + }, + { + "step": 440, + "epoch": 0.7051282051282052, + "cpu_mem": 3.378679808, + "gpu_mem": 1.352753664, + "loss": 0.5351, + "grad_norm": 3.3184492588043213, + "learning_rate": 7.282830744455895e-05 + }, + { + "step": 441, + "epoch": 0.7067307692307693, + "cpu_mem": 3.378679808, + "gpu_mem": 1.352772096, + "loss": 0.6704, + "grad_norm": 5.0973734855651855, + "learning_rate": 7.210921990586957e-05 + }, + { + "step": 442, + "epoch": 0.7083333333333334, + "cpu_mem": 3.378679808, + "gpu_mem": 1.352773632, + "loss": 0.2708, + "grad_norm": 2.9184751510620117, + "learning_rate": 7.139257500423665e-05 + }, + { + "step": 443, + "epoch": 0.7099358974358975, + "cpu_mem": 3.378679808, + "gpu_mem": 1.35277824, + "loss": 0.4136, + "grad_norm": 3.7115933895111084, + "learning_rate": 7.067839521348035e-05 + }, + { + "step": 444, + "epoch": 0.7115384615384616, + "cpu_mem": 3.378679808, + "gpu_mem": 1.352781312, + "loss": 0.3754, + "grad_norm": 3.89996337890625, + "learning_rate": 6.996670293011575e-05 + }, + { + "step": 445, + "epoch": 0.7131410256410257, + "cpu_mem": 3.378679808, + "gpu_mem": 1.352775168, + "loss": 0.4261, + "grad_norm": 3.6031978130340576, + "learning_rate": 6.92575204726501e-05 + }, + { + "step": 446, + "epoch": 0.7147435897435898, + "cpu_mem": 3.378679808, + "gpu_mem": 1.35280128, + "loss": 0.6204, + "grad_norm": 5.002869129180908, + "learning_rate": 6.855087008088307e-05 + }, + { + "step": 447, + "epoch": 0.7163461538461539, + "cpu_mem": 3.378679808, + "gpu_mem": 1.352769024, + "loss": 0.4377, + "grad_norm": 3.9823503494262695, + "learning_rate": 6.784677391520952e-05 + }, + { + "step": 448, + "epoch": 0.717948717948718, + "cpu_mem": 3.378679808, + "gpu_mem": 1.352796672, + "loss": 0.424, + "grad_norm": 4.157087802886963, + "learning_rate": 6.714525405592412e-05 + }, + { + "step": 449, + "epoch": 0.719551282051282, + "cpu_mem": 3.378679808, + "gpu_mem": 1.352804352, + "loss": 0.3831, + "grad_norm": 4.2339887619018555, + "learning_rate": 6.644633250252937e-05 + }, + { + "step": 450, + "epoch": 0.7211538461538461, + "cpu_mem": 3.378679808, + "gpu_mem": 1.35278592, + "loss": 0.5976, + "grad_norm": 5.517625331878662, + "learning_rate": 6.575003117304535e-05 + }, + { + "step": 451, + "epoch": 0.7227564102564102, + "cpu_mem": 3.378876416, + "gpu_mem": 1.352772096, + "loss": 0.3983, + "grad_norm": 4.364997386932373, + "learning_rate": 6.50563719033225e-05 + }, + { + "step": 452, + "epoch": 0.7243589743589743, + "cpu_mem": 3.379073024, + "gpu_mem": 1.352782848, + "loss": 0.4327, + "grad_norm": 3.96940016746521, + "learning_rate": 6.436537644635705e-05 + }, + { + "step": 453, + "epoch": 0.7259615384615384, + "cpu_mem": 3.379073024, + "gpu_mem": 1.352775168, + "loss": 0.4073, + "grad_norm": 4.3990044593811035, + "learning_rate": 6.367706647160847e-05 + }, + { + "step": 454, + "epoch": 0.7275641025641025, + "cpu_mem": 3.379073024, + "gpu_mem": 1.352792064, + "loss": 0.5437, + "grad_norm": 3.916335344314575, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 455, + "epoch": 0.7291666666666666, + "cpu_mem": 3.379073024, + "gpu_mem": 1.352764416, + "loss": 0.4255, + "grad_norm": 4.550477504730225, + "learning_rate": 6.230858922484288e-05 + }, + { + "step": 456, + "epoch": 0.7307692307692307, + "cpu_mem": 3.379073024, + "gpu_mem": 1.352795136, + "loss": 0.4098, + "grad_norm": 3.532620906829834, + "learning_rate": 6.162846486795938e-05 + }, + { + "step": 457, + "epoch": 0.7323717948717948, + "cpu_mem": 3.379073024, + "gpu_mem": 1.352776704, + "loss": 0.508, + "grad_norm": 4.6265645027160645, + "learning_rate": 6.095111182221422e-05 + }, + { + "step": 458, + "epoch": 0.7339743589743589, + "cpu_mem": 3.379073024, + "gpu_mem": 1.352764416, + "loss": 0.4835, + "grad_norm": 3.679452657699585, + "learning_rate": 6.027655132924397e-05 + }, + { + "step": 459, + "epoch": 0.7355769230769231, + "cpu_mem": 3.379073024, + "gpu_mem": 1.352776704, + "loss": 0.4254, + "grad_norm": 3.8895883560180664, + "learning_rate": 5.960480454311155e-05 + }, + { + "step": 460, + "epoch": 0.7371794871794872, + "cpu_mem": 3.379073024, + "gpu_mem": 1.352782848, + "loss": 0.3169, + "grad_norm": 4.098667621612549, + "learning_rate": 5.893589252964258e-05 + }, + { + "step": 461, + "epoch": 0.7387820512820513, + "cpu_mem": 3.379073024, + "gpu_mem": 1.35277056, + "loss": 0.3289, + "grad_norm": 3.155287981033325, + "learning_rate": 5.826983626576479e-05 + }, + { + "step": 462, + "epoch": 0.7403846153846154, + "cpu_mem": 3.379073024, + "gpu_mem": 1.352759808, + "loss": 0.2681, + "grad_norm": 3.1111369132995605, + "learning_rate": 5.760665663885046e-05 + }, + { + "step": 463, + "epoch": 0.7419871794871795, + "cpu_mem": 3.379073024, + "gpu_mem": 1.352761344, + "loss": 0.3194, + "grad_norm": 3.7213423252105713, + "learning_rate": 5.6946374446060984e-05 + }, + { + "step": 464, + "epoch": 0.7435897435897436, + "cpu_mem": 3.379073024, + "gpu_mem": 1.352775168, + "loss": 0.3961, + "grad_norm": 2.7526121139526367, + "learning_rate": 5.6289010393695056e-05 + }, + { + "step": 465, + "epoch": 0.7451923076923077, + "cpu_mem": 3.379073024, + "gpu_mem": 1.35277824, + "loss": 0.5238, + "grad_norm": 4.715906620025635, + "learning_rate": 5.563458509653904e-05 + }, + { + "step": 466, + "epoch": 0.7467948717948718, + "cpu_mem": 3.379269632, + "gpu_mem": 1.352788992, + "loss": 0.4662, + "grad_norm": 4.900501728057861, + "learning_rate": 5.498311907722057e-05 + }, + { + "step": 467, + "epoch": 0.7483974358974359, + "cpu_mem": 3.379269632, + "gpu_mem": 1.35276288, + "loss": 0.5316, + "grad_norm": 3.9620110988616943, + "learning_rate": 5.43346327655652e-05 + }, + { + "step": 468, + "epoch": 0.75, + "cpu_mem": 3.379269632, + "gpu_mem": 1.35277824, + "loss": 0.5476, + "grad_norm": 4.485487937927246, + "learning_rate": 5.3689146497955274e-05 + }, + { + "step": 469, + "epoch": 0.7516025641025641, + "cpu_mem": 3.379269632, + "gpu_mem": 1.352787456, + "loss": 0.482, + "grad_norm": 3.9051544666290283, + "learning_rate": 5.30466805166927e-05 + }, + { + "step": 470, + "epoch": 0.7532051282051282, + "cpu_mem": 3.379269632, + "gpu_mem": 1.352761344, + "loss": 0.5489, + "grad_norm": 4.311708450317383, + "learning_rate": 5.240725496936372e-05 + }, + { + "step": 471, + "epoch": 0.7548076923076923, + "cpu_mem": 3.379269632, + "gpu_mem": 1.352767488, + "loss": 0.3971, + "grad_norm": 2.925165891647339, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 472, + "epoch": 0.7564102564102564, + "cpu_mem": 3.379269632, + "gpu_mem": 1.352756736, + "loss": 0.4881, + "grad_norm": 3.2379543781280518, + "learning_rate": 5.113760528948622e-05 + }, + { + "step": 473, + "epoch": 0.7580128205128205, + "cpu_mem": 3.379269632, + "gpu_mem": 1.35276288, + "loss": 0.3382, + "grad_norm": 3.762775421142578, + "learning_rate": 5.05074209728614e-05 + }, + { + "step": 474, + "epoch": 0.7596153846153846, + "cpu_mem": 3.379269632, + "gpu_mem": 1.352799744, + "loss": 0.3141, + "grad_norm": 3.6455252170562744, + "learning_rate": 4.988035672076899e-05 + }, + { + "step": 475, + "epoch": 0.7612179487179487, + "cpu_mem": 3.379269632, + "gpu_mem": 1.35274752, + "loss": 0.6392, + "grad_norm": 4.955118179321289, + "learning_rate": 4.925643219780052e-05 + }, + { + "step": 476, + "epoch": 0.7628205128205128, + "cpu_mem": 3.37946624, + "gpu_mem": 1.352767488, + "loss": 0.5215, + "grad_norm": 3.5804126262664795, + "learning_rate": 4.863566697008634e-05 + }, + { + "step": 477, + "epoch": 0.7644230769230769, + "cpu_mem": 3.37946624, + "gpu_mem": 1.352767488, + "loss": 0.3425, + "grad_norm": 3.047938108444214, + "learning_rate": 4.801808050468219e-05 + }, + { + "step": 478, + "epoch": 0.7660256410256411, + "cpu_mem": 3.37946624, + "gpu_mem": 1.352765952, + "loss": 0.394, + "grad_norm": 2.6385481357574463, + "learning_rate": 4.7403692168958305e-05 + }, + { + "step": 479, + "epoch": 0.7676282051282052, + "cpu_mem": 3.37946624, + "gpu_mem": 1.352764416, + "loss": 0.3393, + "grad_norm": 3.5619046688079834, + "learning_rate": 4.679252122999255e-05 + }, + { + "step": 480, + "epoch": 0.7692307692307693, + "cpu_mem": 3.379662848, + "gpu_mem": 1.352756736, + "loss": 0.5774, + "grad_norm": 4.705263137817383, + "learning_rate": 4.618458685396579e-05 + }, + { + "step": 481, + "epoch": 0.7708333333333334, + "cpu_mem": 3.379662848, + "gpu_mem": 1.35281664, + "loss": 0.34, + "grad_norm": 2.822368860244751, + "learning_rate": 4.5579908105561016e-05 + }, + { + "step": 482, + "epoch": 0.7724358974358975, + "cpu_mem": 3.379662848, + "gpu_mem": 1.352761344, + "loss": 0.4159, + "grad_norm": 3.386230945587158, + "learning_rate": 4.497850394736563e-05 + }, + { + "step": 483, + "epoch": 0.7740384615384616, + "cpu_mem": 3.379662848, + "gpu_mem": 1.352744448, + "loss": 0.4803, + "grad_norm": 3.1763899326324463, + "learning_rate": 4.438039323927648e-05 + }, + { + "step": 484, + "epoch": 0.7756410256410257, + "cpu_mem": 3.379662848, + "gpu_mem": 1.352775168, + "loss": 0.4959, + "grad_norm": 3.438669443130493, + "learning_rate": 4.3785594737908676e-05 + }, + { + "step": 485, + "epoch": 0.7772435897435898, + "cpu_mem": 3.379662848, + "gpu_mem": 1.352819712, + "loss": 0.4017, + "grad_norm": 3.981435775756836, + "learning_rate": 4.319412709600723e-05 + }, + { + "step": 486, + "epoch": 0.7788461538461539, + "cpu_mem": 3.379662848, + "gpu_mem": 1.352799744, + "loss": 0.3246, + "grad_norm": 3.1795785427093506, + "learning_rate": 4.2606008861862116e-05 + }, + { + "step": 487, + "epoch": 0.780448717948718, + "cpu_mem": 3.379662848, + "gpu_mem": 1.352799744, + "loss": 0.3279, + "grad_norm": 3.4790945053100586, + "learning_rate": 4.2021258478726774e-05 + }, + { + "step": 488, + "epoch": 0.782051282051282, + "cpu_mem": 3.379662848, + "gpu_mem": 1.352765952, + "loss": 0.4574, + "grad_norm": 4.594079494476318, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 489, + "epoch": 0.7836538461538461, + "cpu_mem": 3.379662848, + "gpu_mem": 1.352790528, + "loss": 0.4626, + "grad_norm": 3.799405813217163, + "learning_rate": 4.0861934509848507e-05 + }, + { + "step": 490, + "epoch": 0.7852564102564102, + "cpu_mem": 3.379662848, + "gpu_mem": 1.3527936, + "loss": 0.5272, + "grad_norm": 4.6277265548706055, + "learning_rate": 4.028739728024022e-05 + }, + { + "step": 491, + "epoch": 0.7868589743589743, + "cpu_mem": 3.379662848, + "gpu_mem": 1.352772096, + "loss": 0.3915, + "grad_norm": 4.537452697753906, + "learning_rate": 3.971630061277077e-05 + }, + { + "step": 492, + "epoch": 0.7884615384615384, + "cpu_mem": 3.379662848, + "gpu_mem": 1.352795136, + "loss": 0.3884, + "grad_norm": 3.8674097061157227, + "learning_rate": 3.914866241690115e-05 + }, + { + "step": 493, + "epoch": 0.7900641025641025, + "cpu_mem": 3.379662848, + "gpu_mem": 1.352775168, + "loss": 0.422, + "grad_norm": 5.062219619750977, + "learning_rate": 3.858450049363532e-05 + }, + { + "step": 494, + "epoch": 0.7916666666666666, + "cpu_mem": 3.379662848, + "gpu_mem": 1.352799744, + "loss": 0.3816, + "grad_norm": 3.378476142883301, + "learning_rate": 3.8023832534962314e-05 + }, + { + "step": 495, + "epoch": 0.7932692307692307, + "cpu_mem": 3.379662848, + "gpu_mem": 1.352782848, + "loss": 0.4568, + "grad_norm": 4.605844974517822, + "learning_rate": 3.746667612330109e-05 + }, + { + "step": 496, + "epoch": 0.7948717948717948, + "cpu_mem": 3.379662848, + "gpu_mem": 1.35277824, + "loss": 0.4155, + "grad_norm": 4.786030292510986, + "learning_rate": 3.691304873094927e-05 + }, + { + "step": 497, + "epoch": 0.7964743589743589, + "cpu_mem": 3.379662848, + "gpu_mem": 1.352790528, + "loss": 0.4029, + "grad_norm": 3.570801019668579, + "learning_rate": 3.636296771953544e-05 + }, + { + "step": 498, + "epoch": 0.7980769230769231, + "cpu_mem": 3.379662848, + "gpu_mem": 1.352761344, + "loss": 0.3101, + "grad_norm": 2.915475845336914, + "learning_rate": 3.581645033947425e-05 + }, + { + "step": 499, + "epoch": 0.7996794871794872, + "cpu_mem": 3.379662848, + "gpu_mem": 1.352775168, + "loss": 0.5903, + "grad_norm": 4.397287368774414, + "learning_rate": 3.527351372942588e-05 + }, + { + "step": 500, + "epoch": 0.8012820512820513, + "cpu_mem": 3.379859456, + "gpu_mem": 1.352761344, + "loss": 0.4096, + "grad_norm": 5.597043037414551, + "learning_rate": 3.473417491575824e-05 + }, + { + "step": 501, + "epoch": 0.8028846153846154, + "cpu_mem": 3.379859456, + "gpu_mem": 1.3527552, + "loss": 0.4258, + "grad_norm": 4.228761196136475, + "learning_rate": 3.41984508120132e-05 + }, + { + "step": 502, + "epoch": 0.8044871794871795, + "cpu_mem": 3.379859456, + "gpu_mem": 1.352761344, + "loss": 0.2657, + "grad_norm": 2.755167245864868, + "learning_rate": 3.366635821837627e-05 + }, + { + "step": 503, + "epoch": 0.8060897435897436, + "cpu_mem": 3.379859456, + "gpu_mem": 1.352775168, + "loss": 0.3612, + "grad_norm": 3.062067985534668, + "learning_rate": 3.3137913821149425e-05 + }, + { + "step": 504, + "epoch": 0.8076923076923077, + "cpu_mem": 3.379859456, + "gpu_mem": 1.352758272, + "loss": 0.4959, + "grad_norm": 4.356472015380859, + "learning_rate": 3.261313419222825e-05 + }, + { + "step": 505, + "epoch": 0.8092948717948718, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352812032, + "loss": 0.3183, + "grad_norm": 2.6905875205993652, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 506, + "epoch": 0.8108974358974359, + "cpu_mem": 3.380056064, + "gpu_mem": 1.3527552, + "loss": 0.2604, + "grad_norm": 3.2491722106933594, + "learning_rate": 3.157463495173713e-05 + }, + { + "step": 507, + "epoch": 0.8125, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352833536, + "loss": 0.3526, + "grad_norm": 3.7245681285858154, + "learning_rate": 3.1060947907265936e-05 + }, + { + "step": 508, + "epoch": 0.8141025641025641, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352776704, + "loss": 0.3204, + "grad_norm": 3.301570415496826, + "learning_rate": 3.0550990764276634e-05 + }, + { + "step": 509, + "epoch": 0.8157051282051282, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352795136, + "loss": 0.4564, + "grad_norm": 4.058881759643555, + "learning_rate": 3.00447795149086e-05 + }, + { + "step": 510, + "epoch": 0.8173076923076923, + "cpu_mem": 3.380056064, + "gpu_mem": 1.35277056, + "loss": 0.3939, + "grad_norm": 3.83038330078125, + "learning_rate": 2.9542330033830884e-05 + }, + { + "step": 511, + "epoch": 0.8189102564102564, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352802816, + "loss": 0.4555, + "grad_norm": 4.3954691886901855, + "learning_rate": 2.9043658077744316e-05 + }, + { + "step": 512, + "epoch": 0.8205128205128205, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352822784, + "loss": 0.4738, + "grad_norm": 3.8564441204071045, + "learning_rate": 2.8548779284887442e-05 + }, + { + "step": 513, + "epoch": 0.8221153846153846, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352752128, + "loss": 0.3554, + "grad_norm": 4.3323564529418945, + "learning_rate": 2.805770917454614e-05 + }, + { + "step": 514, + "epoch": 0.8237179487179487, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352765952, + "loss": 0.2848, + "grad_norm": 3.4251291751861572, + "learning_rate": 2.7570463146566758e-05 + }, + { + "step": 515, + "epoch": 0.8253205128205128, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352750592, + "loss": 0.3172, + "grad_norm": 3.467022657394409, + "learning_rate": 2.708705648087332e-05 + }, + { + "step": 516, + "epoch": 0.8269230769230769, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352788992, + "loss": 0.2306, + "grad_norm": 2.661904811859131, + "learning_rate": 2.6607504336988317e-05 + }, + { + "step": 517, + "epoch": 0.8285256410256411, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352788992, + "loss": 0.4957, + "grad_norm": 4.646728038787842, + "learning_rate": 2.613182175355739e-05 + }, + { + "step": 518, + "epoch": 0.8301282051282052, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352775168, + "loss": 0.3856, + "grad_norm": 3.935546636581421, + "learning_rate": 2.5660023647877644e-05 + }, + { + "step": 519, + "epoch": 0.8317307692307693, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352765952, + "loss": 0.3887, + "grad_norm": 3.8935399055480957, + "learning_rate": 2.5192124815429777e-05 + }, + { + "step": 520, + "epoch": 0.8333333333333334, + "cpu_mem": 3.380056064, + "gpu_mem": 1.35277056, + "loss": 0.5382, + "grad_norm": 5.200833797454834, + "learning_rate": 2.472813992941418e-05 + }, + { + "step": 521, + "epoch": 0.8349358974358975, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352773632, + "loss": 0.3455, + "grad_norm": 3.853806495666504, + "learning_rate": 2.426808354029078e-05 + }, + { + "step": 522, + "epoch": 0.8365384615384616, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352779776, + "loss": 0.3531, + "grad_norm": 3.2284677028656006, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 523, + "epoch": 0.8381410256410257, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352798208, + "loss": 0.353, + "grad_norm": 4.369974613189697, + "learning_rate": 2.3359813838124277e-05 + }, + { + "step": 524, + "epoch": 0.8397435897435898, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352792064, + "loss": 0.5056, + "grad_norm": 5.107171058654785, + "learning_rate": 2.2911629008211363e-05 + }, + { + "step": 525, + "epoch": 0.8413461538461539, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352769024, + "loss": 0.3011, + "grad_norm": 3.6832852363586426, + "learning_rate": 2.24674296405579e-05 + }, + { + "step": 526, + "epoch": 0.842948717948718, + "cpu_mem": 3.380056064, + "gpu_mem": 1.352756736, + "loss": 0.4871, + "grad_norm": 4.155484676361084, + "learning_rate": 2.2027229665154446e-05 + }, + { + "step": 527, + "epoch": 0.844551282051282, + "cpu_mem": 3.380252672, + "gpu_mem": 1.352722944, + "loss": 0.4324, + "grad_norm": 3.888580799102783, + "learning_rate": 2.1591042886571634e-05 + }, + { + "step": 528, + "epoch": 0.8461538461538461, + "cpu_mem": 3.380252672, + "gpu_mem": 1.35277056, + "loss": 0.4041, + "grad_norm": 3.9685370922088623, + "learning_rate": 2.1158882983527166e-05 + }, + { + "step": 529, + "epoch": 0.8477564102564102, + "cpu_mem": 3.380252672, + "gpu_mem": 1.352736768, + "loss": 0.3837, + "grad_norm": 3.4673726558685303, + "learning_rate": 2.0730763508456738e-05 + }, + { + "step": 530, + "epoch": 0.8493589743589743, + "cpu_mem": 3.380252672, + "gpu_mem": 1.352784384, + "loss": 0.4141, + "grad_norm": 3.85807728767395, + "learning_rate": 2.0306697887089235e-05 + }, + { + "step": 531, + "epoch": 0.8509615384615384, + "cpu_mem": 3.380252672, + "gpu_mem": 1.352782848, + "loss": 0.553, + "grad_norm": 3.754528045654297, + "learning_rate": 1.9886699418025543e-05 + }, + { + "step": 532, + "epoch": 0.8525641025641025, + "cpu_mem": 3.380252672, + "gpu_mem": 1.352784384, + "loss": 0.2679, + "grad_norm": 2.671337366104126, + "learning_rate": 1.947078127232169e-05 + }, + { + "step": 533, + "epoch": 0.8541666666666666, + "cpu_mem": 3.380252672, + "gpu_mem": 1.3527936, + "loss": 0.6159, + "grad_norm": 5.270925998687744, + "learning_rate": 1.9058956493075644e-05 + }, + { + "step": 534, + "epoch": 0.8557692307692307, + "cpu_mem": 3.380252672, + "gpu_mem": 1.352769024, + "loss": 0.4717, + "grad_norm": 4.758904933929443, + "learning_rate": 1.8651237995018324e-05 + }, + { + "step": 535, + "epoch": 0.8573717948717948, + "cpu_mem": 3.380252672, + "gpu_mem": 1.352753664, + "loss": 0.4035, + "grad_norm": 4.24276065826416, + "learning_rate": 1.8247638564108607e-05 + }, + { + "step": 536, + "epoch": 0.8589743589743589, + "cpu_mem": 3.380252672, + "gpu_mem": 1.352782848, + "loss": 0.37, + "grad_norm": 3.7135841846466064, + "learning_rate": 1.7848170857132325e-05 + }, + { + "step": 537, + "epoch": 0.8605769230769231, + "cpu_mem": 3.380252672, + "gpu_mem": 1.352796672, + "loss": 0.3868, + "grad_norm": 4.361562728881836, + "learning_rate": 1.7452847401305496e-05 + }, + { + "step": 538, + "epoch": 0.8621794871794872, + "cpu_mem": 3.380252672, + "gpu_mem": 1.352752128, + "loss": 0.3845, + "grad_norm": 3.7186875343322754, + "learning_rate": 1.7061680593881344e-05 + }, + { + "step": 539, + "epoch": 0.8637820512820513, + "cpu_mem": 3.380252672, + "gpu_mem": 1.352758272, + "loss": 0.474, + "grad_norm": 3.8212132453918457, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 540, + "epoch": 0.8653846153846154, + "cpu_mem": 3.380252672, + "gpu_mem": 1.352787456, + "loss": 0.522, + "grad_norm": 4.496225833892822, + "learning_rate": 1.6291865861111353e-05 + }, + { + "step": 541, + "epoch": 0.8669871794871795, + "cpu_mem": 3.380252672, + "gpu_mem": 1.352782848, + "loss": 0.3769, + "grad_norm": 3.5001988410949707, + "learning_rate": 1.5913242076979493e-05 + }, + { + "step": 542, + "epoch": 0.8685897435897436, + "cpu_mem": 3.380252672, + "gpu_mem": 1.352769024, + "loss": 0.583, + "grad_norm": 4.820213794708252, + "learning_rate": 1.5538823222921288e-05 + }, + { + "step": 543, + "epoch": 0.8701923076923077, + "cpu_mem": 3.380252672, + "gpu_mem": 1.352782848, + "loss": 0.3817, + "grad_norm": 3.87214732170105, + "learning_rate": 1.5168621040626388e-05 + }, + { + "step": 544, + "epoch": 0.8717948717948718, + "cpu_mem": 3.380252672, + "gpu_mem": 1.352772096, + "loss": 0.4643, + "grad_norm": 4.26419734954834, + "learning_rate": 1.4802647139550577e-05 + }, + { + "step": 545, + "epoch": 0.8733974358974359, + "cpu_mem": 3.380252672, + "gpu_mem": 1.35277824, + "loss": 0.2265, + "grad_norm": 2.344705104827881, + "learning_rate": 1.444091299655175e-05 + }, + { + "step": 546, + "epoch": 0.875, + "cpu_mem": 3.380252672, + "gpu_mem": 1.352782848, + "loss": 0.5928, + "grad_norm": 5.453653335571289, + "learning_rate": 1.408342995552988e-05 + }, + { + "step": 547, + "epoch": 0.8766025641025641, + "cpu_mem": 3.380252672, + "gpu_mem": 1.35277824, + "loss": 0.4196, + "grad_norm": 3.479572296142578, + "learning_rate": 1.3730209227071436e-05 + }, + { + "step": 548, + "epoch": 0.8782051282051282, + "cpu_mem": 3.380252672, + "gpu_mem": 1.352752128, + "loss": 0.3469, + "grad_norm": 2.8647336959838867, + "learning_rate": 1.3381261888097755e-05 + }, + { + "step": 549, + "epoch": 0.8798076923076923, + "cpu_mem": 3.38044928, + "gpu_mem": 1.352761344, + "loss": 0.3913, + "grad_norm": 3.5005202293395996, + "learning_rate": 1.303659888151753e-05 + }, + { + "step": 550, + "epoch": 0.8814102564102564, + "cpu_mem": 3.38044928, + "gpu_mem": 1.352779776, + "loss": 0.479, + "grad_norm": 4.1472086906433105, + "learning_rate": 1.2696231015883913e-05 + }, + { + "step": 551, + "epoch": 0.8830128205128205, + "cpu_mem": 3.38044928, + "gpu_mem": 1.352750592, + "loss": 0.4194, + "grad_norm": 3.5357871055603027, + "learning_rate": 1.2360168965055301e-05 + }, + { + "step": 552, + "epoch": 0.8846153846153846, + "cpu_mem": 3.38044928, + "gpu_mem": 1.352781312, + "loss": 0.4791, + "grad_norm": 4.100743293762207, + "learning_rate": 1.2028423267860805e-05 + }, + { + "step": 553, + "epoch": 0.8862179487179487, + "cpu_mem": 3.38044928, + "gpu_mem": 1.352790528, + "loss": 0.3355, + "grad_norm": 3.2997989654541016, + "learning_rate": 1.1701004327769709e-05 + }, + { + "step": 554, + "epoch": 0.8878205128205128, + "cpu_mem": 3.38044928, + "gpu_mem": 1.352752128, + "loss": 0.6448, + "grad_norm": 4.070967197418213, + "learning_rate": 1.1377922412565005e-05 + }, + { + "step": 555, + "epoch": 0.8894230769230769, + "cpu_mem": 3.38044928, + "gpu_mem": 1.352756736, + "loss": 0.343, + "grad_norm": 3.5486130714416504, + "learning_rate": 1.1059187654021762e-05 + }, + { + "step": 556, + "epoch": 0.8910256410256411, + "cpu_mem": 3.38044928, + "gpu_mem": 1.352781312, + "loss": 0.4169, + "grad_norm": 3.4542014598846436, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 557, + "epoch": 0.8926282051282052, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352799744, + "loss": 0.3111, + "grad_norm": 3.0073719024658203, + "learning_rate": 1.0434799452076915e-05 + }, + { + "step": 558, + "epoch": 0.8942307692307693, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352781312, + "loss": 0.6458, + "grad_norm": 4.598834991455078, + "learning_rate": 1.0129165589346643e-05 + }, + { + "step": 559, + "epoch": 0.8958333333333334, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352832, + "loss": 0.4273, + "grad_norm": 3.9403979778289795, + "learning_rate": 9.82791804400626e-06 + }, + { + "step": 560, + "epoch": 0.8974358974358975, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352764416, + "loss": 0.8904, + "grad_norm": 5.2412261962890625, + "learning_rate": 9.531066263109971e-06 + }, + { + "step": 561, + "epoch": 0.8990384615384616, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352765952, + "loss": 0.4098, + "grad_norm": 4.054440021514893, + "learning_rate": 9.238619555861731e-06 + }, + { + "step": 562, + "epoch": 0.9006410256410257, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352765952, + "loss": 0.4775, + "grad_norm": 4.029513835906982, + "learning_rate": 8.950587093323435e-06 + }, + { + "step": 563, + "epoch": 0.9022435897435898, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352772096, + "loss": 0.2677, + "grad_norm": 2.581970691680908, + "learning_rate": 8.66697790812731e-06 + }, + { + "step": 564, + "epoch": 0.9038461538461539, + "cpu_mem": 3.380645888, + "gpu_mem": 1.35278592, + "loss": 0.3693, + "grad_norm": 3.5135717391967773, + "learning_rate": 8.387800894192453e-06 + }, + { + "step": 565, + "epoch": 0.905448717948718, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352790528, + "loss": 0.4153, + "grad_norm": 3.4998838901519775, + "learning_rate": 8.113064806446285e-06 + }, + { + "step": 566, + "epoch": 0.907051282051282, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352784384, + "loss": 0.4302, + "grad_norm": 3.5361876487731934, + "learning_rate": 7.842778260549654e-06 + }, + { + "step": 567, + "epoch": 0.9086538461538461, + "cpu_mem": 3.380645888, + "gpu_mem": 1.35277824, + "loss": 0.3878, + "grad_norm": 3.2839441299438477, + "learning_rate": 7.5769497326268804e-06 + }, + { + "step": 568, + "epoch": 0.9102564102564102, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352792064, + "loss": 0.5748, + "grad_norm": 4.429206371307373, + "learning_rate": 7.315587558999864e-06 + }, + { + "step": 569, + "epoch": 0.9118589743589743, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352784384, + "loss": 0.4952, + "grad_norm": 4.441903114318848, + "learning_rate": 7.058699935926526e-06 + }, + { + "step": 570, + "epoch": 0.9134615384615384, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352769024, + "loss": 0.4121, + "grad_norm": 3.6707990169525146, + "learning_rate": 6.8062949193440515e-06 + }, + { + "step": 571, + "epoch": 0.9150641025641025, + "cpu_mem": 3.380645888, + "gpu_mem": 1.35277824, + "loss": 0.4828, + "grad_norm": 4.229144096374512, + "learning_rate": 6.5583804246160385e-06 + }, + { + "step": 572, + "epoch": 0.9166666666666666, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352787456, + "loss": 0.4566, + "grad_norm": 4.129189968109131, + "learning_rate": 6.3149642262843804e-06 + }, + { + "step": 573, + "epoch": 0.9182692307692307, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352790528, + "loss": 0.4092, + "grad_norm": 3.369016647338867, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 574, + "epoch": 0.9198717948717948, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352744448, + "loss": 0.3974, + "grad_norm": 3.6037685871124268, + "learning_rate": 5.84165711141048e-06 + }, + { + "step": 575, + "epoch": 0.9214743589743589, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352798208, + "loss": 0.2161, + "grad_norm": 3.454543352127075, + "learning_rate": 5.611781037671176e-06 + }, + { + "step": 576, + "epoch": 0.9230769230769231, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352799744, + "loss": 0.394, + "grad_norm": 3.3556501865386963, + "learning_rate": 5.386432945468555e-06 + }, + { + "step": 577, + "epoch": 0.9246794871794872, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352744448, + "loss": 0.4574, + "grad_norm": 3.68985915184021, + "learning_rate": 5.165619901667311e-06 + }, + { + "step": 578, + "epoch": 0.9262820512820513, + "cpu_mem": 3.380645888, + "gpu_mem": 1.35277824, + "loss": 0.5676, + "grad_norm": 4.463977336883545, + "learning_rate": 4.949348830914002e-06 + }, + { + "step": 579, + "epoch": 0.9278846153846154, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352756736, + "loss": 0.3872, + "grad_norm": 3.805849313735962, + "learning_rate": 4.737626515419951e-06 + }, + { + "step": 580, + "epoch": 0.9294871794871795, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352787456, + "loss": 0.489, + "grad_norm": 4.079235553741455, + "learning_rate": 4.530459594748592e-06 + }, + { + "step": 581, + "epoch": 0.9310897435897436, + "cpu_mem": 3.380645888, + "gpu_mem": 1.35276288, + "loss": 0.3321, + "grad_norm": 3.5827927589416504, + "learning_rate": 4.327854565607164e-06 + }, + { + "step": 582, + "epoch": 0.9326923076923077, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352796672, + "loss": 0.3973, + "grad_norm": 3.4881513118743896, + "learning_rate": 4.129817781643091e-06 + }, + { + "step": 583, + "epoch": 0.9342948717948718, + "cpu_mem": 3.380645888, + "gpu_mem": 1.35281664, + "loss": 0.5421, + "grad_norm": 4.284890174865723, + "learning_rate": 3.9363554532446276e-06 + }, + { + "step": 584, + "epoch": 0.9358974358974359, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352781312, + "loss": 0.3178, + "grad_norm": 3.2541866302490234, + "learning_rate": 3.7474736473461607e-06 + }, + { + "step": 585, + "epoch": 0.9375, + "cpu_mem": 3.380645888, + "gpu_mem": 1.35280128, + "loss": 0.4384, + "grad_norm": 3.9730286598205566, + "learning_rate": 3.56317828723795e-06 + }, + { + "step": 586, + "epoch": 0.9391025641025641, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352781312, + "loss": 0.3962, + "grad_norm": 3.375638246536255, + "learning_rate": 3.383475152380355e-06 + }, + { + "step": 587, + "epoch": 0.9407051282051282, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352782848, + "loss": 0.417, + "grad_norm": 3.4858384132385254, + "learning_rate": 3.2083698782225997e-06 + }, + { + "step": 588, + "epoch": 0.9423076923076923, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352775168, + "loss": 0.2946, + "grad_norm": 3.13385009765625, + "learning_rate": 3.0378679560260467e-06 + }, + { + "step": 589, + "epoch": 0.9439102564102564, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352773632, + "loss": 0.4083, + "grad_norm": 2.972154378890991, + "learning_rate": 2.871974732691984e-06 + }, + { + "step": 590, + "epoch": 0.9455128205128205, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352787456, + "loss": 0.4434, + "grad_norm": 4.152482986450195, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 591, + "epoch": 0.9471153846153846, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352758272, + "loss": 0.3034, + "grad_norm": 2.5355234146118164, + "learning_rate": 2.554035047414732e-06 + }, + { + "step": 592, + "epoch": 0.9487179487179487, + "cpu_mem": 3.380645888, + "gpu_mem": 1.35280896, + "loss": 0.3768, + "grad_norm": 3.536540985107422, + "learning_rate": 2.401998555987389e-06 + }, + { + "step": 593, + "epoch": 0.9503205128205128, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352805888, + "loss": 0.4275, + "grad_norm": 3.85062313079834, + "learning_rate": 2.2545907041415457e-06 + }, + { + "step": 594, + "epoch": 0.9519230769230769, + "cpu_mem": 3.380645888, + "gpu_mem": 1.35278592, + "loss": 0.3931, + "grad_norm": 3.7779572010040283, + "learning_rate": 2.1118161145537436e-06 + }, + { + "step": 595, + "epoch": 0.9535256410256411, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352767488, + "loss": 0.3592, + "grad_norm": 3.088696241378784, + "learning_rate": 1.973679264602485e-06 + }, + { + "step": 596, + "epoch": 0.9551282051282052, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352776704, + "loss": 0.2747, + "grad_norm": 3.0343873500823975, + "learning_rate": 1.840184486227808e-06 + }, + { + "step": 597, + "epoch": 0.9567307692307693, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352744448, + "loss": 0.3819, + "grad_norm": 4.393159866333008, + "learning_rate": 1.711335965795435e-06 + }, + { + "step": 598, + "epoch": 0.9583333333333334, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352805888, + "loss": 0.5407, + "grad_norm": 4.358717441558838, + "learning_rate": 1.5871377439655054e-06 + }, + { + "step": 599, + "epoch": 0.9599358974358975, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352804352, + "loss": 0.4005, + "grad_norm": 4.690081596374512, + "learning_rate": 1.4675937155658456e-06 + }, + { + "step": 600, + "epoch": 0.9615384615384616, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352759808, + "loss": 0.4255, + "grad_norm": 4.11088228225708, + "learning_rate": 1.3527076294698846e-06 + }, + { + "step": 601, + "epoch": 0.9631410256410257, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352792064, + "loss": 0.3736, + "grad_norm": 2.81129789352417, + "learning_rate": 1.2424830884790126e-06 + }, + { + "step": 602, + "epoch": 0.9647435897435898, + "cpu_mem": 3.380645888, + "gpu_mem": 1.35278592, + "loss": 0.4011, + "grad_norm": 4.345637798309326, + "learning_rate": 1.1369235492096397e-06 + }, + { + "step": 603, + "epoch": 0.9663461538461539, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352772096, + "loss": 0.3006, + "grad_norm": 3.183746814727783, + "learning_rate": 1.0360323219847645e-06 + }, + { + "step": 604, + "epoch": 0.967948717948718, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352772096, + "loss": 0.5327, + "grad_norm": 5.218776226043701, + "learning_rate": 9.398125707302084e-07 + }, + { + "step": 605, + "epoch": 0.969551282051282, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352798208, + "loss": 0.3586, + "grad_norm": 3.223905324935913, + "learning_rate": 8.482673128753947e-07 + }, + { + "step": 606, + "epoch": 0.9711538461538461, + "cpu_mem": 3.380645888, + "gpu_mem": 1.35278592, + "loss": 0.3954, + "grad_norm": 4.090332984924316, + "learning_rate": 7.613994192586736e-07 + }, + { + "step": 607, + "epoch": 0.9727564102564102, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352776704, + "loss": 0.4291, + "grad_norm": 3.432371139526367, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 608, + "epoch": 0.9743589743589743, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352769024, + "loss": 0.5746, + "grad_norm": 5.256326675415039, + "learning_rate": 6.017064746021094e-07 + }, + { + "step": 609, + "epoch": 0.9759615384615384, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352792064, + "loss": 0.3002, + "grad_norm": 2.8416848182678223, + "learning_rate": 5.288864314965003e-07 + }, + { + "step": 610, + "epoch": 0.9775641025641025, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352781312, + "loss": 0.25, + "grad_norm": 2.3330235481262207, + "learning_rate": 4.607537683404106e-07 + }, + { + "step": 611, + "epoch": 0.9791666666666666, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352765952, + "loss": 0.3876, + "grad_norm": 3.0041301250457764, + "learning_rate": 3.973106217585842e-07 + }, + { + "step": 612, + "epoch": 0.9807692307692307, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352832, + "loss": 0.3004, + "grad_norm": 2.6256775856018066, + "learning_rate": 3.3855898131356915e-07 + }, + { + "step": 613, + "epoch": 0.9823717948717948, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352775168, + "loss": 0.3828, + "grad_norm": 3.717013359069824, + "learning_rate": 2.845006894433843e-07 + }, + { + "step": 614, + "epoch": 0.9839743589743589, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352761344, + "loss": 0.3657, + "grad_norm": 3.1365115642547607, + "learning_rate": 2.351374414037155e-07 + }, + { + "step": 615, + "epoch": 0.9855769230769231, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352825856, + "loss": 0.3429, + "grad_norm": 3.5160603523254395, + "learning_rate": 1.9047078521474135e-07 + }, + { + "step": 616, + "epoch": 0.9871794871794872, + "cpu_mem": 3.380645888, + "gpu_mem": 1.3527552, + "loss": 0.391, + "grad_norm": 3.093254327774048, + "learning_rate": 1.505021216125557e-07 + }, + { + "step": 617, + "epoch": 0.9887820512820513, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352782848, + "loss": 0.318, + "grad_norm": 3.2911975383758545, + "learning_rate": 1.1523270400535245e-07 + }, + { + "step": 618, + "epoch": 0.9903846153846154, + "cpu_mem": 3.380645888, + "gpu_mem": 1.35278592, + "loss": 0.442, + "grad_norm": 3.985895872116089, + "learning_rate": 8.466363843397383e-08 + }, + { + "step": 619, + "epoch": 0.9919871794871795, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352761344, + "loss": 0.4848, + "grad_norm": 3.799037218093872, + "learning_rate": 5.8795883537338106e-08 + }, + { + "step": 620, + "epoch": 0.9935897435897436, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352788992, + "loss": 0.3825, + "grad_norm": 4.400968551635742, + "learning_rate": 3.763025052231361e-08 + }, + { + "step": 621, + "epoch": 0.9951923076923077, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352798208, + "loss": 0.2737, + "grad_norm": 3.102393865585327, + "learning_rate": 2.1167403138339088e-08 + }, + { + "step": 622, + "epoch": 0.9967948717948718, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352795136, + "loss": 0.3703, + "grad_norm": 3.1312265396118164, + "learning_rate": 9.407857656540397e-09 + }, + { + "step": 623, + "epoch": 0.9983974358974359, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352767488, + "loss": 0.542, + "grad_norm": 3.964945077896118, + "learning_rate": 2.3519828535434325e-09 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352514048, + "loss": 0.4415, + "grad_norm": 4.533402442932129, + "learning_rate": 0.0 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 3.380645888, + "gpu_mem": 1.352514048, + "train_runtime": 8243.2711, + "train_samples_per_second": 4.841, + "train_steps_per_second": 0.076, + "total_flos": 8.629042704352051e+16, + "train_loss": 0.7538369896893318 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r8-a2/README.md b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r8-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r8-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r8-a2/adapter_config.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a34e999804ff05ab393ed2117c936e4d7827f88f --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r8-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r8-a2/eval_results.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..0bee04fe8bacdb5509fb7ba06f501c43a8c19176 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "hellaswag", + "results": 0.8333001394144592 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r8-a2/training_configuration.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..bdb70f539e40abfa3e31ce832c020b4d4d4da762 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "HELLASWAG", + "dataset_id": "Rowan/hellaswag", + "preprocess_id": "hellaswag_train_deepeval" + }, + "peft_config": { + "method": "loraq4", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 6307840 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 1, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loraq4-hellaswag-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loraq4-fix/TinyLlama_v1.1-loraq4-hellaswag-r8-a2", + "seed": 42, + "timestamp": "2025-09-01T06:32:46.390330" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r8-a2/training_logs.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..0ab89605124f9df6062bd0f2c5119472197b6ed8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-hellaswag-r8-a2/training_logs.json @@ -0,0 +1,5629 @@ +[ + { + "step": 1, + "epoch": 0.0016025641025641025, + "cpu_mem": 3.329703936, + "gpu_mem": 1.07522304, + "loss": 4.3397, + "grad_norm": 51.138797760009766, + "learning_rate": 4.7619047619047615e-06 + }, + { + "step": 2, + "epoch": 0.003205128205128205, + "cpu_mem": 3.330686976, + "gpu_mem": 1.125679104, + "loss": 4.4533, + "grad_norm": 67.38164520263672, + "learning_rate": 9.523809523809523e-06 + }, + { + "step": 3, + "epoch": 0.004807692307692308, + "cpu_mem": 3.331866624, + "gpu_mem": 1.125686784, + "loss": 4.1518, + "grad_norm": 143.41065979003906, + "learning_rate": 1.4285714285714284e-05 + }, + { + "step": 4, + "epoch": 0.00641025641025641, + "cpu_mem": 3.332849664, + "gpu_mem": 1.125720576, + "loss": 4.2242, + "grad_norm": 319.06964111328125, + "learning_rate": 1.9047619047619046e-05 + }, + { + "step": 5, + "epoch": 0.008012820512820512, + "cpu_mem": 3.333832704, + "gpu_mem": 1.125683712, + "loss": 3.6077, + "grad_norm": 158.74594116210938, + "learning_rate": 2.3809523809523807e-05 + }, + { + "step": 6, + "epoch": 0.009615384615384616, + "cpu_mem": 3.334815744, + "gpu_mem": 1.125729792, + "loss": 3.4668, + "grad_norm": 21.32390785217285, + "learning_rate": 2.8571428571428567e-05 + }, + { + "step": 7, + "epoch": 0.011217948717948718, + "cpu_mem": 3.335602176, + "gpu_mem": 1.125689856, + "loss": 3.1872, + "grad_norm": 19.316547393798828, + "learning_rate": 3.333333333333333e-05 + }, + { + "step": 8, + "epoch": 0.01282051282051282, + "cpu_mem": 3.336388608, + "gpu_mem": 1.125720576, + "loss": 2.864, + "grad_norm": 23.239221572875977, + "learning_rate": 3.809523809523809e-05 + }, + { + "step": 9, + "epoch": 0.014423076923076924, + "cpu_mem": 3.337371648, + "gpu_mem": 1.125720576, + "loss": 2.5366, + "grad_norm": 26.871152877807617, + "learning_rate": 4.285714285714285e-05 + }, + { + "step": 10, + "epoch": 0.016025641025641024, + "cpu_mem": 3.337961472, + "gpu_mem": 1.125663744, + "loss": 2.1898, + "grad_norm": 13.458043098449707, + "learning_rate": 4.7619047619047614e-05 + }, + { + "step": 11, + "epoch": 0.017628205128205128, + "cpu_mem": 3.338551296, + "gpu_mem": 1.125683712, + "loss": 1.8854, + "grad_norm": 10.698444366455078, + "learning_rate": 5.238095238095237e-05 + }, + { + "step": 12, + "epoch": 0.019230769230769232, + "cpu_mem": 3.339337728, + "gpu_mem": 1.12568064, + "loss": 1.8642, + "grad_norm": 13.635722160339355, + "learning_rate": 5.7142857142857135e-05 + }, + { + "step": 13, + "epoch": 0.020833333333333332, + "cpu_mem": 3.34012416, + "gpu_mem": 1.12567296, + "loss": 1.5991, + "grad_norm": 5.001216888427734, + "learning_rate": 6.190476190476189e-05 + }, + { + "step": 14, + "epoch": 0.022435897435897436, + "cpu_mem": 3.340910592, + "gpu_mem": 1.125699072, + "loss": 1.4973, + "grad_norm": 3.7517952919006348, + "learning_rate": 6.666666666666666e-05 + }, + { + "step": 15, + "epoch": 0.02403846153846154, + "cpu_mem": 3.341697024, + "gpu_mem": 1.125697536, + "loss": 1.4987, + "grad_norm": 3.534456491470337, + "learning_rate": 7.142857142857142e-05 + }, + { + "step": 16, + "epoch": 0.02564102564102564, + "cpu_mem": 3.342286848, + "gpu_mem": 1.125689856, + "loss": 1.4375, + "grad_norm": 2.233186960220337, + "learning_rate": 7.619047619047618e-05 + }, + { + "step": 17, + "epoch": 0.027243589743589744, + "cpu_mem": 3.34307328, + "gpu_mem": 1.125689856, + "loss": 1.4847, + "grad_norm": 3.156278610229492, + "learning_rate": 8.095238095238093e-05 + }, + { + "step": 18, + "epoch": 0.028846153846153848, + "cpu_mem": 3.343663104, + "gpu_mem": 1.125689856, + "loss": 1.3433, + "grad_norm": 2.331435203552246, + "learning_rate": 8.57142857142857e-05 + }, + { + "step": 19, + "epoch": 0.030448717948717948, + "cpu_mem": 3.344449536, + "gpu_mem": 1.125689856, + "loss": 1.4752, + "grad_norm": 3.317863702774048, + "learning_rate": 9.047619047619046e-05 + }, + { + "step": 20, + "epoch": 0.03205128205128205, + "cpu_mem": 3.34503936, + "gpu_mem": 1.125663744, + "loss": 1.492, + "grad_norm": 4.066086292266846, + "learning_rate": 9.523809523809523e-05 + }, + { + "step": 21, + "epoch": 0.03365384615384615, + "cpu_mem": 3.345629184, + "gpu_mem": 1.12568064, + "loss": 1.4548, + "grad_norm": 2.904893398284912, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 22, + "epoch": 0.035256410256410256, + "cpu_mem": 3.346219008, + "gpu_mem": 1.12568832, + "loss": 1.4574, + "grad_norm": 4.428384780883789, + "learning_rate": 0.00010476190476190474 + }, + { + "step": 23, + "epoch": 0.03685897435897436, + "cpu_mem": 3.346808832, + "gpu_mem": 1.125702144, + "loss": 1.4157, + "grad_norm": 2.664661169052124, + "learning_rate": 0.0001095238095238095 + }, + { + "step": 24, + "epoch": 0.038461538461538464, + "cpu_mem": 3.347595264, + "gpu_mem": 1.125686784, + "loss": 1.383, + "grad_norm": 6.989508152008057, + "learning_rate": 0.00011428571428571427 + }, + { + "step": 25, + "epoch": 0.04006410256410257, + "cpu_mem": 3.348185088, + "gpu_mem": 1.125674496, + "loss": 1.5176, + "grad_norm": 5.456232070922852, + "learning_rate": 0.00011904761904761903 + }, + { + "step": 26, + "epoch": 0.041666666666666664, + "cpu_mem": 3.348774912, + "gpu_mem": 1.12568064, + "loss": 1.431, + "grad_norm": 6.38388729095459, + "learning_rate": 0.00012380952380952378 + }, + { + "step": 27, + "epoch": 0.04326923076923077, + "cpu_mem": 3.349364736, + "gpu_mem": 1.12568832, + "loss": 1.3765, + "grad_norm": 1.4423831701278687, + "learning_rate": 0.00012857142857142855 + }, + { + "step": 28, + "epoch": 0.04487179487179487, + "cpu_mem": 3.34995456, + "gpu_mem": 1.125683712, + "loss": 1.4221, + "grad_norm": 1.9716120958328247, + "learning_rate": 0.0001333333333333333 + }, + { + "step": 29, + "epoch": 0.046474358974358976, + "cpu_mem": 3.350544384, + "gpu_mem": 1.125692928, + "loss": 1.4877, + "grad_norm": 4.363304138183594, + "learning_rate": 0.00013809523809523808 + }, + { + "step": 30, + "epoch": 0.04807692307692308, + "cpu_mem": 3.351134208, + "gpu_mem": 1.12566528, + "loss": 1.4622, + "grad_norm": 2.522653818130493, + "learning_rate": 0.00014285714285714284 + }, + { + "step": 31, + "epoch": 0.049679487179487176, + "cpu_mem": 3.351724032, + "gpu_mem": 1.125720576, + "loss": 1.3892, + "grad_norm": 1.098405122756958, + "learning_rate": 0.0001476190476190476 + }, + { + "step": 32, + "epoch": 0.05128205128205128, + "cpu_mem": 3.352313856, + "gpu_mem": 1.125712896, + "loss": 1.3817, + "grad_norm": 0.9411613941192627, + "learning_rate": 0.00015238095238095237 + }, + { + "step": 33, + "epoch": 0.052884615384615384, + "cpu_mem": 3.352707072, + "gpu_mem": 1.125666816, + "loss": 1.4101, + "grad_norm": 1.3765735626220703, + "learning_rate": 0.00015714285714285713 + }, + { + "step": 34, + "epoch": 0.05448717948717949, + "cpu_mem": 3.353296896, + "gpu_mem": 1.125685248, + "loss": 1.4132, + "grad_norm": 1.7911897897720337, + "learning_rate": 0.00016190476190476187 + }, + { + "step": 35, + "epoch": 0.05608974358974359, + "cpu_mem": 3.35388672, + "gpu_mem": 1.125706752, + "loss": 1.494, + "grad_norm": 2.735259771347046, + "learning_rate": 0.00016666666666666666 + }, + { + "step": 36, + "epoch": 0.057692307692307696, + "cpu_mem": 3.354476544, + "gpu_mem": 1.125705216, + "loss": 1.3947, + "grad_norm": 0.7541161775588989, + "learning_rate": 0.0001714285714285714 + }, + { + "step": 37, + "epoch": 0.05929487179487179, + "cpu_mem": 3.355066368, + "gpu_mem": 1.125737472, + "loss": 1.4203, + "grad_norm": 1.1274186372756958, + "learning_rate": 0.0001761904761904762 + }, + { + "step": 38, + "epoch": 0.060897435897435896, + "cpu_mem": 3.355656192, + "gpu_mem": 1.125689856, + "loss": 1.422, + "grad_norm": 0.8396160006523132, + "learning_rate": 0.00018095238095238093 + }, + { + "step": 39, + "epoch": 0.0625, + "cpu_mem": 3.356246016, + "gpu_mem": 1.125746688, + "loss": 1.3601, + "grad_norm": 1.1649466753005981, + "learning_rate": 0.00018571428571428572 + }, + { + "step": 40, + "epoch": 0.0641025641025641, + "cpu_mem": 3.35683584, + "gpu_mem": 1.125674496, + "loss": 1.4526, + "grad_norm": 1.3186384439468384, + "learning_rate": 0.00019047619047619045 + }, + { + "step": 41, + "epoch": 0.06570512820512821, + "cpu_mem": 3.357229056, + "gpu_mem": 1.125702144, + "loss": 1.3795, + "grad_norm": 0.6759102940559387, + "learning_rate": 0.00019523809523809522 + }, + { + "step": 42, + "epoch": 0.0673076923076923, + "cpu_mem": 3.35781888, + "gpu_mem": 1.125715968, + "loss": 1.4374, + "grad_norm": 1.4251306056976318, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 43, + "epoch": 0.06891025641025642, + "cpu_mem": 3.358408704, + "gpu_mem": 1.125722112, + "loss": 1.3874, + "grad_norm": 0.6681056618690491, + "learning_rate": 0.00020476190476190475 + }, + { + "step": 44, + "epoch": 0.07051282051282051, + "cpu_mem": 3.358998528, + "gpu_mem": 1.125700608, + "loss": 1.4178, + "grad_norm": 0.9374871253967285, + "learning_rate": 0.00020952380952380948 + }, + { + "step": 45, + "epoch": 0.07211538461538461, + "cpu_mem": 3.359391744, + "gpu_mem": 1.125700608, + "loss": 1.3806, + "grad_norm": 0.4291742742061615, + "learning_rate": 0.00021428571428571427 + }, + { + "step": 46, + "epoch": 0.07371794871794872, + "cpu_mem": 3.359981568, + "gpu_mem": 1.125700608, + "loss": 1.3783, + "grad_norm": 1.275956630706787, + "learning_rate": 0.000219047619047619 + }, + { + "step": 47, + "epoch": 0.07532051282051282, + "cpu_mem": 3.360374784, + "gpu_mem": 1.125686784, + "loss": 1.4231, + "grad_norm": 1.2517460584640503, + "learning_rate": 0.0002238095238095238 + }, + { + "step": 48, + "epoch": 0.07692307692307693, + "cpu_mem": 3.360964608, + "gpu_mem": 1.125705216, + "loss": 1.3922, + "grad_norm": 1.2480961084365845, + "learning_rate": 0.00022857142857142854 + }, + { + "step": 49, + "epoch": 0.07852564102564102, + "cpu_mem": 3.361554432, + "gpu_mem": 1.125717504, + "loss": 1.4394, + "grad_norm": 1.4807002544403076, + "learning_rate": 0.0002333333333333333 + }, + { + "step": 50, + "epoch": 0.08012820512820513, + "cpu_mem": 3.361947648, + "gpu_mem": 1.125694464, + "loss": 1.394, + "grad_norm": 0.861428439617157, + "learning_rate": 0.00023809523809523807 + }, + { + "step": 51, + "epoch": 0.08173076923076923, + "cpu_mem": 3.362537472, + "gpu_mem": 1.125679104, + "loss": 1.3769, + "grad_norm": 0.8003106713294983, + "learning_rate": 0.00024285714285714283 + }, + { + "step": 52, + "epoch": 0.08333333333333333, + "cpu_mem": 3.363127296, + "gpu_mem": 1.125683712, + "loss": 1.3718, + "grad_norm": 0.5499626994132996, + "learning_rate": 0.00024761904761904757 + }, + { + "step": 53, + "epoch": 0.08493589743589744, + "cpu_mem": 3.363520512, + "gpu_mem": 1.12571136, + "loss": 1.3785, + "grad_norm": 1.0003025531768799, + "learning_rate": 0.0002523809523809524 + }, + { + "step": 54, + "epoch": 0.08653846153846154, + "cpu_mem": 3.363913728, + "gpu_mem": 1.125686784, + "loss": 1.4859, + "grad_norm": 1.8696109056472778, + "learning_rate": 0.0002571428571428571 + }, + { + "step": 55, + "epoch": 0.08814102564102565, + "cpu_mem": 3.364306944, + "gpu_mem": 1.125705216, + "loss": 1.4152, + "grad_norm": 1.0043869018554688, + "learning_rate": 0.00026190476190476186 + }, + { + "step": 56, + "epoch": 0.08974358974358974, + "cpu_mem": 3.364896768, + "gpu_mem": 1.125699072, + "loss": 1.3727, + "grad_norm": 0.4658830165863037, + "learning_rate": 0.0002666666666666666 + }, + { + "step": 57, + "epoch": 0.09134615384615384, + "cpu_mem": 3.365289984, + "gpu_mem": 1.12566528, + "loss": 1.4092, + "grad_norm": 0.8491731882095337, + "learning_rate": 0.0002714285714285714 + }, + { + "step": 58, + "epoch": 0.09294871794871795, + "cpu_mem": 3.3656832, + "gpu_mem": 1.125694464, + "loss": 1.44, + "grad_norm": 1.3348828554153442, + "learning_rate": 0.00027619047619047615 + }, + { + "step": 59, + "epoch": 0.09455128205128205, + "cpu_mem": 3.366273024, + "gpu_mem": 1.125677568, + "loss": 1.3162, + "grad_norm": 0.5980450510978699, + "learning_rate": 0.0002809523809523809 + }, + { + "step": 60, + "epoch": 0.09615384615384616, + "cpu_mem": 3.36666624, + "gpu_mem": 1.12571904, + "loss": 1.4356, + "grad_norm": 1.4766641855239868, + "learning_rate": 0.0002857142857142857 + }, + { + "step": 61, + "epoch": 0.09775641025641026, + "cpu_mem": 3.367059456, + "gpu_mem": 1.125685248, + "loss": 1.4005, + "grad_norm": 0.6815160512924194, + "learning_rate": 0.00029047619047619045 + }, + { + "step": 62, + "epoch": 0.09935897435897435, + "cpu_mem": 3.36764928, + "gpu_mem": 1.125725184, + "loss": 1.3562, + "grad_norm": 1.0125457048416138, + "learning_rate": 0.0002952380952380952 + }, + { + "step": 63, + "epoch": 0.10096153846153846, + "cpu_mem": 3.368239104, + "gpu_mem": 1.125679104, + "loss": 1.4262, + "grad_norm": 1.1522331237792969, + "learning_rate": 0.0003 + }, + { + "step": 64, + "epoch": 0.10256410256410256, + "cpu_mem": 3.36863232, + "gpu_mem": 1.125683712, + "loss": 1.4501, + "grad_norm": 1.2873867750167847, + "learning_rate": 0.00029999764801714643 + }, + { + "step": 65, + "epoch": 0.10416666666666667, + "cpu_mem": 3.369025536, + "gpu_mem": 1.12568064, + "loss": 1.4211, + "grad_norm": 1.169226050376892, + "learning_rate": 0.00029999059214234344 + }, + { + "step": 66, + "epoch": 0.10576923076923077, + "cpu_mem": 3.369418752, + "gpu_mem": 1.125699072, + "loss": 1.3945, + "grad_norm": 0.5259639620780945, + "learning_rate": 0.00029997883259686163 + }, + { + "step": 67, + "epoch": 0.10737179487179487, + "cpu_mem": 3.369811968, + "gpu_mem": 1.125691392, + "loss": 1.4093, + "grad_norm": 0.8873715400695801, + "learning_rate": 0.00029996236974947764 + }, + { + "step": 68, + "epoch": 0.10897435897435898, + "cpu_mem": 3.370401792, + "gpu_mem": 1.125676032, + "loss": 1.4283, + "grad_norm": 1.4243661165237427, + "learning_rate": 0.00029994120411646263 + }, + { + "step": 69, + "epoch": 0.11057692307692307, + "cpu_mem": 3.370795008, + "gpu_mem": 1.125746688, + "loss": 1.3785, + "grad_norm": 0.8999771475791931, + "learning_rate": 0.00029991533636156603 + }, + { + "step": 70, + "epoch": 0.11217948717948718, + "cpu_mem": 3.371188224, + "gpu_mem": 1.125697536, + "loss": 1.4024, + "grad_norm": 1.1074942350387573, + "learning_rate": 0.00029988476729599464 + }, + { + "step": 71, + "epoch": 0.11378205128205128, + "cpu_mem": 3.37158144, + "gpu_mem": 1.125722112, + "loss": 1.3599, + "grad_norm": 1.0623029470443726, + "learning_rate": 0.0002998494978783874 + }, + { + "step": 72, + "epoch": 0.11538461538461539, + "cpu_mem": 3.372171264, + "gpu_mem": 1.125692928, + "loss": 1.4184, + "grad_norm": 1.3982954025268555, + "learning_rate": 0.0002998095292147852 + }, + { + "step": 73, + "epoch": 0.11698717948717949, + "cpu_mem": 3.37256448, + "gpu_mem": 1.125685248, + "loss": 1.4811, + "grad_norm": 2.3660974502563477, + "learning_rate": 0.0002997648625585962 + }, + { + "step": 74, + "epoch": 0.11858974358974358, + "cpu_mem": 3.372957696, + "gpu_mem": 1.125679104, + "loss": 1.371, + "grad_norm": 0.5041117072105408, + "learning_rate": 0.0002997154993105566 + }, + { + "step": 75, + "epoch": 0.1201923076923077, + "cpu_mem": 3.373350912, + "gpu_mem": 1.125708288, + "loss": 1.4317, + "grad_norm": 1.4604682922363281, + "learning_rate": 0.00029966144101868636 + }, + { + "step": 76, + "epoch": 0.12179487179487179, + "cpu_mem": 3.373744128, + "gpu_mem": 1.125699072, + "loss": 1.397, + "grad_norm": 1.543352723121643, + "learning_rate": 0.0002996026893782414 + }, + { + "step": 77, + "epoch": 0.1233974358974359, + "cpu_mem": 3.373940736, + "gpu_mem": 1.125686784, + "loss": 1.3887, + "grad_norm": 0.47296813130378723, + "learning_rate": 0.00029953924623165955 + }, + { + "step": 78, + "epoch": 0.125, + "cpu_mem": 3.37453056, + "gpu_mem": 1.125679104, + "loss": 1.4264, + "grad_norm": 1.2204669713974, + "learning_rate": 0.0002994711135685035 + }, + { + "step": 79, + "epoch": 0.1266025641025641, + "cpu_mem": 3.374923776, + "gpu_mem": 1.125731328, + "loss": 1.39, + "grad_norm": 0.6343543529510498, + "learning_rate": 0.00029939829352539787 + }, + { + "step": 80, + "epoch": 0.1282051282051282, + "cpu_mem": 3.3755136, + "gpu_mem": 1.125709824, + "loss": 1.4199, + "grad_norm": 0.8529762029647827, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 81, + "epoch": 0.12980769230769232, + "cpu_mem": 3.375906816, + "gpu_mem": 1.12570368, + "loss": 1.3567, + "grad_norm": 0.4079361855983734, + "learning_rate": 0.0002992386005807413 + }, + { + "step": 82, + "epoch": 0.13141025641025642, + "cpu_mem": 3.376300032, + "gpu_mem": 1.12568064, + "loss": 1.3885, + "grad_norm": 0.7243728637695312, + "learning_rate": 0.00029915173268712456 + }, + { + "step": 83, + "epoch": 0.1330128205128205, + "cpu_mem": 3.376693248, + "gpu_mem": 1.125702144, + "loss": 1.4219, + "grad_norm": 1.08815336227417, + "learning_rate": 0.0002990601874292698 + }, + { + "step": 84, + "epoch": 0.1346153846153846, + "cpu_mem": 3.377086464, + "gpu_mem": 1.125674496, + "loss": 1.4069, + "grad_norm": 0.9862790107727051, + "learning_rate": 0.0002989639676780152 + }, + { + "step": 85, + "epoch": 0.1362179487179487, + "cpu_mem": 3.377283072, + "gpu_mem": 1.125682176, + "loss": 1.3776, + "grad_norm": 0.6880221366882324, + "learning_rate": 0.0002988630764507904 + }, + { + "step": 86, + "epoch": 0.13782051282051283, + "cpu_mem": 3.377676288, + "gpu_mem": 1.125700608, + "loss": 1.3825, + "grad_norm": 0.9968577027320862, + "learning_rate": 0.00029875751691152094 + }, + { + "step": 87, + "epoch": 0.13942307692307693, + "cpu_mem": 3.378266112, + "gpu_mem": 1.125689856, + "loss": 1.4493, + "grad_norm": 1.813805341720581, + "learning_rate": 0.0002986472923705301 + }, + { + "step": 88, + "epoch": 0.14102564102564102, + "cpu_mem": 3.378659328, + "gpu_mem": 1.12568832, + "loss": 1.3926, + "grad_norm": 1.2555878162384033, + "learning_rate": 0.0002985324062844341 + }, + { + "step": 89, + "epoch": 0.14262820512820512, + "cpu_mem": 3.379052544, + "gpu_mem": 1.125683712, + "loss": 1.397, + "grad_norm": 0.8255822062492371, + "learning_rate": 0.0002984128622560345 + }, + { + "step": 90, + "epoch": 0.14423076923076922, + "cpu_mem": 3.379249152, + "gpu_mem": 1.12568832, + "loss": 1.3857, + "grad_norm": 0.715108335018158, + "learning_rate": 0.0002982886640342046 + }, + { + "step": 91, + "epoch": 0.14583333333333334, + "cpu_mem": 3.379838976, + "gpu_mem": 1.125699072, + "loss": 1.4307, + "grad_norm": 1.2281923294067383, + "learning_rate": 0.00029815981551377217 + }, + { + "step": 92, + "epoch": 0.14743589743589744, + "cpu_mem": 3.380232192, + "gpu_mem": 1.125702144, + "loss": 1.4069, + "grad_norm": 0.7532985806465149, + "learning_rate": 0.00029802632073539745 + }, + { + "step": 93, + "epoch": 0.14903846153846154, + "cpu_mem": 3.3804288, + "gpu_mem": 1.125702144, + "loss": 1.4154, + "grad_norm": 0.4658808708190918, + "learning_rate": 0.0002978881838854462 + }, + { + "step": 94, + "epoch": 0.15064102564102563, + "cpu_mem": 3.380822016, + "gpu_mem": 1.125697536, + "loss": 1.375, + "grad_norm": 0.4443753659725189, + "learning_rate": 0.00029774540929585847 + }, + { + "step": 95, + "epoch": 0.15224358974358973, + "cpu_mem": 3.381215232, + "gpu_mem": 1.125715968, + "loss": 1.4148, + "grad_norm": 1.2880910634994507, + "learning_rate": 0.0002975980014440126 + }, + { + "step": 96, + "epoch": 0.15384615384615385, + "cpu_mem": 3.381608448, + "gpu_mem": 1.12571904, + "loss": 1.3909, + "grad_norm": 0.5830844640731812, + "learning_rate": 0.00029744596495258525 + }, + { + "step": 97, + "epoch": 0.15544871794871795, + "cpu_mem": 3.382001664, + "gpu_mem": 1.125696, + "loss": 1.3816, + "grad_norm": 0.3255292475223541, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 98, + "epoch": 0.15705128205128205, + "cpu_mem": 3.38239488, + "gpu_mem": 1.125706752, + "loss": 1.3901, + "grad_norm": 0.489499032497406, + "learning_rate": 0.000297128025267308 + }, + { + "step": 99, + "epoch": 0.15865384615384615, + "cpu_mem": 3.382591488, + "gpu_mem": 1.125706752, + "loss": 1.3927, + "grad_norm": 0.9686228632926941, + "learning_rate": 0.00029696213204397396 + }, + { + "step": 100, + "epoch": 0.16025641025641027, + "cpu_mem": 3.382984704, + "gpu_mem": 1.125682176, + "loss": 1.3789, + "grad_norm": 0.5790128707885742, + "learning_rate": 0.00029679163012177737 + }, + { + "step": 101, + "epoch": 0.16185897435897437, + "cpu_mem": 3.38337792, + "gpu_mem": 1.12571136, + "loss": 1.4503, + "grad_norm": 1.4607429504394531, + "learning_rate": 0.0002966165248476196 + }, + { + "step": 102, + "epoch": 0.16346153846153846, + "cpu_mem": 3.383771136, + "gpu_mem": 1.12568832, + "loss": 1.3592, + "grad_norm": 0.782285749912262, + "learning_rate": 0.00029643682171276203 + }, + { + "step": 103, + "epoch": 0.16506410256410256, + "cpu_mem": 3.384164352, + "gpu_mem": 1.125705216, + "loss": 1.4308, + "grad_norm": 1.083450436592102, + "learning_rate": 0.0002962525263526538 + }, + { + "step": 104, + "epoch": 0.16666666666666666, + "cpu_mem": 3.384557568, + "gpu_mem": 1.12567296, + "loss": 1.4134, + "grad_norm": 1.1731876134872437, + "learning_rate": 0.0002960636445467553 + }, + { + "step": 105, + "epoch": 0.16826923076923078, + "cpu_mem": 3.384754176, + "gpu_mem": 1.12568832, + "loss": 1.3793, + "grad_norm": 1.4168041944503784, + "learning_rate": 0.0002958701822183569 + }, + { + "step": 106, + "epoch": 0.16987179487179488, + "cpu_mem": 3.385147392, + "gpu_mem": 1.125668352, + "loss": 1.4795, + "grad_norm": 1.7007389068603516, + "learning_rate": 0.0002956721454343928 + }, + { + "step": 107, + "epoch": 0.17147435897435898, + "cpu_mem": 3.385344, + "gpu_mem": 1.125709824, + "loss": 1.4031, + "grad_norm": 0.9290204048156738, + "learning_rate": 0.0002954695404052514 + }, + { + "step": 108, + "epoch": 0.17307692307692307, + "cpu_mem": 3.385737216, + "gpu_mem": 1.125705216, + "loss": 1.4011, + "grad_norm": 0.6739901304244995, + "learning_rate": 0.00029526237348458003 + }, + { + "step": 109, + "epoch": 0.17467948717948717, + "cpu_mem": 3.386130432, + "gpu_mem": 1.12571136, + "loss": 1.3675, + "grad_norm": 0.681174099445343, + "learning_rate": 0.000295050651169086 + }, + { + "step": 110, + "epoch": 0.1762820512820513, + "cpu_mem": 3.386523648, + "gpu_mem": 1.125708288, + "loss": 1.3946, + "grad_norm": 0.6556748151779175, + "learning_rate": 0.00029483438009833264 + }, + { + "step": 111, + "epoch": 0.1778846153846154, + "cpu_mem": 3.386720256, + "gpu_mem": 1.125709824, + "loss": 1.3805, + "grad_norm": 0.8595457673072815, + "learning_rate": 0.0002946135670545314 + }, + { + "step": 112, + "epoch": 0.1794871794871795, + "cpu_mem": 3.387113472, + "gpu_mem": 1.125706752, + "loss": 1.3737, + "grad_norm": 0.7164903283119202, + "learning_rate": 0.0002943882189623288 + }, + { + "step": 113, + "epoch": 0.18108974358974358, + "cpu_mem": 3.38731008, + "gpu_mem": 1.125686784, + "loss": 1.3746, + "grad_norm": 0.5462442636489868, + "learning_rate": 0.00029415834288858947 + }, + { + "step": 114, + "epoch": 0.18269230769230768, + "cpu_mem": 3.387703296, + "gpu_mem": 1.125682176, + "loss": 1.3788, + "grad_norm": 1.333668828010559, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 115, + "epoch": 0.1842948717948718, + "cpu_mem": 3.388096512, + "gpu_mem": 1.125700608, + "loss": 1.3838, + "grad_norm": 0.5709267258644104, + "learning_rate": 0.0002936850357737156 + }, + { + "step": 116, + "epoch": 0.1858974358974359, + "cpu_mem": 3.38829312, + "gpu_mem": 1.12571136, + "loss": 1.3831, + "grad_norm": 0.5874531269073486, + "learning_rate": 0.0002934416195753839 + }, + { + "step": 117, + "epoch": 0.1875, + "cpu_mem": 3.388686336, + "gpu_mem": 1.125697536, + "loss": 1.3663, + "grad_norm": 0.6250464916229248, + "learning_rate": 0.00029319370508065594 + }, + { + "step": 118, + "epoch": 0.1891025641025641, + "cpu_mem": 3.389079552, + "gpu_mem": 1.125712896, + "loss": 1.3372, + "grad_norm": 1.2857674360275269, + "learning_rate": 0.0002929413000640735 + }, + { + "step": 119, + "epoch": 0.1907051282051282, + "cpu_mem": 3.38927616, + "gpu_mem": 1.125694464, + "loss": 1.4184, + "grad_norm": 2.605499744415283, + "learning_rate": 0.0002926844124410001 + }, + { + "step": 120, + "epoch": 0.19230769230769232, + "cpu_mem": 3.389669376, + "gpu_mem": 1.125720576, + "loss": 1.3922, + "grad_norm": 1.2134472131729126, + "learning_rate": 0.0002924230502673731 + }, + { + "step": 121, + "epoch": 0.19391025641025642, + "cpu_mem": 3.390062592, + "gpu_mem": 1.125679104, + "loss": 1.2999, + "grad_norm": 1.240521788597107, + "learning_rate": 0.00029215722173945034 + }, + { + "step": 122, + "epoch": 0.1955128205128205, + "cpu_mem": 3.390455808, + "gpu_mem": 1.12571136, + "loss": 1.3469, + "grad_norm": 1.2988940477371216, + "learning_rate": 0.0002918869351935537 + }, + { + "step": 123, + "epoch": 0.1971153846153846, + "cpu_mem": 3.390849024, + "gpu_mem": 1.125705216, + "loss": 1.3746, + "grad_norm": 1.83686101436615, + "learning_rate": 0.00029161219910580754 + }, + { + "step": 124, + "epoch": 0.1987179487179487, + "cpu_mem": 3.391045632, + "gpu_mem": 1.125706752, + "loss": 1.3387, + "grad_norm": 2.2672278881073, + "learning_rate": 0.00029133302209187267 + }, + { + "step": 125, + "epoch": 0.20032051282051283, + "cpu_mem": 3.391438848, + "gpu_mem": 1.125682176, + "loss": 1.2579, + "grad_norm": 1.7325142621994019, + "learning_rate": 0.00029104941290667655 + }, + { + "step": 126, + "epoch": 0.20192307692307693, + "cpu_mem": 3.391635456, + "gpu_mem": 1.125691392, + "loss": 1.2602, + "grad_norm": 1.9697414636611938, + "learning_rate": 0.00029076138044413827 + }, + { + "step": 127, + "epoch": 0.20352564102564102, + "cpu_mem": 3.391832064, + "gpu_mem": 1.125677568, + "loss": 1.2777, + "grad_norm": 4.121600151062012, + "learning_rate": 0.00029046893373689 + }, + { + "step": 128, + "epoch": 0.20512820512820512, + "cpu_mem": 3.39222528, + "gpu_mem": 1.125714432, + "loss": 1.175, + "grad_norm": 3.4239773750305176, + "learning_rate": 0.00029017208195599375 + }, + { + "step": 129, + "epoch": 0.20673076923076922, + "cpu_mem": 3.392618496, + "gpu_mem": 1.12571136, + "loss": 1.3623, + "grad_norm": 3.2858145236968994, + "learning_rate": 0.0002898708344106533 + }, + { + "step": 130, + "epoch": 0.20833333333333334, + "cpu_mem": 3.392815104, + "gpu_mem": 1.12571136, + "loss": 1.1651, + "grad_norm": 2.0062475204467773, + "learning_rate": 0.00028956520054792303 + }, + { + "step": 131, + "epoch": 0.20993589743589744, + "cpu_mem": 3.39320832, + "gpu_mem": 1.125700608, + "loss": 1.187, + "grad_norm": 2.2492034435272217, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 132, + "epoch": 0.21153846153846154, + "cpu_mem": 3.393601536, + "gpu_mem": 1.125700608, + "loss": 1.1877, + "grad_norm": 2.4924521446228027, + "learning_rate": 0.0002889408123459782 + }, + { + "step": 133, + "epoch": 0.21314102564102563, + "cpu_mem": 3.393798144, + "gpu_mem": 1.125682176, + "loss": 1.2567, + "grad_norm": 2.6062369346618652, + "learning_rate": 0.000288622077587435 + }, + { + "step": 134, + "epoch": 0.21474358974358973, + "cpu_mem": 3.39419136, + "gpu_mem": 1.125692928, + "loss": 1.2411, + "grad_norm": 4.003267288208008, + "learning_rate": 0.0002882989956722303 + }, + { + "step": 135, + "epoch": 0.21634615384615385, + "cpu_mem": 3.394387968, + "gpu_mem": 1.125702144, + "loss": 1.3189, + "grad_norm": 4.310692310333252, + "learning_rate": 0.00028797157673213914 + }, + { + "step": 136, + "epoch": 0.21794871794871795, + "cpu_mem": 3.394781184, + "gpu_mem": 1.125717504, + "loss": 1.1925, + "grad_norm": 2.558511257171631, + "learning_rate": 0.00028763983103494465 + }, + { + "step": 137, + "epoch": 0.21955128205128205, + "cpu_mem": 3.394977792, + "gpu_mem": 1.12566528, + "loss": 1.2388, + "grad_norm": 3.2781012058258057, + "learning_rate": 0.00028730376898411606 + }, + { + "step": 138, + "epoch": 0.22115384615384615, + "cpu_mem": 3.395371008, + "gpu_mem": 1.125685248, + "loss": 1.3252, + "grad_norm": 3.8809938430786133, + "learning_rate": 0.00028696340111848245 + }, + { + "step": 139, + "epoch": 0.22275641025641027, + "cpu_mem": 3.395567616, + "gpu_mem": 1.125666816, + "loss": 1.2711, + "grad_norm": 2.7481954097747803, + "learning_rate": 0.00028661873811190226 + }, + { + "step": 140, + "epoch": 0.22435897435897437, + "cpu_mem": 3.395960832, + "gpu_mem": 1.125683712, + "loss": 1.1976, + "grad_norm": 1.9528728723526, + "learning_rate": 0.0002862697907729285 + }, + { + "step": 141, + "epoch": 0.22596153846153846, + "cpu_mem": 3.39615744, + "gpu_mem": 1.125689856, + "loss": 1.2228, + "grad_norm": 2.3535447120666504, + "learning_rate": 0.0002859165700444701 + }, + { + "step": 142, + "epoch": 0.22756410256410256, + "cpu_mem": 3.396354048, + "gpu_mem": 1.125686784, + "loss": 1.0805, + "grad_norm": 2.219804048538208, + "learning_rate": 0.00028555908700344824 + }, + { + "step": 143, + "epoch": 0.22916666666666666, + "cpu_mem": 3.396550656, + "gpu_mem": 1.125712896, + "loss": 1.1565, + "grad_norm": 3.2043728828430176, + "learning_rate": 0.00028519735286044936 + }, + { + "step": 144, + "epoch": 0.23076923076923078, + "cpu_mem": 3.396943872, + "gpu_mem": 1.125686784, + "loss": 1.2104, + "grad_norm": 3.6737678050994873, + "learning_rate": 0.0002848313789593736 + }, + { + "step": 145, + "epoch": 0.23237179487179488, + "cpu_mem": 3.39714048, + "gpu_mem": 1.12572672, + "loss": 1.0743, + "grad_norm": 2.609926223754883, + "learning_rate": 0.00028446117677707867 + }, + { + "step": 146, + "epoch": 0.23397435897435898, + "cpu_mem": 3.397533696, + "gpu_mem": 1.125676032, + "loss": 1.1487, + "grad_norm": 2.5469577312469482, + "learning_rate": 0.0002840867579230205 + }, + { + "step": 147, + "epoch": 0.23557692307692307, + "cpu_mem": 3.397730304, + "gpu_mem": 1.125685248, + "loss": 1.1049, + "grad_norm": 3.6170644760131836, + "learning_rate": 0.00028370813413888866 + }, + { + "step": 148, + "epoch": 0.23717948717948717, + "cpu_mem": 3.397926912, + "gpu_mem": 1.125705216, + "loss": 1.0876, + "grad_norm": 3.829030990600586, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 149, + "epoch": 0.2387820512820513, + "cpu_mem": 3.398320128, + "gpu_mem": 1.125696, + "loss": 0.93, + "grad_norm": 3.5031020641326904, + "learning_rate": 0.0002829383194061186 + }, + { + "step": 150, + "epoch": 0.2403846153846154, + "cpu_mem": 3.398516736, + "gpu_mem": 1.125708288, + "loss": 1.1399, + "grad_norm": 5.434287071228027, + "learning_rate": 0.00028254715259869444 + }, + { + "step": 151, + "epoch": 0.2419871794871795, + "cpu_mem": 3.398909952, + "gpu_mem": 1.12567296, + "loss": 0.998, + "grad_norm": 5.029645919799805, + "learning_rate": 0.00028215182914286766 + }, + { + "step": 152, + "epoch": 0.24358974358974358, + "cpu_mem": 3.39910656, + "gpu_mem": 1.12570368, + "loss": 0.995, + "grad_norm": 3.816197156906128, + "learning_rate": 0.00028175236143589144 + }, + { + "step": 153, + "epoch": 0.24519230769230768, + "cpu_mem": 3.399499776, + "gpu_mem": 1.125699072, + "loss": 1.1092, + "grad_norm": 5.081612586975098, + "learning_rate": 0.0002813487620049817 + }, + { + "step": 154, + "epoch": 0.2467948717948718, + "cpu_mem": 3.399696384, + "gpu_mem": 1.125723648, + "loss": 1.0461, + "grad_norm": 4.313901424407959, + "learning_rate": 0.00028094104350692435 + }, + { + "step": 155, + "epoch": 0.2483974358974359, + "cpu_mem": 3.399892992, + "gpu_mem": 1.125660672, + "loss": 0.9318, + "grad_norm": 4.486854553222656, + "learning_rate": 0.0002805292187276783 + }, + { + "step": 156, + "epoch": 0.25, + "cpu_mem": 3.4000896, + "gpu_mem": 1.125714432, + "loss": 1.0788, + "grad_norm": 4.202513694763184, + "learning_rate": 0.0002801133005819744 + }, + { + "step": 157, + "epoch": 0.2516025641025641, + "cpu_mem": 3.400286208, + "gpu_mem": 1.125706752, + "loss": 0.9256, + "grad_norm": 4.320004940032959, + "learning_rate": 0.00027969330211291077 + }, + { + "step": 158, + "epoch": 0.2532051282051282, + "cpu_mem": 3.400679424, + "gpu_mem": 1.125722112, + "loss": 0.8337, + "grad_norm": 3.9458351135253906, + "learning_rate": 0.00027926923649154327 + }, + { + "step": 159, + "epoch": 0.2548076923076923, + "cpu_mem": 3.400876032, + "gpu_mem": 1.125723648, + "loss": 0.7508, + "grad_norm": 3.5022082328796387, + "learning_rate": 0.00027884111701647284 + }, + { + "step": 160, + "epoch": 0.2564102564102564, + "cpu_mem": 3.401269248, + "gpu_mem": 1.125691392, + "loss": 0.8781, + "grad_norm": 4.364773273468018, + "learning_rate": 0.00027840895711342834 + }, + { + "step": 161, + "epoch": 0.25801282051282054, + "cpu_mem": 3.401465856, + "gpu_mem": 1.125683712, + "loss": 0.8313, + "grad_norm": 5.633781909942627, + "learning_rate": 0.00027797277033484553 + }, + { + "step": 162, + "epoch": 0.25961538461538464, + "cpu_mem": 3.401662464, + "gpu_mem": 1.12571904, + "loss": 0.7676, + "grad_norm": 6.818156719207764, + "learning_rate": 0.0002775325703594421 + }, + { + "step": 163, + "epoch": 0.26121794871794873, + "cpu_mem": 3.401859072, + "gpu_mem": 1.125666816, + "loss": 0.7989, + "grad_norm": 5.311073303222656, + "learning_rate": 0.0002770883709917886 + }, + { + "step": 164, + "epoch": 0.26282051282051283, + "cpu_mem": 3.40205568, + "gpu_mem": 1.125702144, + "loss": 0.6761, + "grad_norm": 3.341973304748535, + "learning_rate": 0.0002766401861618757 + }, + { + "step": 165, + "epoch": 0.2644230769230769, + "cpu_mem": 3.402448896, + "gpu_mem": 1.125691392, + "loss": 0.9284, + "grad_norm": 4.286813735961914, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 166, + "epoch": 0.266025641025641, + "cpu_mem": 3.402645504, + "gpu_mem": 1.125723648, + "loss": 0.707, + "grad_norm": 4.5421624183654785, + "learning_rate": 0.0002757319164597092 + }, + { + "step": 167, + "epoch": 0.2676282051282051, + "cpu_mem": 3.402842112, + "gpu_mem": 1.125717504, + "loss": 0.8196, + "grad_norm": 4.173457145690918, + "learning_rate": 0.0002752718600705858 + }, + { + "step": 168, + "epoch": 0.2692307692307692, + "cpu_mem": 3.40303872, + "gpu_mem": 1.125696, + "loss": 0.8867, + "grad_norm": 6.964019775390625, + "learning_rate": 0.00027480787518457023 + }, + { + "step": 169, + "epoch": 0.2708333333333333, + "cpu_mem": 3.403431936, + "gpu_mem": 1.125692928, + "loss": 1.1885, + "grad_norm": 6.968769550323486, + "learning_rate": 0.0002743399763521223 + }, + { + "step": 170, + "epoch": 0.2724358974358974, + "cpu_mem": 3.403628544, + "gpu_mem": 1.125729792, + "loss": 0.7361, + "grad_norm": 3.8965911865234375, + "learning_rate": 0.0002738681782464426 + }, + { + "step": 171, + "epoch": 0.27403846153846156, + "cpu_mem": 3.40402176, + "gpu_mem": 1.12570368, + "loss": 0.7407, + "grad_norm": 3.683558464050293, + "learning_rate": 0.0002733924956630117 + }, + { + "step": 172, + "epoch": 0.27564102564102566, + "cpu_mem": 3.40402176, + "gpu_mem": 1.12568064, + "loss": 0.7598, + "grad_norm": 3.583803176879883, + "learning_rate": 0.00027291294351912664 + }, + { + "step": 173, + "epoch": 0.27724358974358976, + "cpu_mem": 3.404414976, + "gpu_mem": 1.125706752, + "loss": 0.8004, + "grad_norm": 3.9168784618377686, + "learning_rate": 0.00027242953685343327 + }, + { + "step": 174, + "epoch": 0.27884615384615385, + "cpu_mem": 3.404611584, + "gpu_mem": 1.12571904, + "loss": 1.0027, + "grad_norm": 5.229266166687012, + "learning_rate": 0.0002719422908254538 + }, + { + "step": 175, + "epoch": 0.28044871794871795, + "cpu_mem": 3.404808192, + "gpu_mem": 1.12568064, + "loss": 0.7617, + "grad_norm": 3.3253166675567627, + "learning_rate": 0.0002714512207151125 + }, + { + "step": 176, + "epoch": 0.28205128205128205, + "cpu_mem": 3.4050048, + "gpu_mem": 1.125689856, + "loss": 0.7667, + "grad_norm": 3.9033665657043457, + "learning_rate": 0.0002709563419222557 + }, + { + "step": 177, + "epoch": 0.28365384615384615, + "cpu_mem": 3.405201408, + "gpu_mem": 1.125671424, + "loss": 0.6719, + "grad_norm": 3.880782127380371, + "learning_rate": 0.0002704576699661691 + }, + { + "step": 178, + "epoch": 0.28525641025641024, + "cpu_mem": 3.405398016, + "gpu_mem": 1.125685248, + "loss": 0.6488, + "grad_norm": 4.7876129150390625, + "learning_rate": 0.0002699552204850914 + }, + { + "step": 179, + "epoch": 0.28685897435897434, + "cpu_mem": 3.405791232, + "gpu_mem": 1.125692928, + "loss": 0.6847, + "grad_norm": 3.5362391471862793, + "learning_rate": 0.0002694490092357233 + }, + { + "step": 180, + "epoch": 0.28846153846153844, + "cpu_mem": 3.40598784, + "gpu_mem": 1.125674496, + "loss": 0.7462, + "grad_norm": 3.859915018081665, + "learning_rate": 0.00026893905209273404 + }, + { + "step": 181, + "epoch": 0.2900641025641026, + "cpu_mem": 3.406184448, + "gpu_mem": 1.125705216, + "loss": 0.7058, + "grad_norm": 5.14849328994751, + "learning_rate": 0.00026842536504826286 + }, + { + "step": 182, + "epoch": 0.2916666666666667, + "cpu_mem": 3.406381056, + "gpu_mem": 1.125676032, + "loss": 0.8935, + "grad_norm": 6.168877601623535, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 183, + "epoch": 0.2932692307692308, + "cpu_mem": 3.406577664, + "gpu_mem": 1.125700608, + "loss": 0.5916, + "grad_norm": 3.2316267490386963, + "learning_rate": 0.0002673868658077717 + }, + { + "step": 184, + "epoch": 0.2948717948717949, + "cpu_mem": 3.406774272, + "gpu_mem": 1.12568064, + "loss": 0.568, + "grad_norm": 3.398488759994507, + "learning_rate": 0.00026686208617885055 + }, + { + "step": 185, + "epoch": 0.296474358974359, + "cpu_mem": 3.407167488, + "gpu_mem": 1.125712896, + "loss": 0.8174, + "grad_norm": 4.001771450042725, + "learning_rate": 0.0002663336417816238 + }, + { + "step": 186, + "epoch": 0.2980769230769231, + "cpu_mem": 3.407364096, + "gpu_mem": 1.12570368, + "loss": 0.892, + "grad_norm": 4.264076232910156, + "learning_rate": 0.0002658015491879868 + }, + { + "step": 187, + "epoch": 0.29967948717948717, + "cpu_mem": 3.407364096, + "gpu_mem": 1.125699072, + "loss": 0.7324, + "grad_norm": 3.2598748207092285, + "learning_rate": 0.00026526582508424175 + }, + { + "step": 188, + "epoch": 0.30128205128205127, + "cpu_mem": 3.407560704, + "gpu_mem": 1.125656064, + "loss": 0.7503, + "grad_norm": 3.702191114425659, + "learning_rate": 0.0002647264862705741 + }, + { + "step": 189, + "epoch": 0.30288461538461536, + "cpu_mem": 3.407757312, + "gpu_mem": 1.125735936, + "loss": 0.7221, + "grad_norm": 3.1509463787078857, + "learning_rate": 0.00026418354966052573 + }, + { + "step": 190, + "epoch": 0.30448717948717946, + "cpu_mem": 3.40795392, + "gpu_mem": 1.125686784, + "loss": 0.7377, + "grad_norm": 4.795535564422607, + "learning_rate": 0.00026363703228046454 + }, + { + "step": 191, + "epoch": 0.3060897435897436, + "cpu_mem": 3.408150528, + "gpu_mem": 1.125686784, + "loss": 0.7363, + "grad_norm": 4.421326637268066, + "learning_rate": 0.0002630869512690507 + }, + { + "step": 192, + "epoch": 0.3076923076923077, + "cpu_mem": 3.408543744, + "gpu_mem": 1.125652992, + "loss": 0.8141, + "grad_norm": 5.385985374450684, + "learning_rate": 0.0002625333238766989 + }, + { + "step": 193, + "epoch": 0.3092948717948718, + "cpu_mem": 3.408740352, + "gpu_mem": 1.125692928, + "loss": 0.3846, + "grad_norm": 3.0123801231384277, + "learning_rate": 0.0002619761674650377 + }, + { + "step": 194, + "epoch": 0.3108974358974359, + "cpu_mem": 3.40893696, + "gpu_mem": 1.12568832, + "loss": 0.6162, + "grad_norm": 4.219264030456543, + "learning_rate": 0.0002614154995063647 + }, + { + "step": 195, + "epoch": 0.3125, + "cpu_mem": 3.409133568, + "gpu_mem": 1.125676032, + "loss": 0.7992, + "grad_norm": 3.6730611324310303, + "learning_rate": 0.00026085133758309883 + }, + { + "step": 196, + "epoch": 0.3141025641025641, + "cpu_mem": 3.409330176, + "gpu_mem": 1.125700608, + "loss": 0.8453, + "grad_norm": 4.824152946472168, + "learning_rate": 0.0002602836993872292 + }, + { + "step": 197, + "epoch": 0.3157051282051282, + "cpu_mem": 3.409526784, + "gpu_mem": 1.125715968, + "loss": 0.9338, + "grad_norm": 5.33161735534668, + "learning_rate": 0.0002597126027197598 + }, + { + "step": 198, + "epoch": 0.3173076923076923, + "cpu_mem": 3.409723392, + "gpu_mem": 1.12568832, + "loss": 0.5646, + "grad_norm": 3.524167060852051, + "learning_rate": 0.0002591380654901515 + }, + { + "step": 199, + "epoch": 0.3189102564102564, + "cpu_mem": 3.40992, + "gpu_mem": 1.125685248, + "loss": 0.7841, + "grad_norm": 7.139894485473633, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 200, + "epoch": 0.32051282051282054, + "cpu_mem": 3.410116608, + "gpu_mem": 1.125700608, + "loss": 0.8584, + "grad_norm": 3.4273619651794434, + "learning_rate": 0.0002579787415212732 + }, + { + "step": 201, + "epoch": 0.32211538461538464, + "cpu_mem": 3.410313216, + "gpu_mem": 1.125677568, + "loss": 0.6819, + "grad_norm": 3.9533066749572754, + "learning_rate": 0.00025739399113813784 + }, + { + "step": 202, + "epoch": 0.32371794871794873, + "cpu_mem": 3.410509824, + "gpu_mem": 1.125679104, + "loss": 0.7597, + "grad_norm": 3.9273083209991455, + "learning_rate": 0.00025680587290399277 + }, + { + "step": 203, + "epoch": 0.32532051282051283, + "cpu_mem": 3.410706432, + "gpu_mem": 1.125720576, + "loss": 0.6434, + "grad_norm": 3.357543706893921, + "learning_rate": 0.0002562144052620913 + }, + { + "step": 204, + "epoch": 0.3269230769230769, + "cpu_mem": 3.41090304, + "gpu_mem": 1.125691392, + "loss": 0.626, + "grad_norm": 4.427675247192383, + "learning_rate": 0.00025561960676072354 + }, + { + "step": 205, + "epoch": 0.328525641025641, + "cpu_mem": 3.411099648, + "gpu_mem": 1.125691392, + "loss": 0.8255, + "grad_norm": 4.867093086242676, + "learning_rate": 0.0002550214960526344 + }, + { + "step": 206, + "epoch": 0.3301282051282051, + "cpu_mem": 3.411296256, + "gpu_mem": 1.12568832, + "loss": 0.6202, + "grad_norm": 3.462733745574951, + "learning_rate": 0.000254420091894439 + }, + { + "step": 207, + "epoch": 0.3317307692307692, + "cpu_mem": 3.411492864, + "gpu_mem": 1.12568832, + "loss": 0.637, + "grad_norm": 5.318131923675537, + "learning_rate": 0.0002538154131460342 + }, + { + "step": 208, + "epoch": 0.3333333333333333, + "cpu_mem": 3.411689472, + "gpu_mem": 1.125679104, + "loss": 0.5851, + "grad_norm": 5.021348476409912, + "learning_rate": 0.00025320747877000745 + }, + { + "step": 209, + "epoch": 0.3349358974358974, + "cpu_mem": 3.41188608, + "gpu_mem": 1.125714432, + "loss": 0.4778, + "grad_norm": 3.1915152072906494, + "learning_rate": 0.00025259630783104164 + }, + { + "step": 210, + "epoch": 0.33653846153846156, + "cpu_mem": 3.412082688, + "gpu_mem": 1.125671424, + "loss": 0.6835, + "grad_norm": 3.606722831726074, + "learning_rate": 0.00025198191949531786 + }, + { + "step": 211, + "epoch": 0.33814102564102566, + "cpu_mem": 3.412279296, + "gpu_mem": 1.125699072, + "loss": 0.6199, + "grad_norm": 3.3759098052978516, + "learning_rate": 0.00025136433302991366 + }, + { + "step": 212, + "epoch": 0.33974358974358976, + "cpu_mem": 3.412475904, + "gpu_mem": 1.125708288, + "loss": 0.4974, + "grad_norm": 2.7075963020324707, + "learning_rate": 0.00025074356780219946 + }, + { + "step": 213, + "epoch": 0.34134615384615385, + "cpu_mem": 3.412672512, + "gpu_mem": 1.12568064, + "loss": 0.5936, + "grad_norm": 3.171992540359497, + "learning_rate": 0.000250119643279231 + }, + { + "step": 214, + "epoch": 0.34294871794871795, + "cpu_mem": 3.41286912, + "gpu_mem": 1.125689856, + "loss": 0.6878, + "grad_norm": 3.8997409343719482, + "learning_rate": 0.0002494925790271386 + }, + { + "step": 215, + "epoch": 0.34455128205128205, + "cpu_mem": 3.413065728, + "gpu_mem": 1.125691392, + "loss": 0.5751, + "grad_norm": 3.959726095199585, + "learning_rate": 0.00024886239471051376 + }, + { + "step": 216, + "epoch": 0.34615384615384615, + "cpu_mem": 3.413262336, + "gpu_mem": 1.125691392, + "loss": 0.634, + "grad_norm": 3.5127100944519043, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 217, + "epoch": 0.34775641025641024, + "cpu_mem": 3.413458944, + "gpu_mem": 1.125676032, + "loss": 0.5288, + "grad_norm": 3.742642402648926, + "learning_rate": 0.0002475927450306363 + }, + { + "step": 218, + "epoch": 0.34935897435897434, + "cpu_mem": 3.413655552, + "gpu_mem": 1.125697536, + "loss": 0.5269, + "grad_norm": 3.44482684135437, + "learning_rate": 0.0002469533194833073 + }, + { + "step": 219, + "epoch": 0.35096153846153844, + "cpu_mem": 3.41385216, + "gpu_mem": 1.125731328, + "loss": 0.6405, + "grad_norm": 4.122509479522705, + "learning_rate": 0.0002463108535020447 + }, + { + "step": 220, + "epoch": 0.3525641025641026, + "cpu_mem": 3.414048768, + "gpu_mem": 1.125685248, + "loss": 0.881, + "grad_norm": 7.107041835784912, + "learning_rate": 0.0002456653672344348 + }, + { + "step": 221, + "epoch": 0.3541666666666667, + "cpu_mem": 3.414245376, + "gpu_mem": 1.125691392, + "loss": 0.6393, + "grad_norm": 4.435815811157227, + "learning_rate": 0.0002450168809227794 + }, + { + "step": 222, + "epoch": 0.3557692307692308, + "cpu_mem": 3.414441984, + "gpu_mem": 1.125706752, + "loss": 0.6795, + "grad_norm": 4.923554420471191, + "learning_rate": 0.00024436541490346095 + }, + { + "step": 223, + "epoch": 0.3573717948717949, + "cpu_mem": 3.414638592, + "gpu_mem": 1.125725184, + "loss": 0.4743, + "grad_norm": 3.590759515762329, + "learning_rate": 0.00024371098960630495 + }, + { + "step": 224, + "epoch": 0.358974358974359, + "cpu_mem": 3.4148352, + "gpu_mem": 1.125694464, + "loss": 0.6961, + "grad_norm": 3.6215953826904297, + "learning_rate": 0.000243053625553939 + }, + { + "step": 225, + "epoch": 0.3605769230769231, + "cpu_mem": 3.415031808, + "gpu_mem": 1.12568064, + "loss": 0.4657, + "grad_norm": 2.699805736541748, + "learning_rate": 0.00024239334336114953 + }, + { + "step": 226, + "epoch": 0.36217948717948717, + "cpu_mem": 3.415228416, + "gpu_mem": 1.12567296, + "loss": 0.5199, + "grad_norm": 3.0588605403900146, + "learning_rate": 0.0002417301637342352 + }, + { + "step": 227, + "epoch": 0.36378205128205127, + "cpu_mem": 3.415228416, + "gpu_mem": 1.125737472, + "loss": 0.6109, + "grad_norm": 3.6363399028778076, + "learning_rate": 0.00024106410747035744 + }, + { + "step": 228, + "epoch": 0.36538461538461536, + "cpu_mem": 3.415425024, + "gpu_mem": 1.125676032, + "loss": 0.6794, + "grad_norm": 4.365848064422607, + "learning_rate": 0.00024039519545688846 + }, + { + "step": 229, + "epoch": 0.36698717948717946, + "cpu_mem": 3.415621632, + "gpu_mem": 1.125728256, + "loss": 0.5598, + "grad_norm": 3.4743759632110596, + "learning_rate": 0.000239723448670756 + }, + { + "step": 230, + "epoch": 0.3685897435897436, + "cpu_mem": 3.41581824, + "gpu_mem": 1.125709824, + "loss": 0.5584, + "grad_norm": 4.179272174835205, + "learning_rate": 0.0002390488881777858 + }, + { + "step": 231, + "epoch": 0.3701923076923077, + "cpu_mem": 3.416014848, + "gpu_mem": 1.125708288, + "loss": 0.5556, + "grad_norm": 3.0370101928710938, + "learning_rate": 0.0002383715351320406 + }, + { + "step": 232, + "epoch": 0.3717948717948718, + "cpu_mem": 3.416211456, + "gpu_mem": 1.125712896, + "loss": 0.7079, + "grad_norm": 3.9434990882873535, + "learning_rate": 0.00023769141077515713 + }, + { + "step": 233, + "epoch": 0.3733974358974359, + "cpu_mem": 3.416408064, + "gpu_mem": 1.12568832, + "loss": 0.4494, + "grad_norm": 3.3351638317108154, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 234, + "epoch": 0.375, + "cpu_mem": 3.416604672, + "gpu_mem": 1.125717504, + "loss": 0.5246, + "grad_norm": 3.7790186405181885, + "learning_rate": 0.0002363229335283915 + }, + { + "step": 235, + "epoch": 0.3766025641025641, + "cpu_mem": 3.41680128, + "gpu_mem": 1.125694464, + "loss": 0.5845, + "grad_norm": 4.257011413574219, + "learning_rate": 0.00023563462355364297 + }, + { + "step": 236, + "epoch": 0.3782051282051282, + "cpu_mem": 3.416997888, + "gpu_mem": 1.125755904, + "loss": 0.5327, + "grad_norm": 3.8204891681671143, + "learning_rate": 0.0002349436280966775 + }, + { + "step": 237, + "epoch": 0.3798076923076923, + "cpu_mem": 3.416997888, + "gpu_mem": 1.12568064, + "loss": 0.6016, + "grad_norm": 4.447617530822754, + "learning_rate": 0.00023424996882695468 + }, + { + "step": 238, + "epoch": 0.3814102564102564, + "cpu_mem": 3.416997888, + "gpu_mem": 1.125691392, + "loss": 0.6388, + "grad_norm": 4.3770952224731445, + "learning_rate": 0.00023355366749747063 + }, + { + "step": 239, + "epoch": 0.38301282051282054, + "cpu_mem": 3.417194496, + "gpu_mem": 1.125689856, + "loss": 0.7379, + "grad_norm": 4.119786262512207, + "learning_rate": 0.00023285474594407585 + }, + { + "step": 240, + "epoch": 0.38461538461538464, + "cpu_mem": 3.417391104, + "gpu_mem": 1.125686784, + "loss": 0.7178, + "grad_norm": 4.275717258453369, + "learning_rate": 0.0002321532260847905 + }, + { + "step": 241, + "epoch": 0.38621794871794873, + "cpu_mem": 3.417587712, + "gpu_mem": 1.125717504, + "loss": 0.521, + "grad_norm": 3.7095096111297607, + "learning_rate": 0.00023144912991911691 + }, + { + "step": 242, + "epoch": 0.38782051282051283, + "cpu_mem": 3.41778432, + "gpu_mem": 1.125696, + "loss": 0.5978, + "grad_norm": 3.7863268852233887, + "learning_rate": 0.0002307424795273499 + }, + { + "step": 243, + "epoch": 0.3894230769230769, + "cpu_mem": 3.417980928, + "gpu_mem": 1.125691392, + "loss": 0.565, + "grad_norm": 3.7503652572631836, + "learning_rate": 0.00023003329706988425 + }, + { + "step": 244, + "epoch": 0.391025641025641, + "cpu_mem": 3.418177536, + "gpu_mem": 1.125702144, + "loss": 0.5526, + "grad_norm": 3.1811983585357666, + "learning_rate": 0.00022932160478651963 + }, + { + "step": 245, + "epoch": 0.3926282051282051, + "cpu_mem": 3.418374144, + "gpu_mem": 1.125706752, + "loss": 0.6334, + "grad_norm": 3.5433290004730225, + "learning_rate": 0.00022860742499576338 + }, + { + "step": 246, + "epoch": 0.3942307692307692, + "cpu_mem": 3.418570752, + "gpu_mem": 1.125668352, + "loss": 0.7078, + "grad_norm": 4.0762248039245605, + "learning_rate": 0.00022789078009413042 + }, + { + "step": 247, + "epoch": 0.3958333333333333, + "cpu_mem": 3.418570752, + "gpu_mem": 1.125735936, + "loss": 0.7847, + "grad_norm": 3.5055885314941406, + "learning_rate": 0.00022717169255544108 + }, + { + "step": 248, + "epoch": 0.3974358974358974, + "cpu_mem": 3.41876736, + "gpu_mem": 1.125699072, + "loss": 0.5979, + "grad_norm": 2.7694742679595947, + "learning_rate": 0.00022645018493011612 + }, + { + "step": 249, + "epoch": 0.39903846153846156, + "cpu_mem": 3.418963968, + "gpu_mem": 1.12568832, + "loss": 0.5758, + "grad_norm": 2.9905989170074463, + "learning_rate": 0.0002257262798444698 + }, + { + "step": 250, + "epoch": 0.40064102564102566, + "cpu_mem": 3.419160576, + "gpu_mem": 1.125705216, + "loss": 0.6854, + "grad_norm": 2.962320327758789, + "learning_rate": 0.000225 + }, + { + "step": 251, + "epoch": 0.40224358974358976, + "cpu_mem": 3.419160576, + "gpu_mem": 1.125679104, + "loss": 0.5865, + "grad_norm": 3.670105218887329, + "learning_rate": 0.00022427136817267668 + }, + { + "step": 252, + "epoch": 0.40384615384615385, + "cpu_mem": 3.419357184, + "gpu_mem": 1.12572672, + "loss": 0.4606, + "grad_norm": 2.8604609966278076, + "learning_rate": 0.0002235404072122273 + }, + { + "step": 253, + "epoch": 0.40544871794871795, + "cpu_mem": 3.419553792, + "gpu_mem": 1.125694464, + "loss": 0.571, + "grad_norm": 3.9744739532470703, + "learning_rate": 0.00022280714004142054 + }, + { + "step": 254, + "epoch": 0.40705128205128205, + "cpu_mem": 3.4197504, + "gpu_mem": 1.125683712, + "loss": 0.4058, + "grad_norm": 2.7209420204162598, + "learning_rate": 0.00022207158965534726 + }, + { + "step": 255, + "epoch": 0.40865384615384615, + "cpu_mem": 3.419947008, + "gpu_mem": 1.125699072, + "loss": 0.6371, + "grad_norm": 3.957677125930786, + "learning_rate": 0.0002213337791206993 + }, + { + "step": 256, + "epoch": 0.41025641025641024, + "cpu_mem": 3.420143616, + "gpu_mem": 1.125696, + "loss": 0.6586, + "grad_norm": 3.9090576171875, + "learning_rate": 0.00022059373157504636 + }, + { + "step": 257, + "epoch": 0.41185897435897434, + "cpu_mem": 3.420143616, + "gpu_mem": 1.125696, + "loss": 0.4608, + "grad_norm": 3.3167080879211426, + "learning_rate": 0.00021985147022611038 + }, + { + "step": 258, + "epoch": 0.41346153846153844, + "cpu_mem": 3.420340224, + "gpu_mem": 1.125683712, + "loss": 0.5977, + "grad_norm": 4.301418781280518, + "learning_rate": 0.0002191070183510375 + }, + { + "step": 259, + "epoch": 0.4150641025641026, + "cpu_mem": 3.420536832, + "gpu_mem": 1.125666816, + "loss": 0.6873, + "grad_norm": 5.173335075378418, + "learning_rate": 0.00021836039929566835 + }, + { + "step": 260, + "epoch": 0.4166666666666667, + "cpu_mem": 3.42073344, + "gpu_mem": 1.125729792, + "loss": 0.6409, + "grad_norm": 3.653987169265747, + "learning_rate": 0.0002176116364738058 + }, + { + "step": 261, + "epoch": 0.4182692307692308, + "cpu_mem": 3.42073344, + "gpu_mem": 1.125683712, + "loss": 0.4706, + "grad_norm": 3.224968194961548, + "learning_rate": 0.00021686075336648075 + }, + { + "step": 262, + "epoch": 0.4198717948717949, + "cpu_mem": 3.420930048, + "gpu_mem": 1.125692928, + "loss": 0.5584, + "grad_norm": 3.966257333755493, + "learning_rate": 0.00021610777352121574 + }, + { + "step": 263, + "epoch": 0.421474358974359, + "cpu_mem": 3.420930048, + "gpu_mem": 1.125728256, + "loss": 0.4207, + "grad_norm": 2.654153347015381, + "learning_rate": 0.0002153527205512867 + }, + { + "step": 264, + "epoch": 0.4230769230769231, + "cpu_mem": 3.421126656, + "gpu_mem": 1.125692928, + "loss": 0.5423, + "grad_norm": 3.5723869800567627, + "learning_rate": 0.0002145956181349821 + }, + { + "step": 265, + "epoch": 0.42467948717948717, + "cpu_mem": 3.421323264, + "gpu_mem": 1.125697536, + "loss": 0.7708, + "grad_norm": 4.641718864440918, + "learning_rate": 0.00021383649001486055 + }, + { + "step": 266, + "epoch": 0.42628205128205127, + "cpu_mem": 3.421519872, + "gpu_mem": 1.125745152, + "loss": 0.4436, + "grad_norm": 3.37636661529541, + "learning_rate": 0.00021307535999700637 + }, + { + "step": 267, + "epoch": 0.42788461538461536, + "cpu_mem": 3.421519872, + "gpu_mem": 1.125754368, + "loss": 0.3989, + "grad_norm": 2.7013227939605713, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 268, + "epoch": 0.42948717948717946, + "cpu_mem": 3.42171648, + "gpu_mem": 1.125708288, + "loss": 0.4157, + "grad_norm": 3.206953287124634, + "learning_rate": 0.00021154718980558417 + }, + { + "step": 269, + "epoch": 0.4310897435897436, + "cpu_mem": 3.421913088, + "gpu_mem": 1.125702144, + "loss": 0.5465, + "grad_norm": 3.358818531036377, + "learning_rate": 0.00021078019755508398 + }, + { + "step": 270, + "epoch": 0.4326923076923077, + "cpu_mem": 3.422109696, + "gpu_mem": 1.125763584, + "loss": 0.5136, + "grad_norm": 3.297494649887085, + "learning_rate": 0.00021001129925148396 + }, + { + "step": 271, + "epoch": 0.4342948717948718, + "cpu_mem": 3.422109696, + "gpu_mem": 1.125689856, + "loss": 0.6008, + "grad_norm": 5.3274030685424805, + "learning_rate": 0.00020924051900725923 + }, + { + "step": 272, + "epoch": 0.4358974358974359, + "cpu_mem": 3.422306304, + "gpu_mem": 1.12568832, + "loss": 0.6151, + "grad_norm": 4.710865497589111, + "learning_rate": 0.00020846788099390188 + }, + { + "step": 273, + "epoch": 0.4375, + "cpu_mem": 3.422306304, + "gpu_mem": 1.125691392, + "loss": 0.4416, + "grad_norm": 3.3821945190429688, + "learning_rate": 0.0002076934094411635 + }, + { + "step": 274, + "epoch": 0.4391025641025641, + "cpu_mem": 3.422502912, + "gpu_mem": 1.125677568, + "loss": 0.538, + "grad_norm": 3.742550849914551, + "learning_rate": 0.0002069171286362949 + }, + { + "step": 275, + "epoch": 0.4407051282051282, + "cpu_mem": 3.42269952, + "gpu_mem": 1.125692928, + "loss": 0.3468, + "grad_norm": 2.5764448642730713, + "learning_rate": 0.00020613906292328457 + }, + { + "step": 276, + "epoch": 0.4423076923076923, + "cpu_mem": 3.422896128, + "gpu_mem": 1.125731328, + "loss": 0.4915, + "grad_norm": 3.459211587905884, + "learning_rate": 0.0002053592367020955 + }, + { + "step": 277, + "epoch": 0.4439102564102564, + "cpu_mem": 3.422896128, + "gpu_mem": 1.12571136, + "loss": 0.5839, + "grad_norm": 3.7547082901000977, + "learning_rate": 0.0002045776744278996 + }, + { + "step": 278, + "epoch": 0.44551282051282054, + "cpu_mem": 3.423092736, + "gpu_mem": 1.125737472, + "loss": 0.6201, + "grad_norm": 3.777604818344116, + "learning_rate": 0.00020379440061031118 + }, + { + "step": 279, + "epoch": 0.44711538461538464, + "cpu_mem": 3.423289344, + "gpu_mem": 1.12568832, + "loss": 0.4301, + "grad_norm": 3.1565380096435547, + "learning_rate": 0.00020300943981261808 + }, + { + "step": 280, + "epoch": 0.44871794871794873, + "cpu_mem": 3.423289344, + "gpu_mem": 1.125682176, + "loss": 0.6464, + "grad_norm": 3.555190324783325, + "learning_rate": 0.0002022228166510114 + }, + { + "step": 281, + "epoch": 0.45032051282051283, + "cpu_mem": 3.423289344, + "gpu_mem": 1.125705216, + "loss": 0.4066, + "grad_norm": 2.6024444103240967, + "learning_rate": 0.00020143455579381373 + }, + { + "step": 282, + "epoch": 0.4519230769230769, + "cpu_mem": 3.423485952, + "gpu_mem": 1.125683712, + "loss": 0.6056, + "grad_norm": 3.3291842937469482, + "learning_rate": 0.0002006446819607053 + }, + { + "step": 283, + "epoch": 0.453525641025641, + "cpu_mem": 3.423485952, + "gpu_mem": 1.125697536, + "loss": 0.645, + "grad_norm": 3.2095425128936768, + "learning_rate": 0.00019985321992194892 + }, + { + "step": 284, + "epoch": 0.4551282051282051, + "cpu_mem": 3.42368256, + "gpu_mem": 1.125702144, + "loss": 0.5548, + "grad_norm": 3.0105881690979004, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 285, + "epoch": 0.4567307692307692, + "cpu_mem": 3.423879168, + "gpu_mem": 1.125720576, + "loss": 0.5504, + "grad_norm": 3.129361152648926, + "learning_rate": 0.00019826563055679418 + }, + { + "step": 286, + "epoch": 0.4583333333333333, + "cpu_mem": 3.424075776, + "gpu_mem": 1.125691392, + "loss": 0.3969, + "grad_norm": 2.123267412185669, + "learning_rate": 0.00019746955301683537 + }, + { + "step": 287, + "epoch": 0.4599358974358974, + "cpu_mem": 3.424075776, + "gpu_mem": 1.12571904, + "loss": 0.7014, + "grad_norm": 3.1591475009918213, + "learning_rate": 0.0001966719868425464 + }, + { + "step": 288, + "epoch": 0.46153846153846156, + "cpu_mem": 3.424075776, + "gpu_mem": 1.125700608, + "loss": 0.5447, + "grad_norm": 2.8219265937805176, + "learning_rate": 0.0001958729570454201 + }, + { + "step": 289, + "epoch": 0.46314102564102566, + "cpu_mem": 3.424272384, + "gpu_mem": 1.12568832, + "loss": 0.5632, + "grad_norm": 2.893486499786377, + "learning_rate": 0.0001950724886828484 + }, + { + "step": 290, + "epoch": 0.46474358974358976, + "cpu_mem": 3.424468992, + "gpu_mem": 1.125697536, + "loss": 0.7281, + "grad_norm": 3.680816888809204, + "learning_rate": 0.000194270606857336 + }, + { + "step": 291, + "epoch": 0.46634615384615385, + "cpu_mem": 3.424468992, + "gpu_mem": 1.125694464, + "loss": 0.6824, + "grad_norm": 4.421926975250244, + "learning_rate": 0.00019346733671571367 + }, + { + "step": 292, + "epoch": 0.46794871794871795, + "cpu_mem": 3.4246656, + "gpu_mem": 1.125709824, + "loss": 0.5787, + "grad_norm": 3.581559181213379, + "learning_rate": 0.00019266270344834942 + }, + { + "step": 293, + "epoch": 0.46955128205128205, + "cpu_mem": 3.424862208, + "gpu_mem": 1.125717504, + "loss": 0.4937, + "grad_norm": 3.30686354637146, + "learning_rate": 0.00019185673228835857 + }, + { + "step": 294, + "epoch": 0.47115384615384615, + "cpu_mem": 3.425058816, + "gpu_mem": 1.125706752, + "loss": 0.5013, + "grad_norm": 3.1696276664733887, + "learning_rate": 0.00019104944851081244 + }, + { + "step": 295, + "epoch": 0.47275641025641024, + "cpu_mem": 3.425058816, + "gpu_mem": 1.125691392, + "loss": 0.8399, + "grad_norm": 3.9744772911071777, + "learning_rate": 0.00019024087743194564 + }, + { + "step": 296, + "epoch": 0.47435897435897434, + "cpu_mem": 3.425255424, + "gpu_mem": 1.125694464, + "loss": 0.5364, + "grad_norm": 3.4160468578338623, + "learning_rate": 0.0001894310444083625 + }, + { + "step": 297, + "epoch": 0.47596153846153844, + "cpu_mem": 3.425255424, + "gpu_mem": 1.12568832, + "loss": 0.4818, + "grad_norm": 3.322261095046997, + "learning_rate": 0.00018861997483624136 + }, + { + "step": 298, + "epoch": 0.4775641025641026, + "cpu_mem": 3.425452032, + "gpu_mem": 1.125683712, + "loss": 0.8351, + "grad_norm": 5.046484470367432, + "learning_rate": 0.00018780769415053866 + }, + { + "step": 299, + "epoch": 0.4791666666666667, + "cpu_mem": 3.42564864, + "gpu_mem": 1.125705216, + "loss": 0.7859, + "grad_norm": 3.8405885696411133, + "learning_rate": 0.00018699422782419094 + }, + { + "step": 300, + "epoch": 0.4807692307692308, + "cpu_mem": 3.42564864, + "gpu_mem": 1.125697536, + "loss": 0.608, + "grad_norm": 2.7156646251678467, + "learning_rate": 0.00018617960136731624 + }, + { + "step": 301, + "epoch": 0.4823717948717949, + "cpu_mem": 3.42564864, + "gpu_mem": 1.125669888, + "loss": 0.6761, + "grad_norm": 3.700874090194702, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 302, + "epoch": 0.483974358974359, + "cpu_mem": 3.425845248, + "gpu_mem": 1.125668352, + "loss": 0.523, + "grad_norm": 2.2063801288604736, + "learning_rate": 0.0001845469702835641 + }, + { + "step": 303, + "epoch": 0.4855769230769231, + "cpu_mem": 3.425845248, + "gpu_mem": 1.125694464, + "loss": 0.6779, + "grad_norm": 3.259058952331543, + "learning_rate": 0.00018372901685562414 + }, + { + "step": 304, + "epoch": 0.48717948717948717, + "cpu_mem": 3.426041856, + "gpu_mem": 1.125677568, + "loss": 0.3947, + "grad_norm": 2.7725465297698975, + "learning_rate": 0.00018291000569342676 + }, + { + "step": 305, + "epoch": 0.48878205128205127, + "cpu_mem": 3.426041856, + "gpu_mem": 1.125708288, + "loss": 0.4915, + "grad_norm": 3.6511547565460205, + "learning_rate": 0.00018208996248097458 + }, + { + "step": 306, + "epoch": 0.49038461538461536, + "cpu_mem": 3.426238464, + "gpu_mem": 1.125691392, + "loss": 0.5373, + "grad_norm": 2.905935525894165, + "learning_rate": 0.00018126891293463547 + }, + { + "step": 307, + "epoch": 0.49198717948717946, + "cpu_mem": 3.426435072, + "gpu_mem": 1.125722112, + "loss": 0.5385, + "grad_norm": 2.824286699295044, + "learning_rate": 0.0001804468828023354 + }, + { + "step": 308, + "epoch": 0.4935897435897436, + "cpu_mem": 3.426435072, + "gpu_mem": 1.125689856, + "loss": 0.5762, + "grad_norm": 4.253853797912598, + "learning_rate": 0.00017962389786275142 + }, + { + "step": 309, + "epoch": 0.4951923076923077, + "cpu_mem": 3.42663168, + "gpu_mem": 1.125715968, + "loss": 0.4774, + "grad_norm": 3.5711584091186523, + "learning_rate": 0.000178799983924503 + }, + { + "step": 310, + "epoch": 0.4967948717948718, + "cpu_mem": 3.42663168, + "gpu_mem": 1.125691392, + "loss": 0.484, + "grad_norm": 3.0693750381469727, + "learning_rate": 0.00017797516682534293 + }, + { + "step": 311, + "epoch": 0.4983974358974359, + "cpu_mem": 3.426828288, + "gpu_mem": 1.125686784, + "loss": 0.5653, + "grad_norm": 3.6720778942108154, + "learning_rate": 0.00017714947243134695 + }, + { + "step": 312, + "epoch": 0.5, + "cpu_mem": 3.426828288, + "gpu_mem": 1.125689856, + "loss": 0.443, + "grad_norm": 4.150754451751709, + "learning_rate": 0.0001763229266361024 + }, + { + "step": 313, + "epoch": 0.5016025641025641, + "cpu_mem": 3.426828288, + "gpu_mem": 1.125708288, + "loss": 0.5382, + "grad_norm": 4.3089799880981445, + "learning_rate": 0.00017549555535989648 + }, + { + "step": 314, + "epoch": 0.5032051282051282, + "cpu_mem": 3.426828288, + "gpu_mem": 1.12568832, + "loss": 0.5546, + "grad_norm": 3.6592512130737305, + "learning_rate": 0.00017466738454890323 + }, + { + "step": 315, + "epoch": 0.5048076923076923, + "cpu_mem": 3.427024896, + "gpu_mem": 1.125692928, + "loss": 0.6509, + "grad_norm": 3.9803860187530518, + "learning_rate": 0.00017383844017436996 + }, + { + "step": 316, + "epoch": 0.5064102564102564, + "cpu_mem": 3.427024896, + "gpu_mem": 1.12568832, + "loss": 0.5242, + "grad_norm": 3.55664324760437, + "learning_rate": 0.00017300874823180282 + }, + { + "step": 317, + "epoch": 0.5080128205128205, + "cpu_mem": 3.427024896, + "gpu_mem": 1.125696, + "loss": 0.2714, + "grad_norm": 1.900181531906128, + "learning_rate": 0.00017217833474015128 + }, + { + "step": 318, + "epoch": 0.5096153846153846, + "cpu_mem": 3.427221504, + "gpu_mem": 1.125720576, + "loss": 0.5299, + "grad_norm": 2.8436625003814697, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 319, + "epoch": 0.5112179487179487, + "cpu_mem": 3.427418112, + "gpu_mem": 1.125712896, + "loss": 0.5438, + "grad_norm": 3.584642171859741, + "learning_rate": 0.0001705154472977154 + }, + { + "step": 320, + "epoch": 0.5128205128205128, + "cpu_mem": 3.42761472, + "gpu_mem": 1.125714432, + "loss": 0.5783, + "grad_norm": 3.361196279525757, + "learning_rate": 0.00016968302549470095 + }, + { + "step": 321, + "epoch": 0.5144230769230769, + "cpu_mem": 3.42761472, + "gpu_mem": 1.125689856, + "loss": 0.5029, + "grad_norm": 2.8327834606170654, + "learning_rate": 0.00016884998643650694 + }, + { + "step": 322, + "epoch": 0.5160256410256411, + "cpu_mem": 3.42761472, + "gpu_mem": 1.125691392, + "loss": 0.6411, + "grad_norm": 4.00896692276001, + "learning_rate": 0.00016801635624704776 + }, + { + "step": 323, + "epoch": 0.5176282051282052, + "cpu_mem": 3.427811328, + "gpu_mem": 1.12571136, + "loss": 0.6693, + "grad_norm": 3.2585608959198, + "learning_rate": 0.0001671821610687756 + }, + { + "step": 324, + "epoch": 0.5192307692307693, + "cpu_mem": 3.428007936, + "gpu_mem": 1.125683712, + "loss": 0.6125, + "grad_norm": 2.8783085346221924, + "learning_rate": 0.00016634742706186036 + }, + { + "step": 325, + "epoch": 0.5208333333333334, + "cpu_mem": 3.428007936, + "gpu_mem": 1.125696, + "loss": 0.3889, + "grad_norm": 2.5767877101898193, + "learning_rate": 0.00016551218040336993 + }, + { + "step": 326, + "epoch": 0.5224358974358975, + "cpu_mem": 3.428007936, + "gpu_mem": 1.125705216, + "loss": 0.4703, + "grad_norm": 3.0291733741760254, + "learning_rate": 0.00016467644728644843 + }, + { + "step": 327, + "epoch": 0.5240384615384616, + "cpu_mem": 3.428204544, + "gpu_mem": 1.125682176, + "loss": 0.5054, + "grad_norm": 3.3668699264526367, + "learning_rate": 0.0001638402539194953 + }, + { + "step": 328, + "epoch": 0.5256410256410257, + "cpu_mem": 3.428401152, + "gpu_mem": 1.125706752, + "loss": 0.6325, + "grad_norm": 3.53958797454834, + "learning_rate": 0.00016300362652534346 + }, + { + "step": 329, + "epoch": 0.5272435897435898, + "cpu_mem": 3.428401152, + "gpu_mem": 1.125706752, + "loss": 0.4747, + "grad_norm": 3.146620988845825, + "learning_rate": 0.00016216659134043657 + }, + { + "step": 330, + "epoch": 0.5288461538461539, + "cpu_mem": 3.428401152, + "gpu_mem": 1.125689856, + "loss": 0.416, + "grad_norm": 3.047092914581299, + "learning_rate": 0.00016132917461400686 + }, + { + "step": 331, + "epoch": 0.530448717948718, + "cpu_mem": 3.42859776, + "gpu_mem": 1.125686784, + "loss": 0.4528, + "grad_norm": 2.880415439605713, + "learning_rate": 0.00016049140260725127 + }, + { + "step": 332, + "epoch": 0.532051282051282, + "cpu_mem": 3.42859776, + "gpu_mem": 1.125679104, + "loss": 0.5252, + "grad_norm": 3.168412446975708, + "learning_rate": 0.00015965330159250845 + }, + { + "step": 333, + "epoch": 0.5336538461538461, + "cpu_mem": 3.42859776, + "gpu_mem": 1.125717504, + "loss": 0.6136, + "grad_norm": 4.405975341796875, + "learning_rate": 0.00015881489785243467 + }, + { + "step": 334, + "epoch": 0.5352564102564102, + "cpu_mem": 3.428794368, + "gpu_mem": 1.125694464, + "loss": 0.8641, + "grad_norm": 4.821969509124756, + "learning_rate": 0.00015797621767917942 + }, + { + "step": 335, + "epoch": 0.5368589743589743, + "cpu_mem": 3.428794368, + "gpu_mem": 1.125692928, + "loss": 0.628, + "grad_norm": 4.00813102722168, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 336, + "epoch": 0.5384615384615384, + "cpu_mem": 3.428794368, + "gpu_mem": 1.125709824, + "loss": 0.5723, + "grad_norm": 3.67295241355896, + "learning_rate": 0.00015629813324424292 + }, + { + "step": 337, + "epoch": 0.5400641025641025, + "cpu_mem": 3.428794368, + "gpu_mem": 1.125694464, + "loss": 0.5271, + "grad_norm": 3.613194704055786, + "learning_rate": 0.00015545878160690583 + }, + { + "step": 338, + "epoch": 0.5416666666666666, + "cpu_mem": 3.428990976, + "gpu_mem": 1.125706752, + "loss": 0.5792, + "grad_norm": 3.474271774291992, + "learning_rate": 0.00015461925878342556 + }, + { + "step": 339, + "epoch": 0.5432692307692307, + "cpu_mem": 3.428990976, + "gpu_mem": 1.12571904, + "loss": 0.4548, + "grad_norm": 2.4802868366241455, + "learning_rate": 0.00015377959110104584 + }, + { + "step": 340, + "epoch": 0.5448717948717948, + "cpu_mem": 3.429187584, + "gpu_mem": 1.125694464, + "loss": 0.4976, + "grad_norm": 2.7709505558013916, + "learning_rate": 0.00015293980489155333 + }, + { + "step": 341, + "epoch": 0.5464743589743589, + "cpu_mem": 3.429187584, + "gpu_mem": 1.125739008, + "loss": 0.5414, + "grad_norm": 2.818671941757202, + "learning_rate": 0.00015209992649045152 + }, + { + "step": 342, + "epoch": 0.5480769230769231, + "cpu_mem": 3.429187584, + "gpu_mem": 1.125712896, + "loss": 0.4927, + "grad_norm": 3.102410316467285, + "learning_rate": 0.000151259982236135 + }, + { + "step": 343, + "epoch": 0.5496794871794872, + "cpu_mem": 3.429384192, + "gpu_mem": 1.125709824, + "loss": 0.5124, + "grad_norm": 2.466782331466675, + "learning_rate": 0.00015041999846906367 + }, + { + "step": 344, + "epoch": 0.5512820512820513, + "cpu_mem": 3.429384192, + "gpu_mem": 1.125691392, + "loss": 0.3657, + "grad_norm": 2.034623384475708, + "learning_rate": 0.00014958000153093634 + }, + { + "step": 345, + "epoch": 0.5528846153846154, + "cpu_mem": 3.429384192, + "gpu_mem": 1.125697536, + "loss": 0.3586, + "grad_norm": 2.572171926498413, + "learning_rate": 0.000148740017763865 + }, + { + "step": 346, + "epoch": 0.5544871794871795, + "cpu_mem": 3.4295808, + "gpu_mem": 1.125666816, + "loss": 0.4675, + "grad_norm": 2.61598801612854, + "learning_rate": 0.00014790007350954845 + }, + { + "step": 347, + "epoch": 0.5560897435897436, + "cpu_mem": 3.429777408, + "gpu_mem": 1.125731328, + "loss": 0.6597, + "grad_norm": 3.1974668502807617, + "learning_rate": 0.00014706019510844664 + }, + { + "step": 348, + "epoch": 0.5576923076923077, + "cpu_mem": 3.429777408, + "gpu_mem": 1.125685248, + "loss": 0.4857, + "grad_norm": 3.384884834289551, + "learning_rate": 0.0001462204088989541 + }, + { + "step": 349, + "epoch": 0.5592948717948718, + "cpu_mem": 3.429777408, + "gpu_mem": 1.125679104, + "loss": 0.4392, + "grad_norm": 3.0120527744293213, + "learning_rate": 0.00014538074121657447 + }, + { + "step": 350, + "epoch": 0.5608974358974359, + "cpu_mem": 3.429974016, + "gpu_mem": 1.1257344, + "loss": 0.2721, + "grad_norm": 2.4898297786712646, + "learning_rate": 0.00014454121839309415 + }, + { + "step": 351, + "epoch": 0.5625, + "cpu_mem": 3.429974016, + "gpu_mem": 1.125700608, + "loss": 0.3568, + "grad_norm": 3.4663116931915283, + "learning_rate": 0.00014370186675575705 + }, + { + "step": 352, + "epoch": 0.5641025641025641, + "cpu_mem": 3.429974016, + "gpu_mem": 1.12568832, + "loss": 0.5751, + "grad_norm": 4.9622392654418945, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 353, + "epoch": 0.5657051282051282, + "cpu_mem": 3.429974016, + "gpu_mem": 1.125692928, + "loss": 0.538, + "grad_norm": 4.399668216705322, + "learning_rate": 0.00014202378232082053 + }, + { + "step": 354, + "epoch": 0.5673076923076923, + "cpu_mem": 3.430170624, + "gpu_mem": 1.12567296, + "loss": 0.4678, + "grad_norm": 3.8899691104888916, + "learning_rate": 0.00014118510214756536 + }, + { + "step": 355, + "epoch": 0.5689102564102564, + "cpu_mem": 3.430170624, + "gpu_mem": 1.125697536, + "loss": 0.5514, + "grad_norm": 4.638918876647949, + "learning_rate": 0.00014034669840749152 + }, + { + "step": 356, + "epoch": 0.5705128205128205, + "cpu_mem": 3.430367232, + "gpu_mem": 1.125676032, + "loss": 0.3537, + "grad_norm": 3.7556209564208984, + "learning_rate": 0.0001395085973927487 + }, + { + "step": 357, + "epoch": 0.5721153846153846, + "cpu_mem": 3.430367232, + "gpu_mem": 1.125692928, + "loss": 0.3936, + "grad_norm": 4.138609886169434, + "learning_rate": 0.00013867082538599317 + }, + { + "step": 358, + "epoch": 0.5737179487179487, + "cpu_mem": 3.430367232, + "gpu_mem": 1.1256576, + "loss": 0.6889, + "grad_norm": 6.602727890014648, + "learning_rate": 0.00013783340865956338 + }, + { + "step": 359, + "epoch": 0.5753205128205128, + "cpu_mem": 3.43056384, + "gpu_mem": 1.125689856, + "loss": 0.5589, + "grad_norm": 5.067461013793945, + "learning_rate": 0.0001369963734746566 + }, + { + "step": 360, + "epoch": 0.5769230769230769, + "cpu_mem": 3.430760448, + "gpu_mem": 1.125679104, + "loss": 0.3545, + "grad_norm": 3.3387680053710938, + "learning_rate": 0.0001361597460805047 + }, + { + "step": 361, + "epoch": 0.5785256410256411, + "cpu_mem": 3.430760448, + "gpu_mem": 1.125715968, + "loss": 0.4932, + "grad_norm": 4.20853328704834, + "learning_rate": 0.00013532355271355154 + }, + { + "step": 362, + "epoch": 0.5801282051282052, + "cpu_mem": 3.430760448, + "gpu_mem": 1.125682176, + "loss": 0.5309, + "grad_norm": 3.6700191497802734, + "learning_rate": 0.00013448781959663004 + }, + { + "step": 363, + "epoch": 0.5817307692307693, + "cpu_mem": 3.430760448, + "gpu_mem": 1.125705216, + "loss": 0.6391, + "grad_norm": 4.082846164703369, + "learning_rate": 0.00013365257293813956 + }, + { + "step": 364, + "epoch": 0.5833333333333334, + "cpu_mem": 3.430957056, + "gpu_mem": 1.125694464, + "loss": 0.5974, + "grad_norm": 3.7821357250213623, + "learning_rate": 0.00013281783893122446 + }, + { + "step": 365, + "epoch": 0.5849358974358975, + "cpu_mem": 3.430957056, + "gpu_mem": 1.125700608, + "loss": 0.4713, + "grad_norm": 3.544257879257202, + "learning_rate": 0.00013198364375295224 + }, + { + "step": 366, + "epoch": 0.5865384615384616, + "cpu_mem": 3.430957056, + "gpu_mem": 1.125694464, + "loss": 0.5737, + "grad_norm": 3.1483054161071777, + "learning_rate": 0.000131150013563493 + }, + { + "step": 367, + "epoch": 0.5881410256410257, + "cpu_mem": 3.431153664, + "gpu_mem": 1.125712896, + "loss": 0.6517, + "grad_norm": 2.9787662029266357, + "learning_rate": 0.00013031697450529902 + }, + { + "step": 368, + "epoch": 0.5897435897435898, + "cpu_mem": 3.431153664, + "gpu_mem": 1.12567296, + "loss": 0.4043, + "grad_norm": 2.3765928745269775, + "learning_rate": 0.0001294845527022846 + }, + { + "step": 369, + "epoch": 0.5913461538461539, + "cpu_mem": 3.431153664, + "gpu_mem": 1.125705216, + "loss": 0.7644, + "grad_norm": 2.7378060817718506, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 370, + "epoch": 0.592948717948718, + "cpu_mem": 3.431153664, + "gpu_mem": 1.125725184, + "loss": 0.5551, + "grad_norm": 2.1341941356658936, + "learning_rate": 0.0001278216652598487 + }, + { + "step": 371, + "epoch": 0.594551282051282, + "cpu_mem": 3.431153664, + "gpu_mem": 1.12571904, + "loss": 0.4931, + "grad_norm": 2.8134407997131348, + "learning_rate": 0.00012699125176819716 + }, + { + "step": 372, + "epoch": 0.5961538461538461, + "cpu_mem": 3.431350272, + "gpu_mem": 1.125682176, + "loss": 0.574, + "grad_norm": 3.04988431930542, + "learning_rate": 0.00012616155982563 + }, + { + "step": 373, + "epoch": 0.5977564102564102, + "cpu_mem": 3.431350272, + "gpu_mem": 1.125699072, + "loss": 0.3803, + "grad_norm": 2.747776508331299, + "learning_rate": 0.00012533261545109674 + }, + { + "step": 374, + "epoch": 0.5993589743589743, + "cpu_mem": 3.43154688, + "gpu_mem": 1.125676032, + "loss": 0.6865, + "grad_norm": 2.640749931335449, + "learning_rate": 0.00012450444464010352 + }, + { + "step": 375, + "epoch": 0.6009615384615384, + "cpu_mem": 3.43154688, + "gpu_mem": 1.125708288, + "loss": 0.4949, + "grad_norm": 3.2296416759490967, + "learning_rate": 0.0001236770733638976 + }, + { + "step": 376, + "epoch": 0.6025641025641025, + "cpu_mem": 3.43154688, + "gpu_mem": 1.12570368, + "loss": 0.4535, + "grad_norm": 2.912365436553955, + "learning_rate": 0.000122850527568653 + }, + { + "step": 377, + "epoch": 0.6041666666666666, + "cpu_mem": 3.431743488, + "gpu_mem": 1.125712896, + "loss": 0.3808, + "grad_norm": 2.4258999824523926, + "learning_rate": 0.00012202483317465704 + }, + { + "step": 378, + "epoch": 0.6057692307692307, + "cpu_mem": 3.431743488, + "gpu_mem": 1.125686784, + "loss": 0.4231, + "grad_norm": 2.351055860519409, + "learning_rate": 0.00012120001607549698 + }, + { + "step": 379, + "epoch": 0.6073717948717948, + "cpu_mem": 3.431743488, + "gpu_mem": 1.125706752, + "loss": 0.4822, + "grad_norm": 2.7122762203216553, + "learning_rate": 0.00012037610213724862 + }, + { + "step": 380, + "epoch": 0.6089743589743589, + "cpu_mem": 3.431743488, + "gpu_mem": 1.12568064, + "loss": 0.5699, + "grad_norm": 3.4061405658721924, + "learning_rate": 0.0001195531171976646 + }, + { + "step": 381, + "epoch": 0.6105769230769231, + "cpu_mem": 3.431743488, + "gpu_mem": 1.125705216, + "loss": 0.4829, + "grad_norm": 3.0852696895599365, + "learning_rate": 0.00011873108706536448 + }, + { + "step": 382, + "epoch": 0.6121794871794872, + "cpu_mem": 3.431940096, + "gpu_mem": 1.125689856, + "loss": 0.4429, + "grad_norm": 3.5664069652557373, + "learning_rate": 0.00011791003751902542 + }, + { + "step": 383, + "epoch": 0.6137820512820513, + "cpu_mem": 3.431940096, + "gpu_mem": 1.125723648, + "loss": 0.5715, + "grad_norm": 3.0119705200195312, + "learning_rate": 0.00011708999430657325 + }, + { + "step": 384, + "epoch": 0.6153846153846154, + "cpu_mem": 3.432136704, + "gpu_mem": 1.12570368, + "loss": 0.4064, + "grad_norm": 3.4676060676574707, + "learning_rate": 0.00011627098314437586 + }, + { + "step": 385, + "epoch": 0.6169871794871795, + "cpu_mem": 3.432136704, + "gpu_mem": 1.12568832, + "loss": 0.4987, + "grad_norm": 3.355682849884033, + "learning_rate": 0.0001154530297164359 + }, + { + "step": 386, + "epoch": 0.6185897435897436, + "cpu_mem": 3.432136704, + "gpu_mem": 1.125723648, + "loss": 0.4755, + "grad_norm": 3.722646474838257, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 387, + "epoch": 0.6201923076923077, + "cpu_mem": 3.432136704, + "gpu_mem": 1.125729792, + "loss": 0.3228, + "grad_norm": 2.548408031463623, + "learning_rate": 0.00011382039863268374 + }, + { + "step": 388, + "epoch": 0.6217948717948718, + "cpu_mem": 3.432136704, + "gpu_mem": 1.125692928, + "loss": 0.5611, + "grad_norm": 4.5473480224609375, + "learning_rate": 0.00011300577217580905 + }, + { + "step": 389, + "epoch": 0.6233974358974359, + "cpu_mem": 3.432136704, + "gpu_mem": 1.125671424, + "loss": 0.4867, + "grad_norm": 3.335951805114746, + "learning_rate": 0.00011219230584946136 + }, + { + "step": 390, + "epoch": 0.625, + "cpu_mem": 3.432333312, + "gpu_mem": 1.125723648, + "loss": 0.69, + "grad_norm": 4.234187602996826, + "learning_rate": 0.00011138002516375864 + }, + { + "step": 391, + "epoch": 0.6266025641025641, + "cpu_mem": 3.432333312, + "gpu_mem": 1.125709824, + "loss": 0.3382, + "grad_norm": 2.554506301879883, + "learning_rate": 0.00011056895559163748 + }, + { + "step": 392, + "epoch": 0.6282051282051282, + "cpu_mem": 3.432333312, + "gpu_mem": 1.12570368, + "loss": 0.4878, + "grad_norm": 2.9870965480804443, + "learning_rate": 0.00010975912256805436 + }, + { + "step": 393, + "epoch": 0.6298076923076923, + "cpu_mem": 3.432333312, + "gpu_mem": 1.125709824, + "loss": 0.5987, + "grad_norm": 3.238006353378296, + "learning_rate": 0.00010895055148918756 + }, + { + "step": 394, + "epoch": 0.6314102564102564, + "cpu_mem": 3.432333312, + "gpu_mem": 1.125686784, + "loss": 0.6723, + "grad_norm": 3.7677509784698486, + "learning_rate": 0.00010814326771164141 + }, + { + "step": 395, + "epoch": 0.6330128205128205, + "cpu_mem": 3.432333312, + "gpu_mem": 1.125700608, + "loss": 0.2919, + "grad_norm": 2.3828697204589844, + "learning_rate": 0.00010733729655165054 + }, + { + "step": 396, + "epoch": 0.6346153846153846, + "cpu_mem": 3.43252992, + "gpu_mem": 1.125700608, + "loss": 0.7205, + "grad_norm": 4.670462131500244, + "learning_rate": 0.00010653266328428628 + }, + { + "step": 397, + "epoch": 0.6362179487179487, + "cpu_mem": 3.43252992, + "gpu_mem": 1.125669888, + "loss": 0.3688, + "grad_norm": 2.5733377933502197, + "learning_rate": 0.00010572939314266402 + }, + { + "step": 398, + "epoch": 0.6378205128205128, + "cpu_mem": 3.43252992, + "gpu_mem": 1.12570368, + "loss": 0.5469, + "grad_norm": 3.295950174331665, + "learning_rate": 0.00010492751131715159 + }, + { + "step": 399, + "epoch": 0.6394230769230769, + "cpu_mem": 3.43252992, + "gpu_mem": 1.125682176, + "loss": 0.4915, + "grad_norm": 3.0758450031280518, + "learning_rate": 0.00010412704295457988 + }, + { + "step": 400, + "epoch": 0.6410256410256411, + "cpu_mem": 3.432726528, + "gpu_mem": 1.125689856, + "loss": 0.4139, + "grad_norm": 3.1469602584838867, + "learning_rate": 0.00010332801315745361 + }, + { + "step": 401, + "epoch": 0.6426282051282052, + "cpu_mem": 3.432726528, + "gpu_mem": 1.125708288, + "loss": 0.4856, + "grad_norm": 3.0009608268737793, + "learning_rate": 0.00010253044698316464 + }, + { + "step": 402, + "epoch": 0.6442307692307693, + "cpu_mem": 3.432923136, + "gpu_mem": 1.125676032, + "loss": 0.6123, + "grad_norm": 3.682239532470703, + "learning_rate": 0.00010173436944320582 + }, + { + "step": 403, + "epoch": 0.6458333333333334, + "cpu_mem": 3.432923136, + "gpu_mem": 1.12568064, + "loss": 0.3911, + "grad_norm": 2.3040530681610107, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 404, + "epoch": 0.6474358974358975, + "cpu_mem": 3.432923136, + "gpu_mem": 1.125676032, + "loss": 0.3544, + "grad_norm": 2.0100793838500977, + "learning_rate": 0.00010014678007805106 + }, + { + "step": 405, + "epoch": 0.6490384615384616, + "cpu_mem": 3.432923136, + "gpu_mem": 1.125720576, + "loss": 0.5854, + "grad_norm": 3.6193881034851074, + "learning_rate": 9.935531803929469e-05 + }, + { + "step": 406, + "epoch": 0.6506410256410257, + "cpu_mem": 3.433119744, + "gpu_mem": 1.12570368, + "loss": 0.3668, + "grad_norm": 2.44498348236084, + "learning_rate": 9.856544420618624e-05 + }, + { + "step": 407, + "epoch": 0.6522435897435898, + "cpu_mem": 3.433119744, + "gpu_mem": 1.125692928, + "loss": 0.4778, + "grad_norm": 2.841466188430786, + "learning_rate": 9.777718334898859e-05 + }, + { + "step": 408, + "epoch": 0.6538461538461539, + "cpu_mem": 3.433119744, + "gpu_mem": 1.125714432, + "loss": 0.4099, + "grad_norm": 2.8391273021698, + "learning_rate": 9.699056018738192e-05 + }, + { + "step": 409, + "epoch": 0.655448717948718, + "cpu_mem": 3.433119744, + "gpu_mem": 1.12568064, + "loss": 0.4729, + "grad_norm": 3.057194232940674, + "learning_rate": 9.62055993896888e-05 + }, + { + "step": 410, + "epoch": 0.657051282051282, + "cpu_mem": 3.433119744, + "gpu_mem": 1.125696, + "loss": 0.383, + "grad_norm": 2.733968734741211, + "learning_rate": 9.542232557210039e-05 + }, + { + "step": 411, + "epoch": 0.6586538461538461, + "cpu_mem": 3.433316352, + "gpu_mem": 1.125696, + "loss": 0.4587, + "grad_norm": 2.8765406608581543, + "learning_rate": 9.464076329790451e-05 + }, + { + "step": 412, + "epoch": 0.6602564102564102, + "cpu_mem": 3.433316352, + "gpu_mem": 1.125686784, + "loss": 0.4435, + "grad_norm": 3.4765307903289795, + "learning_rate": 9.386093707671543e-05 + }, + { + "step": 413, + "epoch": 0.6618589743589743, + "cpu_mem": 3.433316352, + "gpu_mem": 1.125697536, + "loss": 0.7061, + "grad_norm": 3.6621525287628174, + "learning_rate": 9.308287136370511e-05 + }, + { + "step": 414, + "epoch": 0.6634615384615384, + "cpu_mem": 3.433316352, + "gpu_mem": 1.125722112, + "loss": 0.4846, + "grad_norm": 3.240004301071167, + "learning_rate": 9.230659055883649e-05 + }, + { + "step": 415, + "epoch": 0.6650641025641025, + "cpu_mem": 3.43351296, + "gpu_mem": 1.125674496, + "loss": 0.5987, + "grad_norm": 3.470036506652832, + "learning_rate": 9.15321190060981e-05 + }, + { + "step": 416, + "epoch": 0.6666666666666666, + "cpu_mem": 3.43351296, + "gpu_mem": 1.125709824, + "loss": 0.4633, + "grad_norm": 3.5036020278930664, + "learning_rate": 9.075948099274078e-05 + }, + { + "step": 417, + "epoch": 0.6682692307692307, + "cpu_mem": 3.43351296, + "gpu_mem": 1.125671424, + "loss": 0.3602, + "grad_norm": 2.723820924758911, + "learning_rate": 8.998870074851604e-05 + }, + { + "step": 418, + "epoch": 0.6698717948717948, + "cpu_mem": 3.43351296, + "gpu_mem": 1.125689856, + "loss": 0.4332, + "grad_norm": 2.9448118209838867, + "learning_rate": 8.9219802444916e-05 + }, + { + "step": 419, + "epoch": 0.6714743589743589, + "cpu_mem": 3.43351296, + "gpu_mem": 1.125682176, + "loss": 0.4328, + "grad_norm": 2.9031031131744385, + "learning_rate": 8.845281019441583e-05 + }, + { + "step": 420, + "epoch": 0.6730769230769231, + "cpu_mem": 3.43351296, + "gpu_mem": 1.12571904, + "loss": 0.4261, + "grad_norm": 2.7792513370513916, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 421, + "epoch": 0.6746794871794872, + "cpu_mem": 3.43351296, + "gpu_mem": 1.125679104, + "loss": 0.3933, + "grad_norm": 3.261742115020752, + "learning_rate": 8.692464000299362e-05 + }, + { + "step": 422, + "epoch": 0.6762820512820513, + "cpu_mem": 3.43351296, + "gpu_mem": 1.125692928, + "loss": 0.3507, + "grad_norm": 2.3528950214385986, + "learning_rate": 8.61635099851395e-05 + }, + { + "step": 423, + "epoch": 0.6778846153846154, + "cpu_mem": 3.433709568, + "gpu_mem": 1.125697536, + "loss": 0.3084, + "grad_norm": 2.5022027492523193, + "learning_rate": 8.540438186501792e-05 + }, + { + "step": 424, + "epoch": 0.6794871794871795, + "cpu_mem": 3.433906176, + "gpu_mem": 1.125659136, + "loss": 0.5041, + "grad_norm": 3.6584157943725586, + "learning_rate": 8.464727944871322e-05 + }, + { + "step": 425, + "epoch": 0.6810897435897436, + "cpu_mem": 3.433906176, + "gpu_mem": 1.125682176, + "loss": 0.6386, + "grad_norm": 3.5420913696289062, + "learning_rate": 8.389222647878426e-05 + }, + { + "step": 426, + "epoch": 0.6826923076923077, + "cpu_mem": 3.433906176, + "gpu_mem": 1.12568064, + "loss": 0.4354, + "grad_norm": 3.030334711074829, + "learning_rate": 8.313924663351926e-05 + }, + { + "step": 427, + "epoch": 0.6842948717948718, + "cpu_mem": 3.433906176, + "gpu_mem": 1.125699072, + "loss": 0.5326, + "grad_norm": 3.4918313026428223, + "learning_rate": 8.238836352619424e-05 + }, + { + "step": 428, + "epoch": 0.6858974358974359, + "cpu_mem": 3.433906176, + "gpu_mem": 1.125696, + "loss": 0.5389, + "grad_norm": 3.7121403217315674, + "learning_rate": 8.163960070433164e-05 + }, + { + "step": 429, + "epoch": 0.6875, + "cpu_mem": 3.434102784, + "gpu_mem": 1.125694464, + "loss": 0.5232, + "grad_norm": 3.0608017444610596, + "learning_rate": 8.089298164896245e-05 + }, + { + "step": 430, + "epoch": 0.6891025641025641, + "cpu_mem": 3.434102784, + "gpu_mem": 1.125712896, + "loss": 0.308, + "grad_norm": 2.6720163822174072, + "learning_rate": 8.014852977388964e-05 + }, + { + "step": 431, + "epoch": 0.6907051282051282, + "cpu_mem": 3.434102784, + "gpu_mem": 1.125674496, + "loss": 0.5039, + "grad_norm": 3.5080089569091797, + "learning_rate": 7.940626842495362e-05 + }, + { + "step": 432, + "epoch": 0.6923076923076923, + "cpu_mem": 3.434102784, + "gpu_mem": 1.12571904, + "loss": 0.3114, + "grad_norm": 2.749934673309326, + "learning_rate": 7.866622087930074e-05 + }, + { + "step": 433, + "epoch": 0.6939102564102564, + "cpu_mem": 3.434102784, + "gpu_mem": 1.125683712, + "loss": 0.3593, + "grad_norm": 2.5356295108795166, + "learning_rate": 7.792841034465275e-05 + }, + { + "step": 434, + "epoch": 0.6955128205128205, + "cpu_mem": 3.434102784, + "gpu_mem": 1.12571136, + "loss": 0.4947, + "grad_norm": 3.3213722705841064, + "learning_rate": 7.719285995857938e-05 + }, + { + "step": 435, + "epoch": 0.6971153846153846, + "cpu_mem": 3.434102784, + "gpu_mem": 1.125691392, + "loss": 0.3908, + "grad_norm": 3.3147993087768555, + "learning_rate": 7.64595927877727e-05 + }, + { + "step": 436, + "epoch": 0.6987179487179487, + "cpu_mem": 3.434102784, + "gpu_mem": 1.125737472, + "loss": 0.6583, + "grad_norm": 3.7401819229125977, + "learning_rate": 7.572863182732332e-05 + }, + { + "step": 437, + "epoch": 0.7003205128205128, + "cpu_mem": 3.434102784, + "gpu_mem": 1.125702144, + "loss": 0.4417, + "grad_norm": 2.7934937477111816, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 438, + "epoch": 0.7019230769230769, + "cpu_mem": 3.434102784, + "gpu_mem": 1.125692928, + "loss": 0.344, + "grad_norm": 2.8811941146850586, + "learning_rate": 7.42737201555302e-05 + }, + { + "step": 439, + "epoch": 0.7035256410256411, + "cpu_mem": 3.434299392, + "gpu_mem": 1.125686784, + "loss": 0.3063, + "grad_norm": 2.8746438026428223, + "learning_rate": 7.354981506988387e-05 + }, + { + "step": 440, + "epoch": 0.7051282051282052, + "cpu_mem": 3.434496, + "gpu_mem": 1.125671424, + "loss": 0.4635, + "grad_norm": 3.054126024246216, + "learning_rate": 7.282830744455895e-05 + }, + { + "step": 441, + "epoch": 0.7067307692307693, + "cpu_mem": 3.434496, + "gpu_mem": 1.125689856, + "loss": 0.6795, + "grad_norm": 4.431227684020996, + "learning_rate": 7.210921990586957e-05 + }, + { + "step": 442, + "epoch": 0.7083333333333334, + "cpu_mem": 3.434496, + "gpu_mem": 1.125691392, + "loss": 0.3121, + "grad_norm": 2.509927749633789, + "learning_rate": 7.139257500423665e-05 + }, + { + "step": 443, + "epoch": 0.7099358974358975, + "cpu_mem": 3.434496, + "gpu_mem": 1.125696, + "loss": 0.5858, + "grad_norm": 4.0217061042785645, + "learning_rate": 7.067839521348035e-05 + }, + { + "step": 444, + "epoch": 0.7115384615384616, + "cpu_mem": 3.434496, + "gpu_mem": 1.125699072, + "loss": 0.3705, + "grad_norm": 3.5972747802734375, + "learning_rate": 6.996670293011575e-05 + }, + { + "step": 445, + "epoch": 0.7131410256410257, + "cpu_mem": 3.434496, + "gpu_mem": 1.125692928, + "loss": 0.5928, + "grad_norm": 4.082733154296875, + "learning_rate": 6.92575204726501e-05 + }, + { + "step": 446, + "epoch": 0.7147435897435898, + "cpu_mem": 3.434496, + "gpu_mem": 1.12571904, + "loss": 0.6639, + "grad_norm": 3.6822690963745117, + "learning_rate": 6.855087008088307e-05 + }, + { + "step": 447, + "epoch": 0.7163461538461539, + "cpu_mem": 3.434496, + "gpu_mem": 1.125686784, + "loss": 0.3487, + "grad_norm": 2.3745572566986084, + "learning_rate": 6.784677391520952e-05 + }, + { + "step": 448, + "epoch": 0.717948717948718, + "cpu_mem": 3.434496, + "gpu_mem": 1.125714432, + "loss": 0.3937, + "grad_norm": 3.163935899734497, + "learning_rate": 6.714525405592412e-05 + }, + { + "step": 449, + "epoch": 0.719551282051282, + "cpu_mem": 3.434496, + "gpu_mem": 1.125722112, + "loss": 0.3889, + "grad_norm": 3.477097272872925, + "learning_rate": 6.644633250252937e-05 + }, + { + "step": 450, + "epoch": 0.7211538461538461, + "cpu_mem": 3.434496, + "gpu_mem": 1.12570368, + "loss": 0.5284, + "grad_norm": 3.663787603378296, + "learning_rate": 6.575003117304535e-05 + }, + { + "step": 451, + "epoch": 0.7227564102564102, + "cpu_mem": 3.434692608, + "gpu_mem": 1.125689856, + "loss": 0.3645, + "grad_norm": 3.001763105392456, + "learning_rate": 6.50563719033225e-05 + }, + { + "step": 452, + "epoch": 0.7243589743589743, + "cpu_mem": 3.434692608, + "gpu_mem": 1.125700608, + "loss": 0.346, + "grad_norm": 3.1899254322052, + "learning_rate": 6.436537644635705e-05 + }, + { + "step": 453, + "epoch": 0.7259615384615384, + "cpu_mem": 3.434692608, + "gpu_mem": 1.125692928, + "loss": 0.4272, + "grad_norm": 3.288916826248169, + "learning_rate": 6.367706647160847e-05 + }, + { + "step": 454, + "epoch": 0.7275641025641025, + "cpu_mem": 3.434692608, + "gpu_mem": 1.125709824, + "loss": 0.528, + "grad_norm": 3.2244999408721924, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 455, + "epoch": 0.7291666666666666, + "cpu_mem": 3.434692608, + "gpu_mem": 1.125682176, + "loss": 0.3958, + "grad_norm": 3.36372971534729, + "learning_rate": 6.230858922484288e-05 + }, + { + "step": 456, + "epoch": 0.7307692307692307, + "cpu_mem": 3.434692608, + "gpu_mem": 1.125712896, + "loss": 0.4209, + "grad_norm": 3.109013557434082, + "learning_rate": 6.162846486795938e-05 + }, + { + "step": 457, + "epoch": 0.7323717948717948, + "cpu_mem": 3.434692608, + "gpu_mem": 1.125694464, + "loss": 0.5235, + "grad_norm": 4.11654806137085, + "learning_rate": 6.095111182221422e-05 + }, + { + "step": 458, + "epoch": 0.7339743589743589, + "cpu_mem": 3.434692608, + "gpu_mem": 1.125682176, + "loss": 0.5565, + "grad_norm": 3.8538734912872314, + "learning_rate": 6.027655132924397e-05 + }, + { + "step": 459, + "epoch": 0.7355769230769231, + "cpu_mem": 3.434692608, + "gpu_mem": 1.125694464, + "loss": 0.4819, + "grad_norm": 3.507513999938965, + "learning_rate": 5.960480454311155e-05 + }, + { + "step": 460, + "epoch": 0.7371794871794872, + "cpu_mem": 3.434692608, + "gpu_mem": 1.125700608, + "loss": 0.3204, + "grad_norm": 3.585418701171875, + "learning_rate": 5.893589252964258e-05 + }, + { + "step": 461, + "epoch": 0.7387820512820513, + "cpu_mem": 3.434889216, + "gpu_mem": 1.12568832, + "loss": 0.2549, + "grad_norm": 2.3041892051696777, + "learning_rate": 5.826983626576479e-05 + }, + { + "step": 462, + "epoch": 0.7403846153846154, + "cpu_mem": 3.434889216, + "gpu_mem": 1.125677568, + "loss": 0.3526, + "grad_norm": 3.342909336090088, + "learning_rate": 5.760665663885046e-05 + }, + { + "step": 463, + "epoch": 0.7419871794871795, + "cpu_mem": 3.434889216, + "gpu_mem": 1.125679104, + "loss": 0.3408, + "grad_norm": 3.1206064224243164, + "learning_rate": 5.6946374446060984e-05 + }, + { + "step": 464, + "epoch": 0.7435897435897436, + "cpu_mem": 3.434889216, + "gpu_mem": 1.125692928, + "loss": 0.4247, + "grad_norm": 3.162538766860962, + "learning_rate": 5.6289010393695056e-05 + }, + { + "step": 465, + "epoch": 0.7451923076923077, + "cpu_mem": 3.434889216, + "gpu_mem": 1.125696, + "loss": 0.4525, + "grad_norm": 3.420485258102417, + "learning_rate": 5.563458509653904e-05 + }, + { + "step": 466, + "epoch": 0.7467948717948718, + "cpu_mem": 3.434889216, + "gpu_mem": 1.125706752, + "loss": 0.3827, + "grad_norm": 3.1574013233184814, + "learning_rate": 5.498311907722057e-05 + }, + { + "step": 467, + "epoch": 0.7483974358974359, + "cpu_mem": 3.434889216, + "gpu_mem": 1.12568064, + "loss": 0.4569, + "grad_norm": 3.088571310043335, + "learning_rate": 5.43346327655652e-05 + }, + { + "step": 468, + "epoch": 0.75, + "cpu_mem": 3.434889216, + "gpu_mem": 1.125696, + "loss": 0.7221, + "grad_norm": 4.554949760437012, + "learning_rate": 5.3689146497955274e-05 + }, + { + "step": 469, + "epoch": 0.7516025641025641, + "cpu_mem": 3.434889216, + "gpu_mem": 1.125705216, + "loss": 0.5177, + "grad_norm": 3.6076836585998535, + "learning_rate": 5.30466805166927e-05 + }, + { + "step": 470, + "epoch": 0.7532051282051282, + "cpu_mem": 3.434889216, + "gpu_mem": 1.125679104, + "loss": 0.5178, + "grad_norm": 3.717750310897827, + "learning_rate": 5.240725496936372e-05 + }, + { + "step": 471, + "epoch": 0.7548076923076923, + "cpu_mem": 3.434889216, + "gpu_mem": 1.125685248, + "loss": 0.4732, + "grad_norm": 4.051069259643555, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 472, + "epoch": 0.7564102564102564, + "cpu_mem": 3.435085824, + "gpu_mem": 1.125674496, + "loss": 0.4929, + "grad_norm": 3.3390235900878906, + "learning_rate": 5.113760528948622e-05 + }, + { + "step": 473, + "epoch": 0.7580128205128205, + "cpu_mem": 3.435085824, + "gpu_mem": 1.12568064, + "loss": 0.3949, + "grad_norm": 3.409930944442749, + "learning_rate": 5.05074209728614e-05 + }, + { + "step": 474, + "epoch": 0.7596153846153846, + "cpu_mem": 3.435085824, + "gpu_mem": 1.125717504, + "loss": 0.2872, + "grad_norm": 2.7701528072357178, + "learning_rate": 4.988035672076899e-05 + }, + { + "step": 475, + "epoch": 0.7612179487179487, + "cpu_mem": 3.435085824, + "gpu_mem": 1.12566528, + "loss": 0.6542, + "grad_norm": 3.8038365840911865, + "learning_rate": 4.925643219780052e-05 + }, + { + "step": 476, + "epoch": 0.7628205128205128, + "cpu_mem": 3.435282432, + "gpu_mem": 1.125685248, + "loss": 0.4621, + "grad_norm": 2.9146790504455566, + "learning_rate": 4.863566697008634e-05 + }, + { + "step": 477, + "epoch": 0.7644230769230769, + "cpu_mem": 3.435282432, + "gpu_mem": 1.125685248, + "loss": 0.4863, + "grad_norm": 3.9681684970855713, + "learning_rate": 4.801808050468219e-05 + }, + { + "step": 478, + "epoch": 0.7660256410256411, + "cpu_mem": 3.435282432, + "gpu_mem": 1.125683712, + "loss": 0.4318, + "grad_norm": 2.6785478591918945, + "learning_rate": 4.7403692168958305e-05 + }, + { + "step": 479, + "epoch": 0.7676282051282052, + "cpu_mem": 3.435282432, + "gpu_mem": 1.125682176, + "loss": 0.2986, + "grad_norm": 2.299663543701172, + "learning_rate": 4.679252122999255e-05 + }, + { + "step": 480, + "epoch": 0.7692307692307693, + "cpu_mem": 3.435282432, + "gpu_mem": 1.125674496, + "loss": 0.6308, + "grad_norm": 3.9106292724609375, + "learning_rate": 4.618458685396579e-05 + }, + { + "step": 481, + "epoch": 0.7708333333333334, + "cpu_mem": 3.435282432, + "gpu_mem": 1.1257344, + "loss": 0.3453, + "grad_norm": 2.4187371730804443, + "learning_rate": 4.5579908105561016e-05 + }, + { + "step": 482, + "epoch": 0.7724358974358975, + "cpu_mem": 3.435282432, + "gpu_mem": 1.125679104, + "loss": 0.3843, + "grad_norm": 2.483978509902954, + "learning_rate": 4.497850394736563e-05 + }, + { + "step": 483, + "epoch": 0.7740384615384616, + "cpu_mem": 3.435282432, + "gpu_mem": 1.125662208, + "loss": 0.5331, + "grad_norm": 2.9886417388916016, + "learning_rate": 4.438039323927648e-05 + }, + { + "step": 484, + "epoch": 0.7756410256410257, + "cpu_mem": 3.435282432, + "gpu_mem": 1.125692928, + "loss": 0.5529, + "grad_norm": 3.1834754943847656, + "learning_rate": 4.3785594737908676e-05 + }, + { + "step": 485, + "epoch": 0.7772435897435898, + "cpu_mem": 3.435282432, + "gpu_mem": 1.125737472, + "loss": 0.4535, + "grad_norm": 2.611889123916626, + "learning_rate": 4.319412709600723e-05 + }, + { + "step": 486, + "epoch": 0.7788461538461539, + "cpu_mem": 3.43547904, + "gpu_mem": 1.125717504, + "loss": 0.3225, + "grad_norm": 2.2773995399475098, + "learning_rate": 4.2606008861862116e-05 + }, + { + "step": 487, + "epoch": 0.780448717948718, + "cpu_mem": 3.43547904, + "gpu_mem": 1.125717504, + "loss": 0.3501, + "grad_norm": 2.6417276859283447, + "learning_rate": 4.2021258478726774e-05 + }, + { + "step": 488, + "epoch": 0.782051282051282, + "cpu_mem": 3.43547904, + "gpu_mem": 1.125683712, + "loss": 0.4265, + "grad_norm": 3.0402333736419678, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 489, + "epoch": 0.7836538461538461, + "cpu_mem": 3.43547904, + "gpu_mem": 1.125708288, + "loss": 0.4062, + "grad_norm": 2.4023475646972656, + "learning_rate": 4.0861934509848507e-05 + }, + { + "step": 490, + "epoch": 0.7852564102564102, + "cpu_mem": 3.43547904, + "gpu_mem": 1.12571136, + "loss": 0.5812, + "grad_norm": 3.2064714431762695, + "learning_rate": 4.028739728024022e-05 + }, + { + "step": 491, + "epoch": 0.7868589743589743, + "cpu_mem": 3.43547904, + "gpu_mem": 1.125689856, + "loss": 0.3395, + "grad_norm": 2.620352029800415, + "learning_rate": 3.971630061277077e-05 + }, + { + "step": 492, + "epoch": 0.7884615384615384, + "cpu_mem": 3.43547904, + "gpu_mem": 1.125712896, + "loss": 0.3411, + "grad_norm": 3.426124095916748, + "learning_rate": 3.914866241690115e-05 + }, + { + "step": 493, + "epoch": 0.7900641025641025, + "cpu_mem": 3.43547904, + "gpu_mem": 1.125692928, + "loss": 0.4624, + "grad_norm": 3.5180723667144775, + "learning_rate": 3.858450049363532e-05 + }, + { + "step": 494, + "epoch": 0.7916666666666666, + "cpu_mem": 3.43547904, + "gpu_mem": 1.125717504, + "loss": 0.4145, + "grad_norm": 2.6674182415008545, + "learning_rate": 3.8023832534962314e-05 + }, + { + "step": 495, + "epoch": 0.7932692307692307, + "cpu_mem": 3.435675648, + "gpu_mem": 1.125700608, + "loss": 0.464, + "grad_norm": 4.058709621429443, + "learning_rate": 3.746667612330109e-05 + }, + { + "step": 496, + "epoch": 0.7948717948717948, + "cpu_mem": 3.435675648, + "gpu_mem": 1.125696, + "loss": 0.4228, + "grad_norm": 3.3853323459625244, + "learning_rate": 3.691304873094927e-05 + }, + { + "step": 497, + "epoch": 0.7964743589743589, + "cpu_mem": 3.435675648, + "gpu_mem": 1.125708288, + "loss": 0.4505, + "grad_norm": 2.768657684326172, + "learning_rate": 3.636296771953544e-05 + }, + { + "step": 498, + "epoch": 0.7980769230769231, + "cpu_mem": 3.435675648, + "gpu_mem": 1.125679104, + "loss": 0.3236, + "grad_norm": 2.655945062637329, + "learning_rate": 3.581645033947425e-05 + }, + { + "step": 499, + "epoch": 0.7996794871794872, + "cpu_mem": 3.435675648, + "gpu_mem": 1.125692928, + "loss": 0.5627, + "grad_norm": 3.4534294605255127, + "learning_rate": 3.527351372942588e-05 + }, + { + "step": 500, + "epoch": 0.8012820512820513, + "cpu_mem": 3.435675648, + "gpu_mem": 1.125679104, + "loss": 0.3927, + "grad_norm": 3.047316074371338, + "learning_rate": 3.473417491575824e-05 + }, + { + "step": 501, + "epoch": 0.8028846153846154, + "cpu_mem": 3.435675648, + "gpu_mem": 1.12567296, + "loss": 0.4632, + "grad_norm": 3.575718402862549, + "learning_rate": 3.41984508120132e-05 + }, + { + "step": 502, + "epoch": 0.8044871794871795, + "cpu_mem": 3.435675648, + "gpu_mem": 1.125679104, + "loss": 0.272, + "grad_norm": 2.5262696743011475, + "learning_rate": 3.366635821837627e-05 + }, + { + "step": 503, + "epoch": 0.8060897435897436, + "cpu_mem": 3.435675648, + "gpu_mem": 1.125692928, + "loss": 0.4728, + "grad_norm": 3.633176565170288, + "learning_rate": 3.3137913821149425e-05 + }, + { + "step": 504, + "epoch": 0.8076923076923077, + "cpu_mem": 3.435675648, + "gpu_mem": 1.125676032, + "loss": 0.6493, + "grad_norm": 3.9570565223693848, + "learning_rate": 3.261313419222825e-05 + }, + { + "step": 505, + "epoch": 0.8092948717948718, + "cpu_mem": 3.435675648, + "gpu_mem": 1.125729792, + "loss": 0.3487, + "grad_norm": 2.635680675506592, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 506, + "epoch": 0.8108974358974359, + "cpu_mem": 3.435675648, + "gpu_mem": 1.12567296, + "loss": 0.3156, + "grad_norm": 2.6680004596710205, + "learning_rate": 3.157463495173713e-05 + }, + { + "step": 507, + "epoch": 0.8125, + "cpu_mem": 3.435675648, + "gpu_mem": 1.125751296, + "loss": 0.407, + "grad_norm": 3.123056411743164, + "learning_rate": 3.1060947907265936e-05 + }, + { + "step": 508, + "epoch": 0.8141025641025641, + "cpu_mem": 3.435675648, + "gpu_mem": 1.125694464, + "loss": 0.3754, + "grad_norm": 3.015554904937744, + "learning_rate": 3.0550990764276634e-05 + }, + { + "step": 509, + "epoch": 0.8157051282051282, + "cpu_mem": 3.435675648, + "gpu_mem": 1.125712896, + "loss": 0.4707, + "grad_norm": 3.446807622909546, + "learning_rate": 3.00447795149086e-05 + }, + { + "step": 510, + "epoch": 0.8173076923076923, + "cpu_mem": 3.435675648, + "gpu_mem": 1.12568832, + "loss": 0.3461, + "grad_norm": 2.370466947555542, + "learning_rate": 2.9542330033830884e-05 + }, + { + "step": 511, + "epoch": 0.8189102564102564, + "cpu_mem": 3.435675648, + "gpu_mem": 1.125720576, + "loss": 0.4277, + "grad_norm": 3.248053550720215, + "learning_rate": 2.9043658077744316e-05 + }, + { + "step": 512, + "epoch": 0.8205128205128205, + "cpu_mem": 3.435675648, + "gpu_mem": 1.125740544, + "loss": 0.446, + "grad_norm": 3.588775873184204, + "learning_rate": 2.8548779284887442e-05 + }, + { + "step": 513, + "epoch": 0.8221153846153846, + "cpu_mem": 3.435675648, + "gpu_mem": 1.125669888, + "loss": 0.3079, + "grad_norm": 2.886380672454834, + "learning_rate": 2.805770917454614e-05 + }, + { + "step": 514, + "epoch": 0.8237179487179487, + "cpu_mem": 3.435872256, + "gpu_mem": 1.125683712, + "loss": 0.2946, + "grad_norm": 2.548650026321411, + "learning_rate": 2.7570463146566758e-05 + }, + { + "step": 515, + "epoch": 0.8253205128205128, + "cpu_mem": 3.435872256, + "gpu_mem": 1.125668352, + "loss": 0.4016, + "grad_norm": 3.4394733905792236, + "learning_rate": 2.708705648087332e-05 + }, + { + "step": 516, + "epoch": 0.8269230769230769, + "cpu_mem": 3.435872256, + "gpu_mem": 1.125706752, + "loss": 0.3233, + "grad_norm": 2.3200416564941406, + "learning_rate": 2.6607504336988317e-05 + }, + { + "step": 517, + "epoch": 0.8285256410256411, + "cpu_mem": 3.435872256, + "gpu_mem": 1.125706752, + "loss": 0.5268, + "grad_norm": 3.4312796592712402, + "learning_rate": 2.613182175355739e-05 + }, + { + "step": 518, + "epoch": 0.8301282051282052, + "cpu_mem": 3.435872256, + "gpu_mem": 1.125692928, + "loss": 0.3617, + "grad_norm": 2.75594162940979, + "learning_rate": 2.5660023647877644e-05 + }, + { + "step": 519, + "epoch": 0.8317307692307693, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125683712, + "loss": 0.4342, + "grad_norm": 3.2963154315948486, + "learning_rate": 2.5192124815429777e-05 + }, + { + "step": 520, + "epoch": 0.8333333333333334, + "cpu_mem": 3.436068864, + "gpu_mem": 1.12568832, + "loss": 0.5514, + "grad_norm": 3.1944496631622314, + "learning_rate": 2.472813992941418e-05 + }, + { + "step": 521, + "epoch": 0.8349358974358975, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125691392, + "loss": 0.4448, + "grad_norm": 3.2383601665496826, + "learning_rate": 2.426808354029078e-05 + }, + { + "step": 522, + "epoch": 0.8365384615384616, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125697536, + "loss": 0.3416, + "grad_norm": 2.468079090118408, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 523, + "epoch": 0.8381410256410257, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125715968, + "loss": 0.4436, + "grad_norm": 3.3646321296691895, + "learning_rate": 2.3359813838124277e-05 + }, + { + "step": 524, + "epoch": 0.8397435897435898, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125709824, + "loss": 0.5306, + "grad_norm": 4.2399396896362305, + "learning_rate": 2.2911629008211363e-05 + }, + { + "step": 525, + "epoch": 0.8413461538461539, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125686784, + "loss": 0.3034, + "grad_norm": 3.209428548812866, + "learning_rate": 2.24674296405579e-05 + }, + { + "step": 526, + "epoch": 0.842948717948718, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125674496, + "loss": 0.5031, + "grad_norm": 3.311450719833374, + "learning_rate": 2.2027229665154446e-05 + }, + { + "step": 527, + "epoch": 0.844551282051282, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125640704, + "loss": 0.4197, + "grad_norm": 3.1421849727630615, + "learning_rate": 2.1591042886571634e-05 + }, + { + "step": 528, + "epoch": 0.8461538461538461, + "cpu_mem": 3.436068864, + "gpu_mem": 1.12568832, + "loss": 0.4431, + "grad_norm": 3.4550814628601074, + "learning_rate": 2.1158882983527166e-05 + }, + { + "step": 529, + "epoch": 0.8477564102564102, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125654528, + "loss": 0.5711, + "grad_norm": 3.6850788593292236, + "learning_rate": 2.0730763508456738e-05 + }, + { + "step": 530, + "epoch": 0.8493589743589743, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125702144, + "loss": 0.4228, + "grad_norm": 2.9446446895599365, + "learning_rate": 2.0306697887089235e-05 + }, + { + "step": 531, + "epoch": 0.8509615384615384, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125700608, + "loss": 0.5564, + "grad_norm": 3.105963945388794, + "learning_rate": 1.9886699418025543e-05 + }, + { + "step": 532, + "epoch": 0.8525641025641025, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125702144, + "loss": 0.2924, + "grad_norm": 2.423595666885376, + "learning_rate": 1.947078127232169e-05 + }, + { + "step": 533, + "epoch": 0.8541666666666666, + "cpu_mem": 3.436068864, + "gpu_mem": 1.12571136, + "loss": 0.5635, + "grad_norm": 3.6205947399139404, + "learning_rate": 1.9058956493075644e-05 + }, + { + "step": 534, + "epoch": 0.8557692307692307, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125686784, + "loss": 0.5011, + "grad_norm": 3.3587160110473633, + "learning_rate": 1.8651237995018324e-05 + }, + { + "step": 535, + "epoch": 0.8573717948717948, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125671424, + "loss": 0.5894, + "grad_norm": 4.460448265075684, + "learning_rate": 1.8247638564108607e-05 + }, + { + "step": 536, + "epoch": 0.8589743589743589, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125700608, + "loss": 0.4236, + "grad_norm": 3.3500468730926514, + "learning_rate": 1.7848170857132325e-05 + }, + { + "step": 537, + "epoch": 0.8605769230769231, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125714432, + "loss": 0.4457, + "grad_norm": 3.1605377197265625, + "learning_rate": 1.7452847401305496e-05 + }, + { + "step": 538, + "epoch": 0.8621794871794872, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125669888, + "loss": 0.3829, + "grad_norm": 2.718906879425049, + "learning_rate": 1.7061680593881344e-05 + }, + { + "step": 539, + "epoch": 0.8637820512820513, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125676032, + "loss": 0.4975, + "grad_norm": 2.8285715579986572, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 540, + "epoch": 0.8653846153846154, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125705216, + "loss": 0.4958, + "grad_norm": 3.4516522884368896, + "learning_rate": 1.6291865861111353e-05 + }, + { + "step": 541, + "epoch": 0.8669871794871795, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125700608, + "loss": 0.3125, + "grad_norm": 2.276359796524048, + "learning_rate": 1.5913242076979493e-05 + }, + { + "step": 542, + "epoch": 0.8685897435897436, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125686784, + "loss": 0.6037, + "grad_norm": 3.239652633666992, + "learning_rate": 1.5538823222921288e-05 + }, + { + "step": 543, + "epoch": 0.8701923076923077, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125700608, + "loss": 0.4216, + "grad_norm": 3.2845616340637207, + "learning_rate": 1.5168621040626388e-05 + }, + { + "step": 544, + "epoch": 0.8717948717948718, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125689856, + "loss": 0.4747, + "grad_norm": 3.447798728942871, + "learning_rate": 1.4802647139550577e-05 + }, + { + "step": 545, + "epoch": 0.8733974358974359, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125696, + "loss": 0.2788, + "grad_norm": 1.923667550086975, + "learning_rate": 1.444091299655175e-05 + }, + { + "step": 546, + "epoch": 0.875, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125700608, + "loss": 0.5325, + "grad_norm": 3.6387834548950195, + "learning_rate": 1.408342995552988e-05 + }, + { + "step": 547, + "epoch": 0.8766025641025641, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125696, + "loss": 0.3953, + "grad_norm": 2.853590488433838, + "learning_rate": 1.3730209227071436e-05 + }, + { + "step": 548, + "epoch": 0.8782051282051282, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125669888, + "loss": 0.3789, + "grad_norm": 2.575258731842041, + "learning_rate": 1.3381261888097755e-05 + }, + { + "step": 549, + "epoch": 0.8798076923076923, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125679104, + "loss": 0.4627, + "grad_norm": 3.212299108505249, + "learning_rate": 1.303659888151753e-05 + }, + { + "step": 550, + "epoch": 0.8814102564102564, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125697536, + "loss": 0.6525, + "grad_norm": 3.6093909740448, + "learning_rate": 1.2696231015883913e-05 + }, + { + "step": 551, + "epoch": 0.8830128205128205, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125668352, + "loss": 0.413, + "grad_norm": 2.9108450412750244, + "learning_rate": 1.2360168965055301e-05 + }, + { + "step": 552, + "epoch": 0.8846153846153846, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125699072, + "loss": 0.4465, + "grad_norm": 3.2429556846618652, + "learning_rate": 1.2028423267860805e-05 + }, + { + "step": 553, + "epoch": 0.8862179487179487, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125708288, + "loss": 0.4295, + "grad_norm": 2.6926209926605225, + "learning_rate": 1.1701004327769709e-05 + }, + { + "step": 554, + "epoch": 0.8878205128205128, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125669888, + "loss": 0.5871, + "grad_norm": 3.3183770179748535, + "learning_rate": 1.1377922412565005e-05 + }, + { + "step": 555, + "epoch": 0.8894230769230769, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125674496, + "loss": 0.3287, + "grad_norm": 2.737887144088745, + "learning_rate": 1.1059187654021762e-05 + }, + { + "step": 556, + "epoch": 0.8910256410256411, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125699072, + "loss": 0.5231, + "grad_norm": 3.1976583003997803, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 557, + "epoch": 0.8926282051282052, + "cpu_mem": 3.436068864, + "gpu_mem": 1.125717504, + "loss": 0.3256, + "grad_norm": 2.3534042835235596, + "learning_rate": 1.0434799452076915e-05 + }, + { + "step": 558, + "epoch": 0.8942307692307693, + "cpu_mem": 3.436265472, + "gpu_mem": 1.125699072, + "loss": 0.6033, + "grad_norm": 3.4700512886047363, + "learning_rate": 1.0129165589346643e-05 + }, + { + "step": 559, + "epoch": 0.8958333333333334, + "cpu_mem": 3.436265472, + "gpu_mem": 1.12574976, + "loss": 0.5171, + "grad_norm": 3.389880895614624, + "learning_rate": 9.82791804400626e-06 + }, + { + "step": 560, + "epoch": 0.8974358974358975, + "cpu_mem": 3.436265472, + "gpu_mem": 1.125682176, + "loss": 0.9499, + "grad_norm": 4.648243427276611, + "learning_rate": 9.531066263109971e-06 + }, + { + "step": 561, + "epoch": 0.8990384615384616, + "cpu_mem": 3.436265472, + "gpu_mem": 1.125683712, + "loss": 0.3844, + "grad_norm": 3.1306264400482178, + "learning_rate": 9.238619555861731e-06 + }, + { + "step": 562, + "epoch": 0.9006410256410257, + "cpu_mem": 3.436265472, + "gpu_mem": 1.125683712, + "loss": 0.4406, + "grad_norm": 2.845165967941284, + "learning_rate": 8.950587093323435e-06 + }, + { + "step": 563, + "epoch": 0.9022435897435898, + "cpu_mem": 3.436265472, + "gpu_mem": 1.125689856, + "loss": 0.3476, + "grad_norm": 2.3906469345092773, + "learning_rate": 8.66697790812731e-06 + }, + { + "step": 564, + "epoch": 0.9038461538461539, + "cpu_mem": 3.436265472, + "gpu_mem": 1.12570368, + "loss": 0.2953, + "grad_norm": 2.2151641845703125, + "learning_rate": 8.387800894192453e-06 + }, + { + "step": 565, + "epoch": 0.905448717948718, + "cpu_mem": 3.436265472, + "gpu_mem": 1.125708288, + "loss": 0.5554, + "grad_norm": 3.5496556758880615, + "learning_rate": 8.113064806446285e-06 + }, + { + "step": 566, + "epoch": 0.907051282051282, + "cpu_mem": 3.436265472, + "gpu_mem": 1.125702144, + "loss": 0.4052, + "grad_norm": 2.867069721221924, + "learning_rate": 7.842778260549654e-06 + }, + { + "step": 567, + "epoch": 0.9086538461538461, + "cpu_mem": 3.436265472, + "gpu_mem": 1.125696, + "loss": 0.4328, + "grad_norm": 3.4153189659118652, + "learning_rate": 7.5769497326268804e-06 + }, + { + "step": 568, + "epoch": 0.9102564102564102, + "cpu_mem": 3.436265472, + "gpu_mem": 1.125709824, + "loss": 0.5482, + "grad_norm": 3.5759832859039307, + "learning_rate": 7.315587558999864e-06 + }, + { + "step": 569, + "epoch": 0.9118589743589743, + "cpu_mem": 3.436265472, + "gpu_mem": 1.125702144, + "loss": 0.5316, + "grad_norm": 3.6609292030334473, + "learning_rate": 7.058699935926526e-06 + }, + { + "step": 570, + "epoch": 0.9134615384615384, + "cpu_mem": 3.436265472, + "gpu_mem": 1.125686784, + "loss": 0.4475, + "grad_norm": 2.9967925548553467, + "learning_rate": 6.8062949193440515e-06 + }, + { + "step": 571, + "epoch": 0.9150641025641025, + "cpu_mem": 3.436265472, + "gpu_mem": 1.125696, + "loss": 0.5461, + "grad_norm": 3.8819174766540527, + "learning_rate": 6.5583804246160385e-06 + }, + { + "step": 572, + "epoch": 0.9166666666666666, + "cpu_mem": 3.436265472, + "gpu_mem": 1.125705216, + "loss": 0.4011, + "grad_norm": 3.0852878093719482, + "learning_rate": 6.3149642262843804e-06 + }, + { + "step": 573, + "epoch": 0.9182692307692307, + "cpu_mem": 3.436265472, + "gpu_mem": 1.125708288, + "loss": 0.3893, + "grad_norm": 2.6983256340026855, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 574, + "epoch": 0.9198717948717948, + "cpu_mem": 3.436265472, + "gpu_mem": 1.125662208, + "loss": 0.4667, + "grad_norm": 2.9461522102355957, + "learning_rate": 5.84165711141048e-06 + }, + { + "step": 575, + "epoch": 0.9214743589743589, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125715968, + "loss": 0.241, + "grad_norm": 2.4695017337799072, + "learning_rate": 5.611781037671176e-06 + }, + { + "step": 576, + "epoch": 0.9230769230769231, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125717504, + "loss": 0.4254, + "grad_norm": 2.533904552459717, + "learning_rate": 5.386432945468555e-06 + }, + { + "step": 577, + "epoch": 0.9246794871794872, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125662208, + "loss": 0.4976, + "grad_norm": 2.7814748287200928, + "learning_rate": 5.165619901667311e-06 + }, + { + "step": 578, + "epoch": 0.9262820512820513, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125696, + "loss": 0.4881, + "grad_norm": 2.8667736053466797, + "learning_rate": 4.949348830914002e-06 + }, + { + "step": 579, + "epoch": 0.9278846153846154, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125674496, + "loss": 0.4955, + "grad_norm": 3.0442259311676025, + "learning_rate": 4.737626515419951e-06 + }, + { + "step": 580, + "epoch": 0.9294871794871795, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125705216, + "loss": 0.4042, + "grad_norm": 2.583869457244873, + "learning_rate": 4.530459594748592e-06 + }, + { + "step": 581, + "epoch": 0.9310897435897436, + "cpu_mem": 3.43646208, + "gpu_mem": 1.12568064, + "loss": 0.3052, + "grad_norm": 2.85621976852417, + "learning_rate": 4.327854565607164e-06 + }, + { + "step": 582, + "epoch": 0.9326923076923077, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125714432, + "loss": 0.3998, + "grad_norm": 2.8520593643188477, + "learning_rate": 4.129817781643091e-06 + }, + { + "step": 583, + "epoch": 0.9342948717948718, + "cpu_mem": 3.43646208, + "gpu_mem": 1.1257344, + "loss": 0.666, + "grad_norm": 3.872570037841797, + "learning_rate": 3.9363554532446276e-06 + }, + { + "step": 584, + "epoch": 0.9358974358974359, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125699072, + "loss": 0.3129, + "grad_norm": 2.4948909282684326, + "learning_rate": 3.7474736473461607e-06 + }, + { + "step": 585, + "epoch": 0.9375, + "cpu_mem": 3.43646208, + "gpu_mem": 1.12571904, + "loss": 0.3541, + "grad_norm": 2.438199043273926, + "learning_rate": 3.56317828723795e-06 + }, + { + "step": 586, + "epoch": 0.9391025641025641, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125699072, + "loss": 0.3888, + "grad_norm": 2.7770633697509766, + "learning_rate": 3.383475152380355e-06 + }, + { + "step": 587, + "epoch": 0.9407051282051282, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125700608, + "loss": 0.4588, + "grad_norm": 3.132174491882324, + "learning_rate": 3.2083698782225997e-06 + }, + { + "step": 588, + "epoch": 0.9423076923076923, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125692928, + "loss": 0.3114, + "grad_norm": 2.520106554031372, + "learning_rate": 3.0378679560260467e-06 + }, + { + "step": 589, + "epoch": 0.9439102564102564, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125691392, + "loss": 0.4143, + "grad_norm": 2.5350656509399414, + "learning_rate": 2.871974732691984e-06 + }, + { + "step": 590, + "epoch": 0.9455128205128205, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125705216, + "loss": 0.4892, + "grad_norm": 3.54239559173584, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 591, + "epoch": 0.9471153846153846, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125676032, + "loss": 0.4208, + "grad_norm": 2.592811346054077, + "learning_rate": 2.554035047414732e-06 + }, + { + "step": 592, + "epoch": 0.9487179487179487, + "cpu_mem": 3.43646208, + "gpu_mem": 1.12572672, + "loss": 0.4124, + "grad_norm": 3.059171199798584, + "learning_rate": 2.401998555987389e-06 + }, + { + "step": 593, + "epoch": 0.9503205128205128, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125723648, + "loss": 0.4433, + "grad_norm": 3.3725225925445557, + "learning_rate": 2.2545907041415457e-06 + }, + { + "step": 594, + "epoch": 0.9519230769230769, + "cpu_mem": 3.43646208, + "gpu_mem": 1.12570368, + "loss": 0.429, + "grad_norm": 3.3244833946228027, + "learning_rate": 2.1118161145537436e-06 + }, + { + "step": 595, + "epoch": 0.9535256410256411, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125685248, + "loss": 0.3601, + "grad_norm": 2.135354995727539, + "learning_rate": 1.973679264602485e-06 + }, + { + "step": 596, + "epoch": 0.9551282051282052, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125694464, + "loss": 0.3308, + "grad_norm": 2.423692464828491, + "learning_rate": 1.840184486227808e-06 + }, + { + "step": 597, + "epoch": 0.9567307692307693, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125662208, + "loss": 0.5534, + "grad_norm": 4.079288005828857, + "learning_rate": 1.711335965795435e-06 + }, + { + "step": 598, + "epoch": 0.9583333333333334, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125723648, + "loss": 0.5536, + "grad_norm": 3.3304054737091064, + "learning_rate": 1.5871377439655054e-06 + }, + { + "step": 599, + "epoch": 0.9599358974358975, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125722112, + "loss": 0.3789, + "grad_norm": 3.0864293575286865, + "learning_rate": 1.4675937155658456e-06 + }, + { + "step": 600, + "epoch": 0.9615384615384616, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125677568, + "loss": 0.4841, + "grad_norm": 2.9184353351593018, + "learning_rate": 1.3527076294698846e-06 + }, + { + "step": 601, + "epoch": 0.9631410256410257, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125709824, + "loss": 0.4078, + "grad_norm": 2.8991098403930664, + "learning_rate": 1.2424830884790126e-06 + }, + { + "step": 602, + "epoch": 0.9647435897435898, + "cpu_mem": 3.43646208, + "gpu_mem": 1.12570368, + "loss": 0.522, + "grad_norm": 3.528278112411499, + "learning_rate": 1.1369235492096397e-06 + }, + { + "step": 603, + "epoch": 0.9663461538461539, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125689856, + "loss": 0.3271, + "grad_norm": 2.319345712661743, + "learning_rate": 1.0360323219847645e-06 + }, + { + "step": 604, + "epoch": 0.967948717948718, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125689856, + "loss": 0.5538, + "grad_norm": 3.820770740509033, + "learning_rate": 9.398125707302084e-07 + }, + { + "step": 605, + "epoch": 0.969551282051282, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125715968, + "loss": 0.3367, + "grad_norm": 2.7328784465789795, + "learning_rate": 8.482673128753947e-07 + }, + { + "step": 606, + "epoch": 0.9711538461538461, + "cpu_mem": 3.43646208, + "gpu_mem": 1.12570368, + "loss": 0.3424, + "grad_norm": 2.54246187210083, + "learning_rate": 7.613994192586736e-07 + }, + { + "step": 607, + "epoch": 0.9727564102564102, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125694464, + "loss": 0.4671, + "grad_norm": 2.689720869064331, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 608, + "epoch": 0.9743589743589743, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125686784, + "loss": 0.6331, + "grad_norm": 4.074700355529785, + "learning_rate": 6.017064746021094e-07 + }, + { + "step": 609, + "epoch": 0.9759615384615384, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125709824, + "loss": 0.379, + "grad_norm": 3.2855963706970215, + "learning_rate": 5.288864314965003e-07 + }, + { + "step": 610, + "epoch": 0.9775641025641025, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125699072, + "loss": 0.3059, + "grad_norm": 2.2233407497406006, + "learning_rate": 4.607537683404106e-07 + }, + { + "step": 611, + "epoch": 0.9791666666666666, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125683712, + "loss": 0.3913, + "grad_norm": 2.2186269760131836, + "learning_rate": 3.973106217585842e-07 + }, + { + "step": 612, + "epoch": 0.9807692307692307, + "cpu_mem": 3.43646208, + "gpu_mem": 1.12574976, + "loss": 0.3728, + "grad_norm": 2.39900541305542, + "learning_rate": 3.3855898131356915e-07 + }, + { + "step": 613, + "epoch": 0.9823717948717948, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125692928, + "loss": 0.4227, + "grad_norm": 3.1128368377685547, + "learning_rate": 2.845006894433843e-07 + }, + { + "step": 614, + "epoch": 0.9839743589743589, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125679104, + "loss": 0.3862, + "grad_norm": 3.068734645843506, + "learning_rate": 2.351374414037155e-07 + }, + { + "step": 615, + "epoch": 0.9855769230769231, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125743616, + "loss": 0.468, + "grad_norm": 3.392822027206421, + "learning_rate": 1.9047078521474135e-07 + }, + { + "step": 616, + "epoch": 0.9871794871794872, + "cpu_mem": 3.43646208, + "gpu_mem": 1.12567296, + "loss": 0.47, + "grad_norm": 3.387977361679077, + "learning_rate": 1.505021216125557e-07 + }, + { + "step": 617, + "epoch": 0.9887820512820513, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125700608, + "loss": 0.2626, + "grad_norm": 2.4619293212890625, + "learning_rate": 1.1523270400535245e-07 + }, + { + "step": 618, + "epoch": 0.9903846153846154, + "cpu_mem": 3.43646208, + "gpu_mem": 1.12570368, + "loss": 0.5746, + "grad_norm": 3.0479586124420166, + "learning_rate": 8.466363843397383e-08 + }, + { + "step": 619, + "epoch": 0.9919871794871795, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125679104, + "loss": 0.5753, + "grad_norm": 3.003659248352051, + "learning_rate": 5.8795883537338106e-08 + }, + { + "step": 620, + "epoch": 0.9935897435897436, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125706752, + "loss": 0.4347, + "grad_norm": 3.350276231765747, + "learning_rate": 3.763025052231361e-08 + }, + { + "step": 621, + "epoch": 0.9951923076923077, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125715968, + "loss": 0.3407, + "grad_norm": 2.886871099472046, + "learning_rate": 2.1167403138339088e-08 + }, + { + "step": 622, + "epoch": 0.9967948717948718, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125712896, + "loss": 0.4129, + "grad_norm": 2.5064327716827393, + "learning_rate": 9.407857656540397e-09 + }, + { + "step": 623, + "epoch": 0.9983974358974359, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125685248, + "loss": 0.4907, + "grad_norm": 3.19950270652771, + "learning_rate": 2.3519828535434325e-09 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125431808, + "loss": 0.3893, + "grad_norm": 3.146829605102539, + "learning_rate": 0.0 + }, + { + "step": 624, + "epoch": 1.0, + "cpu_mem": 3.43646208, + "gpu_mem": 1.125431808, + "train_runtime": 8218.3024, + "train_samples_per_second": 4.856, + "train_steps_per_second": 0.076, + "total_flos": 8.474956527272755e+16, + "train_loss": 0.765637666082535 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r2-a2/README.md b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r2-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r2-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r2-a2/adapter_config.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c5f43ee5d95e6efa86bc12e96d56fbf5a2c265b7 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r2-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 4, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 2, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r2-a2/eval_results.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..30c60f3dbefe1ce24d0c66e9893c80663cae4fbb --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "logiqa", + "results": 0.30675480272670935 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r2-a2/training_configuration.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..138193fa3729114eb0fd12eb738560add3f50c62 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "LOGIQA", + "dataset_id": "data/logiqa_train", + "preprocess_id": "logiqa_train_deepeval" + }, + "peft_config": { + "method": "loraq4", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 1576960 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 3, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loraq4-logiqa-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loraq4-fix/TinyLlama_v1.1-loraq4-logiqa-r2-a2", + "seed": 42, + "timestamp": "2025-08-31T20:01:11.643155" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r2-a2/training_logs.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..c5b1912e460cb294b9afc6a0b4f4f2c6c31b1662 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r2-a2/training_logs.json @@ -0,0 +1,5305 @@ +[ + { + "step": 1, + "epoch": 0.005089058524173028, + "cpu_mem": 3.260260352, + "gpu_mem": 1.056342528, + "loss": 3.684, + "grad_norm": 15.077292442321777, + "learning_rate": 5.084745762711864e-06 + }, + { + "step": 2, + "epoch": 0.010178117048346057, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068977664, + "loss": 3.9445, + "grad_norm": 16.149879455566406, + "learning_rate": 1.0169491525423728e-05 + }, + { + "step": 3, + "epoch": 0.015267175572519083, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069054464, + "loss": 3.7708, + "grad_norm": 15.582755088806152, + "learning_rate": 1.5254237288135592e-05 + }, + { + "step": 4, + "epoch": 0.020356234096692113, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068953088, + "loss": 3.7695, + "grad_norm": 16.539838790893555, + "learning_rate": 2.0338983050847455e-05 + }, + { + "step": 5, + "epoch": 0.02544529262086514, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068968448, + "loss": 3.7969, + "grad_norm": 16.101781845092773, + "learning_rate": 2.542372881355932e-05 + }, + { + "step": 6, + "epoch": 0.030534351145038167, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068960768, + "loss": 3.693, + "grad_norm": 14.890641212463379, + "learning_rate": 3.0508474576271185e-05 + }, + { + "step": 7, + "epoch": 0.035623409669211195, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069031424, + "loss": 3.4744, + "grad_norm": 15.132516860961914, + "learning_rate": 3.559322033898305e-05 + }, + { + "step": 8, + "epoch": 0.04071246819338423, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069016064, + "loss": 3.3924, + "grad_norm": 15.13548469543457, + "learning_rate": 4.067796610169491e-05 + }, + { + "step": 9, + "epoch": 0.04580152671755725, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069014528, + "loss": 3.276, + "grad_norm": 14.78049087524414, + "learning_rate": 4.576271186440678e-05 + }, + { + "step": 10, + "epoch": 0.05089058524173028, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06902528, + "loss": 3.224, + "grad_norm": 13.63449478149414, + "learning_rate": 5.084745762711864e-05 + }, + { + "step": 11, + "epoch": 0.05597964376590331, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068928512, + "loss": 2.9005, + "grad_norm": 13.197659492492676, + "learning_rate": 5.59322033898305e-05 + }, + { + "step": 12, + "epoch": 0.061068702290076333, + "cpu_mem": 3.260260352, + "gpu_mem": 1.0689792, + "loss": 2.6319, + "grad_norm": 11.722529411315918, + "learning_rate": 6.101694915254237e-05 + }, + { + "step": 13, + "epoch": 0.06615776081424936, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06907136, + "loss": 2.412, + "grad_norm": 10.444768905639648, + "learning_rate": 6.610169491525423e-05 + }, + { + "step": 14, + "epoch": 0.07124681933842239, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068983808, + "loss": 2.2367, + "grad_norm": 8.572355270385742, + "learning_rate": 7.11864406779661e-05 + }, + { + "step": 15, + "epoch": 0.07633587786259542, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069122048, + "loss": 1.9706, + "grad_norm": 7.199042320251465, + "learning_rate": 7.627118644067796e-05 + }, + { + "step": 16, + "epoch": 0.08142493638676845, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068982272, + "loss": 1.8206, + "grad_norm": 5.1154303550720215, + "learning_rate": 8.135593220338982e-05 + }, + { + "step": 17, + "epoch": 0.08651399491094147, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069012992, + "loss": 1.5806, + "grad_norm": 2.8572068214416504, + "learning_rate": 8.64406779661017e-05 + }, + { + "step": 18, + "epoch": 0.0916030534351145, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068976128, + "loss": 1.5623, + "grad_norm": 2.625800132751465, + "learning_rate": 9.152542372881355e-05 + }, + { + "step": 19, + "epoch": 0.09669211195928754, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068883968, + "loss": 1.5041, + "grad_norm": 1.5365272760391235, + "learning_rate": 9.661016949152541e-05 + }, + { + "step": 20, + "epoch": 0.10178117048346055, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068922368, + "loss": 1.4813, + "grad_norm": 1.0131875276565552, + "learning_rate": 0.00010169491525423727 + }, + { + "step": 21, + "epoch": 0.10687022900763359, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069056, + "loss": 1.5276, + "grad_norm": 2.8515119552612305, + "learning_rate": 0.00010677966101694915 + }, + { + "step": 22, + "epoch": 0.11195928753180662, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068954624, + "loss": 1.4777, + "grad_norm": 1.9568270444869995, + "learning_rate": 0.000111864406779661 + }, + { + "step": 23, + "epoch": 0.11704834605597965, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068982272, + "loss": 1.4313, + "grad_norm": 1.7025142908096313, + "learning_rate": 0.00011694915254237288 + }, + { + "step": 24, + "epoch": 0.12213740458015267, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068976128, + "loss": 1.4385, + "grad_norm": 1.5104219913482666, + "learning_rate": 0.00012203389830508474 + }, + { + "step": 25, + "epoch": 0.1272264631043257, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068982272, + "loss": 1.4503, + "grad_norm": 1.7656471729278564, + "learning_rate": 0.00012711864406779658 + }, + { + "step": 26, + "epoch": 0.13231552162849872, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069034496, + "loss": 1.4049, + "grad_norm": 2.7524797916412354, + "learning_rate": 0.00013220338983050846 + }, + { + "step": 27, + "epoch": 0.13740458015267176, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068976128, + "loss": 1.5127, + "grad_norm": 3.5006628036499023, + "learning_rate": 0.00013728813559322033 + }, + { + "step": 28, + "epoch": 0.14249363867684478, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068922368, + "loss": 1.496, + "grad_norm": 3.3367626667022705, + "learning_rate": 0.0001423728813559322 + }, + { + "step": 29, + "epoch": 0.1475826972010178, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069014528, + "loss": 1.5023, + "grad_norm": 3.312037706375122, + "learning_rate": 0.00014745762711864405 + }, + { + "step": 30, + "epoch": 0.15267175572519084, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06900992, + "loss": 1.539, + "grad_norm": 4.140618801116943, + "learning_rate": 0.00015254237288135592 + }, + { + "step": 31, + "epoch": 0.15776081424936386, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068988416, + "loss": 1.4721, + "grad_norm": 3.07094144821167, + "learning_rate": 0.0001576271186440678 + }, + { + "step": 32, + "epoch": 0.1628498727735369, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068993024, + "loss": 1.4138, + "grad_norm": 1.6824312210083008, + "learning_rate": 0.00016271186440677964 + }, + { + "step": 33, + "epoch": 0.16793893129770993, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069028352, + "loss": 1.3578, + "grad_norm": 1.784608006477356, + "learning_rate": 0.0001677966101694915 + }, + { + "step": 34, + "epoch": 0.17302798982188294, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068953088, + "loss": 1.4631, + "grad_norm": 1.4232090711593628, + "learning_rate": 0.0001728813559322034 + }, + { + "step": 35, + "epoch": 0.178117048346056, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069000704, + "loss": 1.4543, + "grad_norm": 2.028214931488037, + "learning_rate": 0.00017796610169491523 + }, + { + "step": 36, + "epoch": 0.183206106870229, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069016064, + "loss": 1.4813, + "grad_norm": 2.8354604244232178, + "learning_rate": 0.0001830508474576271 + }, + { + "step": 37, + "epoch": 0.18829516539440203, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069082112, + "loss": 1.3801, + "grad_norm": 2.191455841064453, + "learning_rate": 0.00018813559322033895 + }, + { + "step": 38, + "epoch": 0.19338422391857507, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068965376, + "loss": 1.3842, + "grad_norm": 0.716363787651062, + "learning_rate": 0.00019322033898305083 + }, + { + "step": 39, + "epoch": 0.1984732824427481, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069082112, + "loss": 1.429, + "grad_norm": 1.2041651010513306, + "learning_rate": 0.0001983050847457627 + }, + { + "step": 40, + "epoch": 0.2035623409669211, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069003776, + "loss": 1.3633, + "grad_norm": 0.837681233882904, + "learning_rate": 0.00020338983050847455 + }, + { + "step": 41, + "epoch": 0.20865139949109415, + "cpu_mem": 3.260260352, + "gpu_mem": 1.0689024, + "loss": 1.4264, + "grad_norm": 1.1955137252807617, + "learning_rate": 0.00020847457627118642 + }, + { + "step": 42, + "epoch": 0.21374045801526717, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068974592, + "loss": 1.4301, + "grad_norm": 1.0139071941375732, + "learning_rate": 0.0002135593220338983 + }, + { + "step": 43, + "epoch": 0.21882951653944022, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068937728, + "loss": 1.4171, + "grad_norm": 1.3519630432128906, + "learning_rate": 0.00021864406779661014 + }, + { + "step": 44, + "epoch": 0.22391857506361323, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068973056, + "loss": 1.4174, + "grad_norm": 0.9861114621162415, + "learning_rate": 0.000223728813559322 + }, + { + "step": 45, + "epoch": 0.22900763358778625, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069026816, + "loss": 1.4091, + "grad_norm": 0.7880063056945801, + "learning_rate": 0.00022881355932203386 + }, + { + "step": 46, + "epoch": 0.2340966921119593, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069074432, + "loss": 1.3886, + "grad_norm": 1.0357568264007568, + "learning_rate": 0.00023389830508474576 + }, + { + "step": 47, + "epoch": 0.23918575063613232, + "cpu_mem": 3.260260352, + "gpu_mem": 1.0689024, + "loss": 1.404, + "grad_norm": 1.359277606010437, + "learning_rate": 0.0002389830508474576 + }, + { + "step": 48, + "epoch": 0.24427480916030533, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06895616, + "loss": 1.4033, + "grad_norm": 1.114329218864441, + "learning_rate": 0.00024406779661016948 + }, + { + "step": 49, + "epoch": 0.24936386768447838, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068945408, + "loss": 1.4001, + "grad_norm": 1.027910590171814, + "learning_rate": 0.00024915254237288135 + }, + { + "step": 50, + "epoch": 0.2544529262086514, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068954624, + "loss": 1.3923, + "grad_norm": 0.8557435274124146, + "learning_rate": 0.00025423728813559317 + }, + { + "step": 51, + "epoch": 0.2595419847328244, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069049856, + "loss": 1.3617, + "grad_norm": 0.7158666849136353, + "learning_rate": 0.0002593220338983051 + }, + { + "step": 52, + "epoch": 0.26463104325699743, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068991488, + "loss": 1.4335, + "grad_norm": 1.8222296237945557, + "learning_rate": 0.0002644067796610169 + }, + { + "step": 53, + "epoch": 0.2697201017811705, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069069824, + "loss": 1.4367, + "grad_norm": 1.6771444082260132, + "learning_rate": 0.0002694915254237288 + }, + { + "step": 54, + "epoch": 0.2748091603053435, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068969984, + "loss": 1.416, + "grad_norm": 0.8672899603843689, + "learning_rate": 0.00027457627118644066 + }, + { + "step": 55, + "epoch": 0.27989821882951654, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068966912, + "loss": 1.3828, + "grad_norm": 0.68814617395401, + "learning_rate": 0.0002796610169491525 + }, + { + "step": 56, + "epoch": 0.28498727735368956, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069011456, + "loss": 1.3886, + "grad_norm": 0.9340230822563171, + "learning_rate": 0.0002847457627118644 + }, + { + "step": 57, + "epoch": 0.2900763358778626, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069003776, + "loss": 1.3949, + "grad_norm": 0.9134350419044495, + "learning_rate": 0.00028983050847457623 + }, + { + "step": 58, + "epoch": 0.2951653944020356, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069026816, + "loss": 1.4177, + "grad_norm": 0.9928712248802185, + "learning_rate": 0.0002949152542372881 + }, + { + "step": 59, + "epoch": 0.30025445292620867, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06899456, + "loss": 1.359, + "grad_norm": 1.1458396911621094, + "learning_rate": 0.0003 + }, + { + "step": 60, + "epoch": 0.3053435114503817, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068985344, + "loss": 1.4236, + "grad_norm": 1.0920778512954712, + "learning_rate": 0.00029999735486167307 + }, + { + "step": 61, + "epoch": 0.3104325699745547, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069022208, + "loss": 1.3695, + "grad_norm": 0.6085472106933594, + "learning_rate": 0.00029998941953998247 + }, + { + "step": 62, + "epoch": 0.3155216284987277, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069062144, + "loss": 1.3987, + "grad_norm": 0.8197776675224304, + "learning_rate": 0.0002999761943147951 + }, + { + "step": 63, + "epoch": 0.32061068702290074, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068991488, + "loss": 1.3757, + "grad_norm": 0.5303282737731934, + "learning_rate": 0.000299957679652545 + }, + { + "step": 64, + "epoch": 0.3256997455470738, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068893184, + "loss": 1.3913, + "grad_norm": 1.0873873233795166, + "learning_rate": 0.0002999338762062168 + }, + { + "step": 65, + "epoch": 0.33078880407124683, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068974592, + "loss": 1.4014, + "grad_norm": 0.7953751087188721, + "learning_rate": 0.00029990478481532246 + }, + { + "step": 66, + "epoch": 0.33587786259541985, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06907904, + "loss": 1.4037, + "grad_norm": 0.3341888189315796, + "learning_rate": 0.00029987040650587214 + }, + { + "step": 67, + "epoch": 0.34096692111959287, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06895616, + "loss": 1.4333, + "grad_norm": 0.7198785543441772, + "learning_rate": 0.0002998307424903376 + }, + { + "step": 68, + "epoch": 0.3460559796437659, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069008384, + "loss": 1.4024, + "grad_norm": 0.4656113088130951, + "learning_rate": 0.00029978579416760955 + }, + { + "step": 69, + "epoch": 0.3511450381679389, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069000704, + "loss": 1.3862, + "grad_norm": 0.4637596607208252, + "learning_rate": 0.00029973556312294853 + }, + { + "step": 70, + "epoch": 0.356234096692112, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068916224, + "loss": 1.3807, + "grad_norm": 0.3060680627822876, + "learning_rate": 0.0002996800511279286 + }, + { + "step": 71, + "epoch": 0.361323155216285, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068939264, + "loss": 1.415, + "grad_norm": 1.0426770448684692, + "learning_rate": 0.0002996192601403751 + }, + { + "step": 72, + "epoch": 0.366412213740458, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068988416, + "loss": 1.3911, + "grad_norm": 0.45170557498931885, + "learning_rate": 0.00029955319230429584 + }, + { + "step": 73, + "epoch": 0.37150127226463103, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068942336, + "loss": 1.3938, + "grad_norm": 0.3682629466056824, + "learning_rate": 0.00029948184994980486 + }, + { + "step": 74, + "epoch": 0.37659033078880405, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068996096, + "loss": 1.4013, + "grad_norm": 0.6410622596740723, + "learning_rate": 0.0002994052355930409 + }, + { + "step": 75, + "epoch": 0.3816793893129771, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069039104, + "loss": 1.3681, + "grad_norm": 0.728678822517395, + "learning_rate": 0.0002993233519360781 + }, + { + "step": 76, + "epoch": 0.38676844783715014, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06898688, + "loss": 1.3742, + "grad_norm": 0.4725222587585449, + "learning_rate": 0.0002992362018668312 + }, + { + "step": 77, + "epoch": 0.39185750636132316, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06892544, + "loss": 1.3903, + "grad_norm": 0.5953335762023926, + "learning_rate": 0.00029914378845895343 + }, + { + "step": 78, + "epoch": 0.3969465648854962, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069026816, + "loss": 1.3496, + "grad_norm": 0.6144120097160339, + "learning_rate": 0.000299046114971728 + }, + { + "step": 79, + "epoch": 0.4020356234096692, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069043712, + "loss": 1.3358, + "grad_norm": 1.584025263786316, + "learning_rate": 0.0002989431848499534 + }, + { + "step": 80, + "epoch": 0.4071246819338422, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068883968, + "loss": 1.4496, + "grad_norm": 1.7879050970077515, + "learning_rate": 0.0002988350017238218 + }, + { + "step": 81, + "epoch": 0.4122137404580153, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068989952, + "loss": 1.4495, + "grad_norm": 1.83089280128479, + "learning_rate": 0.0002987215694087909 + }, + { + "step": 82, + "epoch": 0.4173027989821883, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068962304, + "loss": 1.3983, + "grad_norm": 1.3634779453277588, + "learning_rate": 0.0002986028919054496 + }, + { + "step": 83, + "epoch": 0.4223918575063613, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068976128, + "loss": 1.4156, + "grad_norm": 1.7495782375335693, + "learning_rate": 0.00029847897339937675 + }, + { + "step": 84, + "epoch": 0.42748091603053434, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068982272, + "loss": 1.4168, + "grad_norm": 1.1431233882904053, + "learning_rate": 0.0002983498182609935 + }, + { + "step": 85, + "epoch": 0.43256997455470736, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069011456, + "loss": 1.3921, + "grad_norm": 0.6853522062301636, + "learning_rate": 0.0002982154310454093 + }, + { + "step": 86, + "epoch": 0.43765903307888043, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068953088, + "loss": 1.3701, + "grad_norm": 0.6163510084152222, + "learning_rate": 0.00029807581649226114 + }, + { + "step": 87, + "epoch": 0.44274809160305345, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068974592, + "loss": 1.3932, + "grad_norm": 0.3338964581489563, + "learning_rate": 0.00029793097952554646 + }, + { + "step": 88, + "epoch": 0.44783715012722647, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069000704, + "loss": 1.3685, + "grad_norm": 0.795848548412323, + "learning_rate": 0.0002977809252534494 + }, + { + "step": 89, + "epoch": 0.4529262086513995, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06894848, + "loss": 1.411, + "grad_norm": 0.7436772584915161, + "learning_rate": 0.00029762565896816073 + }, + { + "step": 90, + "epoch": 0.4580152671755725, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068980736, + "loss": 1.4426, + "grad_norm": 0.9532755613327026, + "learning_rate": 0.000297465186145691 + }, + { + "step": 91, + "epoch": 0.4631043256997455, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068973056, + "loss": 1.3643, + "grad_norm": 0.6766322255134583, + "learning_rate": 0.0002972995124456779 + }, + { + "step": 92, + "epoch": 0.4681933842239186, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068913152, + "loss": 1.3884, + "grad_norm": 0.503589928150177, + "learning_rate": 0.0002971286437111861 + }, + { + "step": 93, + "epoch": 0.4732824427480916, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069114368, + "loss": 1.3808, + "grad_norm": 0.4560159742832184, + "learning_rate": 0.0002969525859685014 + }, + { + "step": 94, + "epoch": 0.47837150127226463, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068928512, + "loss": 1.4551, + "grad_norm": 1.4479016065597534, + "learning_rate": 0.0002967713454269183 + }, + { + "step": 95, + "epoch": 0.48346055979643765, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069095936, + "loss": 1.4369, + "grad_norm": 1.017583966255188, + "learning_rate": 0.0002965849284785207 + }, + { + "step": 96, + "epoch": 0.48854961832061067, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068969984, + "loss": 1.374, + "grad_norm": 0.6969099640846252, + "learning_rate": 0.000296393341697957 + }, + { + "step": 97, + "epoch": 0.49363867684478374, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068965376, + "loss": 1.3685, + "grad_norm": 0.409608393907547, + "learning_rate": 0.00029619659184220755 + }, + { + "step": 98, + "epoch": 0.49872773536895676, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069012992, + "loss": 1.3874, + "grad_norm": 0.7747059464454651, + "learning_rate": 0.00029599468585034684 + }, + { + "step": 99, + "epoch": 0.5038167938931297, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06899456, + "loss": 1.4231, + "grad_norm": 1.5363117456436157, + "learning_rate": 0.0002957876308432986 + }, + { + "step": 100, + "epoch": 0.5089058524173028, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068959232, + "loss": 1.3809, + "grad_norm": 0.710715651512146, + "learning_rate": 0.0002955754341235846 + }, + { + "step": 101, + "epoch": 0.5139949109414759, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068942336, + "loss": 1.4041, + "grad_norm": 0.7156769633293152, + "learning_rate": 0.00029535810317506714 + }, + { + "step": 102, + "epoch": 0.5190839694656488, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068993024, + "loss": 1.4152, + "grad_norm": 1.0680019855499268, + "learning_rate": 0.00029513564566268524 + }, + { + "step": 103, + "epoch": 0.5241730279898219, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068953088, + "loss": 1.3867, + "grad_norm": 0.6696789860725403, + "learning_rate": 0.0002949080694321841 + }, + { + "step": 104, + "epoch": 0.5292620865139949, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069042176, + "loss": 1.3912, + "grad_norm": 0.44379329681396484, + "learning_rate": 0.0002946753825098386 + }, + { + "step": 105, + "epoch": 0.5343511450381679, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068911616, + "loss": 1.3743, + "grad_norm": 0.6154740452766418, + "learning_rate": 0.0002944375931021699 + }, + { + "step": 106, + "epoch": 0.539440203562341, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068974592, + "loss": 1.4032, + "grad_norm": 0.7824257016181946, + "learning_rate": 0.0002941947095956564 + }, + { + "step": 107, + "epoch": 0.544529262086514, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068969984, + "loss": 1.3689, + "grad_norm": 0.34199875593185425, + "learning_rate": 0.0002939467405564377 + }, + { + "step": 108, + "epoch": 0.549618320610687, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068959232, + "loss": 1.3748, + "grad_norm": 0.4927796721458435, + "learning_rate": 0.00029369369473001265 + }, + { + "step": 109, + "epoch": 0.55470737913486, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069128192, + "loss": 1.4005, + "grad_norm": 0.5135883688926697, + "learning_rate": 0.0002934355810409307 + }, + { + "step": 110, + "epoch": 0.5597964376590331, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06892544, + "loss": 1.3374, + "grad_norm": 0.6255322694778442, + "learning_rate": 0.0002931724085924774 + }, + { + "step": 111, + "epoch": 0.5648854961832062, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068926976, + "loss": 1.3665, + "grad_norm": 0.6189605593681335, + "learning_rate": 0.00029290418666635314 + }, + { + "step": 112, + "epoch": 0.5699745547073791, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069026816, + "loss": 1.3937, + "grad_norm": 0.7617297172546387, + "learning_rate": 0.0002926309247223459 + }, + { + "step": 113, + "epoch": 0.5750636132315522, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06912512, + "loss": 1.3949, + "grad_norm": 0.8942378163337708, + "learning_rate": 0.0002923526323979975 + }, + { + "step": 114, + "epoch": 0.5801526717557252, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068954624, + "loss": 1.3859, + "grad_norm": 0.780949056148529, + "learning_rate": 0.00029206931950826387 + }, + { + "step": 115, + "epoch": 0.5852417302798982, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06896384, + "loss": 1.4093, + "grad_norm": 0.6678350567817688, + "learning_rate": 0.00029178099604516876 + }, + { + "step": 116, + "epoch": 0.5903307888040712, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06902528, + "loss": 1.3954, + "grad_norm": 0.8226045966148376, + "learning_rate": 0.0002914876721774515 + }, + { + "step": 117, + "epoch": 0.5954198473282443, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06891776, + "loss": 1.3589, + "grad_norm": 0.4410369396209717, + "learning_rate": 0.00029118935825020806 + }, + { + "step": 118, + "epoch": 0.6005089058524173, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069012992, + "loss": 1.3775, + "grad_norm": 0.35748225450515747, + "learning_rate": 0.00029088606478452656 + }, + { + "step": 119, + "epoch": 0.6055979643765903, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069146624, + "loss": 1.3607, + "grad_norm": 0.6103960275650024, + "learning_rate": 0.0002905778024771158 + }, + { + "step": 120, + "epoch": 0.6106870229007634, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069049856, + "loss": 1.4534, + "grad_norm": 1.811539649963379, + "learning_rate": 0.00029026458219992855 + }, + { + "step": 121, + "epoch": 0.6157760814249363, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069095936, + "loss": 1.3862, + "grad_norm": 0.831775963306427, + "learning_rate": 0.00028994641499977745 + }, + { + "step": 122, + "epoch": 0.6208651399491094, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06904064, + "loss": 1.3938, + "grad_norm": 0.5605126023292542, + "learning_rate": 0.00028962331209794604 + }, + { + "step": 123, + "epoch": 0.6259541984732825, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06907904, + "loss": 1.3751, + "grad_norm": 0.8226135969161987, + "learning_rate": 0.00028929528488979244 + }, + { + "step": 124, + "epoch": 0.6310432569974554, + "cpu_mem": 3.260260352, + "gpu_mem": 1.06900224, + "loss": 1.3867, + "grad_norm": 0.8372721076011658, + "learning_rate": 0.0002889623449443479 + }, + { + "step": 125, + "epoch": 0.6361323155216285, + "cpu_mem": 3.260260352, + "gpu_mem": 1.069037568, + "loss": 1.3855, + "grad_norm": 0.5893469452857971, + "learning_rate": 0.0002886245040039086 + }, + { + "step": 126, + "epoch": 0.6412213740458015, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068943872, + "loss": 1.3535, + "grad_norm": 0.4739958345890045, + "learning_rate": 0.0002882817739836215 + }, + { + "step": 127, + "epoch": 0.6463104325699746, + "cpu_mem": 3.260260352, + "gpu_mem": 1.068968448, + "loss": 1.4341, + "grad_norm": 0.967251181602478, + "learning_rate": 0.000287934166971064 + }, + { + "step": 128, + "epoch": 0.6513994910941476, + "cpu_mem": 3.258884096, + "gpu_mem": 1.0689408, + "loss": 1.3933, + "grad_norm": 0.6512871384620667, + "learning_rate": 0.0002875816952258179 + }, + { + "step": 129, + "epoch": 0.6564885496183206, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068946944, + "loss": 1.4052, + "grad_norm": 0.7505720853805542, + "learning_rate": 0.00028722437117903693 + }, + { + "step": 130, + "epoch": 0.6615776081424937, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068937728, + "loss": 1.4151, + "grad_norm": 0.8292710781097412, + "learning_rate": 0.000286862207433008 + }, + { + "step": 131, + "epoch": 0.6666666666666666, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06887168, + "loss": 1.3873, + "grad_norm": 0.5161054134368896, + "learning_rate": 0.00028649521676070726 + }, + { + "step": 132, + "epoch": 0.6717557251908397, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069034496, + "loss": 1.3973, + "grad_norm": 0.47214168310165405, + "learning_rate": 0.0002861234121053493 + }, + { + "step": 133, + "epoch": 0.6768447837150128, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068946944, + "loss": 1.422, + "grad_norm": 1.1147072315216064, + "learning_rate": 0.0002857468065799307 + }, + { + "step": 134, + "epoch": 0.6819338422391857, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068950016, + "loss": 1.3868, + "grad_norm": 0.28295010328292847, + "learning_rate": 0.0002853654134667676 + }, + { + "step": 135, + "epoch": 0.6870229007633588, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069005312, + "loss": 1.3852, + "grad_norm": 0.6706323027610779, + "learning_rate": 0.0002849792462170271 + }, + { + "step": 136, + "epoch": 0.6921119592875318, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068908544, + "loss": 1.388, + "grad_norm": 0.4647975564002991, + "learning_rate": 0.0002845883184502533 + }, + { + "step": 137, + "epoch": 0.6972010178117048, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069075968, + "loss": 1.3918, + "grad_norm": 0.4456006586551666, + "learning_rate": 0.00028419264395388626 + }, + { + "step": 138, + "epoch": 0.7022900763358778, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068942336, + "loss": 1.3924, + "grad_norm": 0.6863746047019958, + "learning_rate": 0.0002837922366827765 + }, + { + "step": 139, + "epoch": 0.7073791348600509, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068931584, + "loss": 1.3888, + "grad_norm": 1.0309451818466187, + "learning_rate": 0.00028338711075869216 + }, + { + "step": 140, + "epoch": 0.712468193384224, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068983808, + "loss": 1.4057, + "grad_norm": 0.9512726068496704, + "learning_rate": 0.00028297728046982137 + }, + { + "step": 141, + "epoch": 0.7175572519083969, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068908544, + "loss": 1.37, + "grad_norm": 0.7596926689147949, + "learning_rate": 0.00028256276027026816 + }, + { + "step": 142, + "epoch": 0.72264631043257, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068969984, + "loss": 1.4012, + "grad_norm": 0.8726382851600647, + "learning_rate": 0.0002821435647795429 + }, + { + "step": 143, + "epoch": 0.727735368956743, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068968448, + "loss": 1.4267, + "grad_norm": 0.9781332612037659, + "learning_rate": 0.00028171970878204623 + }, + { + "step": 144, + "epoch": 0.732824427480916, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068913152, + "loss": 1.3648, + "grad_norm": 0.38445258140563965, + "learning_rate": 0.0002812912072265481 + }, + { + "step": 145, + "epoch": 0.7379134860050891, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06891008, + "loss": 1.3859, + "grad_norm": 0.6344216465950012, + "learning_rate": 0.00028085807522566043 + }, + { + "step": 146, + "epoch": 0.7430025445292621, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069011456, + "loss": 1.3793, + "grad_norm": 0.8161703944206238, + "learning_rate": 0.00028042032805530387 + }, + { + "step": 147, + "epoch": 0.7480916030534351, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069016064, + "loss": 1.3891, + "grad_norm": 0.802743136882782, + "learning_rate": 0.00027997798115416935 + }, + { + "step": 148, + "epoch": 0.7531806615776081, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069114368, + "loss": 1.3688, + "grad_norm": 0.5557290315628052, + "learning_rate": 0.0002795310501231734 + }, + { + "step": 149, + "epoch": 0.7582697201017812, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068957696, + "loss": 1.3863, + "grad_norm": 0.7681324481964111, + "learning_rate": 0.0002790795507249081 + }, + { + "step": 150, + "epoch": 0.7633587786259542, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068950016, + "loss": 1.3792, + "grad_norm": 0.5099794268608093, + "learning_rate": 0.00027862349888308494 + }, + { + "step": 151, + "epoch": 0.7684478371501272, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068893184, + "loss": 1.3657, + "grad_norm": 0.43498918414115906, + "learning_rate": 0.0002781629106819733 + }, + { + "step": 152, + "epoch": 0.7735368956743003, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06892544, + "loss": 1.3772, + "grad_norm": 0.5509082078933716, + "learning_rate": 0.00027769780236583315 + }, + { + "step": 153, + "epoch": 0.7786259541984732, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068953088, + "loss": 1.3557, + "grad_norm": 0.47475355863571167, + "learning_rate": 0.0002772281903383424 + }, + { + "step": 154, + "epoch": 0.7837150127226463, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069003776, + "loss": 1.3814, + "grad_norm": 0.7616018056869507, + "learning_rate": 0.00027675409116201797 + }, + { + "step": 155, + "epoch": 0.7888040712468194, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068916224, + "loss": 1.3942, + "grad_norm": 0.47040361166000366, + "learning_rate": 0.00027627552155763186 + }, + { + "step": 156, + "epoch": 0.7938931297709924, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068936192, + "loss": 1.3997, + "grad_norm": 0.9357184767723083, + "learning_rate": 0.00027579249840362145 + }, + { + "step": 157, + "epoch": 0.7989821882951654, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069012992, + "loss": 1.3881, + "grad_norm": 0.7466822266578674, + "learning_rate": 0.0002753050387354942 + }, + { + "step": 158, + "epoch": 0.8040712468193384, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068914688, + "loss": 1.4199, + "grad_norm": 0.5928898453712463, + "learning_rate": 0.0002748131597452268 + }, + { + "step": 159, + "epoch": 0.8091603053435115, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069016064, + "loss": 1.4065, + "grad_norm": 0.7770447134971619, + "learning_rate": 0.00027431687878065874 + }, + { + "step": 160, + "epoch": 0.8142493638676844, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068973056, + "loss": 1.4217, + "grad_norm": 1.1771316528320312, + "learning_rate": 0.00027381621334488085 + }, + { + "step": 161, + "epoch": 0.8193384223918575, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06897152, + "loss": 1.3575, + "grad_norm": 0.5705345869064331, + "learning_rate": 0.00027331118109561744 + }, + { + "step": 162, + "epoch": 0.8244274809160306, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068954624, + "loss": 1.3989, + "grad_norm": 0.41517916321754456, + "learning_rate": 0.000272801799844604 + }, + { + "step": 163, + "epoch": 0.8295165394402035, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06907136, + "loss": 1.4163, + "grad_norm": 0.7363594770431519, + "learning_rate": 0.00027228808755695884 + }, + { + "step": 164, + "epoch": 0.8346055979643766, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068966912, + "loss": 1.3693, + "grad_norm": 0.6944501399993896, + "learning_rate": 0.00027177006235054943 + }, + { + "step": 165, + "epoch": 0.8396946564885496, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069046784, + "loss": 1.3822, + "grad_norm": 0.667091965675354, + "learning_rate": 0.0002712477424953534 + }, + { + "step": 166, + "epoch": 0.8447837150127226, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06895616, + "loss": 1.3558, + "grad_norm": 0.6577123403549194, + "learning_rate": 0.00027072114641281435 + }, + { + "step": 167, + "epoch": 0.8498727735368957, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06891776, + "loss": 1.3939, + "grad_norm": 0.8621401786804199, + "learning_rate": 0.0002701902926751921 + }, + { + "step": 168, + "epoch": 0.8549618320610687, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068920832, + "loss": 1.389, + "grad_norm": 0.9516242742538452, + "learning_rate": 0.00026965520000490743 + }, + { + "step": 169, + "epoch": 0.8600508905852418, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068988416, + "loss": 1.3741, + "grad_norm": 0.6649161577224731, + "learning_rate": 0.0002691158872738822 + }, + { + "step": 170, + "epoch": 0.8651399491094147, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068996096, + "loss": 1.3951, + "grad_norm": 0.43673238158226013, + "learning_rate": 0.00026857237350287334 + }, + { + "step": 171, + "epoch": 0.8702290076335878, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068969984, + "loss": 1.3853, + "grad_norm": 0.4593353271484375, + "learning_rate": 0.0002680246778608023 + }, + { + "step": 172, + "epoch": 0.8753180661577609, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068950016, + "loss": 1.3856, + "grad_norm": 0.3480646014213562, + "learning_rate": 0.0002674728196640788 + }, + { + "step": 173, + "epoch": 0.8804071246819338, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06898688, + "loss": 1.3869, + "grad_norm": 0.4908581078052521, + "learning_rate": 0.00026691681837591984 + }, + { + "step": 174, + "epoch": 0.8854961832061069, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068930048, + "loss": 1.3963, + "grad_norm": 0.42936941981315613, + "learning_rate": 0.00026635669360566296 + }, + { + "step": 175, + "epoch": 0.8905852417302799, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069000704, + "loss": 1.366, + "grad_norm": 0.39526286721229553, + "learning_rate": 0.00026579246510807477 + }, + { + "step": 176, + "epoch": 0.8956743002544529, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068897792, + "loss": 1.3805, + "grad_norm": 0.3896453082561493, + "learning_rate": 0.00026522415278265425 + }, + { + "step": 177, + "epoch": 0.9007633587786259, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068907008, + "loss": 1.3791, + "grad_norm": 0.6235100030899048, + "learning_rate": 0.0002646517766729309 + }, + { + "step": 178, + "epoch": 0.905852417302799, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068983808, + "loss": 1.3671, + "grad_norm": 0.3914746046066284, + "learning_rate": 0.0002640753569657579 + }, + { + "step": 179, + "epoch": 0.910941475826972, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068991488, + "loss": 1.3887, + "grad_norm": 0.5433544516563416, + "learning_rate": 0.0002634949139906 + }, + { + "step": 180, + "epoch": 0.916030534351145, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068997632, + "loss": 1.3871, + "grad_norm": 0.35379138588905334, + "learning_rate": 0.00026291046821881673 + }, + { + "step": 181, + "epoch": 0.9211195928753181, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068916224, + "loss": 1.3946, + "grad_norm": 0.3653639554977417, + "learning_rate": 0.0002623220402629402 + }, + { + "step": 182, + "epoch": 0.926208651399491, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069026816, + "loss": 1.3944, + "grad_norm": 0.38475465774536133, + "learning_rate": 0.0002617296508759483 + }, + { + "step": 183, + "epoch": 0.9312977099236641, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069014528, + "loss": 1.3891, + "grad_norm": 0.4092424511909485, + "learning_rate": 0.00026113332095053257 + }, + { + "step": 184, + "epoch": 0.9363867684478372, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068997632, + "loss": 1.3792, + "grad_norm": 0.7149854898452759, + "learning_rate": 0.0002605330715183616 + }, + { + "step": 185, + "epoch": 0.9414758269720102, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068936192, + "loss": 1.3902, + "grad_norm": 0.7813953161239624, + "learning_rate": 0.0002599289237493392 + }, + { + "step": 186, + "epoch": 0.9465648854961832, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068968448, + "loss": 1.3593, + "grad_norm": 0.3118617832660675, + "learning_rate": 0.0002593208989508575 + }, + { + "step": 187, + "epoch": 0.9516539440203562, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069065216, + "loss": 1.3639, + "grad_norm": 0.29891160130500793, + "learning_rate": 0.00025870901856704583 + }, + { + "step": 188, + "epoch": 0.9567430025445293, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069037568, + "loss": 1.3461, + "grad_norm": 0.3045106828212738, + "learning_rate": 0.00025809330417801425 + }, + { + "step": 189, + "epoch": 0.9618320610687023, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068942336, + "loss": 1.4884, + "grad_norm": 1.1554603576660156, + "learning_rate": 0.00025747377749909254 + }, + { + "step": 190, + "epoch": 0.9669211195928753, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068974592, + "loss": 1.381, + "grad_norm": 0.7117700576782227, + "learning_rate": 0.00025685046038006413 + }, + { + "step": 191, + "epoch": 0.9720101781170484, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06896384, + "loss": 1.4252, + "grad_norm": 0.7419150471687317, + "learning_rate": 0.0002562233748043958 + }, + { + "step": 192, + "epoch": 0.9770992366412213, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068900864, + "loss": 1.4005, + "grad_norm": 0.8789898157119751, + "learning_rate": 0.00025559254288846196 + }, + { + "step": 193, + "epoch": 0.9821882951653944, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069022208, + "loss": 1.4146, + "grad_norm": 0.518558919429779, + "learning_rate": 0.0002549579868807651 + }, + { + "step": 194, + "epoch": 0.9872773536895675, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068943872, + "loss": 1.3612, + "grad_norm": 0.36738091707229614, + "learning_rate": 0.0002543197291611507 + }, + { + "step": 195, + "epoch": 0.9923664122137404, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069056, + "loss": 1.3805, + "grad_norm": 0.38107359409332275, + "learning_rate": 0.0002536777922400183 + }, + { + "step": 196, + "epoch": 0.9974554707379135, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069051392, + "loss": 1.3863, + "grad_norm": 0.5006668567657471, + "learning_rate": 0.0002530321987575271 + }, + { + "step": 197, + "epoch": 1.0025445292620865, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075293184, + "loss": 2.0731, + "grad_norm": 0.5376245379447937, + "learning_rate": 0.0002523829714827981 + }, + { + "step": 198, + "epoch": 1.0076335877862594, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075359232, + "loss": 1.3784, + "grad_norm": 0.3386088013648987, + "learning_rate": 0.00025173013331311053 + }, + { + "step": 199, + "epoch": 1.0127226463104326, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07535616, + "loss": 1.4061, + "grad_norm": 0.7654465436935425, + "learning_rate": 0.0002510737072730946 + }, + { + "step": 200, + "epoch": 1.0178117048346056, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07524096, + "loss": 1.3551, + "grad_norm": 0.558188259601593, + "learning_rate": 0.0002504137165139193 + }, + { + "step": 201, + "epoch": 1.0229007633587786, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075291648, + "loss": 1.386, + "grad_norm": 0.7063589692115784, + "learning_rate": 0.0002497501843124761 + }, + { + "step": 202, + "epoch": 1.0279898218829517, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075285504, + "loss": 1.3939, + "grad_norm": 0.5623087286949158, + "learning_rate": 0.00024908313407055765 + }, + { + "step": 203, + "epoch": 1.0330788804071247, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07533312, + "loss": 1.3736, + "grad_norm": 0.4176013469696045, + "learning_rate": 0.00024841258931403284 + }, + { + "step": 204, + "epoch": 1.0381679389312977, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075265536, + "loss": 1.3554, + "grad_norm": 0.45864710211753845, + "learning_rate": 0.00024773857369201675 + }, + { + "step": 205, + "epoch": 1.0432569974554706, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075311616, + "loss": 1.3702, + "grad_norm": 0.33999794721603394, + "learning_rate": 0.00024706111097603676 + }, + { + "step": 206, + "epoch": 1.0483460559796438, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07532544, + "loss": 1.3482, + "grad_norm": 0.49820104241371155, + "learning_rate": 0.00024638022505919425 + }, + { + "step": 207, + "epoch": 1.0534351145038168, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075280896, + "loss": 1.3841, + "grad_norm": 0.7198119759559631, + "learning_rate": 0.00024569593995532157 + }, + { + "step": 208, + "epoch": 1.0585241730279897, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075273216, + "loss": 1.3731, + "grad_norm": 0.5634872317314148, + "learning_rate": 0.00024500827979813546 + }, + { + "step": 209, + "epoch": 1.063613231552163, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07539456, + "loss": 1.3822, + "grad_norm": 0.7005066871643066, + "learning_rate": 0.0002443172688403859 + }, + { + "step": 210, + "epoch": 1.0687022900763359, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075291648, + "loss": 1.3648, + "grad_norm": 0.5920222401618958, + "learning_rate": 0.00024362293145300027 + }, + { + "step": 211, + "epoch": 1.0737913486005088, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075293184, + "loss": 1.3677, + "grad_norm": 0.6127324104309082, + "learning_rate": 0.00024292529212422445 + }, + { + "step": 212, + "epoch": 1.078880407124682, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075296256, + "loss": 1.3885, + "grad_norm": 0.4673655927181244, + "learning_rate": 0.00024222437545875887 + }, + { + "step": 213, + "epoch": 1.083969465648855, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075222528, + "loss": 1.3804, + "grad_norm": 0.5791721940040588, + "learning_rate": 0.0002415202061768906 + }, + { + "step": 214, + "epoch": 1.089058524173028, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07521792, + "loss": 1.363, + "grad_norm": 0.7734249234199524, + "learning_rate": 0.0002408128091136217 + }, + { + "step": 215, + "epoch": 1.094147582697201, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075268608, + "loss": 1.3756, + "grad_norm": 1.0807422399520874, + "learning_rate": 0.00024010220921779336 + }, + { + "step": 216, + "epoch": 1.099236641221374, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075397632, + "loss": 1.3862, + "grad_norm": 0.5920828580856323, + "learning_rate": 0.00023938843155120581 + }, + { + "step": 217, + "epoch": 1.104325699745547, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075259392, + "loss": 1.4308, + "grad_norm": 0.8686667084693909, + "learning_rate": 0.00023867150128773453 + }, + { + "step": 218, + "epoch": 1.10941475826972, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075285504, + "loss": 1.4061, + "grad_norm": 0.7136402726173401, + "learning_rate": 0.0002379514437124425 + }, + { + "step": 219, + "epoch": 1.1145038167938932, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075297792, + "loss": 1.4034, + "grad_norm": 0.6014848947525024, + "learning_rate": 0.00023722828422068814 + }, + { + "step": 220, + "epoch": 1.1195928753180662, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075270144, + "loss": 1.4497, + "grad_norm": 1.342832088470459, + "learning_rate": 0.00023650204831723008 + }, + { + "step": 221, + "epoch": 1.1246819338422391, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075244032, + "loss": 1.4353, + "grad_norm": 0.6643794178962708, + "learning_rate": 0.00023577276161532718 + }, + { + "step": 222, + "epoch": 1.1297709923664123, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07525632, + "loss": 1.3841, + "grad_norm": 0.4805084466934204, + "learning_rate": 0.0002350404498358356 + }, + { + "step": 223, + "epoch": 1.1348600508905853, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075283968, + "loss": 1.3699, + "grad_norm": 0.561011791229248, + "learning_rate": 0.00023430513880630133 + }, + { + "step": 224, + "epoch": 1.1399491094147582, + "cpu_mem": 3.258884096, + "gpu_mem": 1.0753024, + "loss": 1.3799, + "grad_norm": 0.32088664174079895, + "learning_rate": 0.00023356685446004966 + }, + { + "step": 225, + "epoch": 1.1450381679389312, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075353088, + "loss": 1.3495, + "grad_norm": 0.3048458397388458, + "learning_rate": 0.00023282562283527005 + }, + { + "step": 226, + "epoch": 1.1501272264631044, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075303936, + "loss": 1.4013, + "grad_norm": 0.36258700489997864, + "learning_rate": 0.00023208147007409827 + }, + { + "step": 227, + "epoch": 1.1552162849872774, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075254784, + "loss": 1.3873, + "grad_norm": 0.4210372269153595, + "learning_rate": 0.00023133442242169425 + }, + { + "step": 228, + "epoch": 1.1603053435114503, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075299328, + "loss": 1.427, + "grad_norm": 0.5888636708259583, + "learning_rate": 0.00023058450622531632 + }, + { + "step": 229, + "epoch": 1.1653944020356235, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075197952, + "loss": 1.3562, + "grad_norm": 0.25928106904029846, + "learning_rate": 0.00022983174793339206 + }, + { + "step": 230, + "epoch": 1.1704834605597965, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075265536, + "loss": 1.3999, + "grad_norm": 0.38256192207336426, + "learning_rate": 0.0002290761740945857 + }, + { + "step": 231, + "epoch": 1.1755725190839694, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07525632, + "loss": 1.4098, + "grad_norm": 0.6021372079849243, + "learning_rate": 0.00022831781135686135 + }, + { + "step": 232, + "epoch": 1.1806615776081424, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075260928, + "loss": 1.3364, + "grad_norm": 0.453902006149292, + "learning_rate": 0.00022755668646654375 + }, + { + "step": 233, + "epoch": 1.1857506361323156, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075376128, + "loss": 1.3809, + "grad_norm": 0.4964177906513214, + "learning_rate": 0.00022679282626737442 + }, + { + "step": 234, + "epoch": 1.1908396946564885, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075336192, + "loss": 1.4205, + "grad_norm": 0.6081448197364807, + "learning_rate": 0.00022602625769956519 + }, + { + "step": 235, + "epoch": 1.1959287531806615, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07529472, + "loss": 1.3919, + "grad_norm": 0.7618113160133362, + "learning_rate": 0.00022525700779884802 + }, + { + "step": 236, + "epoch": 1.2010178117048347, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075293184, + "loss": 1.3826, + "grad_norm": 0.6178544759750366, + "learning_rate": 0.00022448510369552164 + }, + { + "step": 237, + "epoch": 1.2061068702290076, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07531776, + "loss": 1.3803, + "grad_norm": 0.4809180796146393, + "learning_rate": 0.0002237105726134943 + }, + { + "step": 238, + "epoch": 1.2111959287531806, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075337728, + "loss": 1.3788, + "grad_norm": 0.4649932384490967, + "learning_rate": 0.00022293344186932406 + }, + { + "step": 239, + "epoch": 1.2162849872773536, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075254784, + "loss": 1.3753, + "grad_norm": 0.662265956401825, + "learning_rate": 0.00022215373887125514 + }, + { + "step": 240, + "epoch": 1.2213740458015268, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075389952, + "loss": 1.3771, + "grad_norm": 0.5203429460525513, + "learning_rate": 0.00022137149111825128 + }, + { + "step": 241, + "epoch": 1.2264631043256997, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075343872, + "loss": 1.3688, + "grad_norm": 0.38688206672668457, + "learning_rate": 0.00022058672619902606 + }, + { + "step": 242, + "epoch": 1.2315521628498727, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075365376, + "loss": 1.3751, + "grad_norm": 0.4566575288772583, + "learning_rate": 0.00021979947179106966 + }, + { + "step": 243, + "epoch": 1.2366412213740459, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075251712, + "loss": 1.3749, + "grad_norm": 1.1173632144927979, + "learning_rate": 0.0002190097556596728 + }, + { + "step": 244, + "epoch": 1.2417302798982188, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075239424, + "loss": 1.3979, + "grad_norm": 0.848514974117279, + "learning_rate": 0.0002182176056569476 + }, + { + "step": 245, + "epoch": 1.2468193384223918, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075244032, + "loss": 1.3721, + "grad_norm": 0.6906311511993408, + "learning_rate": 0.00021742304972084518 + }, + { + "step": 246, + "epoch": 1.2519083969465647, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075280896, + "loss": 1.3689, + "grad_norm": 0.56070476770401, + "learning_rate": 0.00021662611587417035 + }, + { + "step": 247, + "epoch": 1.256997455470738, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075242496, + "loss": 1.3559, + "grad_norm": 0.7285918593406677, + "learning_rate": 0.00021582683222359317 + }, + { + "step": 248, + "epoch": 1.262086513994911, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075288576, + "loss": 1.3972, + "grad_norm": 0.5619736909866333, + "learning_rate": 0.00021502522695865796 + }, + { + "step": 249, + "epoch": 1.267175572519084, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075297792, + "loss": 1.3903, + "grad_norm": 0.41462892293930054, + "learning_rate": 0.00021422132835078884 + }, + { + "step": 250, + "epoch": 1.272264631043257, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075336192, + "loss": 1.3766, + "grad_norm": 0.654066264629364, + "learning_rate": 0.0002134151647522927 + }, + { + "step": 251, + "epoch": 1.27735368956743, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075254784, + "loss": 1.3808, + "grad_norm": 0.8032487034797668, + "learning_rate": 0.00021260676459535933 + }, + { + "step": 252, + "epoch": 1.282442748091603, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075264, + "loss": 1.3525, + "grad_norm": 0.8147600889205933, + "learning_rate": 0.00021179615639105857 + }, + { + "step": 253, + "epoch": 1.2875318066157762, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075330048, + "loss": 1.3488, + "grad_norm": 0.5062264204025269, + "learning_rate": 0.00021098336872833482 + }, + { + "step": 254, + "epoch": 1.2926208651399491, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07525632, + "loss": 1.3309, + "grad_norm": 0.5720639228820801, + "learning_rate": 0.0002101684302729987 + }, + { + "step": 255, + "epoch": 1.297709923664122, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075250176, + "loss": 1.3894, + "grad_norm": 0.6963638067245483, + "learning_rate": 0.00020935136976671617 + }, + { + "step": 256, + "epoch": 1.3027989821882953, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075316224, + "loss": 1.3892, + "grad_norm": 0.5950867533683777, + "learning_rate": 0.00020853221602599458 + }, + { + "step": 257, + "epoch": 1.3078880407124682, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075274752, + "loss": 1.3434, + "grad_norm": 0.45890557765960693, + "learning_rate": 0.00020771099794116672 + }, + { + "step": 258, + "epoch": 1.3129770992366412, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075388416, + "loss": 1.3589, + "grad_norm": 0.6483781337738037, + "learning_rate": 0.0002068877444753717 + }, + { + "step": 259, + "epoch": 1.3180661577608141, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075265536, + "loss": 1.3692, + "grad_norm": 0.6877549886703491, + "learning_rate": 0.0002060624846635335 + }, + { + "step": 260, + "epoch": 1.3231552162849873, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075291648, + "loss": 1.3754, + "grad_norm": 0.5912956595420837, + "learning_rate": 0.00020523524761133677 + }, + { + "step": 261, + "epoch": 1.3282442748091603, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075297792, + "loss": 1.3666, + "grad_norm": 0.8282333016395569, + "learning_rate": 0.00020440606249420073 + }, + { + "step": 262, + "epoch": 1.3333333333333333, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075365376, + "loss": 1.3676, + "grad_norm": 0.5816847681999207, + "learning_rate": 0.00020357495855624974 + }, + { + "step": 263, + "epoch": 1.3384223918575064, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075220992, + "loss": 1.3935, + "grad_norm": 0.899387776851654, + "learning_rate": 0.0002027419651092822 + }, + { + "step": 264, + "epoch": 1.3435114503816794, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075354624, + "loss": 1.3765, + "grad_norm": 0.4678283929824829, + "learning_rate": 0.00020190711153173676 + }, + { + "step": 265, + "epoch": 1.3486005089058524, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075383808, + "loss": 1.3534, + "grad_norm": 0.8070634007453918, + "learning_rate": 0.00020107042726765588 + }, + { + "step": 266, + "epoch": 1.3536895674300253, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075253248, + "loss": 1.3714, + "grad_norm": 0.8699640035629272, + "learning_rate": 0.0002002319418256479 + }, + { + "step": 267, + "epoch": 1.3587786259541985, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075254784, + "loss": 1.4172, + "grad_norm": 0.8502652049064636, + "learning_rate": 0.00019939168477784583 + }, + { + "step": 268, + "epoch": 1.3638676844783715, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07525632, + "loss": 1.3544, + "grad_norm": 0.5194823741912842, + "learning_rate": 0.00019854968575886458 + }, + { + "step": 269, + "epoch": 1.3689567430025447, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075330048, + "loss": 1.41, + "grad_norm": 1.2219352722167969, + "learning_rate": 0.00019770597446475588 + }, + { + "step": 270, + "epoch": 1.3740458015267176, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075311616, + "loss": 1.3519, + "grad_norm": 0.513081431388855, + "learning_rate": 0.0001968605806519608 + }, + { + "step": 271, + "epoch": 1.3791348600508906, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075254784, + "loss": 1.4196, + "grad_norm": 1.0225721597671509, + "learning_rate": 0.00019601353413626032 + }, + { + "step": 272, + "epoch": 1.3842239185750635, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075293184, + "loss": 1.3587, + "grad_norm": 0.6886439323425293, + "learning_rate": 0.00019516486479172386 + }, + { + "step": 273, + "epoch": 1.3893129770992365, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075360768, + "loss": 1.3564, + "grad_norm": 0.637144923210144, + "learning_rate": 0.0001943146025496555 + }, + { + "step": 274, + "epoch": 1.3944020356234097, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075313152, + "loss": 1.4181, + "grad_norm": 0.8329043984413147, + "learning_rate": 0.00019346277739753855 + }, + { + "step": 275, + "epoch": 1.3994910941475827, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07544064, + "loss": 1.3822, + "grad_norm": 0.6254027485847473, + "learning_rate": 0.00019260941937797776 + }, + { + "step": 276, + "epoch": 1.4045801526717558, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07532544, + "loss": 1.3705, + "grad_norm": 0.4880826771259308, + "learning_rate": 0.00019175455858763988 + }, + { + "step": 277, + "epoch": 1.4096692111959288, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075274752, + "loss": 1.3259, + "grad_norm": 0.5492273569107056, + "learning_rate": 0.0001908982251761921 + }, + { + "step": 278, + "epoch": 1.4147582697201018, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075262464, + "loss": 1.3485, + "grad_norm": 0.7227955460548401, + "learning_rate": 0.00019004044934523871 + }, + { + "step": 279, + "epoch": 1.4198473282442747, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075254784, + "loss": 1.3788, + "grad_norm": 0.5486322641372681, + "learning_rate": 0.00018918126134725616 + }, + { + "step": 280, + "epoch": 1.424936386768448, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075377664, + "loss": 1.3875, + "grad_norm": 0.8094238042831421, + "learning_rate": 0.00018832069148452582 + }, + { + "step": 281, + "epoch": 1.4300254452926209, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07531776, + "loss": 1.4243, + "grad_norm": 1.0958892107009888, + "learning_rate": 0.00018745877010806534 + }, + { + "step": 282, + "epoch": 1.4351145038167938, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075264, + "loss": 1.4281, + "grad_norm": 0.9480810165405273, + "learning_rate": 0.00018659552761655828 + }, + { + "step": 283, + "epoch": 1.440203562340967, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075280896, + "loss": 1.4092, + "grad_norm": 0.8580055236816406, + "learning_rate": 0.00018573099445528204 + }, + { + "step": 284, + "epoch": 1.44529262086514, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075420672, + "loss": 1.3611, + "grad_norm": 0.4051409065723419, + "learning_rate": 0.00018486520111503387 + }, + { + "step": 285, + "epoch": 1.450381679389313, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075291648, + "loss": 1.379, + "grad_norm": 0.39697131514549255, + "learning_rate": 0.0001839981781310558 + }, + { + "step": 286, + "epoch": 1.455470737913486, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07528704, + "loss": 1.4039, + "grad_norm": 0.6111369729042053, + "learning_rate": 0.00018312995608195747 + }, + { + "step": 287, + "epoch": 1.460559796437659, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075190272, + "loss": 1.3715, + "grad_norm": 0.4666951298713684, + "learning_rate": 0.00018226056558863778 + }, + { + "step": 288, + "epoch": 1.465648854961832, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075265536, + "loss": 1.3681, + "grad_norm": 0.4495439827442169, + "learning_rate": 0.00018139003731320496 + }, + { + "step": 289, + "epoch": 1.470737913486005, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075285504, + "loss": 1.3923, + "grad_norm": 0.3940727710723877, + "learning_rate": 0.00018051840195789506 + }, + { + "step": 290, + "epoch": 1.4758269720101782, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075251712, + "loss": 1.3541, + "grad_norm": 0.3307391405105591, + "learning_rate": 0.00017964569026398926 + }, + { + "step": 291, + "epoch": 1.4809160305343512, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075213312, + "loss": 1.3697, + "grad_norm": 0.33822059631347656, + "learning_rate": 0.00017877193301072945 + }, + { + "step": 292, + "epoch": 1.4860050890585241, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075339264, + "loss": 1.3721, + "grad_norm": 0.31101471185684204, + "learning_rate": 0.0001778971610142331 + }, + { + "step": 293, + "epoch": 1.491094147582697, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075216384, + "loss": 1.402, + "grad_norm": 0.44738689064979553, + "learning_rate": 0.00017702140512640594 + }, + { + "step": 294, + "epoch": 1.4961832061068703, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075259392, + "loss": 1.3962, + "grad_norm": 0.4075678884983063, + "learning_rate": 0.00017614469623385414 + }, + { + "step": 295, + "epoch": 1.5012722646310432, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075242496, + "loss": 1.343, + "grad_norm": 0.5061039328575134, + "learning_rate": 0.00017526706525679498 + }, + { + "step": 296, + "epoch": 1.5063613231552164, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075283968, + "loss": 1.3806, + "grad_norm": 0.48590484261512756, + "learning_rate": 0.00017438854314796623 + }, + { + "step": 297, + "epoch": 1.5114503816793894, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075224064, + "loss": 1.3643, + "grad_norm": 0.39738908410072327, + "learning_rate": 0.00017350916089153455 + }, + { + "step": 298, + "epoch": 1.5165394402035624, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075247104, + "loss": 1.357, + "grad_norm": 0.5950372219085693, + "learning_rate": 0.00017262894950200277 + }, + { + "step": 299, + "epoch": 1.5216284987277353, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075236352, + "loss": 1.3594, + "grad_norm": 0.39589792490005493, + "learning_rate": 0.000171747940023116 + }, + { + "step": 300, + "epoch": 1.5267175572519083, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07524864, + "loss": 1.3793, + "grad_norm": 0.3703109920024872, + "learning_rate": 0.0001708661635267667 + }, + { + "step": 301, + "epoch": 1.5318066157760815, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075251712, + "loss": 1.3371, + "grad_norm": 0.7604830265045166, + "learning_rate": 0.00016998365111189906 + }, + { + "step": 302, + "epoch": 1.5368956743002544, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07523328, + "loss": 1.3755, + "grad_norm": 0.39655131101608276, + "learning_rate": 0.00016910043390341183 + }, + { + "step": 303, + "epoch": 1.5419847328244276, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075239424, + "loss": 1.3705, + "grad_norm": 0.5908136367797852, + "learning_rate": 0.0001682165430510609 + }, + { + "step": 304, + "epoch": 1.5470737913486006, + "cpu_mem": 3.258884096, + "gpu_mem": 1.0751872, + "loss": 1.4197, + "grad_norm": 1.2583070993423462, + "learning_rate": 0.00016733200972836055 + }, + { + "step": 305, + "epoch": 1.5521628498727735, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075259392, + "loss": 1.3615, + "grad_norm": 0.4946900010108948, + "learning_rate": 0.00016644686513148397 + }, + { + "step": 306, + "epoch": 1.5572519083969465, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075300864, + "loss": 1.3463, + "grad_norm": 0.5299191474914551, + "learning_rate": 0.00016556114047816317 + }, + { + "step": 307, + "epoch": 1.5623409669211195, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07524864, + "loss": 1.3493, + "grad_norm": 0.4106220304965973, + "learning_rate": 0.00016467486700658785 + }, + { + "step": 308, + "epoch": 1.5674300254452926, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075259392, + "loss": 1.4038, + "grad_norm": 0.5659478306770325, + "learning_rate": 0.0001637880759743037 + }, + { + "step": 309, + "epoch": 1.5725190839694656, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075242496, + "loss": 1.3981, + "grad_norm": 0.7547957301139832, + "learning_rate": 0.00016290079865711004 + }, + { + "step": 310, + "epoch": 1.5776081424936388, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075297792, + "loss": 1.373, + "grad_norm": 0.4123050272464752, + "learning_rate": 0.00016201306634795675 + }, + { + "step": 311, + "epoch": 1.5826972010178118, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075264, + "loss": 1.3557, + "grad_norm": 0.401769757270813, + "learning_rate": 0.00016112491035584047 + }, + { + "step": 312, + "epoch": 1.5877862595419847, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075273216, + "loss": 1.3634, + "grad_norm": 0.5397482514381409, + "learning_rate": 0.00016023636200470065 + }, + { + "step": 313, + "epoch": 1.5928753180661577, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075274752, + "loss": 1.3633, + "grad_norm": 0.8580787777900696, + "learning_rate": 0.00015934745263231464 + }, + { + "step": 314, + "epoch": 1.5979643765903306, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07550208, + "loss": 1.368, + "grad_norm": 0.5129746794700623, + "learning_rate": 0.00015845821358919236 + }, + { + "step": 315, + "epoch": 1.6030534351145038, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07529472, + "loss": 1.3493, + "grad_norm": 0.5968105792999268, + "learning_rate": 0.00015756867623747088 + }, + { + "step": 316, + "epoch": 1.608142493638677, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075267072, + "loss": 1.3812, + "grad_norm": 0.6740577816963196, + "learning_rate": 0.00015667887194980806 + }, + { + "step": 317, + "epoch": 1.61323155216285, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075231744, + "loss": 1.3783, + "grad_norm": 0.9293946623802185, + "learning_rate": 0.00015578883210827626 + }, + { + "step": 318, + "epoch": 1.618320610687023, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075322368, + "loss": 1.3356, + "grad_norm": 0.8153334856033325, + "learning_rate": 0.0001548985881032554 + }, + { + "step": 319, + "epoch": 1.623409669211196, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075247104, + "loss": 1.3336, + "grad_norm": 0.7719043493270874, + "learning_rate": 0.00015400817133232606 + }, + { + "step": 320, + "epoch": 1.6284987277353689, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07529472, + "loss": 1.3559, + "grad_norm": 0.6478682160377502, + "learning_rate": 0.00015311761319916184 + }, + { + "step": 321, + "epoch": 1.6335877862595418, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075236352, + "loss": 1.3476, + "grad_norm": 0.742011308670044, + "learning_rate": 0.00015222694511242215 + }, + { + "step": 322, + "epoch": 1.638676844783715, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075457536, + "loss": 1.3931, + "grad_norm": 1.034771203994751, + "learning_rate": 0.00015133619848464424 + }, + { + "step": 323, + "epoch": 1.6437659033078882, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075339264, + "loss": 1.3413, + "grad_norm": 0.6907620429992676, + "learning_rate": 0.0001504454047311353 + }, + { + "step": 324, + "epoch": 1.6488549618320612, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075231744, + "loss": 1.3701, + "grad_norm": 1.2793704271316528, + "learning_rate": 0.00014955459526886468 + }, + { + "step": 325, + "epoch": 1.6539440203562341, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07527168, + "loss": 1.4064, + "grad_norm": 0.9554460048675537, + "learning_rate": 0.00014866380151535574 + }, + { + "step": 326, + "epoch": 1.659033078880407, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075319296, + "loss": 1.3204, + "grad_norm": 0.7702943682670593, + "learning_rate": 0.0001477730548875778 + }, + { + "step": 327, + "epoch": 1.66412213740458, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075285504, + "loss": 1.3708, + "grad_norm": 0.7505571246147156, + "learning_rate": 0.0001468823868008382 + }, + { + "step": 328, + "epoch": 1.6692111959287532, + "cpu_mem": 3.258884096, + "gpu_mem": 1.0753408, + "loss": 1.376, + "grad_norm": 0.7742499709129333, + "learning_rate": 0.000145991828667674 + }, + { + "step": 329, + "epoch": 1.6743002544529262, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07523328, + "loss": 1.3774, + "grad_norm": 0.7097276449203491, + "learning_rate": 0.0001451014118967446 + }, + { + "step": 330, + "epoch": 1.6793893129770994, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07531776, + "loss": 1.3658, + "grad_norm": 0.9413597583770752, + "learning_rate": 0.00014421116789172374 + }, + { + "step": 331, + "epoch": 1.6844783715012723, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075307008, + "loss": 1.3098, + "grad_norm": 0.7703001499176025, + "learning_rate": 0.00014332112805019194 + }, + { + "step": 332, + "epoch": 1.6895674300254453, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075262464, + "loss": 1.3946, + "grad_norm": 0.726716160774231, + "learning_rate": 0.00014243132376252912 + }, + { + "step": 333, + "epoch": 1.6946564885496183, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075351552, + "loss": 1.401, + "grad_norm": 0.7063893675804138, + "learning_rate": 0.00014154178641080767 + }, + { + "step": 334, + "epoch": 1.6997455470737912, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07527936, + "loss": 1.3587, + "grad_norm": 0.5727053284645081, + "learning_rate": 0.0001406525473676854 + }, + { + "step": 335, + "epoch": 1.7048346055979644, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075288576, + "loss": 1.3968, + "grad_norm": 0.6635168194770813, + "learning_rate": 0.00013976363799529936 + }, + { + "step": 336, + "epoch": 1.7099236641221374, + "cpu_mem": 3.258884096, + "gpu_mem": 1.0753408, + "loss": 1.3557, + "grad_norm": 0.6127541661262512, + "learning_rate": 0.00013887508964415956 + }, + { + "step": 337, + "epoch": 1.7150127226463106, + "cpu_mem": 3.258884096, + "gpu_mem": 1.0753408, + "loss": 1.3557, + "grad_norm": 1.2226934432983398, + "learning_rate": 0.00013798693365204325 + }, + { + "step": 338, + "epoch": 1.7201017811704835, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075377664, + "loss": 1.4045, + "grad_norm": 0.8809286952018738, + "learning_rate": 0.00013709920134288993 + }, + { + "step": 339, + "epoch": 1.7251908396946565, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07529472, + "loss": 1.4094, + "grad_norm": 0.746256411075592, + "learning_rate": 0.00013621192402569628 + }, + { + "step": 340, + "epoch": 1.7302798982188294, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07536384, + "loss": 1.3737, + "grad_norm": 0.5475013852119446, + "learning_rate": 0.00013532513299341215 + }, + { + "step": 341, + "epoch": 1.7353689567430024, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075305472, + "loss": 1.3587, + "grad_norm": 0.5856518149375916, + "learning_rate": 0.00013443885952183683 + }, + { + "step": 342, + "epoch": 1.7404580152671756, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075311616, + "loss": 1.3244, + "grad_norm": 0.6645670533180237, + "learning_rate": 0.00013355313486851603 + }, + { + "step": 343, + "epoch": 1.7455470737913485, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075222528, + "loss": 1.4065, + "grad_norm": 0.8894450068473816, + "learning_rate": 0.00013266799027163942 + }, + { + "step": 344, + "epoch": 1.7506361323155217, + "cpu_mem": 3.258884096, + "gpu_mem": 1.0753408, + "loss": 1.3957, + "grad_norm": 0.6157739162445068, + "learning_rate": 0.00013178345694893906 + }, + { + "step": 345, + "epoch": 1.7557251908396947, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075293184, + "loss": 1.3639, + "grad_norm": 0.5204671025276184, + "learning_rate": 0.0001308995660965881 + }, + { + "step": 346, + "epoch": 1.7608142493638677, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075239424, + "loss": 1.3717, + "grad_norm": 0.7175347208976746, + "learning_rate": 0.00013001634888810094 + }, + { + "step": 347, + "epoch": 1.7659033078880406, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075288576, + "loss": 1.3523, + "grad_norm": 0.6337887048721313, + "learning_rate": 0.0001291338364732333 + }, + { + "step": 348, + "epoch": 1.7709923664122136, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075414528, + "loss": 1.3818, + "grad_norm": 0.5410606861114502, + "learning_rate": 0.00012825205997688403 + }, + { + "step": 349, + "epoch": 1.7760814249363868, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075213312, + "loss": 1.3388, + "grad_norm": 0.44364240765571594, + "learning_rate": 0.00012737105049799723 + }, + { + "step": 350, + "epoch": 1.78117048346056, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075277824, + "loss": 1.3667, + "grad_norm": 0.43266379833221436, + "learning_rate": 0.00012649083910846543 + }, + { + "step": 351, + "epoch": 1.786259541984733, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075293184, + "loss": 1.3431, + "grad_norm": 0.4497270882129669, + "learning_rate": 0.00012561145685203374 + }, + { + "step": 352, + "epoch": 1.7913486005089059, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075283968, + "loss": 1.3546, + "grad_norm": 0.3884933292865753, + "learning_rate": 0.00012473293474320505 + }, + { + "step": 353, + "epoch": 1.7964376590330788, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075280896, + "loss": 1.3494, + "grad_norm": 0.8717014789581299, + "learning_rate": 0.00012385530376614586 + }, + { + "step": 354, + "epoch": 1.8015267175572518, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07525632, + "loss": 1.3941, + "grad_norm": 0.57502681016922, + "learning_rate": 0.00012297859487359408 + }, + { + "step": 355, + "epoch": 1.806615776081425, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075239424, + "loss": 1.3608, + "grad_norm": 0.3815855383872986, + "learning_rate": 0.0001221028389857669 + }, + { + "step": 356, + "epoch": 1.811704834605598, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07524096, + "loss": 1.3591, + "grad_norm": 0.36185210943222046, + "learning_rate": 0.00012122806698927051 + }, + { + "step": 357, + "epoch": 1.8167938931297711, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075293184, + "loss": 1.3396, + "grad_norm": 0.600318193435669, + "learning_rate": 0.00012035430973601075 + }, + { + "step": 358, + "epoch": 1.821882951653944, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075359232, + "loss": 1.4034, + "grad_norm": 1.1140371561050415, + "learning_rate": 0.00011948159804210495 + }, + { + "step": 359, + "epoch": 1.826972010178117, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075313152, + "loss": 1.3603, + "grad_norm": 0.47645485401153564, + "learning_rate": 0.00011860996268679504 + }, + { + "step": 360, + "epoch": 1.83206106870229, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07536384, + "loss": 1.3748, + "grad_norm": 0.4927104711532593, + "learning_rate": 0.00011773943441136221 + }, + { + "step": 361, + "epoch": 1.837150127226463, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075334656, + "loss": 1.4044, + "grad_norm": 0.8027552366256714, + "learning_rate": 0.00011687004391804251 + }, + { + "step": 362, + "epoch": 1.8422391857506362, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075268608, + "loss": 1.342, + "grad_norm": 0.505375325679779, + "learning_rate": 0.00011600182186894417 + }, + { + "step": 363, + "epoch": 1.8473282442748091, + "cpu_mem": 3.258884096, + "gpu_mem": 1.0752256, + "loss": 1.3192, + "grad_norm": 0.5314315557479858, + "learning_rate": 0.00011513479888496609 + }, + { + "step": 364, + "epoch": 1.8524173027989823, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075253248, + "loss": 1.3177, + "grad_norm": 0.8384084701538086, + "learning_rate": 0.00011426900554471795 + }, + { + "step": 365, + "epoch": 1.8575063613231553, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075236352, + "loss": 1.38, + "grad_norm": 0.607130229473114, + "learning_rate": 0.0001134044723834417 + }, + { + "step": 366, + "epoch": 1.8625954198473282, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07527936, + "loss": 1.3475, + "grad_norm": 0.7482250928878784, + "learning_rate": 0.00011254122989193465 + }, + { + "step": 367, + "epoch": 1.8676844783715012, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07533312, + "loss": 1.3615, + "grad_norm": 0.7324480414390564, + "learning_rate": 0.00011167930851547418 + }, + { + "step": 368, + "epoch": 1.8727735368956742, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075230208, + "loss": 1.367, + "grad_norm": 0.6653967499732971, + "learning_rate": 0.0001108187386527438 + }, + { + "step": 369, + "epoch": 1.8778625954198473, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075290112, + "loss": 1.3559, + "grad_norm": 0.9320836067199707, + "learning_rate": 0.00010995955065476126 + }, + { + "step": 370, + "epoch": 1.8829516539440203, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075213312, + "loss": 1.3743, + "grad_norm": 0.995067834854126, + "learning_rate": 0.00010910177482380795 + }, + { + "step": 371, + "epoch": 1.8880407124681935, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075373056, + "loss": 1.339, + "grad_norm": 0.7353968024253845, + "learning_rate": 0.00010824544141236015 + }, + { + "step": 372, + "epoch": 1.8931297709923665, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075227136, + "loss": 1.3532, + "grad_norm": 0.5923476219177246, + "learning_rate": 0.00010739058062202224 + }, + { + "step": 373, + "epoch": 1.8982188295165394, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075250176, + "loss": 1.3468, + "grad_norm": 0.6028521656990051, + "learning_rate": 0.00010653722260246145 + }, + { + "step": 374, + "epoch": 1.9033078880407124, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075273216, + "loss": 1.4484, + "grad_norm": 1.5159155130386353, + "learning_rate": 0.00010568539745034447 + }, + { + "step": 375, + "epoch": 1.9083969465648853, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075497472, + "loss": 1.4479, + "grad_norm": 1.1313750743865967, + "learning_rate": 0.00010483513520827614 + }, + { + "step": 376, + "epoch": 1.9134860050890585, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075254784, + "loss": 1.3795, + "grad_norm": 0.6488771438598633, + "learning_rate": 0.00010398646586373969 + }, + { + "step": 377, + "epoch": 1.9185750636132317, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075377664, + "loss": 1.3731, + "grad_norm": 0.7174305319786072, + "learning_rate": 0.00010313941934803922 + }, + { + "step": 378, + "epoch": 1.9236641221374047, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075293184, + "loss": 1.3639, + "grad_norm": 0.6471494436264038, + "learning_rate": 0.00010229402553524413 + }, + { + "step": 379, + "epoch": 1.9287531806615776, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075382272, + "loss": 1.3674, + "grad_norm": 1.0023196935653687, + "learning_rate": 0.00010145031424113542 + }, + { + "step": 380, + "epoch": 1.9338422391857506, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075280896, + "loss": 1.3585, + "grad_norm": 0.7219704389572144, + "learning_rate": 0.00010060831522215416 + }, + { + "step": 381, + "epoch": 1.9389312977099236, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075288576, + "loss": 1.3723, + "grad_norm": 0.6385353207588196, + "learning_rate": 9.976805817435207e-05 + }, + { + "step": 382, + "epoch": 1.9440203562340967, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075211776, + "loss": 1.3404, + "grad_norm": 0.7867012619972229, + "learning_rate": 9.89295727323441e-05 + }, + { + "step": 383, + "epoch": 1.9491094147582697, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075291648, + "loss": 1.4095, + "grad_norm": 0.9182313680648804, + "learning_rate": 9.809288846826327e-05 + }, + { + "step": 384, + "epoch": 1.954198473282443, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075262464, + "loss": 1.3811, + "grad_norm": 0.7023774981498718, + "learning_rate": 9.725803489071779e-05 + }, + { + "step": 385, + "epoch": 1.9592875318066159, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075250176, + "loss": 1.3302, + "grad_norm": 0.5977196097373962, + "learning_rate": 9.642504144375026e-05 + }, + { + "step": 386, + "epoch": 1.9643765903307888, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075393024, + "loss": 1.3045, + "grad_norm": 0.7652204036712646, + "learning_rate": 9.559393750579926e-05 + }, + { + "step": 387, + "epoch": 1.9694656488549618, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075276288, + "loss": 1.3514, + "grad_norm": 0.6074302196502686, + "learning_rate": 9.476475238866318e-05 + }, + { + "step": 388, + "epoch": 1.9745547073791347, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07528704, + "loss": 1.3594, + "grad_norm": 0.6466613411903381, + "learning_rate": 9.393751533646649e-05 + }, + { + "step": 389, + "epoch": 1.979643765903308, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075442176, + "loss": 1.4008, + "grad_norm": 0.8243738412857056, + "learning_rate": 9.31122555246283e-05 + }, + { + "step": 390, + "epoch": 1.984732824427481, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075251712, + "loss": 1.3751, + "grad_norm": 0.7568191885948181, + "learning_rate": 9.228900205883324e-05 + }, + { + "step": 391, + "epoch": 1.989821882951654, + "cpu_mem": 3.258884096, + "gpu_mem": 1.075273216, + "loss": 1.375, + "grad_norm": 0.7072599530220032, + "learning_rate": 9.146778397400543e-05 + }, + { + "step": 392, + "epoch": 1.994910941475827, + "cpu_mem": 3.258884096, + "gpu_mem": 1.07531008, + "loss": 1.3674, + "grad_norm": 0.6050167679786682, + "learning_rate": 9.064863023328384e-05 + }, + { + "step": 393, + "epoch": 2.0, + "cpu_mem": 3.258884096, + "gpu_mem": 1.074896896, + "loss": 2.0718, + "grad_norm": 1.0951662063598633, + "learning_rate": 8.983156972700125e-05 + }, + { + "step": 394, + "epoch": 2.005089058524173, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068923904, + "loss": 1.3513, + "grad_norm": 0.8512520790100098, + "learning_rate": 8.901663127166513e-05 + }, + { + "step": 395, + "epoch": 2.010178117048346, + "cpu_mem": 3.258884096, + "gpu_mem": 1.0689024, + "loss": 1.3551, + "grad_norm": 0.5840802192687988, + "learning_rate": 8.820384360894143e-05 + }, + { + "step": 396, + "epoch": 2.015267175572519, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06891776, + "loss": 1.3521, + "grad_norm": 0.9248055815696716, + "learning_rate": 8.739323540464063e-05 + }, + { + "step": 397, + "epoch": 2.0203562340966923, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069065216, + "loss": 1.3316, + "grad_norm": 0.5586864948272705, + "learning_rate": 8.658483524770728e-05 + }, + { + "step": 398, + "epoch": 2.0254452926208653, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069011456, + "loss": 1.365, + "grad_norm": 0.6278505325317383, + "learning_rate": 8.577867164921113e-05 + }, + { + "step": 399, + "epoch": 2.030534351145038, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069028352, + "loss": 1.3721, + "grad_norm": 0.6287627220153809, + "learning_rate": 8.497477304134203e-05 + }, + { + "step": 400, + "epoch": 2.035623409669211, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068980736, + "loss": 1.3652, + "grad_norm": 0.5956084132194519, + "learning_rate": 8.41731677764068e-05 + }, + { + "step": 401, + "epoch": 2.040712468193384, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069005312, + "loss": 1.363, + "grad_norm": 0.5816306471824646, + "learning_rate": 8.337388412582972e-05 + }, + { + "step": 402, + "epoch": 2.045801526717557, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069052928, + "loss": 1.3374, + "grad_norm": 0.670527458190918, + "learning_rate": 8.257695027915481e-05 + }, + { + "step": 403, + "epoch": 2.05089058524173, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068962304, + "loss": 1.3431, + "grad_norm": 0.8692474961280823, + "learning_rate": 8.178239434305235e-05 + }, + { + "step": 404, + "epoch": 2.0559796437659035, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068934656, + "loss": 1.3305, + "grad_norm": 0.8087844252586365, + "learning_rate": 8.099024434032717e-05 + }, + { + "step": 405, + "epoch": 2.0610687022900764, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069023744, + "loss": 1.3242, + "grad_norm": 0.5420864820480347, + "learning_rate": 8.02005282089303e-05 + }, + { + "step": 406, + "epoch": 2.0661577608142494, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068914688, + "loss": 1.332, + "grad_norm": 0.7683371305465698, + "learning_rate": 7.941327380097388e-05 + }, + { + "step": 407, + "epoch": 2.0712468193384224, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068966912, + "loss": 1.3028, + "grad_norm": 0.6637480854988098, + "learning_rate": 7.862850888174869e-05 + }, + { + "step": 408, + "epoch": 2.0763358778625953, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068920832, + "loss": 1.3489, + "grad_norm": 0.7041542530059814, + "learning_rate": 7.784626112874487e-05 + }, + { + "step": 409, + "epoch": 2.0814249363867683, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069123584, + "loss": 1.3467, + "grad_norm": 0.692335844039917, + "learning_rate": 7.706655813067594e-05 + }, + { + "step": 410, + "epoch": 2.0865139949109412, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069122048, + "loss": 1.3632, + "grad_norm": 1.0641117095947266, + "learning_rate": 7.628942738650573e-05 + }, + { + "step": 411, + "epoch": 2.0916030534351147, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069045248, + "loss": 1.3242, + "grad_norm": 0.6776431202888489, + "learning_rate": 7.551489630447835e-05 + }, + { + "step": 412, + "epoch": 2.0966921119592876, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06900224, + "loss": 1.3035, + "grad_norm": 0.7674925327301025, + "learning_rate": 7.474299220115195e-05 + }, + { + "step": 413, + "epoch": 2.1017811704834606, + "cpu_mem": 3.258884096, + "gpu_mem": 1.0689792, + "loss": 1.2996, + "grad_norm": 0.8167929649353027, + "learning_rate": 7.397374230043484e-05 + }, + { + "step": 414, + "epoch": 2.1068702290076335, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06893312, + "loss": 1.3091, + "grad_norm": 0.8004099130630493, + "learning_rate": 7.320717373262557e-05 + }, + { + "step": 415, + "epoch": 2.1119592875318065, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069049856, + "loss": 1.3459, + "grad_norm": 1.3796700239181519, + "learning_rate": 7.244331353345625e-05 + }, + { + "step": 416, + "epoch": 2.1170483460559795, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068937728, + "loss": 1.381, + "grad_norm": 0.847804069519043, + "learning_rate": 7.16821886431386e-05 + }, + { + "step": 417, + "epoch": 2.122137404580153, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068926976, + "loss": 1.3297, + "grad_norm": 0.951156497001648, + "learning_rate": 7.092382590541432e-05 + }, + { + "step": 418, + "epoch": 2.127226463104326, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068931584, + "loss": 1.2919, + "grad_norm": 0.818080723285675, + "learning_rate": 7.016825206660788e-05 + }, + { + "step": 419, + "epoch": 2.132315521628499, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069046784, + "loss": 1.4146, + "grad_norm": 1.181946039199829, + "learning_rate": 6.941549377468367e-05 + }, + { + "step": 420, + "epoch": 2.1374045801526718, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069020672, + "loss": 1.3439, + "grad_norm": 1.088658094406128, + "learning_rate": 6.866557757830575e-05 + }, + { + "step": 421, + "epoch": 2.1424936386768447, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068922368, + "loss": 1.3122, + "grad_norm": 1.7012630701065063, + "learning_rate": 6.791852992590169e-05 + }, + { + "step": 422, + "epoch": 2.1475826972010177, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068957696, + "loss": 1.3135, + "grad_norm": 1.1662405729293823, + "learning_rate": 6.717437716472997e-05 + }, + { + "step": 423, + "epoch": 2.1526717557251906, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068923904, + "loss": 1.3104, + "grad_norm": 0.8840146064758301, + "learning_rate": 6.643314553995034e-05 + }, + { + "step": 424, + "epoch": 2.157760814249364, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069057536, + "loss": 1.2838, + "grad_norm": 0.8818463683128357, + "learning_rate": 6.569486119369863e-05 + }, + { + "step": 425, + "epoch": 2.162849872773537, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069031424, + "loss": 1.3255, + "grad_norm": 1.0450927019119263, + "learning_rate": 6.495955016416441e-05 + }, + { + "step": 426, + "epoch": 2.16793893129771, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069075968, + "loss": 1.2787, + "grad_norm": 1.1097426414489746, + "learning_rate": 6.422723838467286e-05 + }, + { + "step": 427, + "epoch": 2.173027989821883, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068966912, + "loss": 1.3015, + "grad_norm": 1.156864881515503, + "learning_rate": 6.349795168276994e-05 + }, + { + "step": 428, + "epoch": 2.178117048346056, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068983808, + "loss": 1.2897, + "grad_norm": 1.5556011199951172, + "learning_rate": 6.277171577931187e-05 + }, + { + "step": 429, + "epoch": 2.183206106870229, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069059072, + "loss": 1.3967, + "grad_norm": 1.4059242010116577, + "learning_rate": 6.204855628755751e-05 + }, + { + "step": 430, + "epoch": 2.188295165394402, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068907008, + "loss": 1.3736, + "grad_norm": 1.5524768829345703, + "learning_rate": 6.13284987122654e-05 + }, + { + "step": 431, + "epoch": 2.1933842239185752, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06902528, + "loss": 1.3662, + "grad_norm": 2.589759349822998, + "learning_rate": 6.061156844879417e-05 + }, + { + "step": 432, + "epoch": 2.198473282442748, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069014528, + "loss": 1.3811, + "grad_norm": 2.2743003368377686, + "learning_rate": 5.9897790782206636e-05 + }, + { + "step": 433, + "epoch": 2.203562340966921, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068980736, + "loss": 1.3289, + "grad_norm": 1.4832653999328613, + "learning_rate": 5.9187190886378306e-05 + }, + { + "step": 434, + "epoch": 2.208651399491094, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068959232, + "loss": 1.3374, + "grad_norm": 1.165162444114685, + "learning_rate": 5.8479793823109406e-05 + }, + { + "step": 435, + "epoch": 2.213740458015267, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068954624, + "loss": 1.3519, + "grad_norm": 2.0169007778167725, + "learning_rate": 5.777562454124113e-05 + }, + { + "step": 436, + "epoch": 2.21882951653944, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068965376, + "loss": 1.3333, + "grad_norm": 1.1919994354248047, + "learning_rate": 5.7074707875775496e-05 + }, + { + "step": 437, + "epoch": 2.223918575063613, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069019136, + "loss": 1.3763, + "grad_norm": 1.4489014148712158, + "learning_rate": 5.637706854699974e-05 + }, + { + "step": 438, + "epoch": 2.2290076335877864, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068936192, + "loss": 1.2996, + "grad_norm": 1.0963399410247803, + "learning_rate": 5.568273115961414e-05 + }, + { + "step": 439, + "epoch": 2.2340966921119594, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06896384, + "loss": 1.3703, + "grad_norm": 1.2970160245895386, + "learning_rate": 5.499172020186447e-05 + }, + { + "step": 440, + "epoch": 2.2391857506361323, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068962304, + "loss": 1.3416, + "grad_norm": 1.1772054433822632, + "learning_rate": 5.430406004467842e-05 + }, + { + "step": 441, + "epoch": 2.2442748091603053, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069005312, + "loss": 1.3231, + "grad_norm": 1.0439027547836304, + "learning_rate": 5.361977494080572e-05 + }, + { + "step": 442, + "epoch": 2.2493638676844783, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069003776, + "loss": 1.3439, + "grad_norm": 1.3096983432769775, + "learning_rate": 5.293888902396319e-05 + }, + { + "step": 443, + "epoch": 2.2544529262086512, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069054464, + "loss": 1.3229, + "grad_norm": 1.606872797012329, + "learning_rate": 5.2261426307983204e-05 + }, + { + "step": 444, + "epoch": 2.2595419847328246, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069060608, + "loss": 1.3456, + "grad_norm": 1.2877519130706787, + "learning_rate": 5.158741068596714e-05 + }, + { + "step": 445, + "epoch": 2.2646310432569976, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068999168, + "loss": 1.3641, + "grad_norm": 1.1559590101242065, + "learning_rate": 5.0916865929442326e-05 + }, + { + "step": 446, + "epoch": 2.2697201017811706, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068988416, + "loss": 1.3796, + "grad_norm": 1.6624194383621216, + "learning_rate": 5.024981568752386e-05 + }, + { + "step": 447, + "epoch": 2.2748091603053435, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069012992, + "loss": 1.3762, + "grad_norm": 1.3024390935897827, + "learning_rate": 4.958628348608065e-05 + }, + { + "step": 448, + "epoch": 2.2798982188295165, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069029888, + "loss": 1.3074, + "grad_norm": 1.003501296043396, + "learning_rate": 4.892629272690536e-05 + }, + { + "step": 449, + "epoch": 2.2849872773536894, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068900864, + "loss": 1.3553, + "grad_norm": 1.0401990413665771, + "learning_rate": 4.826986668688944e-05 + }, + { + "step": 450, + "epoch": 2.2900763358778624, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069075968, + "loss": 1.3552, + "grad_norm": 1.0564497709274292, + "learning_rate": 4.761702851720191e-05 + }, + { + "step": 451, + "epoch": 2.2951653944020354, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06903296, + "loss": 1.2943, + "grad_norm": 0.9173198938369751, + "learning_rate": 4.6967801242472916e-05 + }, + { + "step": 452, + "epoch": 2.300254452926209, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068942336, + "loss": 1.3202, + "grad_norm": 0.9992145895957947, + "learning_rate": 4.632220775998172e-05 + }, + { + "step": 453, + "epoch": 2.3053435114503817, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068890112, + "loss": 1.3062, + "grad_norm": 0.8433325290679932, + "learning_rate": 4.568027083884929e-05 + }, + { + "step": 454, + "epoch": 2.3104325699745547, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068905472, + "loss": 1.396, + "grad_norm": 1.028709053993225, + "learning_rate": 4.504201311923488e-05 + }, + { + "step": 455, + "epoch": 2.3155216284987277, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069006848, + "loss": 1.2988, + "grad_norm": 0.9761835932731628, + "learning_rate": 4.440745711153804e-05 + }, + { + "step": 456, + "epoch": 2.3206106870229006, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069016064, + "loss": 1.3459, + "grad_norm": 1.358328104019165, + "learning_rate": 4.377662519560423e-05 + }, + { + "step": 457, + "epoch": 2.325699745547074, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069045248, + "loss": 1.295, + "grad_norm": 1.051002860069275, + "learning_rate": 4.3149539619935836e-05 + }, + { + "step": 458, + "epoch": 2.330788804071247, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069106688, + "loss": 1.318, + "grad_norm": 0.9111893177032471, + "learning_rate": 4.252622250090746e-05 + }, + { + "step": 459, + "epoch": 2.33587786259542, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068960768, + "loss": 1.3312, + "grad_norm": 1.0719459056854248, + "learning_rate": 4.190669582198571e-05 + }, + { + "step": 460, + "epoch": 2.340966921119593, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06904832, + "loss": 1.3068, + "grad_norm": 1.3499486446380615, + "learning_rate": 4.1290981432954185e-05 + }, + { + "step": 461, + "epoch": 2.346055979643766, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068951552, + "loss": 1.3179, + "grad_norm": 0.9045688509941101, + "learning_rate": 4.067910104914249e-05 + }, + { + "step": 462, + "epoch": 2.351145038167939, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069022208, + "loss": 1.3317, + "grad_norm": 0.9219078421592712, + "learning_rate": 4.007107625066079e-05 + }, + { + "step": 463, + "epoch": 2.356234096692112, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069057536, + "loss": 1.3516, + "grad_norm": 1.628555417060852, + "learning_rate": 3.946692848163836e-05 + }, + { + "step": 464, + "epoch": 2.3613231552162848, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068996096, + "loss": 1.2659, + "grad_norm": 1.0624788999557495, + "learning_rate": 3.886667904946739e-05 + }, + { + "step": 465, + "epoch": 2.366412213740458, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068953088, + "loss": 1.314, + "grad_norm": 0.9965498447418213, + "learning_rate": 3.8270349124051694e-05 + }, + { + "step": 466, + "epoch": 2.371501272264631, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068954624, + "loss": 1.3335, + "grad_norm": 1.0532972812652588, + "learning_rate": 3.767795973705975e-05 + }, + { + "step": 467, + "epoch": 2.376590330788804, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068991488, + "loss": 1.2729, + "grad_norm": 0.9515097141265869, + "learning_rate": 3.708953178118324e-05 + }, + { + "step": 468, + "epoch": 2.381679389312977, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06896384, + "loss": 1.2625, + "grad_norm": 1.0845255851745605, + "learning_rate": 3.6505086009399944e-05 + }, + { + "step": 469, + "epoch": 2.38676844783715, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069020672, + "loss": 1.3273, + "grad_norm": 1.2697478532791138, + "learning_rate": 3.5924643034242136e-05 + }, + { + "step": 470, + "epoch": 2.391857506361323, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069049856, + "loss": 1.3009, + "grad_norm": 1.7207767963409424, + "learning_rate": 3.5348223327069105e-05 + }, + { + "step": 471, + "epoch": 2.3969465648854964, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068950016, + "loss": 1.3321, + "grad_norm": 1.5821716785430908, + "learning_rate": 3.4775847217345756e-05 + }, + { + "step": 472, + "epoch": 2.4020356234096694, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068973056, + "loss": 1.2772, + "grad_norm": 1.5854336023330688, + "learning_rate": 3.420753489192524e-05 + }, + { + "step": 473, + "epoch": 2.4071246819338423, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06894848, + "loss": 1.2759, + "grad_norm": 1.2934439182281494, + "learning_rate": 3.364330639433701e-05 + }, + { + "step": 474, + "epoch": 2.4122137404580153, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06904064, + "loss": 1.3274, + "grad_norm": 1.0782780647277832, + "learning_rate": 3.308318162408013e-05 + }, + { + "step": 475, + "epoch": 2.4173027989821882, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06891008, + "loss": 1.3351, + "grad_norm": 1.411712884902954, + "learning_rate": 3.2527180335921186e-05 + }, + { + "step": 476, + "epoch": 2.422391857506361, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069046784, + "loss": 1.3742, + "grad_norm": 1.37353515625, + "learning_rate": 3.197532213919774e-05 + }, + { + "step": 477, + "epoch": 2.427480916030534, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068903936, + "loss": 1.3051, + "grad_norm": 1.6327844858169556, + "learning_rate": 3.1427626497126654e-05 + }, + { + "step": 478, + "epoch": 2.432569974554707, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068926976, + "loss": 1.2167, + "grad_norm": 1.1278135776519775, + "learning_rate": 3.088411272611781e-05 + }, + { + "step": 479, + "epoch": 2.4376590330788805, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068974592, + "loss": 1.3357, + "grad_norm": 1.1887420415878296, + "learning_rate": 3.0344799995092533e-05 + }, + { + "step": 480, + "epoch": 2.4427480916030535, + "cpu_mem": 3.258884096, + "gpu_mem": 1.0690176, + "loss": 1.3526, + "grad_norm": 1.0900686979293823, + "learning_rate": 2.9809707324807912e-05 + }, + { + "step": 481, + "epoch": 2.4478371501272265, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068945408, + "loss": 1.3121, + "grad_norm": 1.7821060419082642, + "learning_rate": 2.9278853587185658e-05 + }, + { + "step": 482, + "epoch": 2.4529262086513994, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069043712, + "loss": 1.3451, + "grad_norm": 1.7332649230957031, + "learning_rate": 2.8752257504646616e-05 + }, + { + "step": 483, + "epoch": 2.4580152671755724, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068939264, + "loss": 1.2735, + "grad_norm": 1.214321255683899, + "learning_rate": 2.8229937649450613e-05 + }, + { + "step": 484, + "epoch": 2.4631043256997454, + "cpu_mem": 3.258884096, + "gpu_mem": 1.0690944, + "loss": 1.2607, + "grad_norm": 1.2425888776779175, + "learning_rate": 2.7711912443041123e-05 + }, + { + "step": 485, + "epoch": 2.4681933842239188, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068926976, + "loss": 1.3121, + "grad_norm": 1.093100666999817, + "learning_rate": 2.719820015539596e-05 + }, + { + "step": 486, + "epoch": 2.4732824427480917, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068969984, + "loss": 1.3278, + "grad_norm": 1.3861163854599, + "learning_rate": 2.6688818904382513e-05 + }, + { + "step": 487, + "epoch": 2.4783715012722647, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068991488, + "loss": 1.3017, + "grad_norm": 1.28607177734375, + "learning_rate": 2.6183786655119144e-05 + }, + { + "step": 488, + "epoch": 2.4834605597964376, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068914688, + "loss": 1.3385, + "grad_norm": 1.438772201538086, + "learning_rate": 2.5683121219341217e-05 + }, + { + "step": 489, + "epoch": 2.4885496183206106, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069051392, + "loss": 1.3016, + "grad_norm": 1.5404229164123535, + "learning_rate": 2.518684025477319e-05 + }, + { + "step": 490, + "epoch": 2.4936386768447836, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068939264, + "loss": 1.2672, + "grad_norm": 1.2474939823150635, + "learning_rate": 2.469496126450578e-05 + }, + { + "step": 491, + "epoch": 2.4987277353689565, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068936192, + "loss": 1.3675, + "grad_norm": 1.3135405778884888, + "learning_rate": 2.4207501596378508e-05 + }, + { + "step": 492, + "epoch": 2.5038167938931295, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06891008, + "loss": 1.2685, + "grad_norm": 1.1005253791809082, + "learning_rate": 2.3724478442368133e-05 + }, + { + "step": 493, + "epoch": 2.508905852417303, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068945408, + "loss": 1.2098, + "grad_norm": 1.4141777753829956, + "learning_rate": 2.324590883798204e-05 + }, + { + "step": 494, + "epoch": 2.513994910941476, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068960768, + "loss": 1.2992, + "grad_norm": 1.5527915954589844, + "learning_rate": 2.2771809661657614e-05 + }, + { + "step": 495, + "epoch": 2.519083969465649, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068954624, + "loss": 1.3253, + "grad_norm": 1.295177936553955, + "learning_rate": 2.2302197634166835e-05 + }, + { + "step": 496, + "epoch": 2.524173027989822, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068966912, + "loss": 1.3629, + "grad_norm": 1.6039657592773438, + "learning_rate": 2.1837089318026714e-05 + }, + { + "step": 497, + "epoch": 2.5292620865139948, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068999168, + "loss": 1.2431, + "grad_norm": 1.5583829879760742, + "learning_rate": 2.1376501116915047e-05 + }, + { + "step": 498, + "epoch": 2.534351145038168, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069000704, + "loss": 1.3025, + "grad_norm": 1.1148289442062378, + "learning_rate": 2.0920449275091837e-05 + }, + { + "step": 499, + "epoch": 2.539440203562341, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068973056, + "loss": 1.3293, + "grad_norm": 1.5956114530563354, + "learning_rate": 2.0468949876826573e-05 + }, + { + "step": 500, + "epoch": 2.544529262086514, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069022208, + "loss": 1.29, + "grad_norm": 1.4830167293548584, + "learning_rate": 2.002201884583065e-05 + }, + { + "step": 501, + "epoch": 2.549618320610687, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068951552, + "loss": 1.3736, + "grad_norm": 1.7280524969100952, + "learning_rate": 1.957967194469615e-05 + }, + { + "step": 502, + "epoch": 2.55470737913486, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06900992, + "loss": 1.2371, + "grad_norm": 1.61713445186615, + "learning_rate": 1.9141924774339566e-05 + }, + { + "step": 503, + "epoch": 2.559796437659033, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069000704, + "loss": 1.1958, + "grad_norm": 1.3413949012756348, + "learning_rate": 1.8708792773451874e-05 + }, + { + "step": 504, + "epoch": 2.564885496183206, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068934656, + "loss": 1.3083, + "grad_norm": 1.6994402408599854, + "learning_rate": 1.828029121795375e-05 + }, + { + "step": 505, + "epoch": 2.569974554707379, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06895616, + "loss": 1.3239, + "grad_norm": 1.4061304330825806, + "learning_rate": 1.7856435220457092e-05 + }, + { + "step": 506, + "epoch": 2.5750636132315523, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069011456, + "loss": 1.3488, + "grad_norm": 1.9850738048553467, + "learning_rate": 1.7437239729731806e-05 + }, + { + "step": 507, + "epoch": 2.5801526717557253, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068928512, + "loss": 1.2937, + "grad_norm": 1.2352666854858398, + "learning_rate": 1.7022719530178624e-05 + }, + { + "step": 508, + "epoch": 2.5852417302798982, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068993024, + "loss": 1.3575, + "grad_norm": 1.5702208280563354, + "learning_rate": 1.6612889241307836e-05 + }, + { + "step": 509, + "epoch": 2.590330788804071, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068951552, + "loss": 1.3213, + "grad_norm": 1.582923412322998, + "learning_rate": 1.620776331722347e-05 + }, + { + "step": 510, + "epoch": 2.595419847328244, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068983808, + "loss": 1.3891, + "grad_norm": 1.6793078184127808, + "learning_rate": 1.580735604611368e-05 + }, + { + "step": 511, + "epoch": 2.6005089058524176, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068957696, + "loss": 1.286, + "grad_norm": 1.5727986097335815, + "learning_rate": 1.5411681549746678e-05 + }, + { + "step": 512, + "epoch": 2.6055979643765905, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068951552, + "loss": 1.3136, + "grad_norm": 1.2482463121414185, + "learning_rate": 1.502075378297285e-05 + }, + { + "step": 513, + "epoch": 2.6106870229007635, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068962304, + "loss": 1.3657, + "grad_norm": 1.820465326309204, + "learning_rate": 1.4634586533232428e-05 + }, + { + "step": 514, + "epoch": 2.6157760814249365, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068867072, + "loss": 1.3706, + "grad_norm": 1.4620566368103027, + "learning_rate": 1.4253193420069292e-05 + }, + { + "step": 515, + "epoch": 2.6208651399491094, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068936192, + "loss": 1.3026, + "grad_norm": 1.609329104423523, + "learning_rate": 1.3876587894650686e-05 + }, + { + "step": 516, + "epoch": 2.6259541984732824, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068983808, + "loss": 1.3224, + "grad_norm": 1.8678323030471802, + "learning_rate": 1.350478323929271e-05 + }, + { + "step": 517, + "epoch": 2.6310432569974553, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068928512, + "loss": 1.3468, + "grad_norm": 1.8419811725616455, + "learning_rate": 1.3137792566992001e-05 + }, + { + "step": 518, + "epoch": 2.6361323155216283, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06898688, + "loss": 1.3334, + "grad_norm": 1.3719607591629028, + "learning_rate": 1.2775628820963091e-05 + }, + { + "step": 519, + "epoch": 2.6412213740458013, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068993024, + "loss": 1.2125, + "grad_norm": 1.5117361545562744, + "learning_rate": 1.2418304774182075e-05 + }, + { + "step": 520, + "epoch": 2.6463104325699747, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069129728, + "loss": 1.2084, + "grad_norm": 1.256232738494873, + "learning_rate": 1.2065833028935968e-05 + }, + { + "step": 521, + "epoch": 2.6513994910941476, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068939264, + "loss": 1.2513, + "grad_norm": 1.3836029767990112, + "learning_rate": 1.1718226016378507e-05 + }, + { + "step": 522, + "epoch": 2.6564885496183206, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069003776, + "loss": 1.3079, + "grad_norm": 1.4266234636306763, + "learning_rate": 1.137549599609136e-05 + }, + { + "step": 523, + "epoch": 2.6615776081424936, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068919296, + "loss": 1.3224, + "grad_norm": 2.0504775047302246, + "learning_rate": 1.103765505565205e-05 + }, + { + "step": 524, + "epoch": 2.6666666666666665, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068936192, + "loss": 1.2125, + "grad_norm": 1.3279485702514648, + "learning_rate": 1.0704715110207579e-05 + }, + { + "step": 525, + "epoch": 2.67175572519084, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068936192, + "loss": 1.2785, + "grad_norm": 1.3704410791397095, + "learning_rate": 1.0376687902053981e-05 + }, + { + "step": 526, + "epoch": 2.676844783715013, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069062144, + "loss": 1.3489, + "grad_norm": 1.4265310764312744, + "learning_rate": 1.0053585000222524e-05 + }, + { + "step": 527, + "epoch": 2.681933842239186, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068942336, + "loss": 1.2525, + "grad_norm": 1.5959515571594238, + "learning_rate": 9.735417800071433e-06 + }, + { + "step": 528, + "epoch": 2.687022900763359, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069100544, + "loss": 1.3114, + "grad_norm": 1.3484317064285278, + "learning_rate": 9.42219752288414e-06 + }, + { + "step": 529, + "epoch": 2.6921119592875318, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06892544, + "loss": 1.2755, + "grad_norm": 1.2415663003921509, + "learning_rate": 9.113935215473428e-06 + }, + { + "step": 530, + "epoch": 2.6972010178117047, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068943872, + "loss": 1.3464, + "grad_norm": 1.4447025060653687, + "learning_rate": 8.810641749791902e-06 + }, + { + "step": 531, + "epoch": 2.7022900763358777, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069120512, + "loss": 1.2276, + "grad_norm": 1.5341014862060547, + "learning_rate": 8.512327822548481e-06 + }, + { + "step": 532, + "epoch": 2.7073791348600507, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06899456, + "loss": 1.2327, + "grad_norm": 1.3470693826675415, + "learning_rate": 8.219003954831199e-06 + }, + { + "step": 533, + "epoch": 2.712468193384224, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068991488, + "loss": 1.3192, + "grad_norm": 1.622653841972351, + "learning_rate": 7.930680491736135e-06 + }, + { + "step": 534, + "epoch": 2.717557251908397, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069000704, + "loss": 1.3232, + "grad_norm": 1.3931396007537842, + "learning_rate": 7.647367602002491e-06 + }, + { + "step": 535, + "epoch": 2.72264631043257, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068876288, + "loss": 1.3056, + "grad_norm": 1.4669103622436523, + "learning_rate": 7.369075277654091e-06 + }, + { + "step": 536, + "epoch": 2.727735368956743, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069158912, + "loss": 1.3359, + "grad_norm": 1.6905311346054077, + "learning_rate": 7.095813333646832e-06 + }, + { + "step": 537, + "epoch": 2.732824427480916, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068922368, + "loss": 1.2577, + "grad_norm": 1.2862242460250854, + "learning_rate": 6.827591407522548e-06 + }, + { + "step": 538, + "epoch": 2.7379134860050893, + "cpu_mem": 3.258884096, + "gpu_mem": 1.0690176, + "loss": 1.3263, + "grad_norm": 1.2286640405654907, + "learning_rate": 6.564418959069273e-06 + }, + { + "step": 539, + "epoch": 2.7430025445292623, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068983808, + "loss": 1.3339, + "grad_norm": 2.0090959072113037, + "learning_rate": 6.3063052699873326e-06 + }, + { + "step": 540, + "epoch": 2.7480916030534353, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068951552, + "loss": 1.382, + "grad_norm": 1.824827790260315, + "learning_rate": 6.053259443562286e-06 + }, + { + "step": 541, + "epoch": 2.753180661577608, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068969984, + "loss": 1.4175, + "grad_norm": 1.9275176525115967, + "learning_rate": 5.8052904043435985e-06 + }, + { + "step": 542, + "epoch": 2.758269720101781, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069020672, + "loss": 1.405, + "grad_norm": 1.604884147644043, + "learning_rate": 5.56240689783013e-06 + }, + { + "step": 543, + "epoch": 2.763358778625954, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068916224, + "loss": 1.3425, + "grad_norm": 1.415191411972046, + "learning_rate": 5.324617490161409e-06 + }, + { + "step": 544, + "epoch": 2.768447837150127, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068945408, + "loss": 1.2993, + "grad_norm": 1.341451644897461, + "learning_rate": 5.091930567815866e-06 + }, + { + "step": 545, + "epoch": 2.7735368956743, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068888576, + "loss": 1.3168, + "grad_norm": 1.3440707921981812, + "learning_rate": 4.86435433731473e-06 + }, + { + "step": 546, + "epoch": 2.778625954198473, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06896384, + "loss": 1.2754, + "grad_norm": 1.4124419689178467, + "learning_rate": 4.641896824932861e-06 + }, + { + "step": 547, + "epoch": 2.7837150127226464, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06895616, + "loss": 1.3096, + "grad_norm": 1.3136605024337769, + "learning_rate": 4.424565876415415e-06 + }, + { + "step": 548, + "epoch": 2.7888040712468194, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06898688, + "loss": 1.297, + "grad_norm": 1.2315781116485596, + "learning_rate": 4.212369156701373e-06 + }, + { + "step": 549, + "epoch": 2.7938931297709924, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068916224, + "loss": 1.3808, + "grad_norm": 1.5503193140029907, + "learning_rate": 4.005314149653133e-06 + }, + { + "step": 550, + "epoch": 2.7989821882951653, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068942336, + "loss": 1.3242, + "grad_norm": 1.432255506515503, + "learning_rate": 3.8034081577924147e-06 + }, + { + "step": 551, + "epoch": 2.8040712468193383, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069037568, + "loss": 1.3328, + "grad_norm": 1.5887073278427124, + "learning_rate": 3.6066583020429864e-06 + }, + { + "step": 552, + "epoch": 2.8091603053435117, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06893312, + "loss": 1.3076, + "grad_norm": 1.3883767127990723, + "learning_rate": 3.415071521479246e-06 + }, + { + "step": 553, + "epoch": 2.8142493638676847, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068942336, + "loss": 1.3412, + "grad_norm": 1.3405543565750122, + "learning_rate": 3.2286545730817183e-06 + }, + { + "step": 554, + "epoch": 2.8193384223918576, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068969984, + "loss": 1.3542, + "grad_norm": 1.5150272846221924, + "learning_rate": 3.0474140314985628e-06 + }, + { + "step": 555, + "epoch": 2.8244274809160306, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068983808, + "loss": 1.37, + "grad_norm": 1.6602613925933838, + "learning_rate": 2.8713562888138754e-06 + }, + { + "step": 556, + "epoch": 2.8295165394402035, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068977664, + "loss": 1.2987, + "grad_norm": 1.331343412399292, + "learning_rate": 2.7004875543220506e-06 + }, + { + "step": 557, + "epoch": 2.8346055979643765, + "cpu_mem": 3.258884096, + "gpu_mem": 1.0689408, + "loss": 1.2584, + "grad_norm": 1.3772581815719604, + "learning_rate": 2.5348138543089425e-06 + }, + { + "step": 558, + "epoch": 2.8396946564885495, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068999168, + "loss": 1.3522, + "grad_norm": 2.153369426727295, + "learning_rate": 2.374341031839283e-06 + }, + { + "step": 559, + "epoch": 2.8447837150127224, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068908544, + "loss": 1.3424, + "grad_norm": 1.640823483467102, + "learning_rate": 2.2190747465505644e-06 + }, + { + "step": 560, + "epoch": 2.849872773536896, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068993024, + "loss": 1.3025, + "grad_norm": 1.816098690032959, + "learning_rate": 2.0690204744534976e-06 + }, + { + "step": 561, + "epoch": 2.854961832061069, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069031424, + "loss": 1.237, + "grad_norm": 1.8894262313842773, + "learning_rate": 1.924183507738819e-06 + }, + { + "step": 562, + "epoch": 2.8600508905852418, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069075968, + "loss": 1.386, + "grad_norm": 1.381552815437317, + "learning_rate": 1.7845689545906704e-06 + }, + { + "step": 563, + "epoch": 2.8651399491094147, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069037568, + "loss": 1.2783, + "grad_norm": 1.3762825727462769, + "learning_rate": 1.6501817390064786e-06 + }, + { + "step": 564, + "epoch": 2.8702290076335877, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068974592, + "loss": 1.3121, + "grad_norm": 1.4234538078308105, + "learning_rate": 1.521026600623243e-06 + }, + { + "step": 565, + "epoch": 2.875318066157761, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069028352, + "loss": 1.3635, + "grad_norm": 1.7200251817703247, + "learning_rate": 1.3971080945503866e-06 + }, + { + "step": 566, + "epoch": 2.880407124681934, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068966912, + "loss": 1.2921, + "grad_norm": 1.926821231842041, + "learning_rate": 1.2784305912090842e-06 + }, + { + "step": 567, + "epoch": 2.885496183206107, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068953088, + "loss": 1.2667, + "grad_norm": 1.3544071912765503, + "learning_rate": 1.1649982761782195e-06 + }, + { + "step": 568, + "epoch": 2.89058524173028, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069165056, + "loss": 1.2776, + "grad_norm": 1.385190725326538, + "learning_rate": 1.0568151500465693e-06 + }, + { + "step": 569, + "epoch": 2.895674300254453, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069029888, + "loss": 1.2624, + "grad_norm": 2.2606301307678223, + "learning_rate": 9.538850282719833e-07 + }, + { + "step": 570, + "epoch": 2.900763358778626, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069003776, + "loss": 1.3396, + "grad_norm": 1.5963176488876343, + "learning_rate": 8.56211541046542e-07 + }, + { + "step": 571, + "epoch": 2.905852417302799, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06900992, + "loss": 1.3182, + "grad_norm": 1.4174522161483765, + "learning_rate": 7.637981331687582e-07 + }, + { + "step": 572, + "epoch": 2.910941475826972, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069043712, + "loss": 1.359, + "grad_norm": 1.9370280504226685, + "learning_rate": 6.766480639218752e-07 + }, + { + "step": 573, + "epoch": 2.916030534351145, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068951552, + "loss": 1.3933, + "grad_norm": 1.394512414932251, + "learning_rate": 5.947644069591084e-07 + }, + { + "step": 574, + "epoch": 2.921119592875318, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069036032, + "loss": 1.3361, + "grad_norm": 1.6055657863616943, + "learning_rate": 5.181500501950986e-07 + }, + { + "step": 575, + "epoch": 2.926208651399491, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068923904, + "loss": 1.2649, + "grad_norm": 1.1883058547973633, + "learning_rate": 4.468076957041433e-07 + }, + { + "step": 576, + "epoch": 2.931297709923664, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06898688, + "loss": 1.356, + "grad_norm": 1.4632490873336792, + "learning_rate": 3.807398596248401e-07 + }, + { + "step": 577, + "epoch": 2.936386768447837, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068951552, + "loss": 1.2683, + "grad_norm": 1.457822561264038, + "learning_rate": 3.199488720714072e-07 + }, + { + "step": 578, + "epoch": 2.94147582697201, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068951552, + "loss": 1.3298, + "grad_norm": 1.334066390991211, + "learning_rate": 2.64436877051466e-07 + }, + { + "step": 579, + "epoch": 2.9465648854961835, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069036032, + "loss": 1.325, + "grad_norm": 1.5908781290054321, + "learning_rate": 2.1420583239040167e-07 + }, + { + "step": 580, + "epoch": 2.9516539440203564, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068982272, + "loss": 1.3649, + "grad_norm": 1.5900657176971436, + "learning_rate": 1.6925750966238494e-07 + }, + { + "step": 581, + "epoch": 2.9567430025445294, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06900224, + "loss": 1.3524, + "grad_norm": 1.8024336099624634, + "learning_rate": 1.295934941278387e-07 + }, + { + "step": 582, + "epoch": 2.9618320610687023, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068930048, + "loss": 1.2943, + "grad_norm": 2.2248759269714355, + "learning_rate": 9.52151846775162e-08 + }, + { + "step": 583, + "epoch": 2.9669211195928753, + "cpu_mem": 3.258884096, + "gpu_mem": 1.0690176, + "loss": 1.3627, + "grad_norm": 1.5396898984909058, + "learning_rate": 6.612379378320709e-08 + }, + { + "step": 584, + "epoch": 2.9720101781170483, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068985344, + "loss": 1.2602, + "grad_norm": 1.468740701675415, + "learning_rate": 4.232034745495494e-08 + }, + { + "step": 585, + "epoch": 2.9770992366412212, + "cpu_mem": 3.258884096, + "gpu_mem": 1.06896384, + "loss": 1.2115, + "grad_norm": 1.2929686307907104, + "learning_rate": 2.3805685204869583e-08 + }, + { + "step": 586, + "epoch": 2.982188295165394, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068907008, + "loss": 1.3241, + "grad_norm": 2.2964682579040527, + "learning_rate": 1.0580460017517444e-08 + }, + { + "step": 587, + "epoch": 2.9872773536895676, + "cpu_mem": 3.258884096, + "gpu_mem": 1.068973056, + "loss": 1.4365, + "grad_norm": 1.7823565006256104, + "learning_rate": 2.645138326906604e-09 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069014528, + "loss": 1.3074, + "grad_norm": 1.331485629081726, + "learning_rate": 0.0 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 3.258884096, + "gpu_mem": 1.069014528, + "train_runtime": 8524.203, + "train_samples_per_second": 4.423, + "train_steps_per_second": 0.069, + "total_flos": 8.835531003429274e+16, + "train_loss": 1.414679901940482 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r32-a2/README.md b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r32-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r32-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r32-a2/adapter_config.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..97cff55d3f03a364161498b7b6299c246238daf5 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r32-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r32-a2/eval_results.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c30b7e2099d6f72ffe8948b3808354a7789c7d74 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "logiqa", + "results": 0.43317496385044413 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r32-a2/training_configuration.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..f45feb1f73f7ca2f76baa47d07aa215b9d187a7f --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "LOGIQA", + "dataset_id": "data/logiqa_train", + "preprocess_id": "logiqa_train_deepeval" + }, + "peft_config": { + "method": "loraq4", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 25231360 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 3, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loraq4-logiqa-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loraq4-fix/TinyLlama_v1.1-loraq4-logiqa-r32-a2", + "seed": 42, + "timestamp": "2025-09-01T10:24:55.471951" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r32-a2/training_logs.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..fcae46141b7a7a6bc3e323e3b4390e8c1bfc0209 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r32-a2/training_logs.json @@ -0,0 +1,5305 @@ +[ + { + "step": 1, + "epoch": 0.005089058524173028, + "cpu_mem": 3.324944384, + "gpu_mem": 1.150960128, + "loss": 3.684, + "grad_norm": 61.11761474609375, + "learning_rate": 5.084745762711864e-06 + }, + { + "step": 2, + "epoch": 0.010178117048346057, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352830464, + "loss": 3.9445, + "grad_norm": 65.53050231933594, + "learning_rate": 1.0169491525423728e-05 + }, + { + "step": 3, + "epoch": 0.015267175572519083, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352907264, + "loss": 3.4818, + "grad_norm": 58.49680709838867, + "learning_rate": 1.5254237288135592e-05 + }, + { + "step": 4, + "epoch": 0.020356234096692113, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352805888, + "loss": 2.9369, + "grad_norm": 49.73374557495117, + "learning_rate": 2.0338983050847455e-05 + }, + { + "step": 5, + "epoch": 0.02544529262086514, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352821248, + "loss": 2.4288, + "grad_norm": 33.68429946899414, + "learning_rate": 2.542372881355932e-05 + }, + { + "step": 6, + "epoch": 0.030534351145038167, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352813568, + "loss": 2.0183, + "grad_norm": 17.087142944335938, + "learning_rate": 3.0508474576271185e-05 + }, + { + "step": 7, + "epoch": 0.035623409669211195, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352884224, + "loss": 1.6354, + "grad_norm": 8.141523361206055, + "learning_rate": 3.559322033898305e-05 + }, + { + "step": 8, + "epoch": 0.04071246819338423, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352868864, + "loss": 1.5257, + "grad_norm": 3.9435482025146484, + "learning_rate": 4.067796610169491e-05 + }, + { + "step": 9, + "epoch": 0.04580152671755725, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352867328, + "loss": 1.4931, + "grad_norm": 3.77956223487854, + "learning_rate": 4.576271186440678e-05 + }, + { + "step": 10, + "epoch": 0.05089058524173028, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35287808, + "loss": 1.4417, + "grad_norm": 3.205181360244751, + "learning_rate": 5.084745762711864e-05 + }, + { + "step": 11, + "epoch": 0.05597964376590331, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352781312, + "loss": 1.3891, + "grad_norm": 2.279606342315674, + "learning_rate": 5.59322033898305e-05 + }, + { + "step": 12, + "epoch": 0.061068702290076333, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352832, + "loss": 1.4336, + "grad_norm": 3.4928464889526367, + "learning_rate": 6.101694915254237e-05 + }, + { + "step": 13, + "epoch": 0.06615776081424936, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35292416, + "loss": 1.3698, + "grad_norm": 1.62000572681427, + "learning_rate": 6.610169491525423e-05 + }, + { + "step": 14, + "epoch": 0.07124681933842239, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352836608, + "loss": 1.5075, + "grad_norm": 4.0626959800720215, + "learning_rate": 7.11864406779661e-05 + }, + { + "step": 15, + "epoch": 0.07633587786259542, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352974848, + "loss": 1.4119, + "grad_norm": 1.9517098665237427, + "learning_rate": 7.627118644067796e-05 + }, + { + "step": 16, + "epoch": 0.08142493638676845, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352835072, + "loss": 1.4721, + "grad_norm": 3.5457191467285156, + "learning_rate": 8.135593220338982e-05 + }, + { + "step": 17, + "epoch": 0.08651399491094147, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352865792, + "loss": 1.3786, + "grad_norm": 1.9533891677856445, + "learning_rate": 8.64406779661017e-05 + }, + { + "step": 18, + "epoch": 0.0916030534351145, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352828928, + "loss": 1.399, + "grad_norm": 2.709408760070801, + "learning_rate": 9.152542372881355e-05 + }, + { + "step": 19, + "epoch": 0.09669211195928754, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352736768, + "loss": 1.3865, + "grad_norm": 1.1461929082870483, + "learning_rate": 9.661016949152541e-05 + }, + { + "step": 20, + "epoch": 0.10178117048346055, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352775168, + "loss": 1.3787, + "grad_norm": 1.168555736541748, + "learning_rate": 0.00010169491525423727 + }, + { + "step": 21, + "epoch": 0.10687022900763359, + "cpu_mem": 3.324944384, + "gpu_mem": 1.3529088, + "loss": 1.4746, + "grad_norm": 3.6385059356689453, + "learning_rate": 0.00010677966101694915 + }, + { + "step": 22, + "epoch": 0.11195928753180662, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352807424, + "loss": 1.4817, + "grad_norm": 3.252824306488037, + "learning_rate": 0.000111864406779661 + }, + { + "step": 23, + "epoch": 0.11704834605597965, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352835072, + "loss": 1.3814, + "grad_norm": 1.692505121231079, + "learning_rate": 0.00011694915254237288 + }, + { + "step": 24, + "epoch": 0.12213740458015267, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352828928, + "loss": 1.3578, + "grad_norm": 0.7379035949707031, + "learning_rate": 0.00012203389830508474 + }, + { + "step": 25, + "epoch": 0.1272264631043257, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352835072, + "loss": 1.4541, + "grad_norm": 2.695420503616333, + "learning_rate": 0.00012711864406779658 + }, + { + "step": 26, + "epoch": 0.13231552162849872, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352887296, + "loss": 1.4142, + "grad_norm": 2.6992812156677246, + "learning_rate": 0.00013220338983050846 + }, + { + "step": 27, + "epoch": 0.13740458015267176, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352828928, + "loss": 1.4424, + "grad_norm": 2.4135286808013916, + "learning_rate": 0.00013728813559322033 + }, + { + "step": 28, + "epoch": 0.14249363867684478, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352775168, + "loss": 1.4235, + "grad_norm": 1.5799323320388794, + "learning_rate": 0.0001423728813559322 + }, + { + "step": 29, + "epoch": 0.1475826972010178, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352867328, + "loss": 1.3781, + "grad_norm": 0.9887019395828247, + "learning_rate": 0.00014745762711864405 + }, + { + "step": 30, + "epoch": 0.15267175572519084, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35286272, + "loss": 1.3306, + "grad_norm": 1.4277961254119873, + "learning_rate": 0.00015254237288135592 + }, + { + "step": 31, + "epoch": 0.15776081424936386, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352841216, + "loss": 1.3593, + "grad_norm": 1.4336220026016235, + "learning_rate": 0.0001576271186440678 + }, + { + "step": 32, + "epoch": 0.1628498727735369, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352845824, + "loss": 1.6067, + "grad_norm": 4.08809757232666, + "learning_rate": 0.00016271186440677964 + }, + { + "step": 33, + "epoch": 0.16793893129770993, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352881152, + "loss": 1.3643, + "grad_norm": 1.8081251382827759, + "learning_rate": 0.0001677966101694915 + }, + { + "step": 34, + "epoch": 0.17302798982188294, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352805888, + "loss": 1.4729, + "grad_norm": 2.1214096546173096, + "learning_rate": 0.0001728813559322034 + }, + { + "step": 35, + "epoch": 0.178117048346056, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352853504, + "loss": 1.3603, + "grad_norm": 0.9614776372909546, + "learning_rate": 0.00017796610169491523 + }, + { + "step": 36, + "epoch": 0.183206106870229, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352868864, + "loss": 1.5751, + "grad_norm": 4.161680698394775, + "learning_rate": 0.0001830508474576271 + }, + { + "step": 37, + "epoch": 0.18829516539440203, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352934912, + "loss": 1.7686, + "grad_norm": 5.528164863586426, + "learning_rate": 0.00018813559322033895 + }, + { + "step": 38, + "epoch": 0.19338422391857507, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352818176, + "loss": 1.432, + "grad_norm": 1.822394609451294, + "learning_rate": 0.00019322033898305083 + }, + { + "step": 39, + "epoch": 0.1984732824427481, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352934912, + "loss": 1.459, + "grad_norm": 1.3471583127975464, + "learning_rate": 0.0001983050847457627 + }, + { + "step": 40, + "epoch": 0.2035623409669211, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352856576, + "loss": 1.3579, + "grad_norm": 0.8547364473342896, + "learning_rate": 0.00020338983050847455 + }, + { + "step": 41, + "epoch": 0.20865139949109415, + "cpu_mem": 3.324944384, + "gpu_mem": 1.3527552, + "loss": 1.4303, + "grad_norm": 1.524259328842163, + "learning_rate": 0.00020847457627118642 + }, + { + "step": 42, + "epoch": 0.21374045801526717, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352827392, + "loss": 1.5336, + "grad_norm": 2.494325876235962, + "learning_rate": 0.0002135593220338983 + }, + { + "step": 43, + "epoch": 0.21882951653944022, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352790528, + "loss": 1.4561, + "grad_norm": 1.9766045808792114, + "learning_rate": 0.00021864406779661014 + }, + { + "step": 44, + "epoch": 0.22391857506361323, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352825856, + "loss": 1.405, + "grad_norm": 1.078290343284607, + "learning_rate": 0.000223728813559322 + }, + { + "step": 45, + "epoch": 0.22900763358778625, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352879616, + "loss": 1.3917, + "grad_norm": 0.6879132986068726, + "learning_rate": 0.00022881355932203386 + }, + { + "step": 46, + "epoch": 0.2340966921119593, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352927232, + "loss": 1.3741, + "grad_norm": 0.8673797845840454, + "learning_rate": 0.00023389830508474576 + }, + { + "step": 47, + "epoch": 0.23918575063613232, + "cpu_mem": 3.324944384, + "gpu_mem": 1.3527552, + "loss": 1.3925, + "grad_norm": 1.2323013544082642, + "learning_rate": 0.0002389830508474576 + }, + { + "step": 48, + "epoch": 0.24427480916030533, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35280896, + "loss": 1.4133, + "grad_norm": 1.565614104270935, + "learning_rate": 0.00024406779661016948 + }, + { + "step": 49, + "epoch": 0.24936386768447838, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352798208, + "loss": 1.4318, + "grad_norm": 1.7466094493865967, + "learning_rate": 0.00024915254237288135 + }, + { + "step": 50, + "epoch": 0.2544529262086514, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352807424, + "loss": 1.4162, + "grad_norm": 1.8770884275436401, + "learning_rate": 0.00025423728813559317 + }, + { + "step": 51, + "epoch": 0.2595419847328244, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352902656, + "loss": 1.3801, + "grad_norm": 1.0354937314987183, + "learning_rate": 0.0002593220338983051 + }, + { + "step": 52, + "epoch": 0.26463104325699743, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352844288, + "loss": 1.4998, + "grad_norm": 2.673734426498413, + "learning_rate": 0.0002644067796610169 + }, + { + "step": 53, + "epoch": 0.2697201017811705, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352922624, + "loss": 1.5193, + "grad_norm": 2.493962526321411, + "learning_rate": 0.0002694915254237288 + }, + { + "step": 54, + "epoch": 0.2748091603053435, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352822784, + "loss": 1.4085, + "grad_norm": 0.7207130789756775, + "learning_rate": 0.00027457627118644066 + }, + { + "step": 55, + "epoch": 0.27989821882951654, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352819712, + "loss": 1.4653, + "grad_norm": 1.8180922269821167, + "learning_rate": 0.0002796610169491525 + }, + { + "step": 56, + "epoch": 0.28498727735368956, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352864256, + "loss": 1.3948, + "grad_norm": 0.7706848978996277, + "learning_rate": 0.0002847457627118644 + }, + { + "step": 57, + "epoch": 0.2900763358778626, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352856576, + "loss": 1.391, + "grad_norm": 1.082289218902588, + "learning_rate": 0.00028983050847457623 + }, + { + "step": 58, + "epoch": 0.2951653944020356, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352879616, + "loss": 1.4856, + "grad_norm": 1.94290292263031, + "learning_rate": 0.0002949152542372881 + }, + { + "step": 59, + "epoch": 0.30025445292620867, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35284736, + "loss": 1.4329, + "grad_norm": 1.944095492362976, + "learning_rate": 0.0003 + }, + { + "step": 60, + "epoch": 0.3053435114503817, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352838144, + "loss": 1.4054, + "grad_norm": 1.3239630460739136, + "learning_rate": 0.00029999735486167307 + }, + { + "step": 61, + "epoch": 0.3104325699745547, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352875008, + "loss": 1.3884, + "grad_norm": 1.1863479614257812, + "learning_rate": 0.00029998941953998247 + }, + { + "step": 62, + "epoch": 0.3155216284987277, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352914944, + "loss": 1.3823, + "grad_norm": 0.5400068759918213, + "learning_rate": 0.0002999761943147951 + }, + { + "step": 63, + "epoch": 0.32061068702290074, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352844288, + "loss": 1.372, + "grad_norm": 0.633487343788147, + "learning_rate": 0.000299957679652545 + }, + { + "step": 64, + "epoch": 0.3256997455470738, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352745984, + "loss": 1.3859, + "grad_norm": 1.0518360137939453, + "learning_rate": 0.0002999338762062168 + }, + { + "step": 65, + "epoch": 0.33078880407124683, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352827392, + "loss": 1.4395, + "grad_norm": 1.6499873399734497, + "learning_rate": 0.00029990478481532246 + }, + { + "step": 66, + "epoch": 0.33587786259541985, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35293184, + "loss": 1.409, + "grad_norm": 0.6572389602661133, + "learning_rate": 0.00029987040650587214 + }, + { + "step": 67, + "epoch": 0.34096692111959287, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35280896, + "loss": 1.4166, + "grad_norm": 1.1665669679641724, + "learning_rate": 0.0002998307424903376 + }, + { + "step": 68, + "epoch": 0.3460559796437659, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352861184, + "loss": 1.4173, + "grad_norm": 1.0875539779663086, + "learning_rate": 0.00029978579416760955 + }, + { + "step": 69, + "epoch": 0.3511450381679389, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352853504, + "loss": 1.4248, + "grad_norm": 1.314205288887024, + "learning_rate": 0.00029973556312294853 + }, + { + "step": 70, + "epoch": 0.356234096692112, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352769024, + "loss": 1.4466, + "grad_norm": 1.162074089050293, + "learning_rate": 0.0002996800511279286 + }, + { + "step": 71, + "epoch": 0.361323155216285, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352792064, + "loss": 1.4197, + "grad_norm": 1.4587390422821045, + "learning_rate": 0.0002996192601403751 + }, + { + "step": 72, + "epoch": 0.366412213740458, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352841216, + "loss": 1.3863, + "grad_norm": 0.39181220531463623, + "learning_rate": 0.00029955319230429584 + }, + { + "step": 73, + "epoch": 0.37150127226463103, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352795136, + "loss": 1.415, + "grad_norm": 0.6939945816993713, + "learning_rate": 0.00029948184994980486 + }, + { + "step": 74, + "epoch": 0.37659033078880405, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352848896, + "loss": 1.4498, + "grad_norm": 1.063015103340149, + "learning_rate": 0.0002994052355930409 + }, + { + "step": 75, + "epoch": 0.3816793893129771, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352891904, + "loss": 1.4498, + "grad_norm": 1.4910989999771118, + "learning_rate": 0.0002993233519360781 + }, + { + "step": 76, + "epoch": 0.38676844783715014, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35283968, + "loss": 1.4146, + "grad_norm": 0.8678625226020813, + "learning_rate": 0.0002992362018668312 + }, + { + "step": 77, + "epoch": 0.39185750636132316, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35277824, + "loss": 1.4041, + "grad_norm": 0.5515819191932678, + "learning_rate": 0.00029914378845895343 + }, + { + "step": 78, + "epoch": 0.3969465648854962, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352879616, + "loss": 1.3628, + "grad_norm": 0.656143844127655, + "learning_rate": 0.000299046114971728 + }, + { + "step": 79, + "epoch": 0.4020356234096692, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352896512, + "loss": 1.3231, + "grad_norm": 1.3540343046188354, + "learning_rate": 0.0002989431848499534 + }, + { + "step": 80, + "epoch": 0.4071246819338422, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352736768, + "loss": 1.5884, + "grad_norm": 3.366746664047241, + "learning_rate": 0.0002988350017238218 + }, + { + "step": 81, + "epoch": 0.4122137404580153, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352842752, + "loss": 1.5523, + "grad_norm": 3.321096897125244, + "learning_rate": 0.0002987215694087909 + }, + { + "step": 82, + "epoch": 0.4173027989821883, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352815104, + "loss": 1.4613, + "grad_norm": 2.4717841148376465, + "learning_rate": 0.0002986028919054496 + }, + { + "step": 83, + "epoch": 0.4223918575063613, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352828928, + "loss": 1.4337, + "grad_norm": 2.0109541416168213, + "learning_rate": 0.00029847897339937675 + }, + { + "step": 84, + "epoch": 0.42748091603053434, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352835072, + "loss": 1.405, + "grad_norm": 0.9913496375083923, + "learning_rate": 0.0002983498182609935 + }, + { + "step": 85, + "epoch": 0.43256997455470736, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352864256, + "loss": 1.3881, + "grad_norm": 0.641679584980011, + "learning_rate": 0.0002982154310454093 + }, + { + "step": 86, + "epoch": 0.43765903307888043, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352805888, + "loss": 1.422, + "grad_norm": 1.9523903131484985, + "learning_rate": 0.00029807581649226114 + }, + { + "step": 87, + "epoch": 0.44274809160305345, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352827392, + "loss": 1.4276, + "grad_norm": 1.3002896308898926, + "learning_rate": 0.00029793097952554646 + }, + { + "step": 88, + "epoch": 0.44783715012722647, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352853504, + "loss": 1.3583, + "grad_norm": 1.0953158140182495, + "learning_rate": 0.0002977809252534494 + }, + { + "step": 89, + "epoch": 0.4529262086513995, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35280128, + "loss": 1.4236, + "grad_norm": 1.1592082977294922, + "learning_rate": 0.00029762565896816073 + }, + { + "step": 90, + "epoch": 0.4580152671755725, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352833536, + "loss": 1.3866, + "grad_norm": 1.1234554052352905, + "learning_rate": 0.000297465186145691 + }, + { + "step": 91, + "epoch": 0.4631043256997455, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352825856, + "loss": 1.3687, + "grad_norm": 1.1195861101150513, + "learning_rate": 0.0002972995124456779 + }, + { + "step": 92, + "epoch": 0.4681933842239186, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352765952, + "loss": 1.4623, + "grad_norm": 1.7494040727615356, + "learning_rate": 0.0002971286437111861 + }, + { + "step": 93, + "epoch": 0.4732824427480916, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352967168, + "loss": 1.4243, + "grad_norm": 1.1749669313430786, + "learning_rate": 0.0002969525859685014 + }, + { + "step": 94, + "epoch": 0.47837150127226463, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352781312, + "loss": 1.4252, + "grad_norm": 1.4704701900482178, + "learning_rate": 0.0002967713454269183 + }, + { + "step": 95, + "epoch": 0.48346055979643765, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352948736, + "loss": 1.4061, + "grad_norm": 0.7859480381011963, + "learning_rate": 0.0002965849284785207 + }, + { + "step": 96, + "epoch": 0.48854961832061067, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352822784, + "loss": 1.4598, + "grad_norm": 1.7841439247131348, + "learning_rate": 0.000296393341697957 + }, + { + "step": 97, + "epoch": 0.49363867684478374, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352818176, + "loss": 1.4684, + "grad_norm": 1.594376802444458, + "learning_rate": 0.00029619659184220755 + }, + { + "step": 98, + "epoch": 0.49872773536895676, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352865792, + "loss": 1.3853, + "grad_norm": 0.8526685833930969, + "learning_rate": 0.00029599468585034684 + }, + { + "step": 99, + "epoch": 0.5038167938931297, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35284736, + "loss": 1.4498, + "grad_norm": 1.7927286624908447, + "learning_rate": 0.0002957876308432986 + }, + { + "step": 100, + "epoch": 0.5089058524173028, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352812032, + "loss": 1.3772, + "grad_norm": 0.5490004420280457, + "learning_rate": 0.0002955754341235846 + }, + { + "step": 101, + "epoch": 0.5139949109414759, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352795136, + "loss": 1.4304, + "grad_norm": 0.8933591246604919, + "learning_rate": 0.00029535810317506714 + }, + { + "step": 102, + "epoch": 0.5190839694656488, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352845824, + "loss": 1.439, + "grad_norm": 1.0588526725769043, + "learning_rate": 0.00029513564566268524 + }, + { + "step": 103, + "epoch": 0.5241730279898219, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352805888, + "loss": 1.4206, + "grad_norm": 0.9819344282150269, + "learning_rate": 0.0002949080694321841 + }, + { + "step": 104, + "epoch": 0.5292620865139949, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352894976, + "loss": 1.4206, + "grad_norm": 1.0478428602218628, + "learning_rate": 0.0002946753825098386 + }, + { + "step": 105, + "epoch": 0.5343511450381679, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352764416, + "loss": 1.4847, + "grad_norm": 2.709792375564575, + "learning_rate": 0.0002944375931021699 + }, + { + "step": 106, + "epoch": 0.539440203562341, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352827392, + "loss": 1.4603, + "grad_norm": 1.982580542564392, + "learning_rate": 0.0002941947095956564 + }, + { + "step": 107, + "epoch": 0.544529262086514, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352822784, + "loss": 1.4044, + "grad_norm": 0.9580506682395935, + "learning_rate": 0.0002939467405564377 + }, + { + "step": 108, + "epoch": 0.549618320610687, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352812032, + "loss": 1.3668, + "grad_norm": 0.6111714243888855, + "learning_rate": 0.00029369369473001265 + }, + { + "step": 109, + "epoch": 0.55470737913486, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352980992, + "loss": 1.4118, + "grad_norm": 0.9242130517959595, + "learning_rate": 0.0002934355810409307 + }, + { + "step": 110, + "epoch": 0.5597964376590331, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35277824, + "loss": 1.3737, + "grad_norm": 0.9589683413505554, + "learning_rate": 0.0002931724085924774 + }, + { + "step": 111, + "epoch": 0.5648854961832062, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352779776, + "loss": 1.3687, + "grad_norm": 0.622128427028656, + "learning_rate": 0.00029290418666635314 + }, + { + "step": 112, + "epoch": 0.5699745547073791, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352879616, + "loss": 1.4325, + "grad_norm": 1.2860844135284424, + "learning_rate": 0.0002926309247223459 + }, + { + "step": 113, + "epoch": 0.5750636132315522, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35297792, + "loss": 1.4479, + "grad_norm": 1.3254040479660034, + "learning_rate": 0.0002923526323979975 + }, + { + "step": 114, + "epoch": 0.5801526717557252, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352807424, + "loss": 1.3987, + "grad_norm": 0.5632498860359192, + "learning_rate": 0.00029206931950826387 + }, + { + "step": 115, + "epoch": 0.5852417302798982, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35281664, + "loss": 1.3843, + "grad_norm": 0.3361908495426178, + "learning_rate": 0.00029178099604516876 + }, + { + "step": 116, + "epoch": 0.5903307888040712, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35287808, + "loss": 1.3823, + "grad_norm": 0.9211949110031128, + "learning_rate": 0.0002914876721774515 + }, + { + "step": 117, + "epoch": 0.5954198473282443, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35277056, + "loss": 1.3865, + "grad_norm": 0.6656874418258667, + "learning_rate": 0.00029118935825020806 + }, + { + "step": 118, + "epoch": 0.6005089058524173, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352865792, + "loss": 1.3858, + "grad_norm": 0.5868481397628784, + "learning_rate": 0.00029088606478452656 + }, + { + "step": 119, + "epoch": 0.6055979643765903, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352999424, + "loss": 1.3788, + "grad_norm": 0.8646591901779175, + "learning_rate": 0.0002905778024771158 + }, + { + "step": 120, + "epoch": 0.6106870229007634, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352902656, + "loss": 1.4433, + "grad_norm": 1.9160512685775757, + "learning_rate": 0.00029026458219992855 + }, + { + "step": 121, + "epoch": 0.6157760814249363, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352948736, + "loss": 1.4103, + "grad_norm": 1.1070753335952759, + "learning_rate": 0.00028994641499977745 + }, + { + "step": 122, + "epoch": 0.6208651399491094, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35289344, + "loss": 1.3864, + "grad_norm": 0.5794543623924255, + "learning_rate": 0.00028962331209794604 + }, + { + "step": 123, + "epoch": 0.6259541984732825, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35293184, + "loss": 1.435, + "grad_norm": 2.0068302154541016, + "learning_rate": 0.00028929528488979244 + }, + { + "step": 124, + "epoch": 0.6310432569974554, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35285504, + "loss": 1.3434, + "grad_norm": 0.6902003288269043, + "learning_rate": 0.0002889623449443479 + }, + { + "step": 125, + "epoch": 0.6361323155216285, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352890368, + "loss": 1.4321, + "grad_norm": 1.2766541242599487, + "learning_rate": 0.0002886245040039086 + }, + { + "step": 126, + "epoch": 0.6412213740458015, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352796672, + "loss": 1.361, + "grad_norm": 0.32857123017311096, + "learning_rate": 0.0002882817739836215 + }, + { + "step": 127, + "epoch": 0.6463104325699746, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352821248, + "loss": 1.4408, + "grad_norm": 1.1822842359542847, + "learning_rate": 0.000287934166971064 + }, + { + "step": 128, + "epoch": 0.6513994910941476, + "cpu_mem": 3.324944384, + "gpu_mem": 1.3527936, + "loss": 1.3688, + "grad_norm": 0.3609676659107208, + "learning_rate": 0.0002875816952258179 + }, + { + "step": 129, + "epoch": 0.6564885496183206, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352799744, + "loss": 1.3979, + "grad_norm": 0.9842472672462463, + "learning_rate": 0.00028722437117903693 + }, + { + "step": 130, + "epoch": 0.6615776081424937, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352790528, + "loss": 1.4401, + "grad_norm": 1.1145094633102417, + "learning_rate": 0.000286862207433008 + }, + { + "step": 131, + "epoch": 0.6666666666666666, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35272448, + "loss": 1.4371, + "grad_norm": 1.3032236099243164, + "learning_rate": 0.00028649521676070726 + }, + { + "step": 132, + "epoch": 0.6717557251908397, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352887296, + "loss": 1.3884, + "grad_norm": 0.6530729532241821, + "learning_rate": 0.0002861234121053493 + }, + { + "step": 133, + "epoch": 0.6768447837150128, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352799744, + "loss": 1.4033, + "grad_norm": 1.386460542678833, + "learning_rate": 0.0002857468065799307 + }, + { + "step": 134, + "epoch": 0.6819338422391857, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352802816, + "loss": 1.4369, + "grad_norm": 1.2974112033843994, + "learning_rate": 0.0002853654134667676 + }, + { + "step": 135, + "epoch": 0.6870229007633588, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352858112, + "loss": 1.4533, + "grad_norm": 1.5056179761886597, + "learning_rate": 0.0002849792462170271 + }, + { + "step": 136, + "epoch": 0.6921119592875318, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352761344, + "loss": 1.4855, + "grad_norm": 1.6523871421813965, + "learning_rate": 0.0002845883184502533 + }, + { + "step": 137, + "epoch": 0.6972010178117048, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352928768, + "loss": 1.4047, + "grad_norm": 0.8765649795532227, + "learning_rate": 0.00028419264395388626 + }, + { + "step": 138, + "epoch": 0.7022900763358778, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352795136, + "loss": 1.3678, + "grad_norm": 0.8552773594856262, + "learning_rate": 0.0002837922366827765 + }, + { + "step": 139, + "epoch": 0.7073791348600509, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352784384, + "loss": 1.395, + "grad_norm": 1.493587613105774, + "learning_rate": 0.00028338711075869216 + }, + { + "step": 140, + "epoch": 0.712468193384224, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352836608, + "loss": 1.3755, + "grad_norm": 1.102967619895935, + "learning_rate": 0.00028297728046982137 + }, + { + "step": 141, + "epoch": 0.7175572519083969, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352761344, + "loss": 1.4126, + "grad_norm": 1.4228878021240234, + "learning_rate": 0.00028256276027026816 + }, + { + "step": 142, + "epoch": 0.72264631043257, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352822784, + "loss": 1.4726, + "grad_norm": 1.747502088546753, + "learning_rate": 0.0002821435647795429 + }, + { + "step": 143, + "epoch": 0.727735368956743, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352821248, + "loss": 1.395, + "grad_norm": 0.7795854806900024, + "learning_rate": 0.00028171970878204623 + }, + { + "step": 144, + "epoch": 0.732824427480916, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352765952, + "loss": 1.3605, + "grad_norm": 0.2929771840572357, + "learning_rate": 0.0002812912072265481 + }, + { + "step": 145, + "epoch": 0.7379134860050891, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35276288, + "loss": 1.406, + "grad_norm": 0.9075442552566528, + "learning_rate": 0.00028085807522566043 + }, + { + "step": 146, + "epoch": 0.7430025445292621, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352864256, + "loss": 1.3881, + "grad_norm": 0.8333525061607361, + "learning_rate": 0.00028042032805530387 + }, + { + "step": 147, + "epoch": 0.7480916030534351, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352868864, + "loss": 1.3812, + "grad_norm": 0.7974780201911926, + "learning_rate": 0.00027997798115416935 + }, + { + "step": 148, + "epoch": 0.7531806615776081, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352967168, + "loss": 1.4196, + "grad_norm": 0.9166802763938904, + "learning_rate": 0.0002795310501231734 + }, + { + "step": 149, + "epoch": 0.7582697201017812, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352810496, + "loss": 1.3928, + "grad_norm": 0.6869223713874817, + "learning_rate": 0.0002790795507249081 + }, + { + "step": 150, + "epoch": 0.7633587786259542, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352802816, + "loss": 1.3997, + "grad_norm": 0.5912744998931885, + "learning_rate": 0.00027862349888308494 + }, + { + "step": 151, + "epoch": 0.7684478371501272, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352745984, + "loss": 1.3797, + "grad_norm": 0.35113203525543213, + "learning_rate": 0.0002781629106819733 + }, + { + "step": 152, + "epoch": 0.7735368956743003, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35277824, + "loss": 1.3866, + "grad_norm": 0.4070204198360443, + "learning_rate": 0.00027769780236583315 + }, + { + "step": 153, + "epoch": 0.7786259541984732, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352805888, + "loss": 1.3902, + "grad_norm": 0.5558227300643921, + "learning_rate": 0.0002772281903383424 + }, + { + "step": 154, + "epoch": 0.7837150127226463, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352856576, + "loss": 1.393, + "grad_norm": 0.6081562638282776, + "learning_rate": 0.00027675409116201797 + }, + { + "step": 155, + "epoch": 0.7888040712468194, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352769024, + "loss": 1.4067, + "grad_norm": 0.26730141043663025, + "learning_rate": 0.00027627552155763186 + }, + { + "step": 156, + "epoch": 0.7938931297709924, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352788992, + "loss": 1.3862, + "grad_norm": 0.5466495752334595, + "learning_rate": 0.00027579249840362145 + }, + { + "step": 157, + "epoch": 0.7989821882951654, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352865792, + "loss": 1.4026, + "grad_norm": 0.6377905607223511, + "learning_rate": 0.0002753050387354942 + }, + { + "step": 158, + "epoch": 0.8040712468193384, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352767488, + "loss": 1.3969, + "grad_norm": 0.3062172532081604, + "learning_rate": 0.0002748131597452268 + }, + { + "step": 159, + "epoch": 0.8091603053435115, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352868864, + "loss": 1.4079, + "grad_norm": 0.5129101276397705, + "learning_rate": 0.00027431687878065874 + }, + { + "step": 160, + "epoch": 0.8142493638676844, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352825856, + "loss": 1.4107, + "grad_norm": 0.7166882157325745, + "learning_rate": 0.00027381621334488085 + }, + { + "step": 161, + "epoch": 0.8193384223918575, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35282432, + "loss": 1.3622, + "grad_norm": 0.36137402057647705, + "learning_rate": 0.00027331118109561744 + }, + { + "step": 162, + "epoch": 0.8244274809160306, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352807424, + "loss": 1.3808, + "grad_norm": 0.2736084461212158, + "learning_rate": 0.000272801799844604 + }, + { + "step": 163, + "epoch": 0.8295165394402035, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35292416, + "loss": 1.4169, + "grad_norm": 0.6300257444381714, + "learning_rate": 0.00027228808755695884 + }, + { + "step": 164, + "epoch": 0.8346055979643766, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352819712, + "loss": 1.3951, + "grad_norm": 0.5753377079963684, + "learning_rate": 0.00027177006235054943 + }, + { + "step": 165, + "epoch": 0.8396946564885496, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352899584, + "loss": 1.393, + "grad_norm": 0.46469539403915405, + "learning_rate": 0.0002712477424953534 + }, + { + "step": 166, + "epoch": 0.8447837150127226, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35280896, + "loss": 1.4714, + "grad_norm": 9.491832733154297, + "learning_rate": 0.00027072114641281435 + }, + { + "step": 167, + "epoch": 0.8498727735368957, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35277056, + "loss": 1.3918, + "grad_norm": 0.6748173832893372, + "learning_rate": 0.0002701902926751921 + }, + { + "step": 168, + "epoch": 0.8549618320610687, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352773632, + "loss": 1.3711, + "grad_norm": 0.5326930284500122, + "learning_rate": 0.00026965520000490743 + }, + { + "step": 169, + "epoch": 0.8600508905852418, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352841216, + "loss": 1.3644, + "grad_norm": 0.4738723933696747, + "learning_rate": 0.0002691158872738822 + }, + { + "step": 170, + "epoch": 0.8651399491094147, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352848896, + "loss": 1.4101, + "grad_norm": 0.5205197930335999, + "learning_rate": 0.00026857237350287334 + }, + { + "step": 171, + "epoch": 0.8702290076335878, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352822784, + "loss": 1.431, + "grad_norm": 0.7159079313278198, + "learning_rate": 0.0002680246778608023 + }, + { + "step": 172, + "epoch": 0.8753180661577609, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352802816, + "loss": 1.3882, + "grad_norm": 0.36445486545562744, + "learning_rate": 0.0002674728196640788 + }, + { + "step": 173, + "epoch": 0.8804071246819338, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35283968, + "loss": 1.3943, + "grad_norm": 0.6018402576446533, + "learning_rate": 0.00026691681837591984 + }, + { + "step": 174, + "epoch": 0.8854961832061069, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352782848, + "loss": 1.3924, + "grad_norm": 0.3792208433151245, + "learning_rate": 0.00026635669360566296 + }, + { + "step": 175, + "epoch": 0.8905852417302799, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352853504, + "loss": 1.3847, + "grad_norm": 0.6294650435447693, + "learning_rate": 0.00026579246510807477 + }, + { + "step": 176, + "epoch": 0.8956743002544529, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352750592, + "loss": 1.3778, + "grad_norm": 0.5824689269065857, + "learning_rate": 0.00026522415278265425 + }, + { + "step": 177, + "epoch": 0.9007633587786259, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352759808, + "loss": 1.4195, + "grad_norm": 0.9922394156455994, + "learning_rate": 0.0002646517766729309 + }, + { + "step": 178, + "epoch": 0.905852417302799, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352836608, + "loss": 1.4083, + "grad_norm": 0.8130871653556824, + "learning_rate": 0.0002640753569657579 + }, + { + "step": 179, + "epoch": 0.910941475826972, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352844288, + "loss": 1.3745, + "grad_norm": 0.44798052310943604, + "learning_rate": 0.0002634949139906 + }, + { + "step": 180, + "epoch": 0.916030534351145, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352850432, + "loss": 1.4027, + "grad_norm": 0.5486446022987366, + "learning_rate": 0.00026291046821881673 + }, + { + "step": 181, + "epoch": 0.9211195928753181, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352769024, + "loss": 1.4007, + "grad_norm": 0.647663414478302, + "learning_rate": 0.0002623220402629402 + }, + { + "step": 182, + "epoch": 0.926208651399491, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352879616, + "loss": 1.4152, + "grad_norm": 0.58375483751297, + "learning_rate": 0.0002617296508759483 + }, + { + "step": 183, + "epoch": 0.9312977099236641, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352867328, + "loss": 1.379, + "grad_norm": 0.40665125846862793, + "learning_rate": 0.00026113332095053257 + }, + { + "step": 184, + "epoch": 0.9363867684478372, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352850432, + "loss": 1.4041, + "grad_norm": 0.7840951085090637, + "learning_rate": 0.0002605330715183616 + }, + { + "step": 185, + "epoch": 0.9414758269720102, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352788992, + "loss": 1.3896, + "grad_norm": 0.845903754234314, + "learning_rate": 0.0002599289237493392 + }, + { + "step": 186, + "epoch": 0.9465648854961832, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352821248, + "loss": 1.3667, + "grad_norm": 0.22181794047355652, + "learning_rate": 0.0002593208989508575 + }, + { + "step": 187, + "epoch": 0.9516539440203562, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352918016, + "loss": 1.3698, + "grad_norm": 0.18324774503707886, + "learning_rate": 0.00025870901856704583 + }, + { + "step": 188, + "epoch": 0.9567430025445293, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352890368, + "loss": 1.3618, + "grad_norm": 0.35232436656951904, + "learning_rate": 0.00025809330417801425 + }, + { + "step": 189, + "epoch": 0.9618320610687023, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352795136, + "loss": 1.5276, + "grad_norm": 1.276874303817749, + "learning_rate": 0.00025747377749909254 + }, + { + "step": 190, + "epoch": 0.9669211195928753, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352827392, + "loss": 1.3944, + "grad_norm": 0.6247072219848633, + "learning_rate": 0.00025685046038006413 + }, + { + "step": 191, + "epoch": 0.9720101781170484, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35281664, + "loss": 1.404, + "grad_norm": 0.59803307056427, + "learning_rate": 0.0002562233748043958 + }, + { + "step": 192, + "epoch": 0.9770992366412213, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352753664, + "loss": 1.4162, + "grad_norm": 1.2608247995376587, + "learning_rate": 0.00025559254288846196 + }, + { + "step": 193, + "epoch": 0.9821882951653944, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352875008, + "loss": 1.3939, + "grad_norm": 0.27602455019950867, + "learning_rate": 0.0002549579868807651 + }, + { + "step": 194, + "epoch": 0.9872773536895675, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352796672, + "loss": 1.4069, + "grad_norm": 1.1514520645141602, + "learning_rate": 0.0002543197291611507 + }, + { + "step": 195, + "epoch": 0.9923664122137404, + "cpu_mem": 3.324944384, + "gpu_mem": 1.3529088, + "loss": 1.3741, + "grad_norm": 0.4509068429470062, + "learning_rate": 0.0002536777922400183 + }, + { + "step": 196, + "epoch": 0.9974554707379135, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352904192, + "loss": 1.3992, + "grad_norm": 1.0623581409454346, + "learning_rate": 0.0002530321987575271 + }, + { + "step": 197, + "epoch": 1.0025445292620865, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453763584, + "loss": 2.0603, + "grad_norm": 1.0149401426315308, + "learning_rate": 0.0002523829714827981 + }, + { + "step": 198, + "epoch": 1.0076335877862594, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453829632, + "loss": 1.3958, + "grad_norm": 0.8498672842979431, + "learning_rate": 0.00025173013331311053 + }, + { + "step": 199, + "epoch": 1.0127226463104326, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45382656, + "loss": 1.3833, + "grad_norm": 0.9680757522583008, + "learning_rate": 0.0002510737072730946 + }, + { + "step": 200, + "epoch": 1.0178117048346056, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45371136, + "loss": 1.3779, + "grad_norm": 0.8814850449562073, + "learning_rate": 0.0002504137165139193 + }, + { + "step": 201, + "epoch": 1.0229007633587786, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453762048, + "loss": 1.3981, + "grad_norm": 0.9682685732841492, + "learning_rate": 0.0002497501843124761 + }, + { + "step": 202, + "epoch": 1.0279898218829517, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453755904, + "loss": 1.4412, + "grad_norm": 1.3773341178894043, + "learning_rate": 0.00024908313407055765 + }, + { + "step": 203, + "epoch": 1.0330788804071247, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45380352, + "loss": 1.3755, + "grad_norm": 0.6738574504852295, + "learning_rate": 0.00024841258931403284 + }, + { + "step": 204, + "epoch": 1.0381679389312977, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453735936, + "loss": 1.339, + "grad_norm": 0.4911693036556244, + "learning_rate": 0.00024773857369201675 + }, + { + "step": 205, + "epoch": 1.0432569974554706, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453782016, + "loss": 1.4136, + "grad_norm": 0.8073433041572571, + "learning_rate": 0.00024706111097603676 + }, + { + "step": 206, + "epoch": 1.0483460559796438, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45379584, + "loss": 1.3798, + "grad_norm": 0.9644945859909058, + "learning_rate": 0.00024638022505919425 + }, + { + "step": 207, + "epoch": 1.0534351145038168, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453751296, + "loss": 1.3574, + "grad_norm": 0.9469156265258789, + "learning_rate": 0.00024569593995532157 + }, + { + "step": 208, + "epoch": 1.0585241730279897, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453743616, + "loss": 1.4126, + "grad_norm": 1.3905264139175415, + "learning_rate": 0.00024500827979813546 + }, + { + "step": 209, + "epoch": 1.063613231552163, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45386496, + "loss": 1.3942, + "grad_norm": 1.4367321729660034, + "learning_rate": 0.0002443172688403859 + }, + { + "step": 210, + "epoch": 1.0687022900763359, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453762048, + "loss": 1.3701, + "grad_norm": 1.0105152130126953, + "learning_rate": 0.00024362293145300027 + }, + { + "step": 211, + "epoch": 1.0737913486005088, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453763584, + "loss": 1.3555, + "grad_norm": 0.8272319436073303, + "learning_rate": 0.00024292529212422445 + }, + { + "step": 212, + "epoch": 1.078880407124682, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453766656, + "loss": 1.361, + "grad_norm": 0.6120912432670593, + "learning_rate": 0.00024222437545875887 + }, + { + "step": 213, + "epoch": 1.083969465648855, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453692928, + "loss": 1.446, + "grad_norm": 1.152339220046997, + "learning_rate": 0.0002415202061768906 + }, + { + "step": 214, + "epoch": 1.089058524173028, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45368832, + "loss": 1.3943, + "grad_norm": 1.858020305633545, + "learning_rate": 0.0002408128091136217 + }, + { + "step": 215, + "epoch": 1.094147582697201, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453739008, + "loss": 1.3632, + "grad_norm": 1.4007282257080078, + "learning_rate": 0.00024010220921779336 + }, + { + "step": 216, + "epoch": 1.099236641221374, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453868032, + "loss": 1.4056, + "grad_norm": 1.152721881866455, + "learning_rate": 0.00023938843155120581 + }, + { + "step": 217, + "epoch": 1.104325699745547, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453729792, + "loss": 1.3682, + "grad_norm": 1.0392452478408813, + "learning_rate": 0.00023867150128773453 + }, + { + "step": 218, + "epoch": 1.10941475826972, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453755904, + "loss": 1.3367, + "grad_norm": 0.6765034198760986, + "learning_rate": 0.0002379514437124425 + }, + { + "step": 219, + "epoch": 1.1145038167938932, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453768192, + "loss": 1.3794, + "grad_norm": 0.8398991823196411, + "learning_rate": 0.00023722828422068814 + }, + { + "step": 220, + "epoch": 1.1195928753180662, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453740544, + "loss": 1.4453, + "grad_norm": 2.3475937843322754, + "learning_rate": 0.00023650204831723008 + }, + { + "step": 221, + "epoch": 1.1246819338422391, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453714432, + "loss": 1.4041, + "grad_norm": 0.8751002550125122, + "learning_rate": 0.00023577276161532718 + }, + { + "step": 222, + "epoch": 1.1297709923664123, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45372672, + "loss": 1.4176, + "grad_norm": 1.0366615056991577, + "learning_rate": 0.0002350404498358356 + }, + { + "step": 223, + "epoch": 1.1348600508905853, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453754368, + "loss": 1.3565, + "grad_norm": 0.8589709401130676, + "learning_rate": 0.00023430513880630133 + }, + { + "step": 224, + "epoch": 1.1399491094147582, + "cpu_mem": 3.324944384, + "gpu_mem": 1.4537728, + "loss": 1.3745, + "grad_norm": 0.49997156858444214, + "learning_rate": 0.00023356685446004966 + }, + { + "step": 225, + "epoch": 1.1450381679389312, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453823488, + "loss": 1.3799, + "grad_norm": 0.6777386665344238, + "learning_rate": 0.00023282562283527005 + }, + { + "step": 226, + "epoch": 1.1501272264631044, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453774336, + "loss": 1.4095, + "grad_norm": 0.8596842885017395, + "learning_rate": 0.00023208147007409827 + }, + { + "step": 227, + "epoch": 1.1552162849872774, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453725184, + "loss": 1.4122, + "grad_norm": 1.0854560136795044, + "learning_rate": 0.00023133442242169425 + }, + { + "step": 228, + "epoch": 1.1603053435114503, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453769728, + "loss": 1.3973, + "grad_norm": 0.46122977137565613, + "learning_rate": 0.00023058450622531632 + }, + { + "step": 229, + "epoch": 1.1653944020356235, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453668352, + "loss": 1.4117, + "grad_norm": 1.026164174079895, + "learning_rate": 0.00022983174793339206 + }, + { + "step": 230, + "epoch": 1.1704834605597965, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453735936, + "loss": 1.3979, + "grad_norm": 0.47771933674812317, + "learning_rate": 0.0002290761740945857 + }, + { + "step": 231, + "epoch": 1.1755725190839694, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45372672, + "loss": 1.373, + "grad_norm": 0.6140305399894714, + "learning_rate": 0.00022831781135686135 + }, + { + "step": 232, + "epoch": 1.1806615776081424, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453731328, + "loss": 1.3644, + "grad_norm": 1.4611433744430542, + "learning_rate": 0.00022755668646654375 + }, + { + "step": 233, + "epoch": 1.1857506361323156, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453846528, + "loss": 1.4352, + "grad_norm": 1.5461804866790771, + "learning_rate": 0.00022679282626737442 + }, + { + "step": 234, + "epoch": 1.1908396946564885, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453806592, + "loss": 1.4412, + "grad_norm": 1.4716558456420898, + "learning_rate": 0.00022602625769956519 + }, + { + "step": 235, + "epoch": 1.1959287531806615, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45376512, + "loss": 1.4238, + "grad_norm": 1.4125932455062866, + "learning_rate": 0.00022525700779884802 + }, + { + "step": 236, + "epoch": 1.2010178117048347, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453763584, + "loss": 1.375, + "grad_norm": 1.138948917388916, + "learning_rate": 0.00022448510369552164 + }, + { + "step": 237, + "epoch": 1.2061068702290076, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45378816, + "loss": 1.3831, + "grad_norm": 0.4658820331096649, + "learning_rate": 0.0002237105726134943 + }, + { + "step": 238, + "epoch": 1.2111959287531806, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453808128, + "loss": 1.3785, + "grad_norm": 0.4429713785648346, + "learning_rate": 0.00022293344186932406 + }, + { + "step": 239, + "epoch": 1.2162849872773536, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453725184, + "loss": 1.4402, + "grad_norm": 1.4898957014083862, + "learning_rate": 0.00022215373887125514 + }, + { + "step": 240, + "epoch": 1.2213740458015268, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453860352, + "loss": 1.4113, + "grad_norm": 0.9233662486076355, + "learning_rate": 0.00022137149111825128 + }, + { + "step": 241, + "epoch": 1.2264631043256997, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453814272, + "loss": 1.3747, + "grad_norm": 0.6659481525421143, + "learning_rate": 0.00022058672619902606 + }, + { + "step": 242, + "epoch": 1.2315521628498727, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453835776, + "loss": 1.3962, + "grad_norm": 0.63540118932724, + "learning_rate": 0.00021979947179106966 + }, + { + "step": 243, + "epoch": 1.2366412213740459, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453722112, + "loss": 1.3607, + "grad_norm": 1.612032175064087, + "learning_rate": 0.0002190097556596728 + }, + { + "step": 244, + "epoch": 1.2417302798982188, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453709824, + "loss": 1.4476, + "grad_norm": 1.5543557405471802, + "learning_rate": 0.0002182176056569476 + }, + { + "step": 245, + "epoch": 1.2468193384223918, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453714432, + "loss": 1.4377, + "grad_norm": 1.4433249235153198, + "learning_rate": 0.00021742304972084518 + }, + { + "step": 246, + "epoch": 1.2519083969465647, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453751296, + "loss": 1.4101, + "grad_norm": 0.998134970664978, + "learning_rate": 0.00021662611587417035 + }, + { + "step": 247, + "epoch": 1.256997455470738, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453712896, + "loss": 1.3461, + "grad_norm": 0.6123658418655396, + "learning_rate": 0.00021582683222359317 + }, + { + "step": 248, + "epoch": 1.262086513994911, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453758976, + "loss": 1.3918, + "grad_norm": 0.43358391523361206, + "learning_rate": 0.00021502522695865796 + }, + { + "step": 249, + "epoch": 1.267175572519084, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453768192, + "loss": 1.3876, + "grad_norm": 0.3141128718852997, + "learning_rate": 0.00021422132835078884 + }, + { + "step": 250, + "epoch": 1.272264631043257, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453806592, + "loss": 1.3908, + "grad_norm": 0.6028469204902649, + "learning_rate": 0.0002134151647522927 + }, + { + "step": 251, + "epoch": 1.27735368956743, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453725184, + "loss": 1.3727, + "grad_norm": 0.5012947916984558, + "learning_rate": 0.00021260676459535933 + }, + { + "step": 252, + "epoch": 1.282442748091603, + "cpu_mem": 3.324944384, + "gpu_mem": 1.4537344, + "loss": 1.3698, + "grad_norm": 0.7302643656730652, + "learning_rate": 0.00021179615639105857 + }, + { + "step": 253, + "epoch": 1.2875318066157762, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453800448, + "loss": 1.3766, + "grad_norm": 0.40093159675598145, + "learning_rate": 0.00021098336872833482 + }, + { + "step": 254, + "epoch": 1.2926208651399491, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45372672, + "loss": 1.3797, + "grad_norm": 0.47039878368377686, + "learning_rate": 0.0002101684302729987 + }, + { + "step": 255, + "epoch": 1.297709923664122, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453720576, + "loss": 1.3952, + "grad_norm": 0.5799374580383301, + "learning_rate": 0.00020935136976671617 + }, + { + "step": 256, + "epoch": 1.3027989821882953, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453786624, + "loss": 1.3878, + "grad_norm": 0.5093396902084351, + "learning_rate": 0.00020853221602599458 + }, + { + "step": 257, + "epoch": 1.3078880407124682, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453745152, + "loss": 1.3473, + "grad_norm": 0.36066603660583496, + "learning_rate": 0.00020771099794116672 + }, + { + "step": 258, + "epoch": 1.3129770992366412, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453858816, + "loss": 1.3659, + "grad_norm": 0.6175323724746704, + "learning_rate": 0.0002068877444753717 + }, + { + "step": 259, + "epoch": 1.3180661577608141, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453735936, + "loss": 1.3676, + "grad_norm": 0.6471043229103088, + "learning_rate": 0.0002060624846635335 + }, + { + "step": 260, + "epoch": 1.3231552162849873, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453762048, + "loss": 1.3887, + "grad_norm": 0.512557327747345, + "learning_rate": 0.00020523524761133677 + }, + { + "step": 261, + "epoch": 1.3282442748091603, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453768192, + "loss": 1.3693, + "grad_norm": 0.8529994487762451, + "learning_rate": 0.00020440606249420073 + }, + { + "step": 262, + "epoch": 1.3333333333333333, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453835776, + "loss": 1.393, + "grad_norm": 0.5022062063217163, + "learning_rate": 0.00020357495855624974 + }, + { + "step": 263, + "epoch": 1.3384223918575064, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453691392, + "loss": 1.3698, + "grad_norm": 0.6844445466995239, + "learning_rate": 0.0002027419651092822 + }, + { + "step": 264, + "epoch": 1.3435114503816794, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453825024, + "loss": 1.4072, + "grad_norm": 0.6347677707672119, + "learning_rate": 0.00020190711153173676 + }, + { + "step": 265, + "epoch": 1.3486005089058524, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453854208, + "loss": 1.3502, + "grad_norm": 0.7422722578048706, + "learning_rate": 0.00020107042726765588 + }, + { + "step": 266, + "epoch": 1.3536895674300253, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453723648, + "loss": 1.422, + "grad_norm": 0.8988478183746338, + "learning_rate": 0.0002002319418256479 + }, + { + "step": 267, + "epoch": 1.3587786259541985, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453725184, + "loss": 1.3995, + "grad_norm": 0.699089765548706, + "learning_rate": 0.00019939168477784583 + }, + { + "step": 268, + "epoch": 1.3638676844783715, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45372672, + "loss": 1.3514, + "grad_norm": 0.43406715989112854, + "learning_rate": 0.00019854968575886458 + }, + { + "step": 269, + "epoch": 1.3689567430025447, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453800448, + "loss": 1.4232, + "grad_norm": 0.9225919246673584, + "learning_rate": 0.00019770597446475588 + }, + { + "step": 270, + "epoch": 1.3740458015267176, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453782016, + "loss": 1.3377, + "grad_norm": 0.38811245560646057, + "learning_rate": 0.0001968605806519608 + }, + { + "step": 271, + "epoch": 1.3791348600508906, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453725184, + "loss": 1.4031, + "grad_norm": 0.6498596668243408, + "learning_rate": 0.00019601353413626032 + }, + { + "step": 272, + "epoch": 1.3842239185750635, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453763584, + "loss": 1.3771, + "grad_norm": 0.6751839518547058, + "learning_rate": 0.00019516486479172386 + }, + { + "step": 273, + "epoch": 1.3893129770992365, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453831168, + "loss": 1.368, + "grad_norm": 0.5586623549461365, + "learning_rate": 0.0001943146025496555 + }, + { + "step": 274, + "epoch": 1.3944020356234097, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453783552, + "loss": 1.3617, + "grad_norm": 0.4317017197608948, + "learning_rate": 0.00019346277739753855 + }, + { + "step": 275, + "epoch": 1.3994910941475827, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45391104, + "loss": 1.3938, + "grad_norm": 0.7610272765159607, + "learning_rate": 0.00019260941937797776 + }, + { + "step": 276, + "epoch": 1.4045801526717558, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45379584, + "loss": 1.3793, + "grad_norm": 0.49989810585975647, + "learning_rate": 0.00019175455858763988 + }, + { + "step": 277, + "epoch": 1.4096692111959288, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453745152, + "loss": 1.3337, + "grad_norm": 0.47165215015411377, + "learning_rate": 0.0001908982251761921 + }, + { + "step": 278, + "epoch": 1.4147582697201018, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453732864, + "loss": 1.3332, + "grad_norm": 0.5304033756256104, + "learning_rate": 0.00019004044934523871 + }, + { + "step": 279, + "epoch": 1.4198473282442747, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453725184, + "loss": 1.3925, + "grad_norm": 0.5482867956161499, + "learning_rate": 0.00018918126134725616 + }, + { + "step": 280, + "epoch": 1.424936386768448, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453848064, + "loss": 1.3867, + "grad_norm": 0.691121518611908, + "learning_rate": 0.00018832069148452582 + }, + { + "step": 281, + "epoch": 1.4300254452926209, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45378816, + "loss": 1.3908, + "grad_norm": 0.8983801603317261, + "learning_rate": 0.00018745877010806534 + }, + { + "step": 282, + "epoch": 1.4351145038167938, + "cpu_mem": 3.324944384, + "gpu_mem": 1.4537344, + "loss": 1.4015, + "grad_norm": 0.826675534248352, + "learning_rate": 0.00018659552761655828 + }, + { + "step": 283, + "epoch": 1.440203562340967, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453751296, + "loss": 1.3672, + "grad_norm": 0.6555270552635193, + "learning_rate": 0.00018573099445528204 + }, + { + "step": 284, + "epoch": 1.44529262086514, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453891072, + "loss": 1.3821, + "grad_norm": 0.5404415130615234, + "learning_rate": 0.00018486520111503387 + }, + { + "step": 285, + "epoch": 1.450381679389313, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453762048, + "loss": 1.3993, + "grad_norm": 0.7674772143363953, + "learning_rate": 0.0001839981781310558 + }, + { + "step": 286, + "epoch": 1.455470737913486, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45375744, + "loss": 1.4558, + "grad_norm": 1.1596359014511108, + "learning_rate": 0.00018312995608195747 + }, + { + "step": 287, + "epoch": 1.460559796437659, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453660672, + "loss": 1.4172, + "grad_norm": 0.7901501655578613, + "learning_rate": 0.00018226056558863778 + }, + { + "step": 288, + "epoch": 1.465648854961832, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453735936, + "loss": 1.3969, + "grad_norm": 0.8071657419204712, + "learning_rate": 0.00018139003731320496 + }, + { + "step": 289, + "epoch": 1.470737913486005, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453755904, + "loss": 1.3838, + "grad_norm": 0.6649911999702454, + "learning_rate": 0.00018051840195789506 + }, + { + "step": 290, + "epoch": 1.4758269720101782, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453722112, + "loss": 1.3703, + "grad_norm": 0.43581411242485046, + "learning_rate": 0.00017964569026398926 + }, + { + "step": 291, + "epoch": 1.4809160305343512, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453683712, + "loss": 1.3945, + "grad_norm": 0.6741722822189331, + "learning_rate": 0.00017877193301072945 + }, + { + "step": 292, + "epoch": 1.4860050890585241, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453809664, + "loss": 1.3924, + "grad_norm": 0.8707190155982971, + "learning_rate": 0.0001778971610142331 + }, + { + "step": 293, + "epoch": 1.491094147582697, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453686784, + "loss": 1.4107, + "grad_norm": 0.7651141881942749, + "learning_rate": 0.00017702140512640594 + }, + { + "step": 294, + "epoch": 1.4961832061068703, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453729792, + "loss": 1.4101, + "grad_norm": 0.7320521473884583, + "learning_rate": 0.00017614469623385414 + }, + { + "step": 295, + "epoch": 1.5012722646310432, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453712896, + "loss": 1.3419, + "grad_norm": 0.6728614568710327, + "learning_rate": 0.00017526706525679498 + }, + { + "step": 296, + "epoch": 1.5063613231552164, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453754368, + "loss": 1.372, + "grad_norm": 0.6952121257781982, + "learning_rate": 0.00017438854314796623 + }, + { + "step": 297, + "epoch": 1.5114503816793894, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453694464, + "loss": 1.3715, + "grad_norm": 0.5669242739677429, + "learning_rate": 0.00017350916089153455 + }, + { + "step": 298, + "epoch": 1.5165394402035624, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453717504, + "loss": 1.3869, + "grad_norm": 0.7775692343711853, + "learning_rate": 0.00017262894950200277 + }, + { + "step": 299, + "epoch": 1.5216284987277353, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453706752, + "loss": 1.3472, + "grad_norm": 0.4285772442817688, + "learning_rate": 0.000171747940023116 + }, + { + "step": 300, + "epoch": 1.5267175572519083, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45371904, + "loss": 1.3881, + "grad_norm": 0.4577607810497284, + "learning_rate": 0.0001708661635267667 + }, + { + "step": 301, + "epoch": 1.5318066157760815, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453722112, + "loss": 1.3526, + "grad_norm": 0.898563027381897, + "learning_rate": 0.00016998365111189906 + }, + { + "step": 302, + "epoch": 1.5368956743002544, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45370368, + "loss": 1.377, + "grad_norm": 0.4703174829483032, + "learning_rate": 0.00016910043390341183 + }, + { + "step": 303, + "epoch": 1.5419847328244276, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453709824, + "loss": 1.3698, + "grad_norm": 0.7467511296272278, + "learning_rate": 0.0001682165430510609 + }, + { + "step": 304, + "epoch": 1.5470737913486006, + "cpu_mem": 3.324944384, + "gpu_mem": 1.4536576, + "loss": 1.4298, + "grad_norm": 1.3383394479751587, + "learning_rate": 0.00016733200972836055 + }, + { + "step": 305, + "epoch": 1.5521628498727735, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453729792, + "loss": 1.3436, + "grad_norm": 0.6041971445083618, + "learning_rate": 0.00016644686513148397 + }, + { + "step": 306, + "epoch": 1.5572519083969465, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453771264, + "loss": 1.363, + "grad_norm": 0.9187496304512024, + "learning_rate": 0.00016556114047816317 + }, + { + "step": 307, + "epoch": 1.5623409669211195, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45371904, + "loss": 1.3525, + "grad_norm": 0.693581759929657, + "learning_rate": 0.00016467486700658785 + }, + { + "step": 308, + "epoch": 1.5674300254452926, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453729792, + "loss": 1.3634, + "grad_norm": 0.8275741934776306, + "learning_rate": 0.0001637880759743037 + }, + { + "step": 309, + "epoch": 1.5725190839694656, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453712896, + "loss": 1.3819, + "grad_norm": 1.0999479293823242, + "learning_rate": 0.00016290079865711004 + }, + { + "step": 310, + "epoch": 1.5776081424936388, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453768192, + "loss": 1.3841, + "grad_norm": 1.0416158437728882, + "learning_rate": 0.00016201306634795675 + }, + { + "step": 311, + "epoch": 1.5826972010178118, + "cpu_mem": 3.324944384, + "gpu_mem": 1.4537344, + "loss": 1.3329, + "grad_norm": 0.7069823145866394, + "learning_rate": 0.00016112491035584047 + }, + { + "step": 312, + "epoch": 1.5877862595419847, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453743616, + "loss": 1.3445, + "grad_norm": 0.7428996562957764, + "learning_rate": 0.00016023636200470065 + }, + { + "step": 313, + "epoch": 1.5928753180661577, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453745152, + "loss": 1.4137, + "grad_norm": 1.3815035820007324, + "learning_rate": 0.00015934745263231464 + }, + { + "step": 314, + "epoch": 1.5979643765903306, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45397248, + "loss": 1.3947, + "grad_norm": 0.95113205909729, + "learning_rate": 0.00015845821358919236 + }, + { + "step": 315, + "epoch": 1.6030534351145038, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45376512, + "loss": 1.3737, + "grad_norm": 0.9358564019203186, + "learning_rate": 0.00015756867623747088 + }, + { + "step": 316, + "epoch": 1.608142493638677, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453737472, + "loss": 1.3505, + "grad_norm": 0.8907728791236877, + "learning_rate": 0.00015667887194980806 + }, + { + "step": 317, + "epoch": 1.61323155216285, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453702144, + "loss": 1.3271, + "grad_norm": 0.9062132835388184, + "learning_rate": 0.00015578883210827626 + }, + { + "step": 318, + "epoch": 1.618320610687023, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453792768, + "loss": 1.3573, + "grad_norm": 1.1136481761932373, + "learning_rate": 0.0001548985881032554 + }, + { + "step": 319, + "epoch": 1.623409669211196, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453717504, + "loss": 1.2945, + "grad_norm": 1.018342137336731, + "learning_rate": 0.00015400817133232606 + }, + { + "step": 320, + "epoch": 1.6284987277353689, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45376512, + "loss": 1.316, + "grad_norm": 0.8377622961997986, + "learning_rate": 0.00015311761319916184 + }, + { + "step": 321, + "epoch": 1.6335877862595418, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453706752, + "loss": 1.3358, + "grad_norm": 1.2049169540405273, + "learning_rate": 0.00015222694511242215 + }, + { + "step": 322, + "epoch": 1.638676844783715, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453927936, + "loss": 1.3891, + "grad_norm": 1.2102223634719849, + "learning_rate": 0.00015133619848464424 + }, + { + "step": 323, + "epoch": 1.6437659033078882, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453809664, + "loss": 1.3765, + "grad_norm": 1.1757296323776245, + "learning_rate": 0.0001504454047311353 + }, + { + "step": 324, + "epoch": 1.6488549618320612, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453702144, + "loss": 1.3775, + "grad_norm": 1.5597118139266968, + "learning_rate": 0.00014955459526886468 + }, + { + "step": 325, + "epoch": 1.6539440203562341, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45374208, + "loss": 1.3883, + "grad_norm": 1.3349967002868652, + "learning_rate": 0.00014866380151535574 + }, + { + "step": 326, + "epoch": 1.659033078880407, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453789696, + "loss": 1.3181, + "grad_norm": 1.0469002723693848, + "learning_rate": 0.0001477730548875778 + }, + { + "step": 327, + "epoch": 1.66412213740458, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453755904, + "loss": 1.3763, + "grad_norm": 1.2387107610702515, + "learning_rate": 0.0001468823868008382 + }, + { + "step": 328, + "epoch": 1.6692111959287532, + "cpu_mem": 3.324944384, + "gpu_mem": 1.4538112, + "loss": 1.3657, + "grad_norm": 1.1863651275634766, + "learning_rate": 0.000145991828667674 + }, + { + "step": 329, + "epoch": 1.6743002544529262, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45370368, + "loss": 1.3624, + "grad_norm": 0.9617934226989746, + "learning_rate": 0.0001451014118967446 + }, + { + "step": 330, + "epoch": 1.6793893129770994, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45378816, + "loss": 1.3479, + "grad_norm": 1.1325657367706299, + "learning_rate": 0.00014421116789172374 + }, + { + "step": 331, + "epoch": 1.6844783715012723, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453777408, + "loss": 1.2632, + "grad_norm": 0.8105853796005249, + "learning_rate": 0.00014332112805019194 + }, + { + "step": 332, + "epoch": 1.6895674300254453, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453732864, + "loss": 1.3193, + "grad_norm": 0.8505505919456482, + "learning_rate": 0.00014243132376252912 + }, + { + "step": 333, + "epoch": 1.6946564885496183, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453821952, + "loss": 1.4377, + "grad_norm": 0.9360431432723999, + "learning_rate": 0.00014154178641080767 + }, + { + "step": 334, + "epoch": 1.6997455470737912, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45374976, + "loss": 1.3416, + "grad_norm": 0.7653841972351074, + "learning_rate": 0.0001406525473676854 + }, + { + "step": 335, + "epoch": 1.7048346055979644, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453758976, + "loss": 1.3641, + "grad_norm": 0.7733296751976013, + "learning_rate": 0.00013976363799529936 + }, + { + "step": 336, + "epoch": 1.7099236641221374, + "cpu_mem": 3.324944384, + "gpu_mem": 1.4538112, + "loss": 1.3494, + "grad_norm": 0.7492001056671143, + "learning_rate": 0.00013887508964415956 + }, + { + "step": 337, + "epoch": 1.7150127226463106, + "cpu_mem": 3.324944384, + "gpu_mem": 1.4538112, + "loss": 1.2655, + "grad_norm": 1.1832350492477417, + "learning_rate": 0.00013798693365204325 + }, + { + "step": 338, + "epoch": 1.7201017811704835, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453848064, + "loss": 1.3244, + "grad_norm": 0.9805998206138611, + "learning_rate": 0.00013709920134288993 + }, + { + "step": 339, + "epoch": 1.7251908396946565, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45376512, + "loss": 1.3478, + "grad_norm": 0.9968200922012329, + "learning_rate": 0.00013621192402569628 + }, + { + "step": 340, + "epoch": 1.7302798982188294, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45383424, + "loss": 1.337, + "grad_norm": 0.8987478613853455, + "learning_rate": 0.00013532513299341215 + }, + { + "step": 341, + "epoch": 1.7353689567430024, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453775872, + "loss": 1.3349, + "grad_norm": 1.1536544561386108, + "learning_rate": 0.00013443885952183683 + }, + { + "step": 342, + "epoch": 1.7404580152671756, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453782016, + "loss": 1.333, + "grad_norm": 1.4061307907104492, + "learning_rate": 0.00013355313486851603 + }, + { + "step": 343, + "epoch": 1.7455470737913485, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453692928, + "loss": 1.3572, + "grad_norm": 1.3116523027420044, + "learning_rate": 0.00013266799027163942 + }, + { + "step": 344, + "epoch": 1.7506361323155217, + "cpu_mem": 3.324944384, + "gpu_mem": 1.4538112, + "loss": 1.2523, + "grad_norm": 1.26020348072052, + "learning_rate": 0.00013178345694893906 + }, + { + "step": 345, + "epoch": 1.7557251908396947, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453763584, + "loss": 1.2946, + "grad_norm": 1.364715576171875, + "learning_rate": 0.0001308995660965881 + }, + { + "step": 346, + "epoch": 1.7608142493638677, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453709824, + "loss": 1.3749, + "grad_norm": 1.8800787925720215, + "learning_rate": 0.00013001634888810094 + }, + { + "step": 347, + "epoch": 1.7659033078880406, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453758976, + "loss": 1.3229, + "grad_norm": 2.146751880645752, + "learning_rate": 0.0001291338364732333 + }, + { + "step": 348, + "epoch": 1.7709923664122136, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453884928, + "loss": 1.4245, + "grad_norm": 1.8142658472061157, + "learning_rate": 0.00012825205997688403 + }, + { + "step": 349, + "epoch": 1.7760814249363868, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453683712, + "loss": 1.2709, + "grad_norm": 1.2885278463363647, + "learning_rate": 0.00012737105049799723 + }, + { + "step": 350, + "epoch": 1.78117048346056, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453748224, + "loss": 1.4239, + "grad_norm": 1.4695223569869995, + "learning_rate": 0.00012649083910846543 + }, + { + "step": 351, + "epoch": 1.786259541984733, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453763584, + "loss": 1.3045, + "grad_norm": 0.8504621386528015, + "learning_rate": 0.00012561145685203374 + }, + { + "step": 352, + "epoch": 1.7913486005089059, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453754368, + "loss": 1.3727, + "grad_norm": 0.7513923048973083, + "learning_rate": 0.00012473293474320505 + }, + { + "step": 353, + "epoch": 1.7964376590330788, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453751296, + "loss": 1.2962, + "grad_norm": 1.104972004890442, + "learning_rate": 0.00012385530376614586 + }, + { + "step": 354, + "epoch": 1.8015267175572518, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45372672, + "loss": 1.3495, + "grad_norm": 0.8508201837539673, + "learning_rate": 0.00012297859487359408 + }, + { + "step": 355, + "epoch": 1.806615776081425, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453709824, + "loss": 1.3171, + "grad_norm": 0.7785360813140869, + "learning_rate": 0.0001221028389857669 + }, + { + "step": 356, + "epoch": 1.811704834605598, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45371136, + "loss": 1.3184, + "grad_norm": 0.7284829616546631, + "learning_rate": 0.00012122806698927051 + }, + { + "step": 357, + "epoch": 1.8167938931297711, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453763584, + "loss": 1.3489, + "grad_norm": 1.2272270917892456, + "learning_rate": 0.00012035430973601075 + }, + { + "step": 358, + "epoch": 1.821882951653944, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453829632, + "loss": 1.3618, + "grad_norm": 1.655991554260254, + "learning_rate": 0.00011948159804210495 + }, + { + "step": 359, + "epoch": 1.826972010178117, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453783552, + "loss": 1.2684, + "grad_norm": 0.9685901999473572, + "learning_rate": 0.00011860996268679504 + }, + { + "step": 360, + "epoch": 1.83206106870229, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45383424, + "loss": 1.3998, + "grad_norm": 1.3306982517242432, + "learning_rate": 0.00011773943441136221 + }, + { + "step": 361, + "epoch": 1.837150127226463, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453805056, + "loss": 1.3186, + "grad_norm": 1.080527901649475, + "learning_rate": 0.00011687004391804251 + }, + { + "step": 362, + "epoch": 1.8422391857506362, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453739008, + "loss": 1.3199, + "grad_norm": 1.209517240524292, + "learning_rate": 0.00011600182186894417 + }, + { + "step": 363, + "epoch": 1.8473282442748091, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453696, + "loss": 1.2886, + "grad_norm": 1.112075686454773, + "learning_rate": 0.00011513479888496609 + }, + { + "step": 364, + "epoch": 1.8524173027989823, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453723648, + "loss": 1.2125, + "grad_norm": 1.086458444595337, + "learning_rate": 0.00011426900554471795 + }, + { + "step": 365, + "epoch": 1.8575063613231553, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453706752, + "loss": 1.3535, + "grad_norm": 1.3055917024612427, + "learning_rate": 0.0001134044723834417 + }, + { + "step": 366, + "epoch": 1.8625954198473282, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45374976, + "loss": 1.2982, + "grad_norm": 1.6202422380447388, + "learning_rate": 0.00011254122989193465 + }, + { + "step": 367, + "epoch": 1.8676844783715012, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45380352, + "loss": 1.2634, + "grad_norm": 1.6184481382369995, + "learning_rate": 0.00011167930851547418 + }, + { + "step": 368, + "epoch": 1.8727735368956742, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453700608, + "loss": 1.2943, + "grad_norm": 1.512048363685608, + "learning_rate": 0.0001108187386527438 + }, + { + "step": 369, + "epoch": 1.8778625954198473, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453760512, + "loss": 1.2717, + "grad_norm": 1.5670909881591797, + "learning_rate": 0.00010995955065476126 + }, + { + "step": 370, + "epoch": 1.8829516539440203, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453683712, + "loss": 1.3369, + "grad_norm": 1.7760504484176636, + "learning_rate": 0.00010910177482380795 + }, + { + "step": 371, + "epoch": 1.8880407124681935, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453843456, + "loss": 1.2762, + "grad_norm": 2.02954363822937, + "learning_rate": 0.00010824544141236015 + }, + { + "step": 372, + "epoch": 1.8931297709923665, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453697536, + "loss": 1.2492, + "grad_norm": 1.598986029624939, + "learning_rate": 0.00010739058062202224 + }, + { + "step": 373, + "epoch": 1.8982188295165394, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453720576, + "loss": 1.3039, + "grad_norm": 1.8501847982406616, + "learning_rate": 0.00010653722260246145 + }, + { + "step": 374, + "epoch": 1.9033078880407124, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453743616, + "loss": 1.325, + "grad_norm": 1.8300966024398804, + "learning_rate": 0.00010568539745034447 + }, + { + "step": 375, + "epoch": 1.9083969465648853, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453967872, + "loss": 1.3995, + "grad_norm": 1.6123273372650146, + "learning_rate": 0.00010483513520827614 + }, + { + "step": 376, + "epoch": 1.9134860050890585, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453725184, + "loss": 1.3406, + "grad_norm": 1.2632520198822021, + "learning_rate": 0.00010398646586373969 + }, + { + "step": 377, + "epoch": 1.9185750636132317, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453848064, + "loss": 1.2912, + "grad_norm": 1.3231995105743408, + "learning_rate": 0.00010313941934803922 + }, + { + "step": 378, + "epoch": 1.9236641221374047, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453763584, + "loss": 1.3301, + "grad_norm": 1.311618685722351, + "learning_rate": 0.00010229402553524413 + }, + { + "step": 379, + "epoch": 1.9287531806615776, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453852672, + "loss": 1.3421, + "grad_norm": 1.4206749200820923, + "learning_rate": 0.00010145031424113542 + }, + { + "step": 380, + "epoch": 1.9338422391857506, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453751296, + "loss": 1.281, + "grad_norm": 1.0758236646652222, + "learning_rate": 0.00010060831522215416 + }, + { + "step": 381, + "epoch": 1.9389312977099236, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453758976, + "loss": 1.3199, + "grad_norm": 1.1319730281829834, + "learning_rate": 9.976805817435207e-05 + }, + { + "step": 382, + "epoch": 1.9440203562340967, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453682176, + "loss": 1.3135, + "grad_norm": 0.9373548626899719, + "learning_rate": 9.89295727323441e-05 + }, + { + "step": 383, + "epoch": 1.9491094147582697, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453762048, + "loss": 1.2801, + "grad_norm": 0.965552568435669, + "learning_rate": 9.809288846826327e-05 + }, + { + "step": 384, + "epoch": 1.954198473282443, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453732864, + "loss": 1.3592, + "grad_norm": 1.4239588975906372, + "learning_rate": 9.725803489071779e-05 + }, + { + "step": 385, + "epoch": 1.9592875318066159, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453720576, + "loss": 1.3296, + "grad_norm": 1.323668122291565, + "learning_rate": 9.642504144375026e-05 + }, + { + "step": 386, + "epoch": 1.9643765903307888, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453863424, + "loss": 1.2498, + "grad_norm": 1.0192303657531738, + "learning_rate": 9.559393750579926e-05 + }, + { + "step": 387, + "epoch": 1.9694656488549618, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453746688, + "loss": 1.2587, + "grad_norm": 0.9681774377822876, + "learning_rate": 9.476475238866318e-05 + }, + { + "step": 388, + "epoch": 1.9745547073791347, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45375744, + "loss": 1.3734, + "grad_norm": 1.1226109266281128, + "learning_rate": 9.393751533646649e-05 + }, + { + "step": 389, + "epoch": 1.979643765903308, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453912576, + "loss": 1.3448, + "grad_norm": 1.151957392692566, + "learning_rate": 9.31122555246283e-05 + }, + { + "step": 390, + "epoch": 1.984732824427481, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453722112, + "loss": 1.2739, + "grad_norm": 1.1765815019607544, + "learning_rate": 9.228900205883324e-05 + }, + { + "step": 391, + "epoch": 1.989821882951654, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453743616, + "loss": 1.2645, + "grad_norm": 1.1149661540985107, + "learning_rate": 9.146778397400543e-05 + }, + { + "step": 392, + "epoch": 1.994910941475827, + "cpu_mem": 3.324944384, + "gpu_mem": 1.45378048, + "loss": 1.2674, + "grad_norm": 0.9648839831352234, + "learning_rate": 9.064863023328384e-05 + }, + { + "step": 393, + "epoch": 2.0, + "cpu_mem": 3.324944384, + "gpu_mem": 1.453367296, + "loss": 1.8862, + "grad_norm": 1.9592204093933105, + "learning_rate": 8.983156972700125e-05 + }, + { + "step": 394, + "epoch": 2.005089058524173, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352776704, + "loss": 1.1093, + "grad_norm": 1.63056218624115, + "learning_rate": 8.901663127166513e-05 + }, + { + "step": 395, + "epoch": 2.010178117048346, + "cpu_mem": 3.324944384, + "gpu_mem": 1.3527552, + "loss": 1.2132, + "grad_norm": 1.4339247941970825, + "learning_rate": 8.820384360894143e-05 + }, + { + "step": 396, + "epoch": 2.015267175572519, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35277056, + "loss": 1.2001, + "grad_norm": 1.5302265882492065, + "learning_rate": 8.739323540464063e-05 + }, + { + "step": 397, + "epoch": 2.0203562340966923, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352918016, + "loss": 1.0368, + "grad_norm": 1.7784756422042847, + "learning_rate": 8.658483524770728e-05 + }, + { + "step": 398, + "epoch": 2.0254452926208653, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352864256, + "loss": 1.2554, + "grad_norm": 2.6294076442718506, + "learning_rate": 8.577867164921113e-05 + }, + { + "step": 399, + "epoch": 2.030534351145038, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352881152, + "loss": 1.1158, + "grad_norm": 2.321924924850464, + "learning_rate": 8.497477304134203e-05 + }, + { + "step": 400, + "epoch": 2.035623409669211, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352833536, + "loss": 1.2058, + "grad_norm": 2.911749839782715, + "learning_rate": 8.41731677764068e-05 + }, + { + "step": 401, + "epoch": 2.040712468193384, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352858112, + "loss": 1.1036, + "grad_norm": 2.693270206451416, + "learning_rate": 8.337388412582972e-05 + }, + { + "step": 402, + "epoch": 2.045801526717557, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352905728, + "loss": 1.2686, + "grad_norm": 3.1857683658599854, + "learning_rate": 8.257695027915481e-05 + }, + { + "step": 403, + "epoch": 2.05089058524173, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352815104, + "loss": 1.1304, + "grad_norm": 2.507841110229492, + "learning_rate": 8.178239434305235e-05 + }, + { + "step": 404, + "epoch": 2.0559796437659035, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352787456, + "loss": 1.0114, + "grad_norm": 2.491103410720825, + "learning_rate": 8.099024434032717e-05 + }, + { + "step": 405, + "epoch": 2.0610687022900764, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352876544, + "loss": 1.0488, + "grad_norm": 2.329896926879883, + "learning_rate": 8.02005282089303e-05 + }, + { + "step": 406, + "epoch": 2.0661577608142494, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352767488, + "loss": 0.9993, + "grad_norm": 2.3024940490722656, + "learning_rate": 7.941327380097388e-05 + }, + { + "step": 407, + "epoch": 2.0712468193384224, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352819712, + "loss": 1.1364, + "grad_norm": 2.3313651084899902, + "learning_rate": 7.862850888174869e-05 + }, + { + "step": 408, + "epoch": 2.0763358778625953, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352773632, + "loss": 1.1179, + "grad_norm": 2.533749580383301, + "learning_rate": 7.784626112874487e-05 + }, + { + "step": 409, + "epoch": 2.0814249363867683, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352976384, + "loss": 1.0792, + "grad_norm": 2.034925699234009, + "learning_rate": 7.706655813067594e-05 + }, + { + "step": 410, + "epoch": 2.0865139949109412, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352974848, + "loss": 1.1076, + "grad_norm": 2.286083936691284, + "learning_rate": 7.628942738650573e-05 + }, + { + "step": 411, + "epoch": 2.0916030534351147, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352898048, + "loss": 1.0094, + "grad_norm": 2.1345715522766113, + "learning_rate": 7.551489630447835e-05 + }, + { + "step": 412, + "epoch": 2.0966921119592876, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35285504, + "loss": 1.1192, + "grad_norm": 2.266895055770874, + "learning_rate": 7.474299220115195e-05 + }, + { + "step": 413, + "epoch": 2.1017811704834606, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352832, + "loss": 1.061, + "grad_norm": 2.5291354656219482, + "learning_rate": 7.397374230043484e-05 + }, + { + "step": 414, + "epoch": 2.1068702290076335, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35278592, + "loss": 0.994, + "grad_norm": 2.131821632385254, + "learning_rate": 7.320717373262557e-05 + }, + { + "step": 415, + "epoch": 2.1119592875318065, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352902656, + "loss": 1.2076, + "grad_norm": 2.849609136581421, + "learning_rate": 7.244331353345625e-05 + }, + { + "step": 416, + "epoch": 2.1170483460559795, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352790528, + "loss": 1.1371, + "grad_norm": 2.0719189643859863, + "learning_rate": 7.16821886431386e-05 + }, + { + "step": 417, + "epoch": 2.122137404580153, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352779776, + "loss": 1.0678, + "grad_norm": 2.4156534671783447, + "learning_rate": 7.092382590541432e-05 + }, + { + "step": 418, + "epoch": 2.127226463104326, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352784384, + "loss": 1.1101, + "grad_norm": 2.128600597381592, + "learning_rate": 7.016825206660788e-05 + }, + { + "step": 419, + "epoch": 2.132315521628499, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352899584, + "loss": 1.2129, + "grad_norm": 2.8567538261413574, + "learning_rate": 6.941549377468367e-05 + }, + { + "step": 420, + "epoch": 2.1374045801526718, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352873472, + "loss": 1.0112, + "grad_norm": 2.4398207664489746, + "learning_rate": 6.866557757830575e-05 + }, + { + "step": 421, + "epoch": 2.1424936386768447, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352775168, + "loss": 1.0846, + "grad_norm": 2.4950618743896484, + "learning_rate": 6.791852992590169e-05 + }, + { + "step": 422, + "epoch": 2.1475826972010177, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352810496, + "loss": 1.1224, + "grad_norm": 2.4445016384124756, + "learning_rate": 6.717437716472997e-05 + }, + { + "step": 423, + "epoch": 2.1526717557251906, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352776704, + "loss": 1.1516, + "grad_norm": 2.951770544052124, + "learning_rate": 6.643314553995034e-05 + }, + { + "step": 424, + "epoch": 2.157760814249364, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352910336, + "loss": 1.16, + "grad_norm": 2.854255199432373, + "learning_rate": 6.569486119369863e-05 + }, + { + "step": 425, + "epoch": 2.162849872773537, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352884224, + "loss": 1.0471, + "grad_norm": 2.5598769187927246, + "learning_rate": 6.495955016416441e-05 + }, + { + "step": 426, + "epoch": 2.16793893129771, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352928768, + "loss": 0.9379, + "grad_norm": 2.3606343269348145, + "learning_rate": 6.422723838467286e-05 + }, + { + "step": 427, + "epoch": 2.173027989821883, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352819712, + "loss": 1.1839, + "grad_norm": 2.565048933029175, + "learning_rate": 6.349795168276994e-05 + }, + { + "step": 428, + "epoch": 2.178117048346056, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352836608, + "loss": 0.8923, + "grad_norm": 2.7127721309661865, + "learning_rate": 6.277171577931187e-05 + }, + { + "step": 429, + "epoch": 2.183206106870229, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352911872, + "loss": 1.1465, + "grad_norm": 2.6912388801574707, + "learning_rate": 6.204855628755751e-05 + }, + { + "step": 430, + "epoch": 2.188295165394402, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352759808, + "loss": 1.2599, + "grad_norm": 3.134354591369629, + "learning_rate": 6.13284987122654e-05 + }, + { + "step": 431, + "epoch": 2.1933842239185752, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35287808, + "loss": 1.0304, + "grad_norm": 2.725787878036499, + "learning_rate": 6.061156844879417e-05 + }, + { + "step": 432, + "epoch": 2.198473282442748, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352867328, + "loss": 0.9127, + "grad_norm": 2.5131096839904785, + "learning_rate": 5.9897790782206636e-05 + }, + { + "step": 433, + "epoch": 2.203562340966921, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352833536, + "loss": 1.1121, + "grad_norm": 2.7520344257354736, + "learning_rate": 5.9187190886378306e-05 + }, + { + "step": 434, + "epoch": 2.208651399491094, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352812032, + "loss": 1.2504, + "grad_norm": 3.4601330757141113, + "learning_rate": 5.8479793823109406e-05 + }, + { + "step": 435, + "epoch": 2.213740458015267, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352807424, + "loss": 1.0117, + "grad_norm": 2.4564454555511475, + "learning_rate": 5.777562454124113e-05 + }, + { + "step": 436, + "epoch": 2.21882951653944, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352818176, + "loss": 1.1592, + "grad_norm": 3.0102241039276123, + "learning_rate": 5.7074707875775496e-05 + }, + { + "step": 437, + "epoch": 2.223918575063613, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352871936, + "loss": 1.0649, + "grad_norm": 3.095808982849121, + "learning_rate": 5.637706854699974e-05 + }, + { + "step": 438, + "epoch": 2.2290076335877864, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352788992, + "loss": 1.0753, + "grad_norm": 2.557029962539673, + "learning_rate": 5.568273115961414e-05 + }, + { + "step": 439, + "epoch": 2.2340966921119594, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35281664, + "loss": 1.0576, + "grad_norm": 2.5501582622528076, + "learning_rate": 5.499172020186447e-05 + }, + { + "step": 440, + "epoch": 2.2391857506361323, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352815104, + "loss": 0.906, + "grad_norm": 2.7032418251037598, + "learning_rate": 5.430406004467842e-05 + }, + { + "step": 441, + "epoch": 2.2442748091603053, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352858112, + "loss": 1.1258, + "grad_norm": 3.266439437866211, + "learning_rate": 5.361977494080572e-05 + }, + { + "step": 442, + "epoch": 2.2493638676844783, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352856576, + "loss": 1.1972, + "grad_norm": 2.935216188430786, + "learning_rate": 5.293888902396319e-05 + }, + { + "step": 443, + "epoch": 2.2544529262086512, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352907264, + "loss": 1.0071, + "grad_norm": 2.6737148761749268, + "learning_rate": 5.2261426307983204e-05 + }, + { + "step": 444, + "epoch": 2.2595419847328246, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352913408, + "loss": 1.0083, + "grad_norm": 2.8401026725769043, + "learning_rate": 5.158741068596714e-05 + }, + { + "step": 445, + "epoch": 2.2646310432569976, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352851968, + "loss": 1.186, + "grad_norm": 3.299640417098999, + "learning_rate": 5.0916865929442326e-05 + }, + { + "step": 446, + "epoch": 2.2697201017811706, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352841216, + "loss": 1.091, + "grad_norm": 2.7352726459503174, + "learning_rate": 5.024981568752386e-05 + }, + { + "step": 447, + "epoch": 2.2748091603053435, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352865792, + "loss": 0.9819, + "grad_norm": 2.265723705291748, + "learning_rate": 4.958628348608065e-05 + }, + { + "step": 448, + "epoch": 2.2798982188295165, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352882688, + "loss": 1.0221, + "grad_norm": 2.974802017211914, + "learning_rate": 4.892629272690536e-05 + }, + { + "step": 449, + "epoch": 2.2849872773536894, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352753664, + "loss": 1.1014, + "grad_norm": 2.820946455001831, + "learning_rate": 4.826986668688944e-05 + }, + { + "step": 450, + "epoch": 2.2900763358778624, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352928768, + "loss": 0.9217, + "grad_norm": 2.7393980026245117, + "learning_rate": 4.761702851720191e-05 + }, + { + "step": 451, + "epoch": 2.2951653944020354, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35288576, + "loss": 1.0385, + "grad_norm": 2.9518496990203857, + "learning_rate": 4.6967801242472916e-05 + }, + { + "step": 452, + "epoch": 2.300254452926209, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352795136, + "loss": 1.0608, + "grad_norm": 2.6857099533081055, + "learning_rate": 4.632220775998172e-05 + }, + { + "step": 453, + "epoch": 2.3053435114503817, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352742912, + "loss": 0.9102, + "grad_norm": 2.815359592437744, + "learning_rate": 4.568027083884929e-05 + }, + { + "step": 454, + "epoch": 2.3104325699745547, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352758272, + "loss": 1.006, + "grad_norm": 2.8635025024414062, + "learning_rate": 4.504201311923488e-05 + }, + { + "step": 455, + "epoch": 2.3155216284987277, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352859648, + "loss": 0.9616, + "grad_norm": 2.782430648803711, + "learning_rate": 4.440745711153804e-05 + }, + { + "step": 456, + "epoch": 2.3206106870229006, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352868864, + "loss": 0.9412, + "grad_norm": 2.8637030124664307, + "learning_rate": 4.377662519560423e-05 + }, + { + "step": 457, + "epoch": 2.325699745547074, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352898048, + "loss": 1.0313, + "grad_norm": 3.1261494159698486, + "learning_rate": 4.3149539619935836e-05 + }, + { + "step": 458, + "epoch": 2.330788804071247, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352959488, + "loss": 1.0914, + "grad_norm": 2.958913803100586, + "learning_rate": 4.252622250090746e-05 + }, + { + "step": 459, + "epoch": 2.33587786259542, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352813568, + "loss": 1.0098, + "grad_norm": 2.9808688163757324, + "learning_rate": 4.190669582198571e-05 + }, + { + "step": 460, + "epoch": 2.340966921119593, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35290112, + "loss": 1.1353, + "grad_norm": 4.092074871063232, + "learning_rate": 4.1290981432954185e-05 + }, + { + "step": 461, + "epoch": 2.346055979643766, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352804352, + "loss": 1.0183, + "grad_norm": 2.996701240539551, + "learning_rate": 4.067910104914249e-05 + }, + { + "step": 462, + "epoch": 2.351145038167939, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352875008, + "loss": 1.1201, + "grad_norm": 3.4018290042877197, + "learning_rate": 4.007107625066079e-05 + }, + { + "step": 463, + "epoch": 2.356234096692112, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352910336, + "loss": 0.9539, + "grad_norm": 2.9501383304595947, + "learning_rate": 3.946692848163836e-05 + }, + { + "step": 464, + "epoch": 2.3613231552162848, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352848896, + "loss": 0.9785, + "grad_norm": 2.728257656097412, + "learning_rate": 3.886667904946739e-05 + }, + { + "step": 465, + "epoch": 2.366412213740458, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352805888, + "loss": 1.1587, + "grad_norm": 3.1722512245178223, + "learning_rate": 3.8270349124051694e-05 + }, + { + "step": 466, + "epoch": 2.371501272264631, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352807424, + "loss": 0.9527, + "grad_norm": 2.5447630882263184, + "learning_rate": 3.767795973705975e-05 + }, + { + "step": 467, + "epoch": 2.376590330788804, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352844288, + "loss": 0.9271, + "grad_norm": 2.7622387409210205, + "learning_rate": 3.708953178118324e-05 + }, + { + "step": 468, + "epoch": 2.381679389312977, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35281664, + "loss": 1.031, + "grad_norm": 2.8288896083831787, + "learning_rate": 3.6505086009399944e-05 + }, + { + "step": 469, + "epoch": 2.38676844783715, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352873472, + "loss": 0.8749, + "grad_norm": 2.636232614517212, + "learning_rate": 3.5924643034242136e-05 + }, + { + "step": 470, + "epoch": 2.391857506361323, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352902656, + "loss": 0.9788, + "grad_norm": 3.1775901317596436, + "learning_rate": 3.5348223327069105e-05 + }, + { + "step": 471, + "epoch": 2.3969465648854964, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352802816, + "loss": 1.0337, + "grad_norm": 3.1117215156555176, + "learning_rate": 3.4775847217345756e-05 + }, + { + "step": 472, + "epoch": 2.4020356234096694, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352825856, + "loss": 0.9846, + "grad_norm": 2.926259994506836, + "learning_rate": 3.420753489192524e-05 + }, + { + "step": 473, + "epoch": 2.4071246819338423, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35280128, + "loss": 0.9603, + "grad_norm": 2.961151361465454, + "learning_rate": 3.364330639433701e-05 + }, + { + "step": 474, + "epoch": 2.4122137404580153, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35289344, + "loss": 1.0652, + "grad_norm": 3.0996322631835938, + "learning_rate": 3.308318162408013e-05 + }, + { + "step": 475, + "epoch": 2.4173027989821882, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35276288, + "loss": 1.0294, + "grad_norm": 3.003209114074707, + "learning_rate": 3.2527180335921186e-05 + }, + { + "step": 476, + "epoch": 2.422391857506361, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352899584, + "loss": 1.0057, + "grad_norm": 2.7728426456451416, + "learning_rate": 3.197532213919774e-05 + }, + { + "step": 477, + "epoch": 2.427480916030534, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352756736, + "loss": 0.8904, + "grad_norm": 2.5778965950012207, + "learning_rate": 3.1427626497126654e-05 + }, + { + "step": 478, + "epoch": 2.432569974554707, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352779776, + "loss": 0.9847, + "grad_norm": 3.361400842666626, + "learning_rate": 3.088411272611781e-05 + }, + { + "step": 479, + "epoch": 2.4376590330788805, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352827392, + "loss": 1.0963, + "grad_norm": 3.0457987785339355, + "learning_rate": 3.0344799995092533e-05 + }, + { + "step": 480, + "epoch": 2.4427480916030535, + "cpu_mem": 3.324944384, + "gpu_mem": 1.3528704, + "loss": 1.0758, + "grad_norm": 3.2192020416259766, + "learning_rate": 2.9809707324807912e-05 + }, + { + "step": 481, + "epoch": 2.4478371501272265, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352798208, + "loss": 1.1627, + "grad_norm": 3.2584197521209717, + "learning_rate": 2.9278853587185658e-05 + }, + { + "step": 482, + "epoch": 2.4529262086513994, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352896512, + "loss": 1.0725, + "grad_norm": 3.1746387481689453, + "learning_rate": 2.8752257504646616e-05 + }, + { + "step": 483, + "epoch": 2.4580152671755724, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352792064, + "loss": 1.0555, + "grad_norm": 3.2349209785461426, + "learning_rate": 2.8229937649450613e-05 + }, + { + "step": 484, + "epoch": 2.4631043256997454, + "cpu_mem": 3.324944384, + "gpu_mem": 1.3529472, + "loss": 0.8472, + "grad_norm": 2.7507317066192627, + "learning_rate": 2.7711912443041123e-05 + }, + { + "step": 485, + "epoch": 2.4681933842239188, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352779776, + "loss": 1.0426, + "grad_norm": 3.0506534576416016, + "learning_rate": 2.719820015539596e-05 + }, + { + "step": 486, + "epoch": 2.4732824427480917, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352822784, + "loss": 0.9873, + "grad_norm": 2.9527781009674072, + "learning_rate": 2.6688818904382513e-05 + }, + { + "step": 487, + "epoch": 2.4783715012722647, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352844288, + "loss": 1.0835, + "grad_norm": 3.127012252807617, + "learning_rate": 2.6183786655119144e-05 + }, + { + "step": 488, + "epoch": 2.4834605597964376, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352767488, + "loss": 1.0708, + "grad_norm": 3.167412042617798, + "learning_rate": 2.5683121219341217e-05 + }, + { + "step": 489, + "epoch": 2.4885496183206106, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352904192, + "loss": 0.9913, + "grad_norm": 3.39615797996521, + "learning_rate": 2.518684025477319e-05 + }, + { + "step": 490, + "epoch": 2.4936386768447836, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352792064, + "loss": 0.946, + "grad_norm": 2.7632601261138916, + "learning_rate": 2.469496126450578e-05 + }, + { + "step": 491, + "epoch": 2.4987277353689565, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352788992, + "loss": 1.1035, + "grad_norm": 3.6609694957733154, + "learning_rate": 2.4207501596378508e-05 + }, + { + "step": 492, + "epoch": 2.5038167938931295, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35276288, + "loss": 1.1663, + "grad_norm": 3.0487022399902344, + "learning_rate": 2.3724478442368133e-05 + }, + { + "step": 493, + "epoch": 2.508905852417303, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352798208, + "loss": 0.8911, + "grad_norm": 3.21035099029541, + "learning_rate": 2.324590883798204e-05 + }, + { + "step": 494, + "epoch": 2.513994910941476, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352813568, + "loss": 0.7973, + "grad_norm": 2.8469135761260986, + "learning_rate": 2.2771809661657614e-05 + }, + { + "step": 495, + "epoch": 2.519083969465649, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352807424, + "loss": 0.8921, + "grad_norm": 2.869516611099243, + "learning_rate": 2.2302197634166835e-05 + }, + { + "step": 496, + "epoch": 2.524173027989822, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352819712, + "loss": 1.0379, + "grad_norm": 3.1953909397125244, + "learning_rate": 2.1837089318026714e-05 + }, + { + "step": 497, + "epoch": 2.5292620865139948, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352851968, + "loss": 1.0099, + "grad_norm": 3.3635048866271973, + "learning_rate": 2.1376501116915047e-05 + }, + { + "step": 498, + "epoch": 2.534351145038168, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352853504, + "loss": 1.1393, + "grad_norm": 3.2022902965545654, + "learning_rate": 2.0920449275091837e-05 + }, + { + "step": 499, + "epoch": 2.539440203562341, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352825856, + "loss": 1.085, + "grad_norm": 3.249678134918213, + "learning_rate": 2.0468949876826573e-05 + }, + { + "step": 500, + "epoch": 2.544529262086514, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352875008, + "loss": 1.0539, + "grad_norm": 3.229567289352417, + "learning_rate": 2.002201884583065e-05 + }, + { + "step": 501, + "epoch": 2.549618320610687, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352804352, + "loss": 1.0301, + "grad_norm": 3.2672300338745117, + "learning_rate": 1.957967194469615e-05 + }, + { + "step": 502, + "epoch": 2.55470737913486, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35286272, + "loss": 0.8426, + "grad_norm": 3.0078487396240234, + "learning_rate": 1.9141924774339566e-05 + }, + { + "step": 503, + "epoch": 2.559796437659033, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352853504, + "loss": 0.8348, + "grad_norm": 3.0929911136627197, + "learning_rate": 1.8708792773451874e-05 + }, + { + "step": 504, + "epoch": 2.564885496183206, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352787456, + "loss": 1.0543, + "grad_norm": 3.915433645248413, + "learning_rate": 1.828029121795375e-05 + }, + { + "step": 505, + "epoch": 2.569974554707379, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35280896, + "loss": 0.9602, + "grad_norm": 3.2177000045776367, + "learning_rate": 1.7856435220457092e-05 + }, + { + "step": 506, + "epoch": 2.5750636132315523, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352864256, + "loss": 0.9711, + "grad_norm": 3.462601900100708, + "learning_rate": 1.7437239729731806e-05 + }, + { + "step": 507, + "epoch": 2.5801526717557253, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352781312, + "loss": 1.0542, + "grad_norm": 3.2760233879089355, + "learning_rate": 1.7022719530178624e-05 + }, + { + "step": 508, + "epoch": 2.5852417302798982, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352845824, + "loss": 1.0711, + "grad_norm": 3.7297189235687256, + "learning_rate": 1.6612889241307836e-05 + }, + { + "step": 509, + "epoch": 2.590330788804071, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352804352, + "loss": 0.8113, + "grad_norm": 3.1316630840301514, + "learning_rate": 1.620776331722347e-05 + }, + { + "step": 510, + "epoch": 2.595419847328244, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352836608, + "loss": 1.0528, + "grad_norm": 3.1354620456695557, + "learning_rate": 1.580735604611368e-05 + }, + { + "step": 511, + "epoch": 2.6005089058524176, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352810496, + "loss": 0.9304, + "grad_norm": 2.948395252227783, + "learning_rate": 1.5411681549746678e-05 + }, + { + "step": 512, + "epoch": 2.6055979643765905, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352804352, + "loss": 1.0601, + "grad_norm": 3.375399351119995, + "learning_rate": 1.502075378297285e-05 + }, + { + "step": 513, + "epoch": 2.6106870229007635, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352815104, + "loss": 1.1793, + "grad_norm": 4.109281539916992, + "learning_rate": 1.4634586533232428e-05 + }, + { + "step": 514, + "epoch": 2.6157760814249365, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352719872, + "loss": 1.089, + "grad_norm": 3.378720760345459, + "learning_rate": 1.4253193420069292e-05 + }, + { + "step": 515, + "epoch": 2.6208651399491094, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352788992, + "loss": 0.9628, + "grad_norm": 3.109628438949585, + "learning_rate": 1.3876587894650686e-05 + }, + { + "step": 516, + "epoch": 2.6259541984732824, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352836608, + "loss": 1.027, + "grad_norm": 3.5308456420898438, + "learning_rate": 1.350478323929271e-05 + }, + { + "step": 517, + "epoch": 2.6310432569974553, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352781312, + "loss": 1.1612, + "grad_norm": 3.852613687515259, + "learning_rate": 1.3137792566992001e-05 + }, + { + "step": 518, + "epoch": 2.6361323155216283, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35283968, + "loss": 1.019, + "grad_norm": 3.191760301589966, + "learning_rate": 1.2775628820963091e-05 + }, + { + "step": 519, + "epoch": 2.6412213740458013, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352845824, + "loss": 0.94, + "grad_norm": 3.259784460067749, + "learning_rate": 1.2418304774182075e-05 + }, + { + "step": 520, + "epoch": 2.6463104325699747, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352982528, + "loss": 0.8096, + "grad_norm": 2.7223594188690186, + "learning_rate": 1.2065833028935968e-05 + }, + { + "step": 521, + "epoch": 2.6513994910941476, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352792064, + "loss": 0.913, + "grad_norm": 2.5968165397644043, + "learning_rate": 1.1718226016378507e-05 + }, + { + "step": 522, + "epoch": 2.6564885496183206, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352856576, + "loss": 0.984, + "grad_norm": 3.148865222930908, + "learning_rate": 1.137549599609136e-05 + }, + { + "step": 523, + "epoch": 2.6615776081424936, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352772096, + "loss": 0.9712, + "grad_norm": 3.3214592933654785, + "learning_rate": 1.103765505565205e-05 + }, + { + "step": 524, + "epoch": 2.6666666666666665, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352788992, + "loss": 0.8904, + "grad_norm": 3.1049928665161133, + "learning_rate": 1.0704715110207579e-05 + }, + { + "step": 525, + "epoch": 2.67175572519084, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352788992, + "loss": 1.0261, + "grad_norm": 3.3733370304107666, + "learning_rate": 1.0376687902053981e-05 + }, + { + "step": 526, + "epoch": 2.676844783715013, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352914944, + "loss": 1.0796, + "grad_norm": 3.7819409370422363, + "learning_rate": 1.0053585000222524e-05 + }, + { + "step": 527, + "epoch": 2.681933842239186, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352795136, + "loss": 0.8705, + "grad_norm": 3.0274670124053955, + "learning_rate": 9.735417800071433e-06 + }, + { + "step": 528, + "epoch": 2.687022900763359, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352953344, + "loss": 0.9553, + "grad_norm": 3.4483094215393066, + "learning_rate": 9.42219752288414e-06 + }, + { + "step": 529, + "epoch": 2.6921119592875318, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35277824, + "loss": 1.0741, + "grad_norm": 3.3103368282318115, + "learning_rate": 9.113935215473428e-06 + }, + { + "step": 530, + "epoch": 2.6972010178117047, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352796672, + "loss": 1.0691, + "grad_norm": 3.491440773010254, + "learning_rate": 8.810641749791902e-06 + }, + { + "step": 531, + "epoch": 2.7022900763358777, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352973312, + "loss": 0.9196, + "grad_norm": 2.8111774921417236, + "learning_rate": 8.512327822548481e-06 + }, + { + "step": 532, + "epoch": 2.7073791348600507, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35284736, + "loss": 0.941, + "grad_norm": 3.293686628341675, + "learning_rate": 8.219003954831199e-06 + }, + { + "step": 533, + "epoch": 2.712468193384224, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352844288, + "loss": 1.0484, + "grad_norm": 4.000848293304443, + "learning_rate": 7.930680491736135e-06 + }, + { + "step": 534, + "epoch": 2.717557251908397, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352853504, + "loss": 0.9577, + "grad_norm": 3.225452184677124, + "learning_rate": 7.647367602002491e-06 + }, + { + "step": 535, + "epoch": 2.72264631043257, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352729088, + "loss": 0.9889, + "grad_norm": 3.0239334106445312, + "learning_rate": 7.369075277654091e-06 + }, + { + "step": 536, + "epoch": 2.727735368956743, + "cpu_mem": 3.324944384, + "gpu_mem": 1.353011712, + "loss": 1.0711, + "grad_norm": 3.9757115840911865, + "learning_rate": 7.095813333646832e-06 + }, + { + "step": 537, + "epoch": 2.732824427480916, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352775168, + "loss": 0.9623, + "grad_norm": 3.345836639404297, + "learning_rate": 6.827591407522548e-06 + }, + { + "step": 538, + "epoch": 2.7379134860050893, + "cpu_mem": 3.324944384, + "gpu_mem": 1.3528704, + "loss": 0.9697, + "grad_norm": 3.0673019886016846, + "learning_rate": 6.564418959069273e-06 + }, + { + "step": 539, + "epoch": 2.7430025445292623, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352836608, + "loss": 1.0672, + "grad_norm": 3.446991205215454, + "learning_rate": 6.3063052699873326e-06 + }, + { + "step": 540, + "epoch": 2.7480916030534353, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352804352, + "loss": 1.0885, + "grad_norm": 3.5903167724609375, + "learning_rate": 6.053259443562286e-06 + }, + { + "step": 541, + "epoch": 2.753180661577608, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352822784, + "loss": 1.0694, + "grad_norm": 3.246758222579956, + "learning_rate": 5.8052904043435985e-06 + }, + { + "step": 542, + "epoch": 2.758269720101781, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352873472, + "loss": 1.0668, + "grad_norm": 3.5431694984436035, + "learning_rate": 5.56240689783013e-06 + }, + { + "step": 543, + "epoch": 2.763358778625954, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352769024, + "loss": 1.0716, + "grad_norm": 3.784592628479004, + "learning_rate": 5.324617490161409e-06 + }, + { + "step": 544, + "epoch": 2.768447837150127, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352798208, + "loss": 0.9476, + "grad_norm": 3.7763004302978516, + "learning_rate": 5.091930567815866e-06 + }, + { + "step": 545, + "epoch": 2.7735368956743, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352741376, + "loss": 1.0351, + "grad_norm": 3.8660154342651367, + "learning_rate": 4.86435433731473e-06 + }, + { + "step": 546, + "epoch": 2.778625954198473, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35281664, + "loss": 0.9441, + "grad_norm": 3.1726887226104736, + "learning_rate": 4.641896824932861e-06 + }, + { + "step": 547, + "epoch": 2.7837150127226464, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35280896, + "loss": 0.9058, + "grad_norm": 3.0006728172302246, + "learning_rate": 4.424565876415415e-06 + }, + { + "step": 548, + "epoch": 2.7888040712468194, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35283968, + "loss": 1.1043, + "grad_norm": 3.883101463317871, + "learning_rate": 4.212369156701373e-06 + }, + { + "step": 549, + "epoch": 2.7938931297709924, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352769024, + "loss": 1.1347, + "grad_norm": 3.4798731803894043, + "learning_rate": 4.005314149653133e-06 + }, + { + "step": 550, + "epoch": 2.7989821882951653, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352795136, + "loss": 0.8366, + "grad_norm": 3.2174527645111084, + "learning_rate": 3.8034081577924147e-06 + }, + { + "step": 551, + "epoch": 2.8040712468193383, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352890368, + "loss": 1.013, + "grad_norm": 3.3607895374298096, + "learning_rate": 3.6066583020429864e-06 + }, + { + "step": 552, + "epoch": 2.8091603053435117, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35278592, + "loss": 0.8992, + "grad_norm": 3.3022282123565674, + "learning_rate": 3.415071521479246e-06 + }, + { + "step": 553, + "epoch": 2.8142493638676847, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352795136, + "loss": 0.9584, + "grad_norm": 3.313258647918701, + "learning_rate": 3.2286545730817183e-06 + }, + { + "step": 554, + "epoch": 2.8193384223918576, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352822784, + "loss": 1.1445, + "grad_norm": 3.7894530296325684, + "learning_rate": 3.0474140314985628e-06 + }, + { + "step": 555, + "epoch": 2.8244274809160306, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352836608, + "loss": 1.1419, + "grad_norm": 3.9882113933563232, + "learning_rate": 2.8713562888138754e-06 + }, + { + "step": 556, + "epoch": 2.8295165394402035, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352830464, + "loss": 0.9027, + "grad_norm": 2.825953722000122, + "learning_rate": 2.7004875543220506e-06 + }, + { + "step": 557, + "epoch": 2.8346055979643765, + "cpu_mem": 3.324944384, + "gpu_mem": 1.3527936, + "loss": 1.009, + "grad_norm": 3.1212780475616455, + "learning_rate": 2.5348138543089425e-06 + }, + { + "step": 558, + "epoch": 2.8396946564885495, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352851968, + "loss": 1.0942, + "grad_norm": 3.782508373260498, + "learning_rate": 2.374341031839283e-06 + }, + { + "step": 559, + "epoch": 2.8447837150127224, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352761344, + "loss": 1.1259, + "grad_norm": 3.743027448654175, + "learning_rate": 2.2190747465505644e-06 + }, + { + "step": 560, + "epoch": 2.849872773536896, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352845824, + "loss": 0.9748, + "grad_norm": 3.4961471557617188, + "learning_rate": 2.0690204744534976e-06 + }, + { + "step": 561, + "epoch": 2.854961832061069, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352884224, + "loss": 0.9173, + "grad_norm": 3.2804369926452637, + "learning_rate": 1.924183507738819e-06 + }, + { + "step": 562, + "epoch": 2.8600508905852418, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352928768, + "loss": 1.1069, + "grad_norm": 3.2742605209350586, + "learning_rate": 1.7845689545906704e-06 + }, + { + "step": 563, + "epoch": 2.8651399491094147, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352890368, + "loss": 0.9809, + "grad_norm": 3.34784197807312, + "learning_rate": 1.6501817390064786e-06 + }, + { + "step": 564, + "epoch": 2.8702290076335877, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352827392, + "loss": 1.0627, + "grad_norm": 3.826216459274292, + "learning_rate": 1.521026600623243e-06 + }, + { + "step": 565, + "epoch": 2.875318066157761, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352881152, + "loss": 1.0509, + "grad_norm": 3.456963300704956, + "learning_rate": 1.3971080945503866e-06 + }, + { + "step": 566, + "epoch": 2.880407124681934, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352819712, + "loss": 0.9829, + "grad_norm": 3.3792643547058105, + "learning_rate": 1.2784305912090842e-06 + }, + { + "step": 567, + "epoch": 2.885496183206107, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352805888, + "loss": 0.8542, + "grad_norm": 2.968716859817505, + "learning_rate": 1.1649982761782195e-06 + }, + { + "step": 568, + "epoch": 2.89058524173028, + "cpu_mem": 3.324944384, + "gpu_mem": 1.353017856, + "loss": 0.8167, + "grad_norm": 2.9013547897338867, + "learning_rate": 1.0568151500465693e-06 + }, + { + "step": 569, + "epoch": 2.895674300254453, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352882688, + "loss": 1.0295, + "grad_norm": 3.2629361152648926, + "learning_rate": 9.538850282719833e-07 + }, + { + "step": 570, + "epoch": 2.900763358778626, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352856576, + "loss": 0.8601, + "grad_norm": 3.1976027488708496, + "learning_rate": 8.56211541046542e-07 + }, + { + "step": 571, + "epoch": 2.905852417302799, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35286272, + "loss": 0.9727, + "grad_norm": 3.1192729473114014, + "learning_rate": 7.637981331687582e-07 + }, + { + "step": 572, + "epoch": 2.910941475826972, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352896512, + "loss": 1.0843, + "grad_norm": 3.66752552986145, + "learning_rate": 6.766480639218752e-07 + }, + { + "step": 573, + "epoch": 2.916030534351145, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352804352, + "loss": 1.0614, + "grad_norm": 3.442542314529419, + "learning_rate": 5.947644069591084e-07 + }, + { + "step": 574, + "epoch": 2.921119592875318, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352888832, + "loss": 0.9628, + "grad_norm": 3.403029441833496, + "learning_rate": 5.181500501950986e-07 + }, + { + "step": 575, + "epoch": 2.926208651399491, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352776704, + "loss": 0.8926, + "grad_norm": 2.8523290157318115, + "learning_rate": 4.468076957041433e-07 + }, + { + "step": 576, + "epoch": 2.931297709923664, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35283968, + "loss": 1.0368, + "grad_norm": 3.144444704055786, + "learning_rate": 3.807398596248401e-07 + }, + { + "step": 577, + "epoch": 2.936386768447837, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352804352, + "loss": 0.9327, + "grad_norm": 2.794154405593872, + "learning_rate": 3.199488720714072e-07 + }, + { + "step": 578, + "epoch": 2.94147582697201, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352804352, + "loss": 0.9117, + "grad_norm": 3.1563830375671387, + "learning_rate": 2.64436877051466e-07 + }, + { + "step": 579, + "epoch": 2.9465648854961835, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352888832, + "loss": 1.1254, + "grad_norm": 3.2360873222351074, + "learning_rate": 2.1420583239040167e-07 + }, + { + "step": 580, + "epoch": 2.9516539440203564, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352835072, + "loss": 1.0166, + "grad_norm": 3.4858782291412354, + "learning_rate": 1.6925750966238494e-07 + }, + { + "step": 581, + "epoch": 2.9567430025445294, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35285504, + "loss": 1.1659, + "grad_norm": 3.9053561687469482, + "learning_rate": 1.295934941278387e-07 + }, + { + "step": 582, + "epoch": 2.9618320610687023, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352782848, + "loss": 1.0135, + "grad_norm": 3.2541697025299072, + "learning_rate": 9.52151846775162e-08 + }, + { + "step": 583, + "epoch": 2.9669211195928753, + "cpu_mem": 3.324944384, + "gpu_mem": 1.3528704, + "loss": 1.169, + "grad_norm": 3.694669723510742, + "learning_rate": 6.612379378320709e-08 + }, + { + "step": 584, + "epoch": 2.9720101781170483, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352838144, + "loss": 0.9361, + "grad_norm": 2.8719639778137207, + "learning_rate": 4.232034745495494e-08 + }, + { + "step": 585, + "epoch": 2.9770992366412212, + "cpu_mem": 3.324944384, + "gpu_mem": 1.35281664, + "loss": 0.9141, + "grad_norm": 2.951301097869873, + "learning_rate": 2.3805685204869583e-08 + }, + { + "step": 586, + "epoch": 2.982188295165394, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352759808, + "loss": 1.0781, + "grad_norm": 3.2783217430114746, + "learning_rate": 1.0580460017517444e-08 + }, + { + "step": 587, + "epoch": 2.9872773536895676, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352825856, + "loss": 1.2145, + "grad_norm": 3.8156402111053467, + "learning_rate": 2.645138326906604e-09 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352867328, + "loss": 0.9451, + "grad_norm": 3.21380352973938, + "learning_rate": 0.0 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 3.324944384, + "gpu_mem": 1.352867328, + "train_runtime": 8586.9742, + "train_samples_per_second": 4.39, + "train_steps_per_second": 0.068, + "total_flos": 9.037250271924634e+16, + "train_loss": 1.289886662749206 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r8-a2/README.md b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r8-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r8-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r8-a2/adapter_config.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a34e999804ff05ab393ed2117c936e4d7827f88f --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r8-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r8-a2/eval_results.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a3eb3b7d4deaeca903e392fdcf98ecd936eefadb --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "logiqa", + "results": 0.36211526544102457 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r8-a2/training_configuration.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..92317f59cdd6054b8d6edc924017f937aca4050e --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "LOGIQA", + "dataset_id": "data/logiqa_train", + "preprocess_id": "logiqa_train_deepeval" + }, + "peft_config": { + "method": "loraq4", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 6307840 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 3, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loraq4-logiqa-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loraq4-fix/TinyLlama_v1.1-loraq4-logiqa-r8-a2", + "seed": 42, + "timestamp": "2025-09-01T03:14:32.968021" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r8-a2/training_logs.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..4ea30ed8a5f68c63d68fec526206ad857fb942a1 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-logiqa-r8-a2/training_logs.json @@ -0,0 +1,5305 @@ +[ + { + "step": 1, + "epoch": 0.005089058524173028, + "cpu_mem": 3.4093056, + "gpu_mem": 1.075266048, + "loss": 3.684, + "grad_norm": 29.871192932128906, + "learning_rate": 5.084745762711864e-06 + }, + { + "step": 2, + "epoch": 0.010178117048346057, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125748224, + "loss": 3.9445, + "grad_norm": 31.978853225708008, + "learning_rate": 1.0169491525423728e-05 + }, + { + "step": 3, + "epoch": 0.015267175572519083, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125825024, + "loss": 3.7121, + "grad_norm": 30.38503646850586, + "learning_rate": 1.5254237288135592e-05 + }, + { + "step": 4, + "epoch": 0.020356234096692113, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125723648, + "loss": 3.5848, + "grad_norm": 31.356014251708984, + "learning_rate": 2.0338983050847455e-05 + }, + { + "step": 5, + "epoch": 0.02544529262086514, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125739008, + "loss": 3.4481, + "grad_norm": 28.966081619262695, + "learning_rate": 2.542372881355932e-05 + }, + { + "step": 6, + "epoch": 0.030534351145038167, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125731328, + "loss": 3.1637, + "grad_norm": 24.288345336914062, + "learning_rate": 3.0508474576271185e-05 + }, + { + "step": 7, + "epoch": 0.035623409669211195, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125801984, + "loss": 2.7218, + "grad_norm": 20.600086212158203, + "learning_rate": 3.559322033898305e-05 + }, + { + "step": 8, + "epoch": 0.04071246819338423, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125786624, + "loss": 2.4421, + "grad_norm": 17.11469078063965, + "learning_rate": 4.067796610169491e-05 + }, + { + "step": 9, + "epoch": 0.04580152671755725, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125785088, + "loss": 2.1602, + "grad_norm": 13.502305030822754, + "learning_rate": 4.576271186440678e-05 + }, + { + "step": 10, + "epoch": 0.05089058524173028, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12579584, + "loss": 2.0204, + "grad_norm": 9.862107276916504, + "learning_rate": 5.084745762711864e-05 + }, + { + "step": 11, + "epoch": 0.05597964376590331, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125699072, + "loss": 1.6854, + "grad_norm": 5.984860897064209, + "learning_rate": 5.59322033898305e-05 + }, + { + "step": 12, + "epoch": 0.061068702290076333, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12574976, + "loss": 1.5571, + "grad_norm": 3.342724561691284, + "learning_rate": 6.101694915254237e-05 + }, + { + "step": 13, + "epoch": 0.06615776081424936, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12584192, + "loss": 1.4604, + "grad_norm": 1.8214565515518188, + "learning_rate": 6.610169491525423e-05 + }, + { + "step": 14, + "epoch": 0.07124681933842239, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125754368, + "loss": 1.4714, + "grad_norm": 1.308097004890442, + "learning_rate": 7.11864406779661e-05 + }, + { + "step": 15, + "epoch": 0.07633587786259542, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125892608, + "loss": 1.4685, + "grad_norm": 2.3664939403533936, + "learning_rate": 7.627118644067796e-05 + }, + { + "step": 16, + "epoch": 0.08142493638676845, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125752832, + "loss": 1.4526, + "grad_norm": 2.5017569065093994, + "learning_rate": 8.135593220338982e-05 + }, + { + "step": 17, + "epoch": 0.08651399491094147, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125783552, + "loss": 1.4052, + "grad_norm": 2.2457468509674072, + "learning_rate": 8.64406779661017e-05 + }, + { + "step": 18, + "epoch": 0.0916030534351145, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125746688, + "loss": 1.3875, + "grad_norm": 2.006960391998291, + "learning_rate": 9.152542372881355e-05 + }, + { + "step": 19, + "epoch": 0.09669211195928754, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125654528, + "loss": 1.3985, + "grad_norm": 1.9313380718231201, + "learning_rate": 9.661016949152541e-05 + }, + { + "step": 20, + "epoch": 0.10178117048346055, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125692928, + "loss": 1.4441, + "grad_norm": 2.6823232173919678, + "learning_rate": 0.00010169491525423727 + }, + { + "step": 21, + "epoch": 0.10687022900763359, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12582656, + "loss": 1.4114, + "grad_norm": 2.618835687637329, + "learning_rate": 0.00010677966101694915 + }, + { + "step": 22, + "epoch": 0.11195928753180662, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125725184, + "loss": 1.4718, + "grad_norm": 2.88139009475708, + "learning_rate": 0.000111864406779661 + }, + { + "step": 23, + "epoch": 0.11704834605597965, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125752832, + "loss": 1.3991, + "grad_norm": 2.759490966796875, + "learning_rate": 0.00011694915254237288 + }, + { + "step": 24, + "epoch": 0.12213740458015267, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125746688, + "loss": 1.3921, + "grad_norm": 1.1894447803497314, + "learning_rate": 0.00012203389830508474 + }, + { + "step": 25, + "epoch": 0.1272264631043257, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125752832, + "loss": 1.4631, + "grad_norm": 2.9123830795288086, + "learning_rate": 0.00012711864406779658 + }, + { + "step": 26, + "epoch": 0.13231552162849872, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125805056, + "loss": 1.4365, + "grad_norm": 3.477806806564331, + "learning_rate": 0.00013220338983050846 + }, + { + "step": 27, + "epoch": 0.13740458015267176, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125746688, + "loss": 1.4295, + "grad_norm": 1.825005292892456, + "learning_rate": 0.00013728813559322033 + }, + { + "step": 28, + "epoch": 0.14249363867684478, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125692928, + "loss": 1.4189, + "grad_norm": 1.2550562620162964, + "learning_rate": 0.0001423728813559322 + }, + { + "step": 29, + "epoch": 0.1475826972010178, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125785088, + "loss": 1.3866, + "grad_norm": 0.9912756681442261, + "learning_rate": 0.00014745762711864405 + }, + { + "step": 30, + "epoch": 0.15267175572519084, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12578048, + "loss": 1.3623, + "grad_norm": 1.7572139501571655, + "learning_rate": 0.00015254237288135592 + }, + { + "step": 31, + "epoch": 0.15776081424936386, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125758976, + "loss": 1.3566, + "grad_norm": 1.1613632440567017, + "learning_rate": 0.0001576271186440678 + }, + { + "step": 32, + "epoch": 0.1628498727735369, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125763584, + "loss": 1.4983, + "grad_norm": 2.4266886711120605, + "learning_rate": 0.00016271186440677964 + }, + { + "step": 33, + "epoch": 0.16793893129770993, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125798912, + "loss": 1.3415, + "grad_norm": 1.2077975273132324, + "learning_rate": 0.0001677966101694915 + }, + { + "step": 34, + "epoch": 0.17302798982188294, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125723648, + "loss": 1.5517, + "grad_norm": 2.723754405975342, + "learning_rate": 0.0001728813559322034 + }, + { + "step": 35, + "epoch": 0.178117048346056, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125771264, + "loss": 1.5328, + "grad_norm": 2.709582567214966, + "learning_rate": 0.00017796610169491523 + }, + { + "step": 36, + "epoch": 0.183206106870229, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125786624, + "loss": 1.5047, + "grad_norm": 2.7103588581085205, + "learning_rate": 0.0001830508474576271 + }, + { + "step": 37, + "epoch": 0.18829516539440203, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125852672, + "loss": 1.3459, + "grad_norm": 1.2499545812606812, + "learning_rate": 0.00018813559322033895 + }, + { + "step": 38, + "epoch": 0.19338422391857507, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125735936, + "loss": 1.3832, + "grad_norm": 0.680790364742279, + "learning_rate": 0.00019322033898305083 + }, + { + "step": 39, + "epoch": 0.1984732824427481, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125852672, + "loss": 1.4366, + "grad_norm": 0.8685393333435059, + "learning_rate": 0.0001983050847457627 + }, + { + "step": 40, + "epoch": 0.2035623409669211, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125774336, + "loss": 1.4119, + "grad_norm": 1.3637477159500122, + "learning_rate": 0.00020338983050847455 + }, + { + "step": 41, + "epoch": 0.20865139949109415, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12567296, + "loss": 1.4279, + "grad_norm": 1.2685344219207764, + "learning_rate": 0.00020847457627118642 + }, + { + "step": 42, + "epoch": 0.21374045801526717, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125745152, + "loss": 1.4254, + "grad_norm": 0.864650309085846, + "learning_rate": 0.0002135593220338983 + }, + { + "step": 43, + "epoch": 0.21882951653944022, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125708288, + "loss": 1.4113, + "grad_norm": 0.9118521213531494, + "learning_rate": 0.00021864406779661014 + }, + { + "step": 44, + "epoch": 0.22391857506361323, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125743616, + "loss": 1.4067, + "grad_norm": 0.594793438911438, + "learning_rate": 0.000223728813559322 + }, + { + "step": 45, + "epoch": 0.22900763358778625, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125797376, + "loss": 1.4022, + "grad_norm": 0.47931763529777527, + "learning_rate": 0.00022881355932203386 + }, + { + "step": 46, + "epoch": 0.2340966921119593, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125844992, + "loss": 1.3766, + "grad_norm": 0.6501488089561462, + "learning_rate": 0.00023389830508474576 + }, + { + "step": 47, + "epoch": 0.23918575063613232, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12567296, + "loss": 1.4189, + "grad_norm": 1.1180607080459595, + "learning_rate": 0.0002389830508474576 + }, + { + "step": 48, + "epoch": 0.24427480916030533, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12572672, + "loss": 1.4117, + "grad_norm": 0.9935393929481506, + "learning_rate": 0.00024406779661016948 + }, + { + "step": 49, + "epoch": 0.24936386768447838, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125715968, + "loss": 1.3985, + "grad_norm": 0.6579759120941162, + "learning_rate": 0.00024915254237288135 + }, + { + "step": 50, + "epoch": 0.2544529262086514, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125725184, + "loss": 1.3812, + "grad_norm": 0.6745386719703674, + "learning_rate": 0.00025423728813559317 + }, + { + "step": 51, + "epoch": 0.2595419847328244, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125820416, + "loss": 1.3768, + "grad_norm": 0.8497889637947083, + "learning_rate": 0.0002593220338983051 + }, + { + "step": 52, + "epoch": 0.26463104325699743, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125762048, + "loss": 1.4519, + "grad_norm": 1.7728925943374634, + "learning_rate": 0.0002644067796610169 + }, + { + "step": 53, + "epoch": 0.2697201017811705, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125840384, + "loss": 1.428, + "grad_norm": 1.2773438692092896, + "learning_rate": 0.0002694915254237288 + }, + { + "step": 54, + "epoch": 0.2748091603053435, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125740544, + "loss": 1.3989, + "grad_norm": 0.6734830737113953, + "learning_rate": 0.00027457627118644066 + }, + { + "step": 55, + "epoch": 0.27989821882951654, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125737472, + "loss": 1.3957, + "grad_norm": 0.8826666474342346, + "learning_rate": 0.0002796610169491525 + }, + { + "step": 56, + "epoch": 0.28498727735368956, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125782016, + "loss": 1.3876, + "grad_norm": 0.9055960774421692, + "learning_rate": 0.0002847457627118644 + }, + { + "step": 57, + "epoch": 0.2900763358778626, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125774336, + "loss": 1.3761, + "grad_norm": 0.7192625403404236, + "learning_rate": 0.00028983050847457623 + }, + { + "step": 58, + "epoch": 0.2951653944020356, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125797376, + "loss": 1.4347, + "grad_norm": 0.8826169371604919, + "learning_rate": 0.0002949152542372881 + }, + { + "step": 59, + "epoch": 0.30025445292620867, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12576512, + "loss": 1.3897, + "grad_norm": 1.2806986570358276, + "learning_rate": 0.0003 + }, + { + "step": 60, + "epoch": 0.3053435114503817, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125755904, + "loss": 1.4251, + "grad_norm": 0.8851944208145142, + "learning_rate": 0.00029999735486167307 + }, + { + "step": 61, + "epoch": 0.3104325699745547, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125792768, + "loss": 1.3796, + "grad_norm": 0.5956801176071167, + "learning_rate": 0.00029998941953998247 + }, + { + "step": 62, + "epoch": 0.3155216284987277, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125832704, + "loss": 1.3897, + "grad_norm": 0.4349636435508728, + "learning_rate": 0.0002999761943147951 + }, + { + "step": 63, + "epoch": 0.32061068702290074, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125762048, + "loss": 1.3717, + "grad_norm": 0.30638763308525085, + "learning_rate": 0.000299957679652545 + }, + { + "step": 64, + "epoch": 0.3256997455470738, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125663744, + "loss": 1.3751, + "grad_norm": 0.7063522934913635, + "learning_rate": 0.0002999338762062168 + }, + { + "step": 65, + "epoch": 0.33078880407124683, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125745152, + "loss": 1.4605, + "grad_norm": 1.2592976093292236, + "learning_rate": 0.00029990478481532246 + }, + { + "step": 66, + "epoch": 0.33587786259541985, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1258496, + "loss": 1.4161, + "grad_norm": 0.5962221622467041, + "learning_rate": 0.00029987040650587214 + }, + { + "step": 67, + "epoch": 0.34096692111959287, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12572672, + "loss": 1.4519, + "grad_norm": 1.061152458190918, + "learning_rate": 0.0002998307424903376 + }, + { + "step": 68, + "epoch": 0.3460559796437659, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125778944, + "loss": 1.4026, + "grad_norm": 0.6074525117874146, + "learning_rate": 0.00029978579416760955 + }, + { + "step": 69, + "epoch": 0.3511450381679389, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125771264, + "loss": 1.3825, + "grad_norm": 0.6745104193687439, + "learning_rate": 0.00029973556312294853 + }, + { + "step": 70, + "epoch": 0.356234096692112, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125686784, + "loss": 1.402, + "grad_norm": 0.7388663291931152, + "learning_rate": 0.0002996800511279286 + }, + { + "step": 71, + "epoch": 0.361323155216285, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125709824, + "loss": 1.4101, + "grad_norm": 1.4112374782562256, + "learning_rate": 0.0002996192601403751 + }, + { + "step": 72, + "epoch": 0.366412213740458, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125758976, + "loss": 1.4321, + "grad_norm": 1.0308871269226074, + "learning_rate": 0.00029955319230429584 + }, + { + "step": 73, + "epoch": 0.37150127226463103, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125712896, + "loss": 1.409, + "grad_norm": 0.6527464389801025, + "learning_rate": 0.00029948184994980486 + }, + { + "step": 74, + "epoch": 0.37659033078880405, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125766656, + "loss": 1.4177, + "grad_norm": 0.9352455139160156, + "learning_rate": 0.0002994052355930409 + }, + { + "step": 75, + "epoch": 0.3816793893129771, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125809664, + "loss": 1.4397, + "grad_norm": 1.6825188398361206, + "learning_rate": 0.0002993233519360781 + }, + { + "step": 76, + "epoch": 0.38676844783715014, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12575744, + "loss": 1.4201, + "grad_norm": 1.1555640697479248, + "learning_rate": 0.0002992362018668312 + }, + { + "step": 77, + "epoch": 0.39185750636132316, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125696, + "loss": 1.416, + "grad_norm": 0.7740017771720886, + "learning_rate": 0.00029914378845895343 + }, + { + "step": 78, + "epoch": 0.3969465648854962, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125797376, + "loss": 1.387, + "grad_norm": 0.9078391790390015, + "learning_rate": 0.000299046114971728 + }, + { + "step": 79, + "epoch": 0.4020356234096692, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125814272, + "loss": 1.3539, + "grad_norm": 1.4698126316070557, + "learning_rate": 0.0002989431848499534 + }, + { + "step": 80, + "epoch": 0.4071246819338422, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125654528, + "loss": 1.4699, + "grad_norm": 1.6814128160476685, + "learning_rate": 0.0002988350017238218 + }, + { + "step": 81, + "epoch": 0.4122137404580153, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125760512, + "loss": 1.4491, + "grad_norm": 1.5839579105377197, + "learning_rate": 0.0002987215694087909 + }, + { + "step": 82, + "epoch": 0.4173027989821883, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125732864, + "loss": 1.4189, + "grad_norm": 1.2322072982788086, + "learning_rate": 0.0002986028919054496 + }, + { + "step": 83, + "epoch": 0.4223918575063613, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125746688, + "loss": 1.4121, + "grad_norm": 1.403512954711914, + "learning_rate": 0.00029847897339937675 + }, + { + "step": 84, + "epoch": 0.42748091603053434, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125752832, + "loss": 1.4026, + "grad_norm": 0.8175625801086426, + "learning_rate": 0.0002983498182609935 + }, + { + "step": 85, + "epoch": 0.43256997455470736, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125782016, + "loss": 1.3998, + "grad_norm": 0.677025556564331, + "learning_rate": 0.0002982154310454093 + }, + { + "step": 86, + "epoch": 0.43765903307888043, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125723648, + "loss": 1.407, + "grad_norm": 1.8027598857879639, + "learning_rate": 0.00029807581649226114 + }, + { + "step": 87, + "epoch": 0.44274809160305345, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125745152, + "loss": 1.4073, + "grad_norm": 0.8752862215042114, + "learning_rate": 0.00029793097952554646 + }, + { + "step": 88, + "epoch": 0.44783715012722647, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125771264, + "loss": 1.3639, + "grad_norm": 1.0207443237304688, + "learning_rate": 0.0002977809252534494 + }, + { + "step": 89, + "epoch": 0.4529262086513995, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12571904, + "loss": 1.4191, + "grad_norm": 1.1161001920700073, + "learning_rate": 0.00029762565896816073 + }, + { + "step": 90, + "epoch": 0.4580152671755725, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125751296, + "loss": 1.4138, + "grad_norm": 1.2527745962142944, + "learning_rate": 0.000297465186145691 + }, + { + "step": 91, + "epoch": 0.4631043256997455, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125743616, + "loss": 1.336, + "grad_norm": 0.5529526472091675, + "learning_rate": 0.0002972995124456779 + }, + { + "step": 92, + "epoch": 0.4681933842239186, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125683712, + "loss": 1.4221, + "grad_norm": 1.0816890001296997, + "learning_rate": 0.0002971286437111861 + }, + { + "step": 93, + "epoch": 0.4732824427480916, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125884928, + "loss": 1.4179, + "grad_norm": 0.887290894985199, + "learning_rate": 0.0002969525859685014 + }, + { + "step": 94, + "epoch": 0.47837150127226463, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125699072, + "loss": 1.4314, + "grad_norm": 1.6692267656326294, + "learning_rate": 0.0002967713454269183 + }, + { + "step": 95, + "epoch": 0.48346055979643765, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125866496, + "loss": 1.4356, + "grad_norm": 1.2087761163711548, + "learning_rate": 0.0002965849284785207 + }, + { + "step": 96, + "epoch": 0.48854961832061067, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125740544, + "loss": 1.3769, + "grad_norm": 0.9729386568069458, + "learning_rate": 0.000296393341697957 + }, + { + "step": 97, + "epoch": 0.49363867684478374, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125735936, + "loss": 1.3797, + "grad_norm": 0.8852517008781433, + "learning_rate": 0.00029619659184220755 + }, + { + "step": 98, + "epoch": 0.49872773536895676, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125783552, + "loss": 1.3916, + "grad_norm": 1.038716435432434, + "learning_rate": 0.00029599468585034684 + }, + { + "step": 99, + "epoch": 0.5038167938931297, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12576512, + "loss": 1.4117, + "grad_norm": 1.848562240600586, + "learning_rate": 0.0002957876308432986 + }, + { + "step": 100, + "epoch": 0.5089058524173028, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125729792, + "loss": 1.3649, + "grad_norm": 0.5639933943748474, + "learning_rate": 0.0002955754341235846 + }, + { + "step": 101, + "epoch": 0.5139949109414759, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125712896, + "loss": 1.4313, + "grad_norm": 0.8479992747306824, + "learning_rate": 0.00029535810317506714 + }, + { + "step": 102, + "epoch": 0.5190839694656488, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125763584, + "loss": 1.4535, + "grad_norm": 1.1303939819335938, + "learning_rate": 0.00029513564566268524 + }, + { + "step": 103, + "epoch": 0.5241730279898219, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125723648, + "loss": 1.4375, + "grad_norm": 1.0699687004089355, + "learning_rate": 0.0002949080694321841 + }, + { + "step": 104, + "epoch": 0.5292620865139949, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125812736, + "loss": 1.402, + "grad_norm": 0.5297269225120544, + "learning_rate": 0.0002946753825098386 + }, + { + "step": 105, + "epoch": 0.5343511450381679, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125682176, + "loss": 1.3876, + "grad_norm": 0.764244019985199, + "learning_rate": 0.0002944375931021699 + }, + { + "step": 106, + "epoch": 0.539440203562341, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125745152, + "loss": 1.4094, + "grad_norm": 0.6557555198669434, + "learning_rate": 0.0002941947095956564 + }, + { + "step": 107, + "epoch": 0.544529262086514, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125740544, + "loss": 1.3926, + "grad_norm": 0.5223321914672852, + "learning_rate": 0.0002939467405564377 + }, + { + "step": 108, + "epoch": 0.549618320610687, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125729792, + "loss": 1.3905, + "grad_norm": 0.6244955658912659, + "learning_rate": 0.00029369369473001265 + }, + { + "step": 109, + "epoch": 0.55470737913486, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125898752, + "loss": 1.378, + "grad_norm": 0.2677730321884155, + "learning_rate": 0.0002934355810409307 + }, + { + "step": 110, + "epoch": 0.5597964376590331, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125696, + "loss": 1.3734, + "grad_norm": 0.8328213095664978, + "learning_rate": 0.0002931724085924774 + }, + { + "step": 111, + "epoch": 0.5648854961832062, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125697536, + "loss": 1.3824, + "grad_norm": 0.7503806948661804, + "learning_rate": 0.00029290418666635314 + }, + { + "step": 112, + "epoch": 0.5699745547073791, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125797376, + "loss": 1.3957, + "grad_norm": 0.7978232502937317, + "learning_rate": 0.0002926309247223459 + }, + { + "step": 113, + "epoch": 0.5750636132315522, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12589568, + "loss": 1.4157, + "grad_norm": 0.964816153049469, + "learning_rate": 0.0002923526323979975 + }, + { + "step": 114, + "epoch": 0.5801526717557252, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125725184, + "loss": 1.4035, + "grad_norm": 0.6620330810546875, + "learning_rate": 0.00029206931950826387 + }, + { + "step": 115, + "epoch": 0.5852417302798982, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1257344, + "loss": 1.3927, + "grad_norm": 0.5174351930618286, + "learning_rate": 0.00029178099604516876 + }, + { + "step": 116, + "epoch": 0.5903307888040712, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12579584, + "loss": 1.383, + "grad_norm": 0.5747383832931519, + "learning_rate": 0.0002914876721774515 + }, + { + "step": 117, + "epoch": 0.5954198473282443, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12568832, + "loss": 1.3732, + "grad_norm": 0.3873158395290375, + "learning_rate": 0.00029118935825020806 + }, + { + "step": 118, + "epoch": 0.6005089058524173, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125783552, + "loss": 1.3814, + "grad_norm": 0.2574141025543213, + "learning_rate": 0.00029088606478452656 + }, + { + "step": 119, + "epoch": 0.6055979643765903, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125917184, + "loss": 1.3593, + "grad_norm": 0.5004837512969971, + "learning_rate": 0.0002905778024771158 + }, + { + "step": 120, + "epoch": 0.6106870229007634, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125820416, + "loss": 1.4784, + "grad_norm": 1.7202436923980713, + "learning_rate": 0.00029026458219992855 + }, + { + "step": 121, + "epoch": 0.6157760814249363, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125866496, + "loss": 1.4052, + "grad_norm": 0.8364105224609375, + "learning_rate": 0.00028994641499977745 + }, + { + "step": 122, + "epoch": 0.6208651399491094, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1258112, + "loss": 1.3905, + "grad_norm": 0.4931715726852417, + "learning_rate": 0.00028962331209794604 + }, + { + "step": 123, + "epoch": 0.6259541984732825, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1258496, + "loss": 1.3877, + "grad_norm": 0.9846198558807373, + "learning_rate": 0.00028929528488979244 + }, + { + "step": 124, + "epoch": 0.6310432569974554, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1257728, + "loss": 1.3684, + "grad_norm": 0.7551160454750061, + "learning_rate": 0.0002889623449443479 + }, + { + "step": 125, + "epoch": 0.6361323155216285, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125808128, + "loss": 1.3911, + "grad_norm": 0.6093849539756775, + "learning_rate": 0.0002886245040039086 + }, + { + "step": 126, + "epoch": 0.6412213740458015, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125714432, + "loss": 1.3536, + "grad_norm": 0.3243202865123749, + "learning_rate": 0.0002882817739836215 + }, + { + "step": 127, + "epoch": 0.6463104325699746, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125739008, + "loss": 1.4228, + "grad_norm": 0.7267773747444153, + "learning_rate": 0.000287934166971064 + }, + { + "step": 128, + "epoch": 0.6513994910941476, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12571136, + "loss": 1.3794, + "grad_norm": 0.46886834502220154, + "learning_rate": 0.0002875816952258179 + }, + { + "step": 129, + "epoch": 0.6564885496183206, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125717504, + "loss": 1.3903, + "grad_norm": 0.6609954833984375, + "learning_rate": 0.00028722437117903693 + }, + { + "step": 130, + "epoch": 0.6615776081424937, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125708288, + "loss": 1.4356, + "grad_norm": 0.8140541911125183, + "learning_rate": 0.000286862207433008 + }, + { + "step": 131, + "epoch": 0.6666666666666666, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12564224, + "loss": 1.4072, + "grad_norm": 0.943206787109375, + "learning_rate": 0.00028649521676070726 + }, + { + "step": 132, + "epoch": 0.6717557251908397, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125805056, + "loss": 1.4116, + "grad_norm": 0.7690998911857605, + "learning_rate": 0.0002861234121053493 + }, + { + "step": 133, + "epoch": 0.6768447837150128, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125717504, + "loss": 1.4227, + "grad_norm": 1.6718356609344482, + "learning_rate": 0.0002857468065799307 + }, + { + "step": 134, + "epoch": 0.6819338422391857, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125720576, + "loss": 1.3844, + "grad_norm": 0.3648189604282379, + "learning_rate": 0.0002853654134667676 + }, + { + "step": 135, + "epoch": 0.6870229007633588, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125775872, + "loss": 1.38, + "grad_norm": 0.7224555015563965, + "learning_rate": 0.0002849792462170271 + }, + { + "step": 136, + "epoch": 0.6921119592875318, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125679104, + "loss": 1.4188, + "grad_norm": 0.6897174715995789, + "learning_rate": 0.0002845883184502533 + }, + { + "step": 137, + "epoch": 0.6972010178117048, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125846528, + "loss": 1.3936, + "grad_norm": 0.6976240277290344, + "learning_rate": 0.00028419264395388626 + }, + { + "step": 138, + "epoch": 0.7022900763358778, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125712896, + "loss": 1.3716, + "grad_norm": 0.8441869020462036, + "learning_rate": 0.0002837922366827765 + }, + { + "step": 139, + "epoch": 0.7073791348600509, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125702144, + "loss": 1.3976, + "grad_norm": 1.511319637298584, + "learning_rate": 0.00028338711075869216 + }, + { + "step": 140, + "epoch": 0.712468193384224, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125754368, + "loss": 1.4044, + "grad_norm": 1.2045376300811768, + "learning_rate": 0.00028297728046982137 + }, + { + "step": 141, + "epoch": 0.7175572519083969, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125679104, + "loss": 1.3866, + "grad_norm": 0.8673524260520935, + "learning_rate": 0.00028256276027026816 + }, + { + "step": 142, + "epoch": 0.72264631043257, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125740544, + "loss": 1.4182, + "grad_norm": 1.0856328010559082, + "learning_rate": 0.0002821435647795429 + }, + { + "step": 143, + "epoch": 0.727735368956743, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125739008, + "loss": 1.4075, + "grad_norm": 0.9939833283424377, + "learning_rate": 0.00028171970878204623 + }, + { + "step": 144, + "epoch": 0.732824427480916, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125683712, + "loss": 1.3653, + "grad_norm": 0.41317200660705566, + "learning_rate": 0.0002812912072265481 + }, + { + "step": 145, + "epoch": 0.7379134860050891, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12568064, + "loss": 1.4254, + "grad_norm": 1.0671842098236084, + "learning_rate": 0.00028085807522566043 + }, + { + "step": 146, + "epoch": 0.7430025445292621, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125782016, + "loss": 1.3615, + "grad_norm": 0.5069601535797119, + "learning_rate": 0.00028042032805530387 + }, + { + "step": 147, + "epoch": 0.7480916030534351, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125786624, + "loss": 1.4103, + "grad_norm": 0.9690648317337036, + "learning_rate": 0.00027997798115416935 + }, + { + "step": 148, + "epoch": 0.7531806615776081, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125884928, + "loss": 1.3876, + "grad_norm": 0.5872686505317688, + "learning_rate": 0.0002795310501231734 + }, + { + "step": 149, + "epoch": 0.7582697201017812, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125728256, + "loss": 1.3761, + "grad_norm": 0.4441320300102234, + "learning_rate": 0.0002790795507249081 + }, + { + "step": 150, + "epoch": 0.7633587786259542, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125720576, + "loss": 1.3939, + "grad_norm": 0.6702535152435303, + "learning_rate": 0.00027862349888308494 + }, + { + "step": 151, + "epoch": 0.7684478371501272, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125663744, + "loss": 1.3744, + "grad_norm": 0.25820183753967285, + "learning_rate": 0.0002781629106819733 + }, + { + "step": 152, + "epoch": 0.7735368956743003, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125696, + "loss": 1.3844, + "grad_norm": 0.4250832796096802, + "learning_rate": 0.00027769780236583315 + }, + { + "step": 153, + "epoch": 0.7786259541984732, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125723648, + "loss": 1.3973, + "grad_norm": 0.7476893663406372, + "learning_rate": 0.0002772281903383424 + }, + { + "step": 154, + "epoch": 0.7837150127226463, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125774336, + "loss": 1.3779, + "grad_norm": 0.54308021068573, + "learning_rate": 0.00027675409116201797 + }, + { + "step": 155, + "epoch": 0.7888040712468194, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125686784, + "loss": 1.41, + "grad_norm": 0.3007655441761017, + "learning_rate": 0.00027627552155763186 + }, + { + "step": 156, + "epoch": 0.7938931297709924, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125706752, + "loss": 1.4052, + "grad_norm": 0.7555776238441467, + "learning_rate": 0.00027579249840362145 + }, + { + "step": 157, + "epoch": 0.7989821882951654, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125783552, + "loss": 1.4, + "grad_norm": 0.7371777892112732, + "learning_rate": 0.0002753050387354942 + }, + { + "step": 158, + "epoch": 0.8040712468193384, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125685248, + "loss": 1.4008, + "grad_norm": 0.3842833936214447, + "learning_rate": 0.0002748131597452268 + }, + { + "step": 159, + "epoch": 0.8091603053435115, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125786624, + "loss": 1.4189, + "grad_norm": 0.8052723407745361, + "learning_rate": 0.00027431687878065874 + }, + { + "step": 160, + "epoch": 0.8142493638676844, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125743616, + "loss": 1.4062, + "grad_norm": 0.8772450685501099, + "learning_rate": 0.00027381621334488085 + }, + { + "step": 161, + "epoch": 0.8193384223918575, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12574208, + "loss": 1.3605, + "grad_norm": 0.28711870312690735, + "learning_rate": 0.00027331118109561744 + }, + { + "step": 162, + "epoch": 0.8244274809160306, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125725184, + "loss": 1.3914, + "grad_norm": 0.2543298602104187, + "learning_rate": 0.000272801799844604 + }, + { + "step": 163, + "epoch": 0.8295165394402035, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12584192, + "loss": 1.4314, + "grad_norm": 0.7195171117782593, + "learning_rate": 0.00027228808755695884 + }, + { + "step": 164, + "epoch": 0.8346055979643766, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125737472, + "loss": 1.3894, + "grad_norm": 0.5747102499008179, + "learning_rate": 0.00027177006235054943 + }, + { + "step": 165, + "epoch": 0.8396946564885496, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125817344, + "loss": 1.3975, + "grad_norm": 0.5087365508079529, + "learning_rate": 0.0002712477424953534 + }, + { + "step": 166, + "epoch": 0.8447837150127226, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12572672, + "loss": 1.3602, + "grad_norm": 0.4455392062664032, + "learning_rate": 0.00027072114641281435 + }, + { + "step": 167, + "epoch": 0.8498727735368957, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12568832, + "loss": 1.3779, + "grad_norm": 0.675197958946228, + "learning_rate": 0.0002701902926751921 + }, + { + "step": 168, + "epoch": 0.8549618320610687, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125691392, + "loss": 1.3743, + "grad_norm": 0.5545535683631897, + "learning_rate": 0.00026965520000490743 + }, + { + "step": 169, + "epoch": 0.8600508905852418, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125758976, + "loss": 1.3669, + "grad_norm": 0.5537741184234619, + "learning_rate": 0.0002691158872738822 + }, + { + "step": 170, + "epoch": 0.8651399491094147, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125766656, + "loss": 1.3913, + "grad_norm": 0.2820163071155548, + "learning_rate": 0.00026857237350287334 + }, + { + "step": 171, + "epoch": 0.8702290076335878, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125740544, + "loss": 1.3997, + "grad_norm": 0.41271352767944336, + "learning_rate": 0.0002680246778608023 + }, + { + "step": 172, + "epoch": 0.8753180661577609, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125720576, + "loss": 1.3816, + "grad_norm": 0.2506580948829651, + "learning_rate": 0.0002674728196640788 + }, + { + "step": 173, + "epoch": 0.8804071246819338, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12575744, + "loss": 1.3938, + "grad_norm": 0.5951747298240662, + "learning_rate": 0.00026691681837591984 + }, + { + "step": 174, + "epoch": 0.8854961832061069, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125700608, + "loss": 1.3937, + "grad_norm": 0.40369901061058044, + "learning_rate": 0.00026635669360566296 + }, + { + "step": 175, + "epoch": 0.8905852417302799, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125771264, + "loss": 1.3695, + "grad_norm": 0.46146222949028015, + "learning_rate": 0.00026579246510807477 + }, + { + "step": 176, + "epoch": 0.8956743002544529, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125668352, + "loss": 1.3692, + "grad_norm": 0.274728000164032, + "learning_rate": 0.00026522415278265425 + }, + { + "step": 177, + "epoch": 0.9007633587786259, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125677568, + "loss": 1.4008, + "grad_norm": 0.7708393335342407, + "learning_rate": 0.0002646517766729309 + }, + { + "step": 178, + "epoch": 0.905852417302799, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125754368, + "loss": 1.3871, + "grad_norm": 0.5276763439178467, + "learning_rate": 0.0002640753569657579 + }, + { + "step": 179, + "epoch": 0.910941475826972, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125762048, + "loss": 1.3842, + "grad_norm": 0.545360267162323, + "learning_rate": 0.0002634949139906 + }, + { + "step": 180, + "epoch": 0.916030534351145, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125768192, + "loss": 1.3924, + "grad_norm": 0.39603668451309204, + "learning_rate": 0.00026291046821881673 + }, + { + "step": 181, + "epoch": 0.9211195928753181, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125686784, + "loss": 1.3822, + "grad_norm": 0.38183584809303284, + "learning_rate": 0.0002623220402629402 + }, + { + "step": 182, + "epoch": 0.926208651399491, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125797376, + "loss": 1.3938, + "grad_norm": 0.44493407011032104, + "learning_rate": 0.0002617296508759483 + }, + { + "step": 183, + "epoch": 0.9312977099236641, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125785088, + "loss": 1.376, + "grad_norm": 0.33913901448249817, + "learning_rate": 0.00026113332095053257 + }, + { + "step": 184, + "epoch": 0.9363867684478372, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125768192, + "loss": 1.3769, + "grad_norm": 0.8065955638885498, + "learning_rate": 0.0002605330715183616 + }, + { + "step": 185, + "epoch": 0.9414758269720102, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125706752, + "loss": 1.3875, + "grad_norm": 1.0256986618041992, + "learning_rate": 0.0002599289237493392 + }, + { + "step": 186, + "epoch": 0.9465648854961832, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125739008, + "loss": 1.3581, + "grad_norm": 0.26662442088127136, + "learning_rate": 0.0002593208989508575 + }, + { + "step": 187, + "epoch": 0.9516539440203562, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125835776, + "loss": 1.3667, + "grad_norm": 0.25688910484313965, + "learning_rate": 0.00025870901856704583 + }, + { + "step": 188, + "epoch": 0.9567430025445293, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125808128, + "loss": 1.359, + "grad_norm": 0.3786275386810303, + "learning_rate": 0.00025809330417801425 + }, + { + "step": 189, + "epoch": 0.9618320610687023, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125712896, + "loss": 1.4952, + "grad_norm": 1.292995810508728, + "learning_rate": 0.00025747377749909254 + }, + { + "step": 190, + "epoch": 0.9669211195928753, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125745152, + "loss": 1.3801, + "grad_norm": 0.6289203763008118, + "learning_rate": 0.00025685046038006413 + }, + { + "step": 191, + "epoch": 0.9720101781170484, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1257344, + "loss": 1.4155, + "grad_norm": 0.5588219165802002, + "learning_rate": 0.0002562233748043958 + }, + { + "step": 192, + "epoch": 0.9770992366412213, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125671424, + "loss": 1.396, + "grad_norm": 0.6319165229797363, + "learning_rate": 0.00025559254288846196 + }, + { + "step": 193, + "epoch": 0.9821882951653944, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125792768, + "loss": 1.3994, + "grad_norm": 0.29802489280700684, + "learning_rate": 0.0002549579868807651 + }, + { + "step": 194, + "epoch": 0.9872773536895675, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125714432, + "loss": 1.3755, + "grad_norm": 0.4730647802352905, + "learning_rate": 0.0002543197291611507 + }, + { + "step": 195, + "epoch": 0.9923664122137404, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12582656, + "loss": 1.3694, + "grad_norm": 0.2684911787509918, + "learning_rate": 0.0002536777922400183 + }, + { + "step": 196, + "epoch": 0.9974554707379135, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125821952, + "loss": 1.3974, + "grad_norm": 0.7923300266265869, + "learning_rate": 0.0002530321987575271 + }, + { + "step": 197, + "epoch": 1.0025445292620865, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150987264, + "loss": 2.0644, + "grad_norm": 0.43012166023254395, + "learning_rate": 0.0002523829714827981 + }, + { + "step": 198, + "epoch": 1.0076335877862594, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151053312, + "loss": 1.3855, + "grad_norm": 0.44297200441360474, + "learning_rate": 0.00025173013331311053 + }, + { + "step": 199, + "epoch": 1.0127226463104326, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15105024, + "loss": 1.4031, + "grad_norm": 1.0088964700698853, + "learning_rate": 0.0002510737072730946 + }, + { + "step": 200, + "epoch": 1.0178117048346056, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15093504, + "loss": 1.3673, + "grad_norm": 0.4487733542919159, + "learning_rate": 0.0002504137165139193 + }, + { + "step": 201, + "epoch": 1.0229007633587786, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150985728, + "loss": 1.3785, + "grad_norm": 0.538973867893219, + "learning_rate": 0.0002497501843124761 + }, + { + "step": 202, + "epoch": 1.0279898218829517, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150979584, + "loss": 1.401, + "grad_norm": 0.7600394487380981, + "learning_rate": 0.00024908313407055765 + }, + { + "step": 203, + "epoch": 1.0330788804071247, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1510272, + "loss": 1.3687, + "grad_norm": 0.3526412844657898, + "learning_rate": 0.00024841258931403284 + }, + { + "step": 204, + "epoch": 1.0381679389312977, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150959616, + "loss": 1.3503, + "grad_norm": 0.33429232239723206, + "learning_rate": 0.00024773857369201675 + }, + { + "step": 205, + "epoch": 1.0432569974554706, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151005696, + "loss": 1.3747, + "grad_norm": 0.3770933151245117, + "learning_rate": 0.00024706111097603676 + }, + { + "step": 206, + "epoch": 1.0483460559796438, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15101952, + "loss": 1.3496, + "grad_norm": 0.40849390625953674, + "learning_rate": 0.00024638022505919425 + }, + { + "step": 207, + "epoch": 1.0534351145038168, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150974976, + "loss": 1.3841, + "grad_norm": 0.73378586769104, + "learning_rate": 0.00024569593995532157 + }, + { + "step": 208, + "epoch": 1.0585241730279897, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150967296, + "loss": 1.3749, + "grad_norm": 0.497897207736969, + "learning_rate": 0.00024500827979813546 + }, + { + "step": 209, + "epoch": 1.063613231552163, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15108864, + "loss": 1.3807, + "grad_norm": 0.7471835017204285, + "learning_rate": 0.0002443172688403859 + }, + { + "step": 210, + "epoch": 1.0687022900763359, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150985728, + "loss": 1.3571, + "grad_norm": 0.6650179624557495, + "learning_rate": 0.00024362293145300027 + }, + { + "step": 211, + "epoch": 1.0737913486005088, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150987264, + "loss": 1.3591, + "grad_norm": 0.5958608388900757, + "learning_rate": 0.00024292529212422445 + }, + { + "step": 212, + "epoch": 1.078880407124682, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150990336, + "loss": 1.3708, + "grad_norm": 0.4515955150127411, + "learning_rate": 0.00024222437545875887 + }, + { + "step": 213, + "epoch": 1.083969465648855, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150916608, + "loss": 1.4262, + "grad_norm": 1.1762816905975342, + "learning_rate": 0.0002415202061768906 + }, + { + "step": 214, + "epoch": 1.089058524173028, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150912, + "loss": 1.3722, + "grad_norm": 1.046394944190979, + "learning_rate": 0.0002408128091136217 + }, + { + "step": 215, + "epoch": 1.094147582697201, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150962688, + "loss": 1.3539, + "grad_norm": 1.060898780822754, + "learning_rate": 0.00024010220921779336 + }, + { + "step": 216, + "epoch": 1.099236641221374, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151091712, + "loss": 1.4197, + "grad_norm": 1.0810054540634155, + "learning_rate": 0.00023938843155120581 + }, + { + "step": 217, + "epoch": 1.104325699745547, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150953472, + "loss": 1.4274, + "grad_norm": 1.0718384981155396, + "learning_rate": 0.00023867150128773453 + }, + { + "step": 218, + "epoch": 1.10941475826972, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150979584, + "loss": 1.3606, + "grad_norm": 0.6661280393600464, + "learning_rate": 0.0002379514437124425 + }, + { + "step": 219, + "epoch": 1.1145038167938932, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150991872, + "loss": 1.3666, + "grad_norm": 0.4193108081817627, + "learning_rate": 0.00023722828422068814 + }, + { + "step": 220, + "epoch": 1.1195928753180662, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150964224, + "loss": 1.4237, + "grad_norm": 1.0968374013900757, + "learning_rate": 0.00023650204831723008 + }, + { + "step": 221, + "epoch": 1.1246819338422391, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150938112, + "loss": 1.4056, + "grad_norm": 0.35510873794555664, + "learning_rate": 0.00023577276161532718 + }, + { + "step": 222, + "epoch": 1.1297709923664123, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1509504, + "loss": 1.3935, + "grad_norm": 0.6473077535629272, + "learning_rate": 0.0002350404498358356 + }, + { + "step": 223, + "epoch": 1.1348600508905853, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150978048, + "loss": 1.373, + "grad_norm": 0.6804153919219971, + "learning_rate": 0.00023430513880630133 + }, + { + "step": 224, + "epoch": 1.1399491094147582, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15099648, + "loss": 1.3858, + "grad_norm": 0.5594496726989746, + "learning_rate": 0.00023356685446004966 + }, + { + "step": 225, + "epoch": 1.1450381679389312, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151047168, + "loss": 1.3627, + "grad_norm": 0.3291279077529907, + "learning_rate": 0.00023282562283527005 + }, + { + "step": 226, + "epoch": 1.1501272264631044, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150998016, + "loss": 1.389, + "grad_norm": 0.4444980025291443, + "learning_rate": 0.00023208147007409827 + }, + { + "step": 227, + "epoch": 1.1552162849872774, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150948864, + "loss": 1.3992, + "grad_norm": 0.8599750995635986, + "learning_rate": 0.00023133442242169425 + }, + { + "step": 228, + "epoch": 1.1603053435114503, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150993408, + "loss": 1.4083, + "grad_norm": 0.537548840045929, + "learning_rate": 0.00023058450622531632 + }, + { + "step": 229, + "epoch": 1.1653944020356235, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150892032, + "loss": 1.406, + "grad_norm": 1.0203948020935059, + "learning_rate": 0.00022983174793339206 + }, + { + "step": 230, + "epoch": 1.1704834605597965, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150959616, + "loss": 1.3983, + "grad_norm": 0.6637153029441833, + "learning_rate": 0.0002290761740945857 + }, + { + "step": 231, + "epoch": 1.1755725190839694, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1509504, + "loss": 1.3991, + "grad_norm": 0.7668421268463135, + "learning_rate": 0.00022831781135686135 + }, + { + "step": 232, + "epoch": 1.1806615776081424, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150955008, + "loss": 1.3511, + "grad_norm": 0.9297974109649658, + "learning_rate": 0.00022755668646654375 + }, + { + "step": 233, + "epoch": 1.1857506361323156, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151070208, + "loss": 1.3958, + "grad_norm": 0.7542980909347534, + "learning_rate": 0.00022679282626737442 + }, + { + "step": 234, + "epoch": 1.1908396946564885, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151030272, + "loss": 1.4334, + "grad_norm": 0.9368133544921875, + "learning_rate": 0.00022602625769956519 + }, + { + "step": 235, + "epoch": 1.1959287531806615, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1509888, + "loss": 1.4105, + "grad_norm": 0.9290534853935242, + "learning_rate": 0.00022525700779884802 + }, + { + "step": 236, + "epoch": 1.2010178117048347, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150987264, + "loss": 1.3911, + "grad_norm": 0.9261981844902039, + "learning_rate": 0.00022448510369552164 + }, + { + "step": 237, + "epoch": 1.2061068702290076, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15101184, + "loss": 1.3928, + "grad_norm": 0.5204061269760132, + "learning_rate": 0.0002237105726134943 + }, + { + "step": 238, + "epoch": 1.2111959287531806, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151031808, + "loss": 1.3781, + "grad_norm": 0.5181812644004822, + "learning_rate": 0.00022293344186932406 + }, + { + "step": 239, + "epoch": 1.2162849872773536, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150948864, + "loss": 1.4159, + "grad_norm": 1.213171362876892, + "learning_rate": 0.00022215373887125514 + }, + { + "step": 240, + "epoch": 1.2213740458015268, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151084032, + "loss": 1.3913, + "grad_norm": 0.7074081897735596, + "learning_rate": 0.00022137149111825128 + }, + { + "step": 241, + "epoch": 1.2264631043256997, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151037952, + "loss": 1.3803, + "grad_norm": 0.6614387631416321, + "learning_rate": 0.00022058672619902606 + }, + { + "step": 242, + "epoch": 1.2315521628498727, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151059456, + "loss": 1.3794, + "grad_norm": 0.5106993913650513, + "learning_rate": 0.00021979947179106966 + }, + { + "step": 243, + "epoch": 1.2366412213740459, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150945792, + "loss": 1.345, + "grad_norm": 1.2891278266906738, + "learning_rate": 0.0002190097556596728 + }, + { + "step": 244, + "epoch": 1.2417302798982188, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150933504, + "loss": 1.4565, + "grad_norm": 1.3498103618621826, + "learning_rate": 0.0002182176056569476 + }, + { + "step": 245, + "epoch": 1.2468193384223918, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150938112, + "loss": 1.4245, + "grad_norm": 1.1336153745651245, + "learning_rate": 0.00021742304972084518 + }, + { + "step": 246, + "epoch": 1.2519083969465647, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150974976, + "loss": 1.4159, + "grad_norm": 0.8649649024009705, + "learning_rate": 0.00021662611587417035 + }, + { + "step": 247, + "epoch": 1.256997455470738, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150936576, + "loss": 1.343, + "grad_norm": 0.6169582605361938, + "learning_rate": 0.00021582683222359317 + }, + { + "step": 248, + "epoch": 1.262086513994911, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150982656, + "loss": 1.3983, + "grad_norm": 0.3996424078941345, + "learning_rate": 0.00021502522695865796 + }, + { + "step": 249, + "epoch": 1.267175572519084, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150991872, + "loss": 1.3783, + "grad_norm": 0.2951216995716095, + "learning_rate": 0.00021422132835078884 + }, + { + "step": 250, + "epoch": 1.272264631043257, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151030272, + "loss": 1.391, + "grad_norm": 0.7150769233703613, + "learning_rate": 0.0002134151647522927 + }, + { + "step": 251, + "epoch": 1.27735368956743, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150948864, + "loss": 1.3777, + "grad_norm": 0.5788134336471558, + "learning_rate": 0.00021260676459535933 + }, + { + "step": 252, + "epoch": 1.282442748091603, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15095808, + "loss": 1.3703, + "grad_norm": 0.860302746295929, + "learning_rate": 0.00021179615639105857 + }, + { + "step": 253, + "epoch": 1.2875318066157762, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151024128, + "loss": 1.3669, + "grad_norm": 0.42600077390670776, + "learning_rate": 0.00021098336872833482 + }, + { + "step": 254, + "epoch": 1.2926208651399491, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1509504, + "loss": 1.3542, + "grad_norm": 0.46812903881073, + "learning_rate": 0.0002101684302729987 + }, + { + "step": 255, + "epoch": 1.297709923664122, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150944256, + "loss": 1.3779, + "grad_norm": 0.7300305962562561, + "learning_rate": 0.00020935136976671617 + }, + { + "step": 256, + "epoch": 1.3027989821882953, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151010304, + "loss": 1.3933, + "grad_norm": 0.6101003885269165, + "learning_rate": 0.00020853221602599458 + }, + { + "step": 257, + "epoch": 1.3078880407124682, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150968832, + "loss": 1.3614, + "grad_norm": 0.4827808737754822, + "learning_rate": 0.00020771099794116672 + }, + { + "step": 258, + "epoch": 1.3129770992366412, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151082496, + "loss": 1.3599, + "grad_norm": 0.7216012477874756, + "learning_rate": 0.0002068877444753717 + }, + { + "step": 259, + "epoch": 1.3180661577608141, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150959616, + "loss": 1.3632, + "grad_norm": 0.6146058440208435, + "learning_rate": 0.0002060624846635335 + }, + { + "step": 260, + "epoch": 1.3231552162849873, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150985728, + "loss": 1.3771, + "grad_norm": 0.4603070318698883, + "learning_rate": 0.00020523524761133677 + }, + { + "step": 261, + "epoch": 1.3282442748091603, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150991872, + "loss": 1.3653, + "grad_norm": 0.8702631592750549, + "learning_rate": 0.00020440606249420073 + }, + { + "step": 262, + "epoch": 1.3333333333333333, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151059456, + "loss": 1.3858, + "grad_norm": 0.6629233360290527, + "learning_rate": 0.00020357495855624974 + }, + { + "step": 263, + "epoch": 1.3384223918575064, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150915072, + "loss": 1.3996, + "grad_norm": 0.8426764011383057, + "learning_rate": 0.0002027419651092822 + }, + { + "step": 264, + "epoch": 1.3435114503816794, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151048704, + "loss": 1.368, + "grad_norm": 0.36956822872161865, + "learning_rate": 0.00020190711153173676 + }, + { + "step": 265, + "epoch": 1.3486005089058524, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151077888, + "loss": 1.3418, + "grad_norm": 0.6008407473564148, + "learning_rate": 0.00020107042726765588 + }, + { + "step": 266, + "epoch": 1.3536895674300253, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150947328, + "loss": 1.406, + "grad_norm": 0.8672747611999512, + "learning_rate": 0.0002002319418256479 + }, + { + "step": 267, + "epoch": 1.3587786259541985, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150948864, + "loss": 1.4021, + "grad_norm": 0.6703933477401733, + "learning_rate": 0.00019939168477784583 + }, + { + "step": 268, + "epoch": 1.3638676844783715, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1509504, + "loss": 1.3619, + "grad_norm": 0.4548293948173523, + "learning_rate": 0.00019854968575886458 + }, + { + "step": 269, + "epoch": 1.3689567430025447, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151024128, + "loss": 1.4433, + "grad_norm": 1.3169457912445068, + "learning_rate": 0.00019770597446475588 + }, + { + "step": 270, + "epoch": 1.3740458015267176, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151005696, + "loss": 1.3422, + "grad_norm": 0.3597468435764313, + "learning_rate": 0.0001968605806519608 + }, + { + "step": 271, + "epoch": 1.3791348600508906, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150948864, + "loss": 1.416, + "grad_norm": 0.8113951683044434, + "learning_rate": 0.00019601353413626032 + }, + { + "step": 272, + "epoch": 1.3842239185750635, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150987264, + "loss": 1.362, + "grad_norm": 0.6494511961936951, + "learning_rate": 0.00019516486479172386 + }, + { + "step": 273, + "epoch": 1.3893129770992365, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151054848, + "loss": 1.366, + "grad_norm": 0.5387434363365173, + "learning_rate": 0.0001943146025496555 + }, + { + "step": 274, + "epoch": 1.3944020356234097, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151007232, + "loss": 1.3912, + "grad_norm": 0.600138783454895, + "learning_rate": 0.00019346277739753855 + }, + { + "step": 275, + "epoch": 1.3994910941475827, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15113472, + "loss": 1.3745, + "grad_norm": 0.6696773767471313, + "learning_rate": 0.00019260941937797776 + }, + { + "step": 276, + "epoch": 1.4045801526717558, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15101952, + "loss": 1.3672, + "grad_norm": 0.43245619535446167, + "learning_rate": 0.00019175455858763988 + }, + { + "step": 277, + "epoch": 1.4096692111959288, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150968832, + "loss": 1.3277, + "grad_norm": 0.4105730354785919, + "learning_rate": 0.0001908982251761921 + }, + { + "step": 278, + "epoch": 1.4147582697201018, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150956544, + "loss": 1.3419, + "grad_norm": 0.7990111112594604, + "learning_rate": 0.00019004044934523871 + }, + { + "step": 279, + "epoch": 1.4198473282442747, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150948864, + "loss": 1.3937, + "grad_norm": 0.5693293809890747, + "learning_rate": 0.00018918126134725616 + }, + { + "step": 280, + "epoch": 1.424936386768448, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151071744, + "loss": 1.3794, + "grad_norm": 0.8156441450119019, + "learning_rate": 0.00018832069148452582 + }, + { + "step": 281, + "epoch": 1.4300254452926209, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15101184, + "loss": 1.4036, + "grad_norm": 1.071025013923645, + "learning_rate": 0.00018745877010806534 + }, + { + "step": 282, + "epoch": 1.4351145038167938, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15095808, + "loss": 1.4166, + "grad_norm": 1.1006295680999756, + "learning_rate": 0.00018659552761655828 + }, + { + "step": 283, + "epoch": 1.440203562340967, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150974976, + "loss": 1.3888, + "grad_norm": 1.0232137441635132, + "learning_rate": 0.00018573099445528204 + }, + { + "step": 284, + "epoch": 1.44529262086514, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151114752, + "loss": 1.3961, + "grad_norm": 0.8334195613861084, + "learning_rate": 0.00018486520111503387 + }, + { + "step": 285, + "epoch": 1.450381679389313, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150985728, + "loss": 1.4421, + "grad_norm": 1.4352595806121826, + "learning_rate": 0.0001839981781310558 + }, + { + "step": 286, + "epoch": 1.455470737913486, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15098112, + "loss": 1.4324, + "grad_norm": 1.112323522567749, + "learning_rate": 0.00018312995608195747 + }, + { + "step": 287, + "epoch": 1.460559796437659, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150884352, + "loss": 1.4061, + "grad_norm": 0.8273434638977051, + "learning_rate": 0.00018226056558863778 + }, + { + "step": 288, + "epoch": 1.465648854961832, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150959616, + "loss": 1.3801, + "grad_norm": 0.6828897595405579, + "learning_rate": 0.00018139003731320496 + }, + { + "step": 289, + "epoch": 1.470737913486005, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150979584, + "loss": 1.3797, + "grad_norm": 0.3862398862838745, + "learning_rate": 0.00018051840195789506 + }, + { + "step": 290, + "epoch": 1.4758269720101782, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150945792, + "loss": 1.3677, + "grad_norm": 0.2755945324897766, + "learning_rate": 0.00017964569026398926 + }, + { + "step": 291, + "epoch": 1.4809160305343512, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150907392, + "loss": 1.3977, + "grad_norm": 0.4865618348121643, + "learning_rate": 0.00017877193301072945 + }, + { + "step": 292, + "epoch": 1.4860050890585241, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151033344, + "loss": 1.4005, + "grad_norm": 0.5125243067741394, + "learning_rate": 0.0001778971610142331 + }, + { + "step": 293, + "epoch": 1.491094147582697, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150910464, + "loss": 1.4161, + "grad_norm": 0.5864467620849609, + "learning_rate": 0.00017702140512640594 + }, + { + "step": 294, + "epoch": 1.4961832061068703, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150953472, + "loss": 1.4176, + "grad_norm": 0.5309094190597534, + "learning_rate": 0.00017614469623385414 + }, + { + "step": 295, + "epoch": 1.5012722646310432, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150936576, + "loss": 1.362, + "grad_norm": 0.6920171976089478, + "learning_rate": 0.00017526706525679498 + }, + { + "step": 296, + "epoch": 1.5063613231552164, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150978048, + "loss": 1.377, + "grad_norm": 0.4273502826690674, + "learning_rate": 0.00017438854314796623 + }, + { + "step": 297, + "epoch": 1.5114503816793894, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150918144, + "loss": 1.3515, + "grad_norm": 0.35482069849967957, + "learning_rate": 0.00017350916089153455 + }, + { + "step": 298, + "epoch": 1.5165394402035624, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150941184, + "loss": 1.3762, + "grad_norm": 0.7627120614051819, + "learning_rate": 0.00017262894950200277 + }, + { + "step": 299, + "epoch": 1.5216284987277353, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150930432, + "loss": 1.3555, + "grad_norm": 0.36984726786613464, + "learning_rate": 0.000171747940023116 + }, + { + "step": 300, + "epoch": 1.5267175572519083, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15094272, + "loss": 1.3762, + "grad_norm": 0.388205349445343, + "learning_rate": 0.0001708661635267667 + }, + { + "step": 301, + "epoch": 1.5318066157760815, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150945792, + "loss": 1.3308, + "grad_norm": 0.8547789454460144, + "learning_rate": 0.00016998365111189906 + }, + { + "step": 302, + "epoch": 1.5368956743002544, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15092736, + "loss": 1.3709, + "grad_norm": 0.38731029629707336, + "learning_rate": 0.00016910043390341183 + }, + { + "step": 303, + "epoch": 1.5419847328244276, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150933504, + "loss": 1.3768, + "grad_norm": 0.6487261056900024, + "learning_rate": 0.0001682165430510609 + }, + { + "step": 304, + "epoch": 1.5470737913486006, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15088128, + "loss": 1.4357, + "grad_norm": 1.3766037225723267, + "learning_rate": 0.00016733200972836055 + }, + { + "step": 305, + "epoch": 1.5521628498727735, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150953472, + "loss": 1.354, + "grad_norm": 0.5282900333404541, + "learning_rate": 0.00016644686513148397 + }, + { + "step": 306, + "epoch": 1.5572519083969465, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150994944, + "loss": 1.3477, + "grad_norm": 0.5687118172645569, + "learning_rate": 0.00016556114047816317 + }, + { + "step": 307, + "epoch": 1.5623409669211195, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15094272, + "loss": 1.3487, + "grad_norm": 0.4359714388847351, + "learning_rate": 0.00016467486700658785 + }, + { + "step": 308, + "epoch": 1.5674300254452926, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150953472, + "loss": 1.3837, + "grad_norm": 0.5427277684211731, + "learning_rate": 0.0001637880759743037 + }, + { + "step": 309, + "epoch": 1.5725190839694656, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150936576, + "loss": 1.4105, + "grad_norm": 1.0049668550491333, + "learning_rate": 0.00016290079865711004 + }, + { + "step": 310, + "epoch": 1.5776081424936388, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150991872, + "loss": 1.3638, + "grad_norm": 0.389760822057724, + "learning_rate": 0.00016201306634795675 + }, + { + "step": 311, + "epoch": 1.5826972010178118, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15095808, + "loss": 1.3468, + "grad_norm": 0.41414085030555725, + "learning_rate": 0.00016112491035584047 + }, + { + "step": 312, + "epoch": 1.5877862595419847, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150967296, + "loss": 1.3547, + "grad_norm": 0.5145827531814575, + "learning_rate": 0.00016023636200470065 + }, + { + "step": 313, + "epoch": 1.5928753180661577, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150968832, + "loss": 1.3422, + "grad_norm": 0.9709764122962952, + "learning_rate": 0.00015934745263231464 + }, + { + "step": 314, + "epoch": 1.5979643765903306, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15119616, + "loss": 1.3882, + "grad_norm": 0.9082082509994507, + "learning_rate": 0.00015845821358919236 + }, + { + "step": 315, + "epoch": 1.6030534351145038, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1509888, + "loss": 1.3506, + "grad_norm": 0.7806737422943115, + "learning_rate": 0.00015756867623747088 + }, + { + "step": 316, + "epoch": 1.608142493638677, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150961152, + "loss": 1.3836, + "grad_norm": 1.0006145238876343, + "learning_rate": 0.00015667887194980806 + }, + { + "step": 317, + "epoch": 1.61323155216285, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150925824, + "loss": 1.3587, + "grad_norm": 0.8742190599441528, + "learning_rate": 0.00015578883210827626 + }, + { + "step": 318, + "epoch": 1.618320610687023, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151016448, + "loss": 1.3379, + "grad_norm": 1.149460792541504, + "learning_rate": 0.0001548985881032554 + }, + { + "step": 319, + "epoch": 1.623409669211196, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150941184, + "loss": 1.3399, + "grad_norm": 0.9264956712722778, + "learning_rate": 0.00015400817133232606 + }, + { + "step": 320, + "epoch": 1.6284987277353689, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1509888, + "loss": 1.3625, + "grad_norm": 0.6866242289543152, + "learning_rate": 0.00015311761319916184 + }, + { + "step": 321, + "epoch": 1.6335877862595418, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150930432, + "loss": 1.364, + "grad_norm": 0.9175609350204468, + "learning_rate": 0.00015222694511242215 + }, + { + "step": 322, + "epoch": 1.638676844783715, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151151616, + "loss": 1.3869, + "grad_norm": 0.9288254976272583, + "learning_rate": 0.00015133619848464424 + }, + { + "step": 323, + "epoch": 1.6437659033078882, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151033344, + "loss": 1.3533, + "grad_norm": 0.8750184178352356, + "learning_rate": 0.0001504454047311353 + }, + { + "step": 324, + "epoch": 1.6488549618320612, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150925824, + "loss": 1.3468, + "grad_norm": 1.3015300035476685, + "learning_rate": 0.00014955459526886468 + }, + { + "step": 325, + "epoch": 1.6539440203562341, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15096576, + "loss": 1.401, + "grad_norm": 0.9193562865257263, + "learning_rate": 0.00014866380151535574 + }, + { + "step": 326, + "epoch": 1.659033078880407, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151013376, + "loss": 1.3211, + "grad_norm": 0.7771753072738647, + "learning_rate": 0.0001477730548875778 + }, + { + "step": 327, + "epoch": 1.66412213740458, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150979584, + "loss": 1.3915, + "grad_norm": 1.0669987201690674, + "learning_rate": 0.0001468823868008382 + }, + { + "step": 328, + "epoch": 1.6692111959287532, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15103488, + "loss": 1.3666, + "grad_norm": 0.8891983032226562, + "learning_rate": 0.000145991828667674 + }, + { + "step": 329, + "epoch": 1.6743002544529262, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15092736, + "loss": 1.372, + "grad_norm": 0.6592817902565002, + "learning_rate": 0.0001451014118967446 + }, + { + "step": 330, + "epoch": 1.6793893129770994, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15101184, + "loss": 1.3584, + "grad_norm": 0.9536921977996826, + "learning_rate": 0.00014421116789172374 + }, + { + "step": 331, + "epoch": 1.6844783715012723, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151001088, + "loss": 1.2941, + "grad_norm": 0.6811585426330566, + "learning_rate": 0.00014332112805019194 + }, + { + "step": 332, + "epoch": 1.6895674300254453, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150956544, + "loss": 1.3719, + "grad_norm": 0.6541745066642761, + "learning_rate": 0.00014243132376252912 + }, + { + "step": 333, + "epoch": 1.6946564885496183, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151045632, + "loss": 1.4259, + "grad_norm": 0.6978986859321594, + "learning_rate": 0.00014154178641080767 + }, + { + "step": 334, + "epoch": 1.6997455470737912, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15097344, + "loss": 1.3554, + "grad_norm": 0.5804476141929626, + "learning_rate": 0.0001406525473676854 + }, + { + "step": 335, + "epoch": 1.7048346055979644, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150982656, + "loss": 1.3856, + "grad_norm": 0.6380639672279358, + "learning_rate": 0.00013976363799529936 + }, + { + "step": 336, + "epoch": 1.7099236641221374, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15103488, + "loss": 1.3321, + "grad_norm": 0.5764433741569519, + "learning_rate": 0.00013887508964415956 + }, + { + "step": 337, + "epoch": 1.7150127226463106, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15103488, + "loss": 1.3129, + "grad_norm": 1.2145497798919678, + "learning_rate": 0.00013798693365204325 + }, + { + "step": 338, + "epoch": 1.7201017811704835, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151071744, + "loss": 1.3873, + "grad_norm": 0.9029868245124817, + "learning_rate": 0.00013709920134288993 + }, + { + "step": 339, + "epoch": 1.7251908396946565, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1509888, + "loss": 1.3831, + "grad_norm": 0.8384062051773071, + "learning_rate": 0.00013621192402569628 + }, + { + "step": 340, + "epoch": 1.7302798982188294, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15105792, + "loss": 1.3455, + "grad_norm": 0.6162306666374207, + "learning_rate": 0.00013532513299341215 + }, + { + "step": 341, + "epoch": 1.7353689567430024, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150999552, + "loss": 1.3548, + "grad_norm": 0.7425881624221802, + "learning_rate": 0.00013443885952183683 + }, + { + "step": 342, + "epoch": 1.7404580152671756, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151005696, + "loss": 1.3283, + "grad_norm": 0.8823555707931519, + "learning_rate": 0.00013355313486851603 + }, + { + "step": 343, + "epoch": 1.7455470737913485, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150916608, + "loss": 1.3941, + "grad_norm": 0.9370356798171997, + "learning_rate": 0.00013266799027163942 + }, + { + "step": 344, + "epoch": 1.7506361323155217, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15103488, + "loss": 1.3739, + "grad_norm": 0.7030972242355347, + "learning_rate": 0.00013178345694893906 + }, + { + "step": 345, + "epoch": 1.7557251908396947, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150987264, + "loss": 1.3478, + "grad_norm": 0.8014369010925293, + "learning_rate": 0.0001308995660965881 + }, + { + "step": 346, + "epoch": 1.7608142493638677, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150933504, + "loss": 1.3666, + "grad_norm": 0.9882919788360596, + "learning_rate": 0.00013001634888810094 + }, + { + "step": 347, + "epoch": 1.7659033078880406, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150982656, + "loss": 1.3355, + "grad_norm": 0.7181122303009033, + "learning_rate": 0.0001291338364732333 + }, + { + "step": 348, + "epoch": 1.7709923664122136, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151108608, + "loss": 1.3935, + "grad_norm": 0.9494197368621826, + "learning_rate": 0.00012825205997688403 + }, + { + "step": 349, + "epoch": 1.7760814249363868, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150907392, + "loss": 1.3038, + "grad_norm": 0.6964796781539917, + "learning_rate": 0.00012737105049799723 + }, + { + "step": 350, + "epoch": 1.78117048346056, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150971904, + "loss": 1.3691, + "grad_norm": 0.6083819270133972, + "learning_rate": 0.00012649083910846543 + }, + { + "step": 351, + "epoch": 1.786259541984733, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150987264, + "loss": 1.3139, + "grad_norm": 0.6104048490524292, + "learning_rate": 0.00012561145685203374 + }, + { + "step": 352, + "epoch": 1.7913486005089059, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150978048, + "loss": 1.3842, + "grad_norm": 0.758448600769043, + "learning_rate": 0.00012473293474320505 + }, + { + "step": 353, + "epoch": 1.7964376590330788, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150974976, + "loss": 1.2978, + "grad_norm": 0.8362996578216553, + "learning_rate": 0.00012385530376614586 + }, + { + "step": 354, + "epoch": 1.8015267175572518, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1509504, + "loss": 1.3994, + "grad_norm": 1.081278920173645, + "learning_rate": 0.00012297859487359408 + }, + { + "step": 355, + "epoch": 1.806615776081425, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150933504, + "loss": 1.3579, + "grad_norm": 0.6163837909698486, + "learning_rate": 0.0001221028389857669 + }, + { + "step": 356, + "epoch": 1.811704834605598, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15093504, + "loss": 1.3383, + "grad_norm": 0.5938782691955566, + "learning_rate": 0.00012122806698927051 + }, + { + "step": 357, + "epoch": 1.8167938931297711, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150987264, + "loss": 1.3141, + "grad_norm": 0.7616696357727051, + "learning_rate": 0.00012035430973601075 + }, + { + "step": 358, + "epoch": 1.821882951653944, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151053312, + "loss": 1.3871, + "grad_norm": 1.2815037965774536, + "learning_rate": 0.00011948159804210495 + }, + { + "step": 359, + "epoch": 1.826972010178117, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151007232, + "loss": 1.2941, + "grad_norm": 0.7637209296226501, + "learning_rate": 0.00011860996268679504 + }, + { + "step": 360, + "epoch": 1.83206106870229, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15105792, + "loss": 1.3634, + "grad_norm": 0.6185732483863831, + "learning_rate": 0.00011773943441136221 + }, + { + "step": 361, + "epoch": 1.837150127226463, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151028736, + "loss": 1.3623, + "grad_norm": 0.7407740950584412, + "learning_rate": 0.00011687004391804251 + }, + { + "step": 362, + "epoch": 1.8422391857506362, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150962688, + "loss": 1.3569, + "grad_norm": 1.01309335231781, + "learning_rate": 0.00011600182186894417 + }, + { + "step": 363, + "epoch": 1.8473282442748091, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15091968, + "loss": 1.3044, + "grad_norm": 0.9012948870658875, + "learning_rate": 0.00011513479888496609 + }, + { + "step": 364, + "epoch": 1.8524173027989823, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150947328, + "loss": 1.2399, + "grad_norm": 0.8388073444366455, + "learning_rate": 0.00011426900554471795 + }, + { + "step": 365, + "epoch": 1.8575063613231553, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150930432, + "loss": 1.3656, + "grad_norm": 0.9280896782875061, + "learning_rate": 0.0001134044723834417 + }, + { + "step": 366, + "epoch": 1.8625954198473282, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15097344, + "loss": 1.2925, + "grad_norm": 0.9964663982391357, + "learning_rate": 0.00011254122989193465 + }, + { + "step": 367, + "epoch": 1.8676844783715012, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1510272, + "loss": 1.3113, + "grad_norm": 1.2439433336257935, + "learning_rate": 0.00011167930851547418 + }, + { + "step": 368, + "epoch": 1.8727735368956742, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150924288, + "loss": 1.3325, + "grad_norm": 1.0852175951004028, + "learning_rate": 0.0001108187386527438 + }, + { + "step": 369, + "epoch": 1.8778625954198473, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150984192, + "loss": 1.2902, + "grad_norm": 1.0865001678466797, + "learning_rate": 0.00010995955065476126 + }, + { + "step": 370, + "epoch": 1.8829516539440203, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150907392, + "loss": 1.367, + "grad_norm": 1.2326934337615967, + "learning_rate": 0.00010910177482380795 + }, + { + "step": 371, + "epoch": 1.8880407124681935, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151067136, + "loss": 1.2487, + "grad_norm": 1.2933940887451172, + "learning_rate": 0.00010824544141236015 + }, + { + "step": 372, + "epoch": 1.8931297709923665, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150921216, + "loss": 1.3269, + "grad_norm": 1.0942070484161377, + "learning_rate": 0.00010739058062202224 + }, + { + "step": 373, + "epoch": 1.8982188295165394, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150944256, + "loss": 1.381, + "grad_norm": 1.3730473518371582, + "learning_rate": 0.00010653722260246145 + }, + { + "step": 374, + "epoch": 1.9033078880407124, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150967296, + "loss": 1.3571, + "grad_norm": 1.6463983058929443, + "learning_rate": 0.00010568539745034447 + }, + { + "step": 375, + "epoch": 1.9083969465648853, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151191552, + "loss": 1.383, + "grad_norm": 1.3831133842468262, + "learning_rate": 0.00010483513520827614 + }, + { + "step": 376, + "epoch": 1.9134860050890585, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150948864, + "loss": 1.3882, + "grad_norm": 1.278062105178833, + "learning_rate": 0.00010398646586373969 + }, + { + "step": 377, + "epoch": 1.9185750636132317, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151071744, + "loss": 1.3855, + "grad_norm": 1.3038331270217896, + "learning_rate": 0.00010313941934803922 + }, + { + "step": 378, + "epoch": 1.9236641221374047, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150987264, + "loss": 1.361, + "grad_norm": 1.3517996072769165, + "learning_rate": 0.00010229402553524413 + }, + { + "step": 379, + "epoch": 1.9287531806615776, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151076352, + "loss": 1.3731, + "grad_norm": 1.4064159393310547, + "learning_rate": 0.00010145031424113542 + }, + { + "step": 380, + "epoch": 1.9338422391857506, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150974976, + "loss": 1.3373, + "grad_norm": 1.0790468454360962, + "learning_rate": 0.00010060831522215416 + }, + { + "step": 381, + "epoch": 1.9389312977099236, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150982656, + "loss": 1.3865, + "grad_norm": 0.9949386119842529, + "learning_rate": 9.976805817435207e-05 + }, + { + "step": 382, + "epoch": 1.9440203562340967, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150905856, + "loss": 1.3202, + "grad_norm": 0.7705826759338379, + "learning_rate": 9.89295727323441e-05 + }, + { + "step": 383, + "epoch": 1.9491094147582697, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150985728, + "loss": 1.3424, + "grad_norm": 0.8659871816635132, + "learning_rate": 9.809288846826327e-05 + }, + { + "step": 384, + "epoch": 1.954198473282443, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150956544, + "loss": 1.3997, + "grad_norm": 1.1686571836471558, + "learning_rate": 9.725803489071779e-05 + }, + { + "step": 385, + "epoch": 1.9592875318066159, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150944256, + "loss": 1.3365, + "grad_norm": 0.9898850917816162, + "learning_rate": 9.642504144375026e-05 + }, + { + "step": 386, + "epoch": 1.9643765903307888, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151087104, + "loss": 1.2759, + "grad_norm": 0.7875490188598633, + "learning_rate": 9.559393750579926e-05 + }, + { + "step": 387, + "epoch": 1.9694656488549618, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150970368, + "loss": 1.3281, + "grad_norm": 0.6776736378669739, + "learning_rate": 9.476475238866318e-05 + }, + { + "step": 388, + "epoch": 1.9745547073791347, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15098112, + "loss": 1.3607, + "grad_norm": 0.7921596765518188, + "learning_rate": 9.393751533646649e-05 + }, + { + "step": 389, + "epoch": 1.979643765903308, + "cpu_mem": 3.4093056, + "gpu_mem": 1.151136256, + "loss": 1.3644, + "grad_norm": 0.8691542744636536, + "learning_rate": 9.31122555246283e-05 + }, + { + "step": 390, + "epoch": 1.984732824427481, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150945792, + "loss": 1.342, + "grad_norm": 0.7552881836891174, + "learning_rate": 9.228900205883324e-05 + }, + { + "step": 391, + "epoch": 1.989821882951654, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150967296, + "loss": 1.3189, + "grad_norm": 0.7588068246841431, + "learning_rate": 9.146778397400543e-05 + }, + { + "step": 392, + "epoch": 1.994910941475827, + "cpu_mem": 3.4093056, + "gpu_mem": 1.15100416, + "loss": 1.3211, + "grad_norm": 0.6571627855300903, + "learning_rate": 9.064863023328384e-05 + }, + { + "step": 393, + "epoch": 2.0, + "cpu_mem": 3.4093056, + "gpu_mem": 1.150590976, + "loss": 2.0092, + "grad_norm": 1.25569486618042, + "learning_rate": 8.983156972700125e-05 + }, + { + "step": 394, + "epoch": 2.005089058524173, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125694464, + "loss": 1.2801, + "grad_norm": 1.0021344423294067, + "learning_rate": 8.901663127166513e-05 + }, + { + "step": 395, + "epoch": 2.010178117048346, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12567296, + "loss": 1.3204, + "grad_norm": 0.890191912651062, + "learning_rate": 8.820384360894143e-05 + }, + { + "step": 396, + "epoch": 2.015267175572519, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12568832, + "loss": 1.3275, + "grad_norm": 1.0000284910202026, + "learning_rate": 8.739323540464063e-05 + }, + { + "step": 397, + "epoch": 2.0203562340966923, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125835776, + "loss": 1.2791, + "grad_norm": 0.9102291464805603, + "learning_rate": 8.658483524770728e-05 + }, + { + "step": 398, + "epoch": 2.0254452926208653, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125782016, + "loss": 1.3223, + "grad_norm": 1.008239984512329, + "learning_rate": 8.577867164921113e-05 + }, + { + "step": 399, + "epoch": 2.030534351145038, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125798912, + "loss": 1.3114, + "grad_norm": 1.1620031595230103, + "learning_rate": 8.497477304134203e-05 + }, + { + "step": 400, + "epoch": 2.035623409669211, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125751296, + "loss": 1.2863, + "grad_norm": 1.1913535594940186, + "learning_rate": 8.41731677764068e-05 + }, + { + "step": 401, + "epoch": 2.040712468193384, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125775872, + "loss": 1.3206, + "grad_norm": 1.3685158491134644, + "learning_rate": 8.337388412582972e-05 + }, + { + "step": 402, + "epoch": 2.045801526717557, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125823488, + "loss": 1.3288, + "grad_norm": 1.5498913526535034, + "learning_rate": 8.257695027915481e-05 + }, + { + "step": 403, + "epoch": 2.05089058524173, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125732864, + "loss": 1.2572, + "grad_norm": 1.39262056350708, + "learning_rate": 8.178239434305235e-05 + }, + { + "step": 404, + "epoch": 2.0559796437659035, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125705216, + "loss": 1.1857, + "grad_norm": 1.360243797302246, + "learning_rate": 8.099024434032717e-05 + }, + { + "step": 405, + "epoch": 2.0610687022900764, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125794304, + "loss": 1.2348, + "grad_norm": 1.2298368215560913, + "learning_rate": 8.02005282089303e-05 + }, + { + "step": 406, + "epoch": 2.0661577608142494, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125685248, + "loss": 1.2124, + "grad_norm": 1.4431509971618652, + "learning_rate": 7.941327380097388e-05 + }, + { + "step": 407, + "epoch": 2.0712468193384224, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125737472, + "loss": 1.1855, + "grad_norm": 1.418897032737732, + "learning_rate": 7.862850888174869e-05 + }, + { + "step": 408, + "epoch": 2.0763358778625953, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125691392, + "loss": 1.3378, + "grad_norm": 2.050952196121216, + "learning_rate": 7.784626112874487e-05 + }, + { + "step": 409, + "epoch": 2.0814249363867683, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125894144, + "loss": 1.2828, + "grad_norm": 1.9025828838348389, + "learning_rate": 7.706655813067594e-05 + }, + { + "step": 410, + "epoch": 2.0865139949109412, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125892608, + "loss": 1.2736, + "grad_norm": 2.2149598598480225, + "learning_rate": 7.628942738650573e-05 + }, + { + "step": 411, + "epoch": 2.0916030534351147, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125815808, + "loss": 1.1848, + "grad_norm": 1.863126277923584, + "learning_rate": 7.551489630447835e-05 + }, + { + "step": 412, + "epoch": 2.0966921119592876, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1257728, + "loss": 1.2743, + "grad_norm": 2.309140920639038, + "learning_rate": 7.474299220115195e-05 + }, + { + "step": 413, + "epoch": 2.1017811704834606, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12574976, + "loss": 1.1382, + "grad_norm": 3.1486356258392334, + "learning_rate": 7.397374230043484e-05 + }, + { + "step": 414, + "epoch": 2.1068702290076335, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12570368, + "loss": 1.2544, + "grad_norm": 1.724923014640808, + "learning_rate": 7.320717373262557e-05 + }, + { + "step": 415, + "epoch": 2.1119592875318065, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125820416, + "loss": 1.3639, + "grad_norm": 2.6176211833953857, + "learning_rate": 7.244331353345625e-05 + }, + { + "step": 416, + "epoch": 2.1170483460559795, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125708288, + "loss": 1.3565, + "grad_norm": 1.8052966594696045, + "learning_rate": 7.16821886431386e-05 + }, + { + "step": 417, + "epoch": 2.122137404580153, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125697536, + "loss": 1.2523, + "grad_norm": 1.8111952543258667, + "learning_rate": 7.092382590541432e-05 + }, + { + "step": 418, + "epoch": 2.127226463104326, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125702144, + "loss": 1.2482, + "grad_norm": 1.9939043521881104, + "learning_rate": 7.016825206660788e-05 + }, + { + "step": 419, + "epoch": 2.132315521628499, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125817344, + "loss": 1.3639, + "grad_norm": 2.17223858833313, + "learning_rate": 6.941549377468367e-05 + }, + { + "step": 420, + "epoch": 2.1374045801526718, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125791232, + "loss": 1.2576, + "grad_norm": 1.998698353767395, + "learning_rate": 6.866557757830575e-05 + }, + { + "step": 421, + "epoch": 2.1424936386768447, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125692928, + "loss": 1.2405, + "grad_norm": 1.836585521697998, + "learning_rate": 6.791852992590169e-05 + }, + { + "step": 422, + "epoch": 2.1475826972010177, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125728256, + "loss": 1.2257, + "grad_norm": 1.615218162536621, + "learning_rate": 6.717437716472997e-05 + }, + { + "step": 423, + "epoch": 2.1526717557251906, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125694464, + "loss": 1.2537, + "grad_norm": 1.3650728464126587, + "learning_rate": 6.643314553995034e-05 + }, + { + "step": 424, + "epoch": 2.157760814249364, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125828096, + "loss": 1.2132, + "grad_norm": 1.355606198310852, + "learning_rate": 6.569486119369863e-05 + }, + { + "step": 425, + "epoch": 2.162849872773537, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125801984, + "loss": 1.2301, + "grad_norm": 1.5736998319625854, + "learning_rate": 6.495955016416441e-05 + }, + { + "step": 426, + "epoch": 2.16793893129771, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125846528, + "loss": 1.1564, + "grad_norm": 1.4629967212677002, + "learning_rate": 6.422723838467286e-05 + }, + { + "step": 427, + "epoch": 2.173027989821883, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125737472, + "loss": 1.2853, + "grad_norm": 1.4798521995544434, + "learning_rate": 6.349795168276994e-05 + }, + { + "step": 428, + "epoch": 2.178117048346056, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125754368, + "loss": 1.1717, + "grad_norm": 1.630557894706726, + "learning_rate": 6.277171577931187e-05 + }, + { + "step": 429, + "epoch": 2.183206106870229, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125829632, + "loss": 1.3521, + "grad_norm": 2.065974473953247, + "learning_rate": 6.204855628755751e-05 + }, + { + "step": 430, + "epoch": 2.188295165394402, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125677568, + "loss": 1.4015, + "grad_norm": 2.1843855381011963, + "learning_rate": 6.13284987122654e-05 + }, + { + "step": 431, + "epoch": 2.1933842239185752, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12579584, + "loss": 1.2989, + "grad_norm": 2.444472312927246, + "learning_rate": 6.061156844879417e-05 + }, + { + "step": 432, + "epoch": 2.198473282442748, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125785088, + "loss": 1.2453, + "grad_norm": 2.1483328342437744, + "learning_rate": 5.9897790782206636e-05 + }, + { + "step": 433, + "epoch": 2.203562340966921, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125751296, + "loss": 1.2615, + "grad_norm": 1.536960244178772, + "learning_rate": 5.9187190886378306e-05 + }, + { + "step": 434, + "epoch": 2.208651399491094, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125729792, + "loss": 1.3071, + "grad_norm": 1.5347875356674194, + "learning_rate": 5.8479793823109406e-05 + }, + { + "step": 435, + "epoch": 2.213740458015267, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125725184, + "loss": 1.2408, + "grad_norm": 1.5014082193374634, + "learning_rate": 5.777562454124113e-05 + }, + { + "step": 436, + "epoch": 2.21882951653944, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125735936, + "loss": 1.2611, + "grad_norm": 1.3105372190475464, + "learning_rate": 5.7074707875775496e-05 + }, + { + "step": 437, + "epoch": 2.223918575063613, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125789696, + "loss": 1.3066, + "grad_norm": 1.5176454782485962, + "learning_rate": 5.637706854699974e-05 + }, + { + "step": 438, + "epoch": 2.2290076335877864, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125706752, + "loss": 1.2324, + "grad_norm": 1.2904958724975586, + "learning_rate": 5.568273115961414e-05 + }, + { + "step": 439, + "epoch": 2.2340966921119594, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1257344, + "loss": 1.2646, + "grad_norm": 1.2829535007476807, + "learning_rate": 5.499172020186447e-05 + }, + { + "step": 440, + "epoch": 2.2391857506361323, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125732864, + "loss": 1.2446, + "grad_norm": 1.5704327821731567, + "learning_rate": 5.430406004467842e-05 + }, + { + "step": 441, + "epoch": 2.2442748091603053, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125775872, + "loss": 1.2638, + "grad_norm": 1.4616711139678955, + "learning_rate": 5.361977494080572e-05 + }, + { + "step": 442, + "epoch": 2.2493638676844783, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125774336, + "loss": 1.2884, + "grad_norm": 1.631168246269226, + "learning_rate": 5.293888902396319e-05 + }, + { + "step": 443, + "epoch": 2.2544529262086512, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125825024, + "loss": 1.2624, + "grad_norm": 1.8213300704956055, + "learning_rate": 5.2261426307983204e-05 + }, + { + "step": 444, + "epoch": 2.2595419847328246, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125831168, + "loss": 1.2498, + "grad_norm": 1.4336581230163574, + "learning_rate": 5.158741068596714e-05 + }, + { + "step": 445, + "epoch": 2.2646310432569976, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125769728, + "loss": 1.3146, + "grad_norm": 1.4688706398010254, + "learning_rate": 5.0916865929442326e-05 + }, + { + "step": 446, + "epoch": 2.2697201017811706, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125758976, + "loss": 1.2752, + "grad_norm": 1.9694615602493286, + "learning_rate": 5.024981568752386e-05 + }, + { + "step": 447, + "epoch": 2.2748091603053435, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125783552, + "loss": 1.2991, + "grad_norm": 1.5247130393981934, + "learning_rate": 4.958628348608065e-05 + }, + { + "step": 448, + "epoch": 2.2798982188295165, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125800448, + "loss": 1.286, + "grad_norm": 1.991368293762207, + "learning_rate": 4.892629272690536e-05 + }, + { + "step": 449, + "epoch": 2.2849872773536894, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125671424, + "loss": 1.2655, + "grad_norm": 1.4695661067962646, + "learning_rate": 4.826986668688944e-05 + }, + { + "step": 450, + "epoch": 2.2900763358778624, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125846528, + "loss": 1.2151, + "grad_norm": 1.8277021646499634, + "learning_rate": 4.761702851720191e-05 + }, + { + "step": 451, + "epoch": 2.2951653944020354, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12580352, + "loss": 1.2105, + "grad_norm": 1.962829351425171, + "learning_rate": 4.6967801242472916e-05 + }, + { + "step": 452, + "epoch": 2.300254452926209, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125712896, + "loss": 1.2141, + "grad_norm": 1.7224100828170776, + "learning_rate": 4.632220775998172e-05 + }, + { + "step": 453, + "epoch": 2.3053435114503817, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125660672, + "loss": 1.1613, + "grad_norm": 1.4821224212646484, + "learning_rate": 4.568027083884929e-05 + }, + { + "step": 454, + "epoch": 2.3104325699745547, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125676032, + "loss": 1.3514, + "grad_norm": 2.0117788314819336, + "learning_rate": 4.504201311923488e-05 + }, + { + "step": 455, + "epoch": 2.3155216284987277, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125777408, + "loss": 1.2483, + "grad_norm": 1.922282338142395, + "learning_rate": 4.440745711153804e-05 + }, + { + "step": 456, + "epoch": 2.3206106870229006, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125786624, + "loss": 1.2392, + "grad_norm": 2.1163864135742188, + "learning_rate": 4.377662519560423e-05 + }, + { + "step": 457, + "epoch": 2.325699745547074, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125815808, + "loss": 1.2504, + "grad_norm": 2.1247146129608154, + "learning_rate": 4.3149539619935836e-05 + }, + { + "step": 458, + "epoch": 2.330788804071247, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125877248, + "loss": 1.2854, + "grad_norm": 1.9337564706802368, + "learning_rate": 4.252622250090746e-05 + }, + { + "step": 459, + "epoch": 2.33587786259542, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125731328, + "loss": 1.1387, + "grad_norm": 1.7797491550445557, + "learning_rate": 4.190669582198571e-05 + }, + { + "step": 460, + "epoch": 2.340966921119593, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12581888, + "loss": 1.3188, + "grad_norm": 2.5965631008148193, + "learning_rate": 4.1290981432954185e-05 + }, + { + "step": 461, + "epoch": 2.346055979643766, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125722112, + "loss": 1.2246, + "grad_norm": 1.8144394159317017, + "learning_rate": 4.067910104914249e-05 + }, + { + "step": 462, + "epoch": 2.351145038167939, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125792768, + "loss": 1.2527, + "grad_norm": 2.2240564823150635, + "learning_rate": 4.007107625066079e-05 + }, + { + "step": 463, + "epoch": 2.356234096692112, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125828096, + "loss": 1.2741, + "grad_norm": 2.288771390914917, + "learning_rate": 3.946692848163836e-05 + }, + { + "step": 464, + "epoch": 2.3613231552162848, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125766656, + "loss": 1.1592, + "grad_norm": 1.924763560295105, + "learning_rate": 3.886667904946739e-05 + }, + { + "step": 465, + "epoch": 2.366412213740458, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125723648, + "loss": 1.2264, + "grad_norm": 1.824343204498291, + "learning_rate": 3.8270349124051694e-05 + }, + { + "step": 466, + "epoch": 2.371501272264631, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125725184, + "loss": 1.2162, + "grad_norm": 2.0432207584381104, + "learning_rate": 3.767795973705975e-05 + }, + { + "step": 467, + "epoch": 2.376590330788804, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125762048, + "loss": 1.1873, + "grad_norm": 1.777555227279663, + "learning_rate": 3.708953178118324e-05 + }, + { + "step": 468, + "epoch": 2.381679389312977, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1257344, + "loss": 1.2426, + "grad_norm": 1.7710180282592773, + "learning_rate": 3.6505086009399944e-05 + }, + { + "step": 469, + "epoch": 2.38676844783715, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125791232, + "loss": 1.2022, + "grad_norm": 1.9939147233963013, + "learning_rate": 3.5924643034242136e-05 + }, + { + "step": 470, + "epoch": 2.391857506361323, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125820416, + "loss": 1.1798, + "grad_norm": 2.209946870803833, + "learning_rate": 3.5348223327069105e-05 + }, + { + "step": 471, + "epoch": 2.3969465648854964, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125720576, + "loss": 1.2443, + "grad_norm": 2.0772104263305664, + "learning_rate": 3.4775847217345756e-05 + }, + { + "step": 472, + "epoch": 2.4020356234096694, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125743616, + "loss": 1.204, + "grad_norm": 2.0399672985076904, + "learning_rate": 3.420753489192524e-05 + }, + { + "step": 473, + "epoch": 2.4071246819338423, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12571904, + "loss": 1.189, + "grad_norm": 1.956943154335022, + "learning_rate": 3.364330639433701e-05 + }, + { + "step": 474, + "epoch": 2.4122137404580153, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1258112, + "loss": 1.2395, + "grad_norm": 2.0163984298706055, + "learning_rate": 3.308318162408013e-05 + }, + { + "step": 475, + "epoch": 2.4173027989821882, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12568064, + "loss": 1.186, + "grad_norm": 2.1892919540405273, + "learning_rate": 3.2527180335921186e-05 + }, + { + "step": 476, + "epoch": 2.422391857506361, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125817344, + "loss": 1.2627, + "grad_norm": 1.9242080450057983, + "learning_rate": 3.197532213919774e-05 + }, + { + "step": 477, + "epoch": 2.427480916030534, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125674496, + "loss": 1.1904, + "grad_norm": 1.9996249675750732, + "learning_rate": 3.1427626497126654e-05 + }, + { + "step": 478, + "epoch": 2.432569974554707, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125697536, + "loss": 1.1755, + "grad_norm": 1.99502694606781, + "learning_rate": 3.088411272611781e-05 + }, + { + "step": 479, + "epoch": 2.4376590330788805, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125745152, + "loss": 1.3414, + "grad_norm": 2.2705914974212646, + "learning_rate": 3.0344799995092533e-05 + }, + { + "step": 480, + "epoch": 2.4427480916030535, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12578816, + "loss": 1.2762, + "grad_norm": 1.9224432706832886, + "learning_rate": 2.9809707324807912e-05 + }, + { + "step": 481, + "epoch": 2.4478371501272265, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125715968, + "loss": 1.2745, + "grad_norm": 2.0973212718963623, + "learning_rate": 2.9278853587185658e-05 + }, + { + "step": 482, + "epoch": 2.4529262086513994, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125814272, + "loss": 1.214, + "grad_norm": 2.1936962604522705, + "learning_rate": 2.8752257504646616e-05 + }, + { + "step": 483, + "epoch": 2.4580152671755724, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125709824, + "loss": 1.1773, + "grad_norm": 1.9469021558761597, + "learning_rate": 2.8229937649450613e-05 + }, + { + "step": 484, + "epoch": 2.4631043256997454, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12586496, + "loss": 1.1674, + "grad_norm": 1.7908746004104614, + "learning_rate": 2.7711912443041123e-05 + }, + { + "step": 485, + "epoch": 2.4681933842239188, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125697536, + "loss": 1.2579, + "grad_norm": 1.7354512214660645, + "learning_rate": 2.719820015539596e-05 + }, + { + "step": 486, + "epoch": 2.4732824427480917, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125740544, + "loss": 1.2145, + "grad_norm": 1.8321118354797363, + "learning_rate": 2.6688818904382513e-05 + }, + { + "step": 487, + "epoch": 2.4783715012722647, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125762048, + "loss": 1.1941, + "grad_norm": 1.8774724006652832, + "learning_rate": 2.6183786655119144e-05 + }, + { + "step": 488, + "epoch": 2.4834605597964376, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125685248, + "loss": 1.3063, + "grad_norm": 2.208664655685425, + "learning_rate": 2.5683121219341217e-05 + }, + { + "step": 489, + "epoch": 2.4885496183206106, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125821952, + "loss": 1.1988, + "grad_norm": 2.0201425552368164, + "learning_rate": 2.518684025477319e-05 + }, + { + "step": 490, + "epoch": 2.4936386768447836, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125709824, + "loss": 1.1645, + "grad_norm": 1.990088939666748, + "learning_rate": 2.469496126450578e-05 + }, + { + "step": 491, + "epoch": 2.4987277353689565, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125706752, + "loss": 1.3152, + "grad_norm": 2.1123080253601074, + "learning_rate": 2.4207501596378508e-05 + }, + { + "step": 492, + "epoch": 2.5038167938931295, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12568064, + "loss": 1.2691, + "grad_norm": 1.755974531173706, + "learning_rate": 2.3724478442368133e-05 + }, + { + "step": 493, + "epoch": 2.508905852417303, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125715968, + "loss": 1.1146, + "grad_norm": 1.8903571367263794, + "learning_rate": 2.324590883798204e-05 + }, + { + "step": 494, + "epoch": 2.513994910941476, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125731328, + "loss": 1.1773, + "grad_norm": 1.9153156280517578, + "learning_rate": 2.2771809661657614e-05 + }, + { + "step": 495, + "epoch": 2.519083969465649, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125725184, + "loss": 1.2268, + "grad_norm": 1.8224170207977295, + "learning_rate": 2.2302197634166835e-05 + }, + { + "step": 496, + "epoch": 2.524173027989822, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125737472, + "loss": 1.3236, + "grad_norm": 2.2900588512420654, + "learning_rate": 2.1837089318026714e-05 + }, + { + "step": 497, + "epoch": 2.5292620865139948, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125769728, + "loss": 1.2136, + "grad_norm": 2.256296396255493, + "learning_rate": 2.1376501116915047e-05 + }, + { + "step": 498, + "epoch": 2.534351145038168, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125771264, + "loss": 1.2815, + "grad_norm": 2.001415252685547, + "learning_rate": 2.0920449275091837e-05 + }, + { + "step": 499, + "epoch": 2.539440203562341, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125743616, + "loss": 1.3006, + "grad_norm": 2.4435157775878906, + "learning_rate": 2.0468949876826573e-05 + }, + { + "step": 500, + "epoch": 2.544529262086514, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125792768, + "loss": 1.2037, + "grad_norm": 1.9827852249145508, + "learning_rate": 2.002201884583065e-05 + }, + { + "step": 501, + "epoch": 2.549618320610687, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125722112, + "loss": 1.2364, + "grad_norm": 2.3719582557678223, + "learning_rate": 1.957967194469615e-05 + }, + { + "step": 502, + "epoch": 2.55470737913486, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12578048, + "loss": 1.0991, + "grad_norm": 2.0149455070495605, + "learning_rate": 1.9141924774339566e-05 + }, + { + "step": 503, + "epoch": 2.559796437659033, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125771264, + "loss": 1.0674, + "grad_norm": 1.7100521326065063, + "learning_rate": 1.8708792773451874e-05 + }, + { + "step": 504, + "epoch": 2.564885496183206, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125705216, + "loss": 1.2436, + "grad_norm": 2.270052671432495, + "learning_rate": 1.828029121795375e-05 + }, + { + "step": 505, + "epoch": 2.569974554707379, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12572672, + "loss": 1.2135, + "grad_norm": 2.040447235107422, + "learning_rate": 1.7856435220457092e-05 + }, + { + "step": 506, + "epoch": 2.5750636132315523, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125782016, + "loss": 1.2955, + "grad_norm": 2.709552049636841, + "learning_rate": 1.7437239729731806e-05 + }, + { + "step": 507, + "epoch": 2.5801526717557253, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125699072, + "loss": 1.2535, + "grad_norm": 2.0035016536712646, + "learning_rate": 1.7022719530178624e-05 + }, + { + "step": 508, + "epoch": 2.5852417302798982, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125763584, + "loss": 1.332, + "grad_norm": 2.553908109664917, + "learning_rate": 1.6612889241307836e-05 + }, + { + "step": 509, + "epoch": 2.590330788804071, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125722112, + "loss": 1.102, + "grad_norm": 2.0185513496398926, + "learning_rate": 1.620776331722347e-05 + }, + { + "step": 510, + "epoch": 2.595419847328244, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125754368, + "loss": 1.2156, + "grad_norm": 2.0152745246887207, + "learning_rate": 1.580735604611368e-05 + }, + { + "step": 511, + "epoch": 2.6005089058524176, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125728256, + "loss": 1.1437, + "grad_norm": 1.8052268028259277, + "learning_rate": 1.5411681549746678e-05 + }, + { + "step": 512, + "epoch": 2.6055979643765905, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125722112, + "loss": 1.2194, + "grad_norm": 1.924847960472107, + "learning_rate": 1.502075378297285e-05 + }, + { + "step": 513, + "epoch": 2.6106870229007635, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125732864, + "loss": 1.3273, + "grad_norm": 2.624382495880127, + "learning_rate": 1.4634586533232428e-05 + }, + { + "step": 514, + "epoch": 2.6157760814249365, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125637632, + "loss": 1.2995, + "grad_norm": 2.2763493061065674, + "learning_rate": 1.4253193420069292e-05 + }, + { + "step": 515, + "epoch": 2.6208651399491094, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125706752, + "loss": 1.1539, + "grad_norm": 2.062746286392212, + "learning_rate": 1.3876587894650686e-05 + }, + { + "step": 516, + "epoch": 2.6259541984732824, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125754368, + "loss": 1.2793, + "grad_norm": 2.3163840770721436, + "learning_rate": 1.350478323929271e-05 + }, + { + "step": 517, + "epoch": 2.6310432569974553, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125699072, + "loss": 1.2688, + "grad_norm": 2.514540910720825, + "learning_rate": 1.3137792566992001e-05 + }, + { + "step": 518, + "epoch": 2.6361323155216283, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12575744, + "loss": 1.3237, + "grad_norm": 2.47774076461792, + "learning_rate": 1.2775628820963091e-05 + }, + { + "step": 519, + "epoch": 2.6412213740458013, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125763584, + "loss": 1.1746, + "grad_norm": 2.2081568241119385, + "learning_rate": 1.2418304774182075e-05 + }, + { + "step": 520, + "epoch": 2.6463104325699747, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125900288, + "loss": 1.0744, + "grad_norm": 1.8066538572311401, + "learning_rate": 1.2065833028935968e-05 + }, + { + "step": 521, + "epoch": 2.6513994910941476, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125709824, + "loss": 1.1616, + "grad_norm": 1.66658353805542, + "learning_rate": 1.1718226016378507e-05 + }, + { + "step": 522, + "epoch": 2.6564885496183206, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125774336, + "loss": 1.1926, + "grad_norm": 2.1113390922546387, + "learning_rate": 1.137549599609136e-05 + }, + { + "step": 523, + "epoch": 2.6615776081424936, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125689856, + "loss": 1.2038, + "grad_norm": 2.396136522293091, + "learning_rate": 1.103765505565205e-05 + }, + { + "step": 524, + "epoch": 2.6666666666666665, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125706752, + "loss": 1.1312, + "grad_norm": 2.00799822807312, + "learning_rate": 1.0704715110207579e-05 + }, + { + "step": 525, + "epoch": 2.67175572519084, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125706752, + "loss": 1.2191, + "grad_norm": 2.261678457260132, + "learning_rate": 1.0376687902053981e-05 + }, + { + "step": 526, + "epoch": 2.676844783715013, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125832704, + "loss": 1.2957, + "grad_norm": 2.1472344398498535, + "learning_rate": 1.0053585000222524e-05 + }, + { + "step": 527, + "epoch": 2.681933842239186, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125712896, + "loss": 1.091, + "grad_norm": 2.247433662414551, + "learning_rate": 9.735417800071433e-06 + }, + { + "step": 528, + "epoch": 2.687022900763359, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125871104, + "loss": 1.1534, + "grad_norm": 1.8070034980773926, + "learning_rate": 9.42219752288414e-06 + }, + { + "step": 529, + "epoch": 2.6921119592875318, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125696, + "loss": 1.2413, + "grad_norm": 1.847753882408142, + "learning_rate": 9.113935215473428e-06 + }, + { + "step": 530, + "epoch": 2.6972010178117047, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125714432, + "loss": 1.1945, + "grad_norm": 1.9214633703231812, + "learning_rate": 8.810641749791902e-06 + }, + { + "step": 531, + "epoch": 2.7022900763358777, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125891072, + "loss": 1.121, + "grad_norm": 1.988155484199524, + "learning_rate": 8.512327822548481e-06 + }, + { + "step": 532, + "epoch": 2.7073791348600507, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12576512, + "loss": 1.1159, + "grad_norm": 1.881072759628296, + "learning_rate": 8.219003954831199e-06 + }, + { + "step": 533, + "epoch": 2.712468193384224, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125762048, + "loss": 1.239, + "grad_norm": 2.2775754928588867, + "learning_rate": 7.930680491736135e-06 + }, + { + "step": 534, + "epoch": 2.717557251908397, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125771264, + "loss": 1.2286, + "grad_norm": 1.9603570699691772, + "learning_rate": 7.647367602002491e-06 + }, + { + "step": 535, + "epoch": 2.72264631043257, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125646848, + "loss": 1.2378, + "grad_norm": 2.0111358165740967, + "learning_rate": 7.369075277654091e-06 + }, + { + "step": 536, + "epoch": 2.727735368956743, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125929472, + "loss": 1.2258, + "grad_norm": 2.143218755722046, + "learning_rate": 7.095813333646832e-06 + }, + { + "step": 537, + "epoch": 2.732824427480916, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125692928, + "loss": 1.1184, + "grad_norm": 1.7328288555145264, + "learning_rate": 6.827591407522548e-06 + }, + { + "step": 538, + "epoch": 2.7379134860050893, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12578816, + "loss": 1.2723, + "grad_norm": 1.870348572731018, + "learning_rate": 6.564418959069273e-06 + }, + { + "step": 539, + "epoch": 2.7430025445292623, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125754368, + "loss": 1.2575, + "grad_norm": 2.8923819065093994, + "learning_rate": 6.3063052699873326e-06 + }, + { + "step": 540, + "epoch": 2.7480916030534353, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125722112, + "loss": 1.3021, + "grad_norm": 2.549255132675171, + "learning_rate": 6.053259443562286e-06 + }, + { + "step": 541, + "epoch": 2.753180661577608, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125740544, + "loss": 1.2818, + "grad_norm": 2.125784397125244, + "learning_rate": 5.8052904043435985e-06 + }, + { + "step": 542, + "epoch": 2.758269720101781, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125791232, + "loss": 1.3226, + "grad_norm": 2.3760392665863037, + "learning_rate": 5.56240689783013e-06 + }, + { + "step": 543, + "epoch": 2.763358778625954, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125686784, + "loss": 1.2664, + "grad_norm": 2.432565212249756, + "learning_rate": 5.324617490161409e-06 + }, + { + "step": 544, + "epoch": 2.768447837150127, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125715968, + "loss": 1.164, + "grad_norm": 2.0957860946655273, + "learning_rate": 5.091930567815866e-06 + }, + { + "step": 545, + "epoch": 2.7735368956743, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125659136, + "loss": 1.2688, + "grad_norm": 1.9980767965316772, + "learning_rate": 4.86435433731473e-06 + }, + { + "step": 546, + "epoch": 2.778625954198473, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1257344, + "loss": 1.1509, + "grad_norm": 2.1849589347839355, + "learning_rate": 4.641896824932861e-06 + }, + { + "step": 547, + "epoch": 2.7837150127226464, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12572672, + "loss": 1.1467, + "grad_norm": 1.9171429872512817, + "learning_rate": 4.424565876415415e-06 + }, + { + "step": 548, + "epoch": 2.7888040712468194, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12575744, + "loss": 1.2582, + "grad_norm": 2.0399720668792725, + "learning_rate": 4.212369156701373e-06 + }, + { + "step": 549, + "epoch": 2.7938931297709924, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125686784, + "loss": 1.2647, + "grad_norm": 1.9902386665344238, + "learning_rate": 4.005314149653133e-06 + }, + { + "step": 550, + "epoch": 2.7989821882951653, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125712896, + "loss": 1.2128, + "grad_norm": 2.4347081184387207, + "learning_rate": 3.8034081577924147e-06 + }, + { + "step": 551, + "epoch": 2.8040712468193383, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125808128, + "loss": 1.246, + "grad_norm": 2.224797487258911, + "learning_rate": 3.6066583020429864e-06 + }, + { + "step": 552, + "epoch": 2.8091603053435117, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12570368, + "loss": 1.1834, + "grad_norm": 2.067335844039917, + "learning_rate": 3.415071521479246e-06 + }, + { + "step": 553, + "epoch": 2.8142493638676847, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125712896, + "loss": 1.2772, + "grad_norm": 2.216869354248047, + "learning_rate": 3.2286545730817183e-06 + }, + { + "step": 554, + "epoch": 2.8193384223918576, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125740544, + "loss": 1.3052, + "grad_norm": 2.480145215988159, + "learning_rate": 3.0474140314985628e-06 + }, + { + "step": 555, + "epoch": 2.8244274809160306, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125754368, + "loss": 1.3451, + "grad_norm": 2.4951000213623047, + "learning_rate": 2.8713562888138754e-06 + }, + { + "step": 556, + "epoch": 2.8295165394402035, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125748224, + "loss": 1.1866, + "grad_norm": 1.9465035200119019, + "learning_rate": 2.7004875543220506e-06 + }, + { + "step": 557, + "epoch": 2.8346055979643765, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12571136, + "loss": 1.1815, + "grad_norm": 2.0216007232666016, + "learning_rate": 2.5348138543089425e-06 + }, + { + "step": 558, + "epoch": 2.8396946564885495, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125769728, + "loss": 1.3071, + "grad_norm": 3.1460025310516357, + "learning_rate": 2.374341031839283e-06 + }, + { + "step": 559, + "epoch": 2.8447837150127224, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125679104, + "loss": 1.278, + "grad_norm": 2.324364185333252, + "learning_rate": 2.2190747465505644e-06 + }, + { + "step": 560, + "epoch": 2.849872773536896, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125763584, + "loss": 1.2046, + "grad_norm": 2.431290626525879, + "learning_rate": 2.0690204744534976e-06 + }, + { + "step": 561, + "epoch": 2.854961832061069, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125801984, + "loss": 1.1343, + "grad_norm": 2.4237658977508545, + "learning_rate": 1.924183507738819e-06 + }, + { + "step": 562, + "epoch": 2.8600508905852418, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125846528, + "loss": 1.3213, + "grad_norm": 2.2055253982543945, + "learning_rate": 1.7845689545906704e-06 + }, + { + "step": 563, + "epoch": 2.8651399491094147, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125808128, + "loss": 1.2171, + "grad_norm": 1.911645531654358, + "learning_rate": 1.6501817390064786e-06 + }, + { + "step": 564, + "epoch": 2.8702290076335877, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125745152, + "loss": 1.2853, + "grad_norm": 2.4477052688598633, + "learning_rate": 1.521026600623243e-06 + }, + { + "step": 565, + "epoch": 2.875318066157761, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125798912, + "loss": 1.3527, + "grad_norm": 2.542365074157715, + "learning_rate": 1.3971080945503866e-06 + }, + { + "step": 566, + "epoch": 2.880407124681934, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125737472, + "loss": 1.2186, + "grad_norm": 2.2143774032592773, + "learning_rate": 1.2784305912090842e-06 + }, + { + "step": 567, + "epoch": 2.885496183206107, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125723648, + "loss": 1.1607, + "grad_norm": 2.05959415435791, + "learning_rate": 1.1649982761782195e-06 + }, + { + "step": 568, + "epoch": 2.89058524173028, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125935616, + "loss": 1.1484, + "grad_norm": 2.1109299659729004, + "learning_rate": 1.0568151500465693e-06 + }, + { + "step": 569, + "epoch": 2.895674300254453, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125800448, + "loss": 1.1856, + "grad_norm": 2.53446626663208, + "learning_rate": 9.538850282719833e-07 + }, + { + "step": 570, + "epoch": 2.900763358778626, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125774336, + "loss": 1.2371, + "grad_norm": 2.242356538772583, + "learning_rate": 8.56211541046542e-07 + }, + { + "step": 571, + "epoch": 2.905852417302799, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12578048, + "loss": 1.1952, + "grad_norm": 2.0890231132507324, + "learning_rate": 7.637981331687582e-07 + }, + { + "step": 572, + "epoch": 2.910941475826972, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125814272, + "loss": 1.3597, + "grad_norm": 2.8311550617218018, + "learning_rate": 6.766480639218752e-07 + }, + { + "step": 573, + "epoch": 2.916030534351145, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125722112, + "loss": 1.3296, + "grad_norm": 2.2377445697784424, + "learning_rate": 5.947644069591084e-07 + }, + { + "step": 574, + "epoch": 2.921119592875318, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125806592, + "loss": 1.202, + "grad_norm": 2.393383264541626, + "learning_rate": 5.181500501950986e-07 + }, + { + "step": 575, + "epoch": 2.926208651399491, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125694464, + "loss": 1.1909, + "grad_norm": 2.0903987884521484, + "learning_rate": 4.468076957041433e-07 + }, + { + "step": 576, + "epoch": 2.931297709923664, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12575744, + "loss": 1.2392, + "grad_norm": 1.9520516395568848, + "learning_rate": 3.807398596248401e-07 + }, + { + "step": 577, + "epoch": 2.936386768447837, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125722112, + "loss": 1.1646, + "grad_norm": 2.137089729309082, + "learning_rate": 3.199488720714072e-07 + }, + { + "step": 578, + "epoch": 2.94147582697201, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125722112, + "loss": 1.3094, + "grad_norm": 2.1856279373168945, + "learning_rate": 2.64436877051466e-07 + }, + { + "step": 579, + "epoch": 2.9465648854961835, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125806592, + "loss": 1.2827, + "grad_norm": 2.1439290046691895, + "learning_rate": 2.1420583239040167e-07 + }, + { + "step": 580, + "epoch": 2.9516539440203564, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125752832, + "loss": 1.2746, + "grad_norm": 2.320873975753784, + "learning_rate": 1.6925750966238494e-07 + }, + { + "step": 581, + "epoch": 2.9567430025445294, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1257728, + "loss": 1.2487, + "grad_norm": 2.5310592651367188, + "learning_rate": 1.295934941278387e-07 + }, + { + "step": 582, + "epoch": 2.9618320610687023, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125700608, + "loss": 1.2823, + "grad_norm": 2.692242383956909, + "learning_rate": 9.52151846775162e-08 + }, + { + "step": 583, + "epoch": 2.9669211195928753, + "cpu_mem": 3.4093056, + "gpu_mem": 1.12578816, + "loss": 1.2782, + "grad_norm": 2.0992605686187744, + "learning_rate": 6.612379378320709e-08 + }, + { + "step": 584, + "epoch": 2.9720101781170483, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125755904, + "loss": 1.17, + "grad_norm": 1.9681111574172974, + "learning_rate": 4.232034745495494e-08 + }, + { + "step": 585, + "epoch": 2.9770992366412212, + "cpu_mem": 3.4093056, + "gpu_mem": 1.1257344, + "loss": 0.9887, + "grad_norm": 1.8877077102661133, + "learning_rate": 2.3805685204869583e-08 + }, + { + "step": 586, + "epoch": 2.982188295165394, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125677568, + "loss": 1.202, + "grad_norm": 2.3306381702423096, + "learning_rate": 1.0580460017517444e-08 + }, + { + "step": 587, + "epoch": 2.9872773536895676, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125743616, + "loss": 1.3327, + "grad_norm": 2.2331714630126953, + "learning_rate": 2.645138326906604e-09 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125785088, + "loss": 1.2287, + "grad_norm": 2.0882811546325684, + "learning_rate": 0.0 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 3.4093056, + "gpu_mem": 1.125785088, + "train_runtime": 8533.2534, + "train_samples_per_second": 4.418, + "train_steps_per_second": 0.069, + "total_flos": 8.875874857128346e+16, + "train_loss": 1.3690297749577736 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r2-a2/README.md b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r2-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r2-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r2-a2/adapter_config.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r2-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c5f43ee5d95e6efa86bc12e96d56fbf5a2c265b7 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r2-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 4, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 2, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r2-a2/eval_results.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r2-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc2f59f3dad59a371f46dd1db575a7703e7145a --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r2-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "winogrande", + "results": 0.5122336227308603 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r2-a2/training_configuration.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r2-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..f5cd05d77447c8a25bc2f10d570cc99552a22e9e --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r2-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "WINOGRANDE", + "dataset_id": "allenai/winogrande", + "preprocess_id": "winogrande_train_deepeval" + }, + "peft_config": { + "method": "loraq4", + "rank": 2, + "alpha": 4, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 1576960 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loraq4-winogrande-r2-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loraq4-fix/TinyLlama_v1.1-loraq4-winogrande-r2-a2", + "seed": 42, + "timestamp": "2025-08-31T22:45:06.771079" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r2-a2/training_logs.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r2-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..fa0dd344ce5221203a91010cbe621ef0f9bf7ae0 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r2-a2/training_logs.json @@ -0,0 +1,5773 @@ +[ + { + "step": 1, + "epoch": 0.00625, + "cpu_mem": 3.736838144, + "gpu_mem": 1.055875584, + "loss": 3.2646, + "grad_norm": 15.341161727905273, + "learning_rate": 4.6875e-06 + }, + { + "step": 2, + "epoch": 0.0125, + "cpu_mem": 3.737427968, + "gpu_mem": 1.068489216, + "loss": 3.2643, + "grad_norm": 15.007658004760742, + "learning_rate": 9.375e-06 + }, + { + "step": 3, + "epoch": 0.01875, + "cpu_mem": 3.737821184, + "gpu_mem": 1.068493824, + "loss": 3.1731, + "grad_norm": 14.545931816101074, + "learning_rate": 1.40625e-05 + }, + { + "step": 4, + "epoch": 0.025, + "cpu_mem": 3.7382144, + "gpu_mem": 1.068492288, + "loss": 3.0562, + "grad_norm": 14.222400665283203, + "learning_rate": 1.875e-05 + }, + { + "step": 5, + "epoch": 0.03125, + "cpu_mem": 3.738411008, + "gpu_mem": 1.068492288, + "loss": 3.0695, + "grad_norm": 14.911974906921387, + "learning_rate": 2.3437499999999997e-05 + }, + { + "step": 6, + "epoch": 0.0375, + "cpu_mem": 3.738607616, + "gpu_mem": 1.068498432, + "loss": 3.0764, + "grad_norm": 15.628325462341309, + "learning_rate": 2.8125e-05 + }, + { + "step": 7, + "epoch": 0.04375, + "cpu_mem": 3.739000832, + "gpu_mem": 1.068504576, + "loss": 2.8607, + "grad_norm": 16.12621307373047, + "learning_rate": 3.28125e-05 + }, + { + "step": 8, + "epoch": 0.05, + "cpu_mem": 3.73919744, + "gpu_mem": 1.06848768, + "loss": 2.7586, + "grad_norm": 14.539695739746094, + "learning_rate": 3.75e-05 + }, + { + "step": 9, + "epoch": 0.05625, + "cpu_mem": 3.73919744, + "gpu_mem": 1.068493824, + "loss": 2.7389, + "grad_norm": 15.622178077697754, + "learning_rate": 4.2187499999999995e-05 + }, + { + "step": 10, + "epoch": 0.0625, + "cpu_mem": 3.739394048, + "gpu_mem": 1.068496896, + "loss": 2.5325, + "grad_norm": 16.108373641967773, + "learning_rate": 4.6874999999999994e-05 + }, + { + "step": 11, + "epoch": 0.06875, + "cpu_mem": 3.739590656, + "gpu_mem": 1.068486144, + "loss": 2.3359, + "grad_norm": 16.209314346313477, + "learning_rate": 5.156249999999999e-05 + }, + { + "step": 12, + "epoch": 0.075, + "cpu_mem": 3.739787264, + "gpu_mem": 1.068490752, + "loss": 2.1598, + "grad_norm": 16.10175132751465, + "learning_rate": 5.625e-05 + }, + { + "step": 13, + "epoch": 0.08125, + "cpu_mem": 3.739983872, + "gpu_mem": 1.068498432, + "loss": 1.846, + "grad_norm": 14.556991577148438, + "learning_rate": 6.09375e-05 + }, + { + "step": 14, + "epoch": 0.0875, + "cpu_mem": 3.74018048, + "gpu_mem": 1.068493824, + "loss": 1.6227, + "grad_norm": 12.672173500061035, + "learning_rate": 6.5625e-05 + }, + { + "step": 15, + "epoch": 0.09375, + "cpu_mem": 3.74018048, + "gpu_mem": 1.068493824, + "loss": 1.3616, + "grad_norm": 10.309033393859863, + "learning_rate": 7.03125e-05 + }, + { + "step": 16, + "epoch": 0.1, + "cpu_mem": 3.740377088, + "gpu_mem": 1.068490752, + "loss": 1.1472, + "grad_norm": 6.798020839691162, + "learning_rate": 7.5e-05 + }, + { + "step": 17, + "epoch": 0.10625, + "cpu_mem": 3.740377088, + "gpu_mem": 1.068490752, + "loss": 0.9276, + "grad_norm": 6.009860038757324, + "learning_rate": 7.968749999999999e-05 + }, + { + "step": 18, + "epoch": 0.1125, + "cpu_mem": 3.740377088, + "gpu_mem": 1.068493824, + "loss": 0.9682, + "grad_norm": 4.698451519012451, + "learning_rate": 8.437499999999999e-05 + }, + { + "step": 19, + "epoch": 0.11875, + "cpu_mem": 3.740573696, + "gpu_mem": 1.068490752, + "loss": 0.8742, + "grad_norm": 4.804157733917236, + "learning_rate": 8.906249999999999e-05 + }, + { + "step": 20, + "epoch": 0.125, + "cpu_mem": 3.740573696, + "gpu_mem": 1.068498432, + "loss": 0.8761, + "grad_norm": 8.626604080200195, + "learning_rate": 9.374999999999999e-05 + }, + { + "step": 21, + "epoch": 0.13125, + "cpu_mem": 3.740770304, + "gpu_mem": 1.068490752, + "loss": 0.7794, + "grad_norm": 3.852346420288086, + "learning_rate": 9.843749999999999e-05 + }, + { + "step": 22, + "epoch": 0.1375, + "cpu_mem": 3.740770304, + "gpu_mem": 1.068490752, + "loss": 0.7643, + "grad_norm": 7.624159812927246, + "learning_rate": 0.00010312499999999999 + }, + { + "step": 23, + "epoch": 0.14375, + "cpu_mem": 3.740770304, + "gpu_mem": 1.068486144, + "loss": 0.7315, + "grad_norm": 3.28399395942688, + "learning_rate": 0.00010781249999999998 + }, + { + "step": 24, + "epoch": 0.15, + "cpu_mem": 3.740966912, + "gpu_mem": 1.068489216, + "loss": 0.734, + "grad_norm": 5.7163801193237305, + "learning_rate": 0.0001125 + }, + { + "step": 25, + "epoch": 0.15625, + "cpu_mem": 3.740966912, + "gpu_mem": 1.068492288, + "loss": 0.7081, + "grad_norm": 2.2371132373809814, + "learning_rate": 0.0001171875 + }, + { + "step": 26, + "epoch": 0.1625, + "cpu_mem": 3.740966912, + "gpu_mem": 1.06848768, + "loss": 0.6633, + "grad_norm": 1.4205535650253296, + "learning_rate": 0.000121875 + }, + { + "step": 27, + "epoch": 0.16875, + "cpu_mem": 3.740966912, + "gpu_mem": 1.068486144, + "loss": 0.8982, + "grad_norm": 11.487899780273438, + "learning_rate": 0.0001265625 + }, + { + "step": 28, + "epoch": 0.175, + "cpu_mem": 3.740966912, + "gpu_mem": 1.068492288, + "loss": 0.7394, + "grad_norm": 5.137890815734863, + "learning_rate": 0.00013125 + }, + { + "step": 29, + "epoch": 0.18125, + "cpu_mem": 3.740966912, + "gpu_mem": 1.068490752, + "loss": 0.7639, + "grad_norm": 5.181459903717041, + "learning_rate": 0.0001359375 + }, + { + "step": 30, + "epoch": 0.1875, + "cpu_mem": 3.740966912, + "gpu_mem": 1.068490752, + "loss": 0.7421, + "grad_norm": 4.702651500701904, + "learning_rate": 0.000140625 + }, + { + "step": 31, + "epoch": 0.19375, + "cpu_mem": 3.740966912, + "gpu_mem": 1.068490752, + "loss": 0.7046, + "grad_norm": 2.956028938293457, + "learning_rate": 0.0001453125 + }, + { + "step": 32, + "epoch": 0.2, + "cpu_mem": 3.740966912, + "gpu_mem": 1.06848768, + "loss": 0.6953, + "grad_norm": 3.145109176635742, + "learning_rate": 0.00015 + }, + { + "step": 33, + "epoch": 0.20625, + "cpu_mem": 3.740966912, + "gpu_mem": 1.06848768, + "loss": 0.8653, + "grad_norm": 7.55678129196167, + "learning_rate": 0.00015468749999999999 + }, + { + "step": 34, + "epoch": 0.2125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.7653, + "grad_norm": 4.607822895050049, + "learning_rate": 0.00015937499999999998 + }, + { + "step": 35, + "epoch": 0.21875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6868, + "grad_norm": 1.1636474132537842, + "learning_rate": 0.00016406249999999998 + }, + { + "step": 36, + "epoch": 0.225, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7262, + "grad_norm": 3.2954277992248535, + "learning_rate": 0.00016874999999999998 + }, + { + "step": 37, + "epoch": 0.23125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.7032, + "grad_norm": 1.193465232849121, + "learning_rate": 0.00017343749999999998 + }, + { + "step": 38, + "epoch": 0.2375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7365, + "grad_norm": 2.7860684394836426, + "learning_rate": 0.00017812499999999998 + }, + { + "step": 39, + "epoch": 0.24375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.7214, + "grad_norm": 2.9561874866485596, + "learning_rate": 0.00018281249999999998 + }, + { + "step": 40, + "epoch": 0.25, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.7293, + "grad_norm": 1.0235817432403564, + "learning_rate": 0.00018749999999999998 + }, + { + "step": 41, + "epoch": 0.25625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.7173, + "grad_norm": 0.574097752571106, + "learning_rate": 0.00019218749999999998 + }, + { + "step": 42, + "epoch": 0.2625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7198, + "grad_norm": 2.648030996322632, + "learning_rate": 0.00019687499999999997 + }, + { + "step": 43, + "epoch": 0.26875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7013, + "grad_norm": 0.7263724207878113, + "learning_rate": 0.00020156249999999997 + }, + { + "step": 44, + "epoch": 0.275, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7039, + "grad_norm": 1.6622720956802368, + "learning_rate": 0.00020624999999999997 + }, + { + "step": 45, + "epoch": 0.28125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068499968, + "loss": 0.694, + "grad_norm": 1.003551721572876, + "learning_rate": 0.00021093749999999997 + }, + { + "step": 46, + "epoch": 0.2875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7099, + "grad_norm": 1.2824467420578003, + "learning_rate": 0.00021562499999999997 + }, + { + "step": 47, + "epoch": 0.29375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.7378, + "grad_norm": 2.166978120803833, + "learning_rate": 0.00022031249999999997 + }, + { + "step": 48, + "epoch": 0.3, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.7188, + "grad_norm": 2.5151896476745605, + "learning_rate": 0.000225 + }, + { + "step": 49, + "epoch": 0.30625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.6732, + "grad_norm": 1.6929657459259033, + "learning_rate": 0.0002296875 + }, + { + "step": 50, + "epoch": 0.3125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.7198, + "grad_norm": 1.0859344005584717, + "learning_rate": 0.000234375 + }, + { + "step": 51, + "epoch": 0.31875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6975, + "grad_norm": 0.9548556804656982, + "learning_rate": 0.0002390625 + }, + { + "step": 52, + "epoch": 0.325, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7088, + "grad_norm": 1.1440411806106567, + "learning_rate": 0.00024375 + }, + { + "step": 53, + "epoch": 0.33125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.7086, + "grad_norm": 1.7345162630081177, + "learning_rate": 0.00024843749999999996 + }, + { + "step": 54, + "epoch": 0.3375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068481536, + "loss": 0.749, + "grad_norm": 2.3650598526000977, + "learning_rate": 0.000253125 + }, + { + "step": 55, + "epoch": 0.34375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.708, + "grad_norm": 1.6000055074691772, + "learning_rate": 0.00025781249999999996 + }, + { + "step": 56, + "epoch": 0.35, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.7031, + "grad_norm": 0.8860859870910645, + "learning_rate": 0.0002625 + }, + { + "step": 57, + "epoch": 0.35625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.6912, + "grad_norm": 0.8413110971450806, + "learning_rate": 0.00026718749999999996 + }, + { + "step": 58, + "epoch": 0.3625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.712, + "grad_norm": 1.5282130241394043, + "learning_rate": 0.000271875 + }, + { + "step": 59, + "epoch": 0.36875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068499968, + "loss": 0.7329, + "grad_norm": 2.479212760925293, + "learning_rate": 0.00027656249999999995 + }, + { + "step": 60, + "epoch": 0.375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7157, + "grad_norm": 1.1457135677337646, + "learning_rate": 0.00028125 + }, + { + "step": 61, + "epoch": 0.38125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6923, + "grad_norm": 0.39912694692611694, + "learning_rate": 0.00028593749999999995 + }, + { + "step": 62, + "epoch": 0.3875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6932, + "grad_norm": 0.29515859484672546, + "learning_rate": 0.000290625 + }, + { + "step": 63, + "epoch": 0.39375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6912, + "grad_norm": 0.4923630654811859, + "learning_rate": 0.00029531249999999995 + }, + { + "step": 64, + "epoch": 0.4, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.7221, + "grad_norm": 2.509011745452881, + "learning_rate": 0.0003 + }, + { + "step": 65, + "epoch": 0.40625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.7019, + "grad_norm": 1.0072270631790161, + "learning_rate": 0.00029999776892091325 + }, + { + "step": 66, + "epoch": 0.4125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.7152, + "grad_norm": 2.4033114910125732, + "learning_rate": 0.00029999107575002246 + }, + { + "step": 67, + "epoch": 0.41875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6873, + "grad_norm": 1.1867454051971436, + "learning_rate": 0.0002999799206864343 + }, + { + "step": 68, + "epoch": 0.425, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7066, + "grad_norm": 0.8923547863960266, + "learning_rate": 0.0002999643040619863 + }, + { + "step": 69, + "epoch": 0.43125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6838, + "grad_norm": 0.6291250586509705, + "learning_rate": 0.0002999442263412377 + }, + { + "step": 70, + "epoch": 0.4375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.6982, + "grad_norm": 0.8638536930084229, + "learning_rate": 0.00029991968812145484 + }, + { + "step": 71, + "epoch": 0.44375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.7078, + "grad_norm": 2.032560110092163, + "learning_rate": 0.00029989069013259374 + }, + { + "step": 72, + "epoch": 0.45, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.7167, + "grad_norm": 1.2106930017471313, + "learning_rate": 0.00029985723323727866 + }, + { + "step": 73, + "epoch": 0.45625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6816, + "grad_norm": 1.001874327659607, + "learning_rate": 0.00029981931843077583 + }, + { + "step": 74, + "epoch": 0.4625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.7365, + "grad_norm": 2.653109073638916, + "learning_rate": 0.00029977694684096444 + }, + { + "step": 75, + "epoch": 0.46875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6843, + "grad_norm": 1.0181810855865479, + "learning_rate": 0.0002997301197283027 + }, + { + "step": 76, + "epoch": 0.475, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.7812, + "grad_norm": 3.916050672531128, + "learning_rate": 0.0002996788384857905 + }, + { + "step": 77, + "epoch": 0.48125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.7037, + "grad_norm": 0.9829432368278503, + "learning_rate": 0.00029962310463892795 + }, + { + "step": 78, + "epoch": 0.4875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6931, + "grad_norm": 0.4783083498477936, + "learning_rate": 0.00029956291984566997 + }, + { + "step": 79, + "epoch": 0.49375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7167, + "grad_norm": 1.6793144941329956, + "learning_rate": 0.00029949828589637703 + }, + { + "step": 80, + "epoch": 0.5, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.7376, + "grad_norm": 2.0821690559387207, + "learning_rate": 0.0002994292047137618 + }, + { + "step": 81, + "epoch": 0.50625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.7189, + "grad_norm": 1.5532649755477905, + "learning_rate": 0.00029935567835283203 + }, + { + "step": 82, + "epoch": 0.5125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6905, + "grad_norm": 0.3926987648010254, + "learning_rate": 0.00029927770900082954 + }, + { + "step": 83, + "epoch": 0.51875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6872, + "grad_norm": 0.71334308385849, + "learning_rate": 0.0002991952989771647 + }, + { + "step": 84, + "epoch": 0.525, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.7221, + "grad_norm": 1.8339756727218628, + "learning_rate": 0.0002991084507333479 + }, + { + "step": 85, + "epoch": 0.53125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7411, + "grad_norm": 1.958024024963379, + "learning_rate": 0.00029901716685291663 + }, + { + "step": 86, + "epoch": 0.5375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.7085, + "grad_norm": 1.1607639789581299, + "learning_rate": 0.0002989214500513582 + }, + { + "step": 87, + "epoch": 0.54375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.684, + "grad_norm": 0.33241385221481323, + "learning_rate": 0.0002988213031760294 + }, + { + "step": 88, + "epoch": 0.55, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.698, + "grad_norm": 0.6219249367713928, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 89, + "epoch": 0.55625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.7099, + "grad_norm": 0.8807662725448608, + "learning_rate": 0.0002986077312523219 + }, + { + "step": 90, + "epoch": 0.5625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.686, + "grad_norm": 0.2522576153278351, + "learning_rate": 0.00029849431255722116 + }, + { + "step": 91, + "epoch": 0.56875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6681, + "grad_norm": 0.6229816675186157, + "learning_rate": 0.00029837647649471715 + }, + { + "step": 92, + "epoch": 0.575, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.7479, + "grad_norm": 1.4716682434082031, + "learning_rate": 0.0002982542265701641 + }, + { + "step": 93, + "epoch": 0.58125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.7193, + "grad_norm": 0.8628938794136047, + "learning_rate": 0.0002981275664202187 + }, + { + "step": 94, + "epoch": 0.5875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7043, + "grad_norm": 0.3660317659378052, + "learning_rate": 0.00029799649981273186 + }, + { + "step": 95, + "epoch": 0.59375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6974, + "grad_norm": 0.5728554725646973, + "learning_rate": 0.00029786103064663634 + }, + { + "step": 96, + "epoch": 0.6, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6944, + "grad_norm": 0.6389541625976562, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 97, + "epoch": 0.60625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.708, + "grad_norm": 0.6137107610702515, + "learning_rate": 0.00029757690088906156 + }, + { + "step": 98, + "epoch": 0.6125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.7101, + "grad_norm": 0.5691790580749512, + "learning_rate": 0.00029742824874979515 + }, + { + "step": 99, + "epoch": 0.61875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6866, + "grad_norm": 0.7134369611740112, + "learning_rate": 0.0002972752109560943 + }, + { + "step": 100, + "epoch": 0.625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7065, + "grad_norm": 0.5455506443977356, + "learning_rate": 0.00029711779206048454 + }, + { + "step": 101, + "epoch": 0.63125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.7021, + "grad_norm": 0.3545895516872406, + "learning_rate": 0.0002969559967458194 + }, + { + "step": 102, + "epoch": 0.6375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.7058, + "grad_norm": 0.5642436742782593, + "learning_rate": 0.0002967898298251407 + }, + { + "step": 103, + "epoch": 0.64375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6916, + "grad_norm": 0.4994181990623474, + "learning_rate": 0.0002966192962415358 + }, + { + "step": 104, + "epoch": 0.65, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6984, + "grad_norm": 0.27669987082481384, + "learning_rate": 0.00029644440106799 + }, + { + "step": 105, + "epoch": 0.65625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6948, + "grad_norm": 0.34594061970710754, + "learning_rate": 0.00029626514950723627 + }, + { + "step": 106, + "epoch": 0.6625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.705, + "grad_norm": 0.6067831516265869, + "learning_rate": 0.0002960815468916 + }, + { + "step": 107, + "epoch": 0.66875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6955, + "grad_norm": 0.32750412821769714, + "learning_rate": 0.0002958935986828407 + }, + { + "step": 108, + "epoch": 0.675, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.691, + "grad_norm": 0.5042902231216431, + "learning_rate": 0.00029570131047198915 + }, + { + "step": 109, + "epoch": 0.68125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.7129, + "grad_norm": 0.32008469104766846, + "learning_rate": 0.0002955046879791816 + }, + { + "step": 110, + "epoch": 0.6875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6929, + "grad_norm": 0.4685095250606537, + "learning_rate": 0.00029530373705348895 + }, + { + "step": 111, + "epoch": 0.69375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.7136, + "grad_norm": 0.49369707703590393, + "learning_rate": 0.00029509846367274336 + }, + { + "step": 112, + "epoch": 0.7, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6926, + "grad_norm": 0.37866172194480896, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 113, + "epoch": 0.70625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068483072, + "loss": 0.6827, + "grad_norm": 0.4609016180038452, + "learning_rate": 0.00029467497410015625 + }, + { + "step": 114, + "epoch": 0.7125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6971, + "grad_norm": 0.17680197954177856, + "learning_rate": 0.00029445677050616437 + }, + { + "step": 115, + "epoch": 0.71875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.697, + "grad_norm": 0.39778974652290344, + "learning_rate": 0.0002942342696524443 + }, + { + "step": 116, + "epoch": 0.725, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.723, + "grad_norm": 1.6693419218063354, + "learning_rate": 0.0002940074781578893 + }, + { + "step": 117, + "epoch": 0.73125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.694, + "grad_norm": 0.2889886498451233, + "learning_rate": 0.00029377640276902954 + }, + { + "step": 118, + "epoch": 0.7375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6921, + "grad_norm": 0.17514359951019287, + "learning_rate": 0.0002935410503598313 + }, + { + "step": 119, + "epoch": 0.74375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7215, + "grad_norm": 1.3596268892288208, + "learning_rate": 0.00029330142793149237 + }, + { + "step": 120, + "epoch": 0.75, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6981, + "grad_norm": 0.3159288465976715, + "learning_rate": 0.000293057542612234 + }, + { + "step": 121, + "epoch": 0.75625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6961, + "grad_norm": 0.29614701867103577, + "learning_rate": 0.0002928094016570886 + }, + { + "step": 122, + "epoch": 0.7625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6963, + "grad_norm": 1.0608338117599487, + "learning_rate": 0.00029255701244768414 + }, + { + "step": 123, + "epoch": 0.76875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6967, + "grad_norm": 0.12757009267807007, + "learning_rate": 0.0002923003824920244 + }, + { + "step": 124, + "epoch": 0.775, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6951, + "grad_norm": 0.4640333950519562, + "learning_rate": 0.0002920395194242658 + }, + { + "step": 125, + "epoch": 0.78125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068483072, + "loss": 0.7002, + "grad_norm": 0.34109261631965637, + "learning_rate": 0.00029177443100449014 + }, + { + "step": 126, + "epoch": 0.7875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6997, + "grad_norm": 0.5762715935707092, + "learning_rate": 0.00029150512511847375 + }, + { + "step": 127, + "epoch": 0.79375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6906, + "grad_norm": 0.1668487936258316, + "learning_rate": 0.00029123160977745306 + }, + { + "step": 128, + "epoch": 0.8, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.7051, + "grad_norm": 0.9032878279685974, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 129, + "epoch": 0.80625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7004, + "grad_norm": 0.22612537443637848, + "learning_rate": 0.00029067198340121094 + }, + { + "step": 130, + "epoch": 0.8125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6986, + "grad_norm": 0.25610682368278503, + "learning_rate": 0.00029038588901359884 + }, + { + "step": 131, + "epoch": 0.81875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6984, + "grad_norm": 0.5078030228614807, + "learning_rate": 0.00029009561846570604 + }, + { + "step": 132, + "epoch": 0.825, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6995, + "grad_norm": 0.5803123712539673, + "learning_rate": 0.00028980118039241976 + }, + { + "step": 133, + "epoch": 0.83125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6876, + "grad_norm": 1.1624385118484497, + "learning_rate": 0.00028950258355260177 + }, + { + "step": 134, + "epoch": 0.8375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7026, + "grad_norm": 0.6950159668922424, + "learning_rate": 0.00028919983682882766 + }, + { + "step": 135, + "epoch": 0.84375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6766, + "grad_norm": 0.24848176538944244, + "learning_rate": 0.0002888929492271224 + }, + { + "step": 136, + "epoch": 0.85, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6797, + "grad_norm": 0.16260002553462982, + "learning_rate": 0.000288581929876693 + }, + { + "step": 137, + "epoch": 0.85625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6959, + "grad_norm": 0.5595306754112244, + "learning_rate": 0.00028826678802965614 + }, + { + "step": 138, + "epoch": 0.8625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7181, + "grad_norm": 0.9321412444114685, + "learning_rate": 0.0002879475330607638 + }, + { + "step": 139, + "epoch": 0.86875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.7063, + "grad_norm": 0.6775842905044556, + "learning_rate": 0.00028762417446712363 + }, + { + "step": 140, + "epoch": 0.875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6958, + "grad_norm": 0.4582332670688629, + "learning_rate": 0.00028729672186791704 + }, + { + "step": 141, + "epoch": 0.88125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6968, + "grad_norm": 0.7271935343742371, + "learning_rate": 0.00028696518500411254 + }, + { + "step": 142, + "epoch": 0.8875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6949, + "grad_norm": 0.5809777975082397, + "learning_rate": 0.0002866295737381763 + }, + { + "step": 143, + "epoch": 0.89375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7017, + "grad_norm": 0.25248032808303833, + "learning_rate": 0.0002862898980537788 + }, + { + "step": 144, + "epoch": 0.9, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.701, + "grad_norm": 0.544955313205719, + "learning_rate": 0.0002859461680554975 + }, + { + "step": 145, + "epoch": 0.90625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.6833, + "grad_norm": 0.21749435365200043, + "learning_rate": 0.0002855983939685165 + }, + { + "step": 146, + "epoch": 0.9125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6839, + "grad_norm": 0.2535029649734497, + "learning_rate": 0.0002852465861383224 + }, + { + "step": 147, + "epoch": 0.91875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7279, + "grad_norm": 1.2177573442459106, + "learning_rate": 0.00028489075503039643 + }, + { + "step": 148, + "epoch": 0.925, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6882, + "grad_norm": 0.2631727457046509, + "learning_rate": 0.00028453091122990323 + }, + { + "step": 149, + "epoch": 0.93125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6958, + "grad_norm": 0.6597511172294617, + "learning_rate": 0.0002841670654413757 + }, + { + "step": 150, + "epoch": 0.9375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.7007, + "grad_norm": 0.6605477333068848, + "learning_rate": 0.0002837992284883971 + }, + { + "step": 151, + "epoch": 0.94375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6919, + "grad_norm": 0.32438457012176514, + "learning_rate": 0.0002834274113132784 + }, + { + "step": 152, + "epoch": 0.95, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6913, + "grad_norm": 0.16103753447532654, + "learning_rate": 0.0002830516249767332 + }, + { + "step": 153, + "epoch": 0.95625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.69, + "grad_norm": 0.3038259446620941, + "learning_rate": 0.0002826718806575488 + }, + { + "step": 154, + "epoch": 0.9625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6942, + "grad_norm": 0.2840510606765747, + "learning_rate": 0.0002822881896522532 + }, + { + "step": 155, + "epoch": 0.96875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7035, + "grad_norm": 0.7380078434944153, + "learning_rate": 0.0002819005633747795 + }, + { + "step": 156, + "epoch": 0.975, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.6973, + "grad_norm": 0.2670704424381256, + "learning_rate": 0.00028150901335612615 + }, + { + "step": 157, + "epoch": 0.98125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6904, + "grad_norm": 1.5326921939849854, + "learning_rate": 0.0002811135512440138 + }, + { + "step": 158, + "epoch": 0.9875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068501504, + "loss": 0.7017, + "grad_norm": 0.4048844575881958, + "learning_rate": 0.0002807141888025392 + }, + { + "step": 159, + "epoch": 0.99375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6906, + "grad_norm": 0.18344715237617493, + "learning_rate": 0.00028031093791182484 + }, + { + "step": 160, + "epoch": 1.0, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.7168, + "grad_norm": 0.678510308265686, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 161, + "epoch": 1.00625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7136, + "grad_norm": 0.6432071328163147, + "learning_rate": 0.0002794928188811727 + }, + { + "step": 162, + "epoch": 1.0125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.7135, + "grad_norm": 0.5835223197937012, + "learning_rate": 0.0002790779750784118 + }, + { + "step": 163, + "epoch": 1.01875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7084, + "grad_norm": 0.8523590564727783, + "learning_rate": 0.0002786592915000408 + }, + { + "step": 164, + "epoch": 1.025, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6907, + "grad_norm": 0.49763649702072144, + "learning_rate": 0.00027823678060094197 + }, + { + "step": 165, + "epoch": 1.03125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6819, + "grad_norm": 0.21055033802986145, + "learning_rate": 0.0002778104549498518 + }, + { + "step": 166, + "epoch": 1.0375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7119, + "grad_norm": 0.6058294177055359, + "learning_rate": 0.00027738032722898683 + }, + { + "step": 167, + "epoch": 1.04375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7358, + "grad_norm": 0.817696750164032, + "learning_rate": 0.00027694641023366656 + }, + { + "step": 168, + "epoch": 1.05, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.7345, + "grad_norm": 0.7444019913673401, + "learning_rate": 0.0002765087168719328 + }, + { + "step": 169, + "epoch": 1.05625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.7293, + "grad_norm": 0.7038159966468811, + "learning_rate": 0.00027606726016416567 + }, + { + "step": 170, + "epoch": 1.0625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.7331, + "grad_norm": 0.8314561247825623, + "learning_rate": 0.00027562205324269617 + }, + { + "step": 171, + "epoch": 1.06875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6874, + "grad_norm": 0.5283418893814087, + "learning_rate": 0.00027517310935141565 + }, + { + "step": 172, + "epoch": 1.075, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7026, + "grad_norm": 0.2935195863246918, + "learning_rate": 0.0002747204418453818 + }, + { + "step": 173, + "epoch": 1.08125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068483072, + "loss": 0.7023, + "grad_norm": 0.3836955428123474, + "learning_rate": 0.00027426406419042135 + }, + { + "step": 174, + "epoch": 1.0875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7183, + "grad_norm": 0.7241147756576538, + "learning_rate": 0.00027380398996272956 + }, + { + "step": 175, + "epoch": 1.09375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7237, + "grad_norm": 0.9878159761428833, + "learning_rate": 0.0002733402328484662 + }, + { + "step": 176, + "epoch": 1.1, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6993, + "grad_norm": 0.19689683616161346, + "learning_rate": 0.00027287280664334875 + }, + { + "step": 177, + "epoch": 1.10625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7032, + "grad_norm": 0.2405109405517578, + "learning_rate": 0.0002724017252522415 + }, + { + "step": 178, + "epoch": 1.1125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6893, + "grad_norm": 0.17460864782333374, + "learning_rate": 0.0002719270026887423 + }, + { + "step": 179, + "epoch": 1.11875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6922, + "grad_norm": 0.12229316681623459, + "learning_rate": 0.0002714486530747656 + }, + { + "step": 180, + "epoch": 1.125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6898, + "grad_norm": 0.21134480834007263, + "learning_rate": 0.0002709666906401224 + }, + { + "step": 181, + "epoch": 1.13125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068506112, + "loss": 0.6713, + "grad_norm": 0.539641261100769, + "learning_rate": 0.0002704811297220967 + }, + { + "step": 182, + "epoch": 1.1375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.7286, + "grad_norm": 1.045167326927185, + "learning_rate": 0.00026999198476501945 + }, + { + "step": 183, + "epoch": 1.14375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7163, + "grad_norm": 0.8501600623130798, + "learning_rate": 0.0002694992703198383 + }, + { + "step": 184, + "epoch": 1.15, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6905, + "grad_norm": 0.51210618019104, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 185, + "epoch": 1.15625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.686, + "grad_norm": 0.22745631635189056, + "learning_rate": 0.0002685031916994403 + }, + { + "step": 186, + "epoch": 1.1625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.7349, + "grad_norm": 1.1106964349746704, + "learning_rate": 0.0002679998571552925 + }, + { + "step": 187, + "epoch": 1.16875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6824, + "grad_norm": 0.2674286365509033, + "learning_rate": 0.0002674930123842975 + }, + { + "step": 188, + "epoch": 1.175, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.706, + "grad_norm": 0.7124238610267639, + "learning_rate": 0.0002669826724639322 + }, + { + "step": 189, + "epoch": 1.18125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7169, + "grad_norm": 0.8434913754463196, + "learning_rate": 0.0002664688525756463 + }, + { + "step": 190, + "epoch": 1.1875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6855, + "grad_norm": 0.33296748995780945, + "learning_rate": 0.0002659515680044105 + }, + { + "step": 191, + "epoch": 1.19375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6951, + "grad_norm": 0.4905150234699249, + "learning_rate": 0.00026543083413826203 + }, + { + "step": 192, + "epoch": 1.2, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6819, + "grad_norm": 0.7793123722076416, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 193, + "epoch": 1.20625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6836, + "grad_norm": 0.16045551002025604, + "learning_rate": 0.0002643790805859582 + }, + { + "step": 194, + "epoch": 1.2125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.6841, + "grad_norm": 0.27169084548950195, + "learning_rate": 0.00026384809218707423 + }, + { + "step": 195, + "epoch": 1.21875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.7695, + "grad_norm": 2.231945037841797, + "learning_rate": 0.0002633137170668897 + }, + { + "step": 196, + "epoch": 1.225, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.7139, + "grad_norm": 0.9966713190078735, + "learning_rate": 0.0002627759711218466 + }, + { + "step": 197, + "epoch": 1.23125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.7114, + "grad_norm": 0.48636358976364136, + "learning_rate": 0.00026223487034866133 + }, + { + "step": 198, + "epoch": 1.2375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6927, + "grad_norm": 0.19105762243270874, + "learning_rate": 0.00026169043084384896 + }, + { + "step": 199, + "epoch": 1.24375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6914, + "grad_norm": 0.29180189967155457, + "learning_rate": 0.00026114266880324387 + }, + { + "step": 200, + "epoch": 1.25, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7044, + "grad_norm": 0.6314902305603027, + "learning_rate": 0.0002605916005215186 + }, + { + "step": 201, + "epoch": 1.25625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.7048, + "grad_norm": 0.5078075528144836, + "learning_rate": 0.00026003724239169874 + }, + { + "step": 202, + "epoch": 1.2625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6871, + "grad_norm": 0.49282175302505493, + "learning_rate": 0.00025947961090467533 + }, + { + "step": 203, + "epoch": 1.26875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7151, + "grad_norm": 1.28816819190979, + "learning_rate": 0.0002589187226487144 + }, + { + "step": 204, + "epoch": 1.275, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6983, + "grad_norm": 0.2227339744567871, + "learning_rate": 0.0002583545943089633 + }, + { + "step": 205, + "epoch": 1.28125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6841, + "grad_norm": 0.22007830440998077, + "learning_rate": 0.00025778724266695466 + }, + { + "step": 206, + "epoch": 1.2875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7191, + "grad_norm": 0.8341906666755676, + "learning_rate": 0.00025721668460010696 + }, + { + "step": 207, + "epoch": 1.29375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.7044, + "grad_norm": 0.46939682960510254, + "learning_rate": 0.0002566429370812223 + }, + { + "step": 208, + "epoch": 1.3, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6876, + "grad_norm": 0.30110588669776917, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 209, + "epoch": 1.30625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7174, + "grad_norm": 0.8721743822097778, + "learning_rate": 0.0002554859420524386 + }, + { + "step": 210, + "epoch": 1.3125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.697, + "grad_norm": 0.48787611722946167, + "learning_rate": 0.00025490272896050507 + }, + { + "step": 211, + "epoch": 1.31875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7109, + "grad_norm": 0.9787148833274841, + "learning_rate": 0.00025431639525144175 + }, + { + "step": 212, + "epoch": 1.325, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6962, + "grad_norm": 0.37622204422950745, + "learning_rate": 0.0002537269583673404 + }, + { + "step": 213, + "epoch": 1.33125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6866, + "grad_norm": 0.1223369762301445, + "learning_rate": 0.0002531344358426051 + }, + { + "step": 214, + "epoch": 1.3375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068499968, + "loss": 0.6914, + "grad_norm": 0.41015371680259705, + "learning_rate": 0.0002525388453034307 + }, + { + "step": 215, + "epoch": 1.34375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6833, + "grad_norm": 0.3515612781047821, + "learning_rate": 0.0002519402044672784 + }, + { + "step": 216, + "epoch": 1.35, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6929, + "grad_norm": 0.5895916223526001, + "learning_rate": 0.00025133853114234905 + }, + { + "step": 217, + "epoch": 1.35625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.7064, + "grad_norm": 0.36029744148254395, + "learning_rate": 0.00025073384322705274 + }, + { + "step": 218, + "epoch": 1.3625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7033, + "grad_norm": 0.7294370532035828, + "learning_rate": 0.0002501261587094771 + }, + { + "step": 219, + "epoch": 1.36875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6915, + "grad_norm": 0.21985113620758057, + "learning_rate": 0.00024951549566685165 + }, + { + "step": 220, + "epoch": 1.375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6962, + "grad_norm": 0.12996530532836914, + "learning_rate": 0.0002489018722650103 + }, + { + "step": 221, + "epoch": 1.38125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6945, + "grad_norm": 0.7904802560806274, + "learning_rate": 0.00024828530675785094 + }, + { + "step": 222, + "epoch": 1.3875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.691, + "grad_norm": 0.15054088830947876, + "learning_rate": 0.00024766581748679234 + }, + { + "step": 223, + "epoch": 1.39375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.6853, + "grad_norm": 0.34210070967674255, + "learning_rate": 0.0002470434228802286 + }, + { + "step": 224, + "epoch": 1.4, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6938, + "grad_norm": 0.17128601670265198, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 225, + "epoch": 1.40625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.696, + "grad_norm": 0.5231716632843018, + "learning_rate": 0.0002457899918057468 + }, + { + "step": 226, + "epoch": 1.4125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7009, + "grad_norm": 0.39934539794921875, + "learning_rate": 0.0002451589926245468 + }, + { + "step": 227, + "epoch": 1.41875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6961, + "grad_norm": 0.1790948361158371, + "learning_rate": 0.00024452516268016865 + }, + { + "step": 228, + "epoch": 1.425, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6947, + "grad_norm": 0.1673346310853958, + "learning_rate": 0.00024388852082760884 + }, + { + "step": 229, + "epoch": 1.43125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6908, + "grad_norm": 0.21347711980342865, + "learning_rate": 0.00024324908600551162 + }, + { + "step": 230, + "epoch": 1.4375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6962, + "grad_norm": 0.3430756628513336, + "learning_rate": 0.00024260687723560574 + }, + { + "step": 231, + "epoch": 1.44375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6845, + "grad_norm": 0.42399147152900696, + "learning_rate": 0.00024196191362213862 + }, + { + "step": 232, + "epoch": 1.45, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6918, + "grad_norm": 0.20001542568206787, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 233, + "epoch": 1.45625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.7019, + "grad_norm": 0.33074578642845154, + "learning_rate": 0.0002406637986906913 + }, + { + "step": 234, + "epoch": 1.4625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068501504, + "loss": 0.7002, + "grad_norm": 0.3362917900085449, + "learning_rate": 0.00024001068598867212 + }, + { + "step": 235, + "epoch": 1.46875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6937, + "grad_norm": 0.49595949053764343, + "learning_rate": 0.000239354895673865 + }, + { + "step": 236, + "epoch": 1.475, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.697, + "grad_norm": 0.5481435060501099, + "learning_rate": 0.00023869644725453735 + }, + { + "step": 237, + "epoch": 1.48125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068499968, + "loss": 0.7051, + "grad_norm": 0.6215497255325317, + "learning_rate": 0.00023803536031802918 + }, + { + "step": 238, + "epoch": 1.4875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.6984, + "grad_norm": 0.9785731434822083, + "learning_rate": 0.00023737165453017033 + }, + { + "step": 239, + "epoch": 1.49375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.7038, + "grad_norm": 0.4780134856700897, + "learning_rate": 0.0002367053496346955 + }, + { + "step": 240, + "epoch": 1.5, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7027, + "grad_norm": 0.3041737377643585, + "learning_rate": 0.00023603646545265687 + }, + { + "step": 241, + "epoch": 1.50625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6706, + "grad_norm": 0.28354543447494507, + "learning_rate": 0.00023536502188183472 + }, + { + "step": 242, + "epoch": 1.5125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.68, + "grad_norm": 0.11263764649629593, + "learning_rate": 0.00023469103889614505 + }, + { + "step": 243, + "epoch": 1.51875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068499968, + "loss": 0.7452, + "grad_norm": 1.0595217943191528, + "learning_rate": 0.0002340145365450458 + }, + { + "step": 244, + "epoch": 1.525, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7182, + "grad_norm": 0.6199190616607666, + "learning_rate": 0.0002333355349529403 + }, + { + "step": 245, + "epoch": 1.53125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.7087, + "grad_norm": 0.5876704454421997, + "learning_rate": 0.0002326540543185786 + }, + { + "step": 246, + "epoch": 1.5375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6958, + "grad_norm": 0.14200207591056824, + "learning_rate": 0.0002319701149144565 + }, + { + "step": 247, + "epoch": 1.54375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6964, + "grad_norm": 0.12572866678237915, + "learning_rate": 0.00023128373708621275 + }, + { + "step": 248, + "epoch": 1.55, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7078, + "grad_norm": 0.5993900299072266, + "learning_rate": 0.00023059494125202357 + }, + { + "step": 249, + "epoch": 1.55625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6847, + "grad_norm": 0.10746530443429947, + "learning_rate": 0.00022990374790199532 + }, + { + "step": 250, + "epoch": 1.5625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7206, + "grad_norm": 0.7313397526741028, + "learning_rate": 0.0002292101775975552 + }, + { + "step": 251, + "epoch": 1.56875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6953, + "grad_norm": 0.14610086381435394, + "learning_rate": 0.00022851425097083906 + }, + { + "step": 252, + "epoch": 1.575, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6869, + "grad_norm": 0.2163408249616623, + "learning_rate": 0.00022781598872407822 + }, + { + "step": 253, + "epoch": 1.58125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6951, + "grad_norm": 0.13957883417606354, + "learning_rate": 0.00022711541162898321 + }, + { + "step": 254, + "epoch": 1.5875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6975, + "grad_norm": 0.17123976349830627, + "learning_rate": 0.00022641254052612627 + }, + { + "step": 255, + "epoch": 1.59375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.6928, + "grad_norm": 0.17455562949180603, + "learning_rate": 0.00022570739632432079 + }, + { + "step": 256, + "epoch": 1.6, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6982, + "grad_norm": 0.24220675230026245, + "learning_rate": 0.000225 + }, + { + "step": 257, + "epoch": 1.60625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6977, + "grad_norm": 0.11216392368078232, + "learning_rate": 0.0002242903725965924 + }, + { + "step": 258, + "epoch": 1.6125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6944, + "grad_norm": 0.30877357721328735, + "learning_rate": 0.00022357853522389615 + }, + { + "step": 259, + "epoch": 1.61875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7027, + "grad_norm": 0.5149140357971191, + "learning_rate": 0.000222864509057451 + }, + { + "step": 260, + "epoch": 1.625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6963, + "grad_norm": 0.28218215703964233, + "learning_rate": 0.00022214831533790813 + }, + { + "step": 261, + "epoch": 1.63125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.695, + "grad_norm": 0.578066885471344, + "learning_rate": 0.0002214299753703987 + }, + { + "step": 262, + "epoch": 1.6375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6906, + "grad_norm": 0.12588295340538025, + "learning_rate": 0.00022070951052389966 + }, + { + "step": 263, + "epoch": 1.64375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068501504, + "loss": 0.6945, + "grad_norm": 0.11946961283683777, + "learning_rate": 0.00021998694223059837 + }, + { + "step": 264, + "epoch": 1.65, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6939, + "grad_norm": 0.100344717502594, + "learning_rate": 0.0002192622919852551 + }, + { + "step": 265, + "epoch": 1.65625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6993, + "grad_norm": 0.24917533993721008, + "learning_rate": 0.00021853558134456307 + }, + { + "step": 266, + "epoch": 1.6625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6933, + "grad_norm": 0.19170351326465607, + "learning_rate": 0.00021780683192650796 + }, + { + "step": 267, + "epoch": 1.66875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6919, + "grad_norm": 0.19857192039489746, + "learning_rate": 0.00021707606540972413 + }, + { + "step": 268, + "epoch": 1.675, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7039, + "grad_norm": 0.5036092400550842, + "learning_rate": 0.00021634330353285017 + }, + { + "step": 269, + "epoch": 1.68125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6912, + "grad_norm": 0.2684602737426758, + "learning_rate": 0.00021560856809388213 + }, + { + "step": 270, + "epoch": 1.6875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7021, + "grad_norm": 0.3963507115840912, + "learning_rate": 0.00021487188094952489 + }, + { + "step": 271, + "epoch": 1.69375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6947, + "grad_norm": 0.11346612870693207, + "learning_rate": 0.0002141332640145423 + }, + { + "step": 272, + "epoch": 1.7, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6925, + "grad_norm": 0.49780726432800293, + "learning_rate": 0.0002133927392611049 + }, + { + "step": 273, + "epoch": 1.70625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.693, + "grad_norm": 0.25823283195495605, + "learning_rate": 0.00021265032871813658 + }, + { + "step": 274, + "epoch": 1.7125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6916, + "grad_norm": 0.4184877276420593, + "learning_rate": 0.00021190605447065917 + }, + { + "step": 275, + "epoch": 1.71875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6966, + "grad_norm": 0.3523774743080139, + "learning_rate": 0.0002111599386591355 + }, + { + "step": 276, + "epoch": 1.725, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6929, + "grad_norm": 0.25271540880203247, + "learning_rate": 0.00021041200347881057 + }, + { + "step": 277, + "epoch": 1.73125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.696, + "grad_norm": 0.4417850375175476, + "learning_rate": 0.00020966227117905163 + }, + { + "step": 278, + "epoch": 1.7375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6879, + "grad_norm": 0.12403226643800735, + "learning_rate": 0.00020891076406268612 + }, + { + "step": 279, + "epoch": 1.74375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6957, + "grad_norm": 0.41995370388031006, + "learning_rate": 0.00020815750448533805 + }, + { + "step": 280, + "epoch": 1.75, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.6969, + "grad_norm": 0.1567663997411728, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 281, + "epoch": 1.75625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.6883, + "grad_norm": 0.35025253891944885, + "learning_rate": 0.00020664581763018324 + }, + { + "step": 282, + "epoch": 1.7625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.7039, + "grad_norm": 0.3068544566631317, + "learning_rate": 0.00020588743532161543 + }, + { + "step": 283, + "epoch": 1.76875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.7177, + "grad_norm": 0.946091890335083, + "learning_rate": 0.00020512739048920552 + }, + { + "step": 284, + "epoch": 1.775, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7037, + "grad_norm": 0.3491276502609253, + "learning_rate": 0.00020436570574255522 + }, + { + "step": 285, + "epoch": 1.78125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6889, + "grad_norm": 0.17164400219917297, + "learning_rate": 0.00020360240374005 + }, + { + "step": 286, + "epoch": 1.7875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.7035, + "grad_norm": 0.8080204725265503, + "learning_rate": 0.00020283750718818501 + }, + { + "step": 287, + "epoch": 1.79375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.6968, + "grad_norm": 0.2539673149585724, + "learning_rate": 0.00020207103884088955 + }, + { + "step": 288, + "epoch": 1.8, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6967, + "grad_norm": 0.1454460471868515, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 289, + "epoch": 1.80625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6992, + "grad_norm": 0.3375777006149292, + "learning_rate": 0.00020053347800883298 + }, + { + "step": 290, + "epoch": 1.8125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6913, + "grad_norm": 0.22563420236110687, + "learning_rate": 0.00019976243126300282 + }, + { + "step": 291, + "epoch": 1.81875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6899, + "grad_norm": 0.13412392139434814, + "learning_rate": 0.00019898990419824333 + }, + { + "step": 292, + "epoch": 1.825, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068481536, + "loss": 0.6955, + "grad_norm": 0.13308067619800568, + "learning_rate": 0.00019821591979547423 + }, + { + "step": 293, + "epoch": 1.83125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.7125, + "grad_norm": 0.6364147067070007, + "learning_rate": 0.00019744050107896774 + }, + { + "step": 294, + "epoch": 1.8375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068483072, + "loss": 0.7161, + "grad_norm": 0.825778067111969, + "learning_rate": 0.0001966636711156636 + }, + { + "step": 295, + "epoch": 1.84375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6902, + "grad_norm": 0.14107663929462433, + "learning_rate": 0.00019588545301448302 + }, + { + "step": 296, + "epoch": 1.85, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6933, + "grad_norm": 0.37827205657958984, + "learning_rate": 0.00019510586992564093 + }, + { + "step": 297, + "epoch": 1.85625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6965, + "grad_norm": 0.11653515696525574, + "learning_rate": 0.0001943249450399578 + }, + { + "step": 298, + "epoch": 1.8625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.695, + "grad_norm": 0.17433851957321167, + "learning_rate": 0.0001935427015881693 + }, + { + "step": 299, + "epoch": 1.86875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6931, + "grad_norm": 0.16421417891979218, + "learning_rate": 0.00019275916284023563 + }, + { + "step": 300, + "epoch": 1.875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7071, + "grad_norm": 0.6435844302177429, + "learning_rate": 0.00019197435210464882 + }, + { + "step": 301, + "epoch": 1.88125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6931, + "grad_norm": 0.11096253991127014, + "learning_rate": 0.00019118829272773985 + }, + { + "step": 302, + "epoch": 1.8875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.692, + "grad_norm": 0.13169337809085846, + "learning_rate": 0.00019040100809298392 + }, + { + "step": 303, + "epoch": 1.89375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068501504, + "loss": 0.6978, + "grad_norm": 0.23815208673477173, + "learning_rate": 0.00018961252162030476 + }, + { + "step": 304, + "epoch": 1.9, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6937, + "grad_norm": 0.2776617407798767, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 305, + "epoch": 1.90625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6937, + "grad_norm": 0.2852586805820465, + "learning_rate": 0.00018803203701893393 + }, + { + "step": 306, + "epoch": 1.9125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6925, + "grad_norm": 0.11637840420007706, + "learning_rate": 0.00018724008590605742 + }, + { + "step": 307, + "epoch": 1.91875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7043, + "grad_norm": 0.44181421399116516, + "learning_rate": 0.0001864470269854896 + }, + { + "step": 308, + "epoch": 1.925, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6885, + "grad_norm": 0.1297931969165802, + "learning_rate": 0.00018565288384892595 + }, + { + "step": 309, + "epoch": 1.93125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7121, + "grad_norm": 0.6209468841552734, + "learning_rate": 0.00018485768012031518 + }, + { + "step": 310, + "epoch": 1.9375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6837, + "grad_norm": 0.24388304352760315, + "learning_rate": 0.00018406143945515598 + }, + { + "step": 311, + "epoch": 1.94375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.6896, + "grad_norm": 0.15176953375339508, + "learning_rate": 0.00018326418553979367 + }, + { + "step": 312, + "epoch": 1.95, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.7014, + "grad_norm": 0.2991977035999298, + "learning_rate": 0.0001824659420907154 + }, + { + "step": 313, + "epoch": 1.95625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.69, + "grad_norm": 0.11519603431224823, + "learning_rate": 0.00018166673285384475 + }, + { + "step": 314, + "epoch": 1.9625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6908, + "grad_norm": 0.14178843796253204, + "learning_rate": 0.00018086658160383523 + }, + { + "step": 315, + "epoch": 1.96875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6901, + "grad_norm": 0.16125711798667908, + "learning_rate": 0.00018006551214336304 + }, + { + "step": 316, + "epoch": 1.975, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6978, + "grad_norm": 0.45740842819213867, + "learning_rate": 0.00017926354830241924 + }, + { + "step": 317, + "epoch": 1.98125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6978, + "grad_norm": 0.3697444200515747, + "learning_rate": 0.00017846071393760044 + }, + { + "step": 318, + "epoch": 1.9875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6934, + "grad_norm": 0.22798733413219452, + "learning_rate": 0.00017765703293139948 + }, + { + "step": 319, + "epoch": 1.99375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6862, + "grad_norm": 0.3072078824043274, + "learning_rate": 0.00017685252919149493 + }, + { + "step": 320, + "epoch": 2.0, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6854, + "grad_norm": 0.3590487539768219, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 321, + "epoch": 2.00625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6619, + "grad_norm": 0.5785498023033142, + "learning_rate": 0.00017524114926294887 + }, + { + "step": 322, + "epoch": 2.0125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6947, + "grad_norm": 0.2823600471019745, + "learning_rate": 0.0001744343210091883 + }, + { + "step": 323, + "epoch": 2.01875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.709, + "grad_norm": 0.4644613265991211, + "learning_rate": 0.00017362676589005967 + }, + { + "step": 324, + "epoch": 2.025, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068501504, + "loss": 0.7415, + "grad_norm": 0.8632988333702087, + "learning_rate": 0.0001728185079284875 + }, + { + "step": 325, + "epoch": 2.03125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.7194, + "grad_norm": 0.5871100425720215, + "learning_rate": 0.00017200957116830423 + }, + { + "step": 326, + "epoch": 2.0375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068499968, + "loss": 0.7167, + "grad_norm": 0.49985271692276, + "learning_rate": 0.00017119997967353514 + }, + { + "step": 327, + "epoch": 2.04375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6856, + "grad_norm": 0.12939928472042084, + "learning_rate": 0.00017038975752768211 + }, + { + "step": 328, + "epoch": 2.05, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.7054, + "grad_norm": 0.3780755400657654, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 329, + "epoch": 2.05625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.7102, + "grad_norm": 0.5026342868804932, + "learning_rate": 0.0001687675177098179 + }, + { + "step": 330, + "epoch": 2.0625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6918, + "grad_norm": 0.15686984360218048, + "learning_rate": 0.00016795554829574435 + }, + { + "step": 331, + "epoch": 2.06875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6979, + "grad_norm": 0.10953454673290253, + "learning_rate": 0.00016714304474502696 + }, + { + "step": 332, + "epoch": 2.075, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6898, + "grad_norm": 0.3794475197792053, + "learning_rate": 0.00016633003122779467 + }, + { + "step": 333, + "epoch": 2.08125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6863, + "grad_norm": 0.2173452377319336, + "learning_rate": 0.00016551653192934694 + }, + { + "step": 334, + "epoch": 2.0875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6942, + "grad_norm": 0.2524566054344177, + "learning_rate": 0.0001647025710494341 + }, + { + "step": 335, + "epoch": 2.09375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6974, + "grad_norm": 0.3086255192756653, + "learning_rate": 0.00016388817280153735 + }, + { + "step": 336, + "epoch": 2.1, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6755, + "grad_norm": 0.15213735401630402, + "learning_rate": 0.00016307336141214873 + }, + { + "step": 337, + "epoch": 2.10625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.7234, + "grad_norm": 0.7455875873565674, + "learning_rate": 0.00016225816112005022 + }, + { + "step": 338, + "epoch": 2.1125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.7321, + "grad_norm": 0.7993876338005066, + "learning_rate": 0.00016144259617559286 + }, + { + "step": 339, + "epoch": 2.11875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.7042, + "grad_norm": 0.48965004086494446, + "learning_rate": 0.00016062669083997513 + }, + { + "step": 340, + "epoch": 2.125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.7066, + "grad_norm": 0.33446022868156433, + "learning_rate": 0.00015981046938452146 + }, + { + "step": 341, + "epoch": 2.13125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6922, + "grad_norm": 0.1785680204629898, + "learning_rate": 0.00015899395608996015 + }, + { + "step": 342, + "epoch": 2.1375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6864, + "grad_norm": 0.3093636929988861, + "learning_rate": 0.00015817717524570094 + }, + { + "step": 343, + "epoch": 2.14375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6929, + "grad_norm": 0.15023477375507355, + "learning_rate": 0.0001573601511491127 + }, + { + "step": 344, + "epoch": 2.15, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6994, + "grad_norm": 0.5150320529937744, + "learning_rate": 0.00015654290810480042 + }, + { + "step": 345, + "epoch": 2.15625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6978, + "grad_norm": 0.4413391649723053, + "learning_rate": 0.00015572547042388223 + }, + { + "step": 346, + "epoch": 2.1625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6987, + "grad_norm": 0.15408672392368317, + "learning_rate": 0.00015490786242326643 + }, + { + "step": 347, + "epoch": 2.16875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6945, + "grad_norm": 0.3325234055519104, + "learning_rate": 0.00015409010842492777 + }, + { + "step": 348, + "epoch": 2.175, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6979, + "grad_norm": 0.30588504672050476, + "learning_rate": 0.00015327223275518416 + }, + { + "step": 349, + "epoch": 2.18125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848, + "loss": 0.6979, + "grad_norm": 0.22226110100746155, + "learning_rate": 0.000152454259743973 + }, + { + "step": 350, + "epoch": 2.1875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6897, + "grad_norm": 0.15655352175235748, + "learning_rate": 0.00015163621372412734 + }, + { + "step": 351, + "epoch": 2.19375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6902, + "grad_norm": 0.3011403977870941, + "learning_rate": 0.00015081811903065205 + }, + { + "step": 352, + "epoch": 2.2, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6936, + "grad_norm": 0.14197301864624023, + "learning_rate": 0.00015 + }, + { + "step": 353, + "epoch": 2.20625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6955, + "grad_norm": 0.22666539251804352, + "learning_rate": 0.0001491818809693479 + }, + { + "step": 354, + "epoch": 2.2125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6888, + "grad_norm": 1.5097085237503052, + "learning_rate": 0.00014836378627587266 + }, + { + "step": 355, + "epoch": 2.21875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.693, + "grad_norm": 0.4130704998970032, + "learning_rate": 0.00014754574025602698 + }, + { + "step": 356, + "epoch": 2.225, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.7029, + "grad_norm": 0.47340983152389526, + "learning_rate": 0.00014672776724481584 + }, + { + "step": 357, + "epoch": 2.23125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.7012, + "grad_norm": 0.5587112903594971, + "learning_rate": 0.00014590989157507224 + }, + { + "step": 358, + "epoch": 2.2375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.688, + "grad_norm": 0.3821542263031006, + "learning_rate": 0.00014509213757673357 + }, + { + "step": 359, + "epoch": 2.24375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6949, + "grad_norm": 0.1493266075849533, + "learning_rate": 0.00014427452957611775 + }, + { + "step": 360, + "epoch": 2.25, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6936, + "grad_norm": 0.3656148314476013, + "learning_rate": 0.0001434570918951996 + }, + { + "step": 361, + "epoch": 2.25625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6963, + "grad_norm": 0.15976186096668243, + "learning_rate": 0.0001426398488508873 + }, + { + "step": 362, + "epoch": 2.2625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6891, + "grad_norm": 0.5558398365974426, + "learning_rate": 0.00014182282475429903 + }, + { + "step": 363, + "epoch": 2.26875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6885, + "grad_norm": 0.13083486258983612, + "learning_rate": 0.00014100604391003985 + }, + { + "step": 364, + "epoch": 2.275, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.7009, + "grad_norm": 0.5258653163909912, + "learning_rate": 0.0001401895306154785 + }, + { + "step": 365, + "epoch": 2.28125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.7068, + "grad_norm": 0.6849786043167114, + "learning_rate": 0.00013937330916002487 + }, + { + "step": 366, + "epoch": 2.2875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.7016, + "grad_norm": 0.34010910987854004, + "learning_rate": 0.00013855740382440714 + }, + { + "step": 367, + "epoch": 2.29375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.681, + "grad_norm": 0.3265831172466278, + "learning_rate": 0.0001377418388799498 + }, + { + "step": 368, + "epoch": 2.3, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6832, + "grad_norm": 0.27797889709472656, + "learning_rate": 0.00013692663858785124 + }, + { + "step": 369, + "epoch": 2.30625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6854, + "grad_norm": 0.25142616033554077, + "learning_rate": 0.00013611182719846268 + }, + { + "step": 370, + "epoch": 2.3125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6927, + "grad_norm": 0.21203815937042236, + "learning_rate": 0.0001352974289505659 + }, + { + "step": 371, + "epoch": 2.31875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6994, + "grad_norm": 0.3062473237514496, + "learning_rate": 0.000134483468070653 + }, + { + "step": 372, + "epoch": 2.325, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.7056, + "grad_norm": 0.34951573610305786, + "learning_rate": 0.00013366996877220533 + }, + { + "step": 373, + "epoch": 2.33125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6931, + "grad_norm": 0.20027469098567963, + "learning_rate": 0.000132856955254973 + }, + { + "step": 374, + "epoch": 2.3375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06850304, + "loss": 0.681, + "grad_norm": 0.3639501631259918, + "learning_rate": 0.00013204445170425565 + }, + { + "step": 375, + "epoch": 2.34375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.697, + "grad_norm": 0.35288935899734497, + "learning_rate": 0.00013123248229018214 + }, + { + "step": 376, + "epoch": 2.35, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6972, + "grad_norm": 0.3930516839027405, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 377, + "epoch": 2.35625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06850304, + "loss": 0.6996, + "grad_norm": 0.276464581489563, + "learning_rate": 0.0001296102424723179 + }, + { + "step": 378, + "epoch": 2.3625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6899, + "grad_norm": 0.3198925852775574, + "learning_rate": 0.0001288000203264649 + }, + { + "step": 379, + "epoch": 2.36875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7038, + "grad_norm": 0.4031619429588318, + "learning_rate": 0.00012799042883169574 + }, + { + "step": 380, + "epoch": 2.375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6908, + "grad_norm": 0.3554222583770752, + "learning_rate": 0.00012718149207151247 + }, + { + "step": 381, + "epoch": 2.38125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7052, + "grad_norm": 0.31471705436706543, + "learning_rate": 0.00012637323410994033 + }, + { + "step": 382, + "epoch": 2.3875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6822, + "grad_norm": 0.2020546942949295, + "learning_rate": 0.0001255656789908117 + }, + { + "step": 383, + "epoch": 2.39375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6975, + "grad_norm": 0.1600024551153183, + "learning_rate": 0.0001247588507370511 + }, + { + "step": 384, + "epoch": 2.4, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7149, + "grad_norm": 0.536888599395752, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 385, + "epoch": 2.40625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.7217, + "grad_norm": 0.7459531426429749, + "learning_rate": 0.0001231474708085051 + }, + { + "step": 386, + "epoch": 2.4125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.7016, + "grad_norm": 0.3504086434841156, + "learning_rate": 0.0001223429670686005 + }, + { + "step": 387, + "epoch": 2.41875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6844, + "grad_norm": 0.15432633459568024, + "learning_rate": 0.00012153928606239957 + }, + { + "step": 388, + "epoch": 2.425, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6969, + "grad_norm": 0.3092220425605774, + "learning_rate": 0.00012073645169758076 + }, + { + "step": 389, + "epoch": 2.43125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6936, + "grad_norm": 0.12380414456129074, + "learning_rate": 0.00011993448785663692 + }, + { + "step": 390, + "epoch": 2.4375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6955, + "grad_norm": 0.36055055260658264, + "learning_rate": 0.00011913341839616476 + }, + { + "step": 391, + "epoch": 2.44375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6914, + "grad_norm": 0.11457808315753937, + "learning_rate": 0.00011833326714615522 + }, + { + "step": 392, + "epoch": 2.45, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068499968, + "loss": 0.6899, + "grad_norm": 0.11186027526855469, + "learning_rate": 0.00011753405790928456 + }, + { + "step": 393, + "epoch": 2.45625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.694, + "grad_norm": 0.18799778819084167, + "learning_rate": 0.0001167358144602063 + }, + { + "step": 394, + "epoch": 2.4625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6969, + "grad_norm": 0.1980976015329361, + "learning_rate": 0.00011593856054484402 + }, + { + "step": 395, + "epoch": 2.46875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6735, + "grad_norm": 0.5779987573623657, + "learning_rate": 0.00011514231987968482 + }, + { + "step": 396, + "epoch": 2.475, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6936, + "grad_norm": 0.10455330461263657, + "learning_rate": 0.00011434711615107404 + }, + { + "step": 397, + "epoch": 2.48125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.6908, + "grad_norm": 0.13172529637813568, + "learning_rate": 0.00011355297301451042 + }, + { + "step": 398, + "epoch": 2.4875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068499968, + "loss": 0.6988, + "grad_norm": 0.17030271887779236, + "learning_rate": 0.00011275991409394253 + }, + { + "step": 399, + "epoch": 2.49375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7144, + "grad_norm": 0.5178220868110657, + "learning_rate": 0.00011196796298106608 + }, + { + "step": 400, + "epoch": 2.5, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7161, + "grad_norm": 0.5244734883308411, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 401, + "epoch": 2.50625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.677, + "grad_norm": 0.2601771950721741, + "learning_rate": 0.00011038747837969526 + }, + { + "step": 402, + "epoch": 2.5125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6958, + "grad_norm": 0.22160431742668152, + "learning_rate": 0.00010959899190701608 + }, + { + "step": 403, + "epoch": 2.51875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.692, + "grad_norm": 0.12964078783988953, + "learning_rate": 0.00010881170727226018 + }, + { + "step": 404, + "epoch": 2.525, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7035, + "grad_norm": 0.45724692940711975, + "learning_rate": 0.00010802564789535119 + }, + { + "step": 405, + "epoch": 2.53125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6894, + "grad_norm": 0.12018575519323349, + "learning_rate": 0.00010724083715976441 + }, + { + "step": 406, + "epoch": 2.5375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6866, + "grad_norm": 0.12046580016613007, + "learning_rate": 0.00010645729841183066 + }, + { + "step": 407, + "epoch": 2.54375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6937, + "grad_norm": 0.0877939760684967, + "learning_rate": 0.00010567505496004213 + }, + { + "step": 408, + "epoch": 2.55, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.6911, + "grad_norm": 0.12210951745510101, + "learning_rate": 0.00010489413007435904 + }, + { + "step": 409, + "epoch": 2.55625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6913, + "grad_norm": 0.09073065221309662, + "learning_rate": 0.00010411454698551695 + }, + { + "step": 410, + "epoch": 2.5625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6891, + "grad_norm": 0.2891448140144348, + "learning_rate": 0.00010333632888433638 + }, + { + "step": 411, + "epoch": 2.56875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.7002, + "grad_norm": 0.3898809254169464, + "learning_rate": 0.00010255949892103225 + }, + { + "step": 412, + "epoch": 2.575, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6883, + "grad_norm": 0.18583045899868011, + "learning_rate": 0.00010178408020452579 + }, + { + "step": 413, + "epoch": 2.58125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068499968, + "loss": 0.6884, + "grad_norm": 0.19345863163471222, + "learning_rate": 0.00010101009580175669 + }, + { + "step": 414, + "epoch": 2.5875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6952, + "grad_norm": 0.10333971679210663, + "learning_rate": 0.00010023756873699722 + }, + { + "step": 415, + "epoch": 2.59375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6881, + "grad_norm": 0.17495012283325195, + "learning_rate": 9.946652199116699e-05 + }, + { + "step": 416, + "epoch": 2.6, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6977, + "grad_norm": 0.38061025738716125, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 417, + "epoch": 2.60625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6978, + "grad_norm": 0.3702091574668884, + "learning_rate": 9.792896115911045e-05 + }, + { + "step": 418, + "epoch": 2.6125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6925, + "grad_norm": 0.2658718228340149, + "learning_rate": 9.716249281181497e-05 + }, + { + "step": 419, + "epoch": 2.61875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6935, + "grad_norm": 0.4891740679740906, + "learning_rate": 9.639759625994998e-05 + }, + { + "step": 420, + "epoch": 2.625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6911, + "grad_norm": 0.1067863404750824, + "learning_rate": 9.563429425744476e-05 + }, + { + "step": 421, + "epoch": 2.63125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7026, + "grad_norm": 0.3713737726211548, + "learning_rate": 9.487260951079448e-05 + }, + { + "step": 422, + "epoch": 2.6375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6939, + "grad_norm": 0.5389323234558105, + "learning_rate": 9.411256467838455e-05 + }, + { + "step": 423, + "epoch": 2.64375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6867, + "grad_norm": 0.5793989300727844, + "learning_rate": 9.335418236981677e-05 + }, + { + "step": 424, + "epoch": 2.65, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7002, + "grad_norm": 0.22989045083522797, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 425, + "epoch": 2.65625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6962, + "grad_norm": 0.24180863797664642, + "learning_rate": 9.184249551466189e-05 + }, + { + "step": 426, + "epoch": 2.6625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6873, + "grad_norm": 0.3239053189754486, + "learning_rate": 9.10892359373139e-05 + }, + { + "step": 427, + "epoch": 2.66875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068499968, + "loss": 0.6958, + "grad_norm": 0.11407417058944702, + "learning_rate": 9.033772882094833e-05 + }, + { + "step": 428, + "epoch": 2.675, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.7072, + "grad_norm": 0.6501893401145935, + "learning_rate": 8.958799652118943e-05 + }, + { + "step": 429, + "epoch": 2.68125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6956, + "grad_norm": 0.13312356173992157, + "learning_rate": 8.884006134086449e-05 + }, + { + "step": 430, + "epoch": 2.6875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6892, + "grad_norm": 0.11326117068529129, + "learning_rate": 8.809394552934079e-05 + }, + { + "step": 431, + "epoch": 2.69375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6838, + "grad_norm": 0.4245454967021942, + "learning_rate": 8.734967128186338e-05 + }, + { + "step": 432, + "epoch": 2.7, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6955, + "grad_norm": 0.10632988065481186, + "learning_rate": 8.660726073889511e-05 + }, + { + "step": 433, + "epoch": 2.70625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6945, + "grad_norm": 0.18738490343093872, + "learning_rate": 8.586673598545771e-05 + }, + { + "step": 434, + "epoch": 2.7125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6988, + "grad_norm": 0.21100488305091858, + "learning_rate": 8.512811905047505e-05 + }, + { + "step": 435, + "epoch": 2.71875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6934, + "grad_norm": 0.18668249249458313, + "learning_rate": 8.439143190611787e-05 + }, + { + "step": 436, + "epoch": 2.725, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6913, + "grad_norm": 0.10376464575529099, + "learning_rate": 8.365669646714983e-05 + }, + { + "step": 437, + "epoch": 2.73125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.7007, + "grad_norm": 0.1832624226808548, + "learning_rate": 8.29239345902759e-05 + }, + { + "step": 438, + "epoch": 2.7375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6961, + "grad_norm": 0.09405610710382462, + "learning_rate": 8.219316807349204e-05 + }, + { + "step": 439, + "epoch": 2.74375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6889, + "grad_norm": 0.10564573854207993, + "learning_rate": 8.146441865543689e-05 + }, + { + "step": 440, + "epoch": 2.75, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6906, + "grad_norm": 0.7760626077651978, + "learning_rate": 8.073770801474495e-05 + }, + { + "step": 441, + "epoch": 2.75625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6941, + "grad_norm": 0.08665087819099426, + "learning_rate": 8.001305776940163e-05 + }, + { + "step": 442, + "epoch": 2.7625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6911, + "grad_norm": 0.17989614605903625, + "learning_rate": 7.929048947610034e-05 + }, + { + "step": 443, + "epoch": 2.76875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6925, + "grad_norm": 0.19903291761875153, + "learning_rate": 7.857002462960132e-05 + }, + { + "step": 444, + "epoch": 2.775, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6876, + "grad_norm": 0.24834184348583221, + "learning_rate": 7.785168466209187e-05 + }, + { + "step": 445, + "epoch": 2.78125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6882, + "grad_norm": 0.13772642612457275, + "learning_rate": 7.713549094254897e-05 + }, + { + "step": 446, + "epoch": 2.7875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.684, + "grad_norm": 0.2967734634876251, + "learning_rate": 7.64214647761038e-05 + }, + { + "step": 447, + "epoch": 2.79375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6876, + "grad_norm": 0.18726655840873718, + "learning_rate": 7.570962740340759e-05 + }, + { + "step": 448, + "epoch": 2.8, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6875, + "grad_norm": 0.14846105873584747, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 449, + "epoch": 2.80625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6846, + "grad_norm": 0.1429746448993683, + "learning_rate": 7.429260367567916e-05 + }, + { + "step": 450, + "epoch": 2.8125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.7074, + "grad_norm": 0.5055184364318848, + "learning_rate": 7.358745947387373e-05 + }, + { + "step": 451, + "epoch": 2.81875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6944, + "grad_norm": 0.2552069425582886, + "learning_rate": 7.288458837101675e-05 + }, + { + "step": 452, + "epoch": 2.825, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6659, + "grad_norm": 0.5078445672988892, + "learning_rate": 7.218401127592175e-05 + }, + { + "step": 453, + "epoch": 2.83125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6736, + "grad_norm": 0.3192375600337982, + "learning_rate": 7.14857490291609e-05 + }, + { + "step": 454, + "epoch": 2.8375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6967, + "grad_norm": 0.19982264935970306, + "learning_rate": 7.07898224024448e-05 + }, + { + "step": 455, + "epoch": 2.84375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7164, + "grad_norm": 0.7453495264053345, + "learning_rate": 7.009625209800465e-05 + }, + { + "step": 456, + "epoch": 2.85, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7135, + "grad_norm": 0.6334215998649597, + "learning_rate": 6.940505874797639e-05 + }, + { + "step": 457, + "epoch": 2.85625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.7053, + "grad_norm": 0.48558351397514343, + "learning_rate": 6.871626291378728e-05 + }, + { + "step": 458, + "epoch": 2.8625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.717, + "grad_norm": 0.7891340851783752, + "learning_rate": 6.80298850855435e-05 + }, + { + "step": 459, + "epoch": 2.86875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7016, + "grad_norm": 0.2745852768421173, + "learning_rate": 6.734594568142142e-05 + }, + { + "step": 460, + "epoch": 2.875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.7036, + "grad_norm": 0.4409470856189728, + "learning_rate": 6.66644650470597e-05 + }, + { + "step": 461, + "epoch": 2.88125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.684, + "grad_norm": 0.7008882761001587, + "learning_rate": 6.598546345495417e-05 + }, + { + "step": 462, + "epoch": 2.8875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6979, + "grad_norm": 0.6351841688156128, + "learning_rate": 6.530896110385494e-05 + }, + { + "step": 463, + "epoch": 2.89375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6991, + "grad_norm": 0.16718043386936188, + "learning_rate": 6.463497811816523e-05 + }, + { + "step": 464, + "epoch": 2.9, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6911, + "grad_norm": 0.18258486688137054, + "learning_rate": 6.396353454734311e-05 + }, + { + "step": 465, + "epoch": 2.90625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6904, + "grad_norm": 0.3989619016647339, + "learning_rate": 6.32946503653045e-05 + }, + { + "step": 466, + "epoch": 2.9125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6958, + "grad_norm": 0.11860117316246033, + "learning_rate": 6.262834546982969e-05 + }, + { + "step": 467, + "epoch": 2.91875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068499968, + "loss": 0.6882, + "grad_norm": 0.13927942514419556, + "learning_rate": 6.196463968197084e-05 + }, + { + "step": 468, + "epoch": 2.925, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6939, + "grad_norm": 0.14481347799301147, + "learning_rate": 6.130355274546267e-05 + }, + { + "step": 469, + "epoch": 2.93125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.7034, + "grad_norm": 0.48914477229118347, + "learning_rate": 6.064510432613499e-05 + }, + { + "step": 470, + "epoch": 2.9375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6989, + "grad_norm": 0.2823730707168579, + "learning_rate": 5.998931401132786e-05 + }, + { + "step": 471, + "epoch": 2.94375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6824, + "grad_norm": 0.34819912910461426, + "learning_rate": 5.933620130930867e-05 + }, + { + "step": 472, + "epoch": 2.95, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7028, + "grad_norm": 0.34358957409858704, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 473, + "epoch": 2.95625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6955, + "grad_norm": 0.17758041620254517, + "learning_rate": 5.803808637786135e-05 + }, + { + "step": 474, + "epoch": 2.9625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.6993, + "grad_norm": 0.2608531415462494, + "learning_rate": 5.739312276439427e-05 + }, + { + "step": 475, + "epoch": 2.96875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6991, + "grad_norm": 0.1538439393043518, + "learning_rate": 5.6750913994488415e-05 + }, + { + "step": 476, + "epoch": 2.975, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6993, + "grad_norm": 0.15718139708042145, + "learning_rate": 5.6111479172391136e-05 + }, + { + "step": 477, + "epoch": 2.98125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.69, + "grad_norm": 0.1282118856906891, + "learning_rate": 5.5474837319831314e-05 + }, + { + "step": 478, + "epoch": 2.9875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6913, + "grad_norm": 0.14349167048931122, + "learning_rate": 5.4841007375453186e-05 + }, + { + "step": 479, + "epoch": 2.99375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6854, + "grad_norm": 0.1797506958246231, + "learning_rate": 5.4210008194253196e-05 + }, + { + "step": 480, + "epoch": 3.0, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6879, + "grad_norm": 0.15538226068019867, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 481, + "epoch": 3.00625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.6853, + "grad_norm": 0.17159630358219147, + "learning_rate": 5.2956577119771405e-05 + }, + { + "step": 482, + "epoch": 3.0125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6927, + "grad_norm": 0.31000980734825134, + "learning_rate": 5.233418251320765e-05 + }, + { + "step": 483, + "epoch": 3.01875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7034, + "grad_norm": 0.37567541003227234, + "learning_rate": 5.171469324214901e-05 + }, + { + "step": 484, + "epoch": 3.025, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6949, + "grad_norm": 0.07869354635477066, + "learning_rate": 5.109812773498967e-05 + }, + { + "step": 485, + "epoch": 3.03125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7009, + "grad_norm": 0.3402394950389862, + "learning_rate": 5.048450433314835e-05 + }, + { + "step": 486, + "epoch": 3.0375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6861, + "grad_norm": 0.22806316614151, + "learning_rate": 4.987384129052291e-05 + }, + { + "step": 487, + "epoch": 3.04375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6972, + "grad_norm": 0.203037291765213, + "learning_rate": 4.926615677294723e-05 + }, + { + "step": 488, + "epoch": 3.05, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6949, + "grad_norm": 0.4455038011074066, + "learning_rate": 4.866146885765096e-05 + }, + { + "step": 489, + "epoch": 3.05625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6911, + "grad_norm": 0.39700791239738464, + "learning_rate": 4.8059795532721575e-05 + }, + { + "step": 490, + "epoch": 3.0625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.694, + "grad_norm": 0.24324260652065277, + "learning_rate": 4.7461154696569294e-05 + }, + { + "step": 491, + "epoch": 3.06875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6921, + "grad_norm": 0.361435204744339, + "learning_rate": 4.686556415739488e-05 + }, + { + "step": 492, + "epoch": 3.075, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6896, + "grad_norm": 0.17220118641853333, + "learning_rate": 4.62730416326596e-05 + }, + { + "step": 493, + "epoch": 3.08125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6954, + "grad_norm": 0.49588543176651, + "learning_rate": 4.568360474855826e-05 + }, + { + "step": 494, + "epoch": 3.0875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6898, + "grad_norm": 0.16981586813926697, + "learning_rate": 4.509727103949492e-05 + }, + { + "step": 495, + "epoch": 3.09375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6893, + "grad_norm": 0.550917387008667, + "learning_rate": 4.451405794756138e-05 + }, + { + "step": 496, + "epoch": 3.1, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068499968, + "loss": 0.6939, + "grad_norm": 0.2121947705745697, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 497, + "epoch": 3.10625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068499968, + "loss": 0.6921, + "grad_norm": 0.38103121519088745, + "learning_rate": 4.33570629187776e-05 + }, + { + "step": 498, + "epoch": 3.1125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6955, + "grad_norm": 0.5080791115760803, + "learning_rate": 4.278331539989307e-05 + }, + { + "step": 499, + "epoch": 3.11875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.6936, + "grad_norm": 0.12950006127357483, + "learning_rate": 4.2212757333045283e-05 + }, + { + "step": 500, + "epoch": 3.125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6984, + "grad_norm": 0.1999814212322235, + "learning_rate": 4.164540569103667e-05 + }, + { + "step": 501, + "epoch": 3.13125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6951, + "grad_norm": 0.12143615633249283, + "learning_rate": 4.108127735128561e-05 + }, + { + "step": 502, + "epoch": 3.1375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6927, + "grad_norm": 0.2619875371456146, + "learning_rate": 4.052038909532469e-05 + }, + { + "step": 503, + "epoch": 3.14375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6913, + "grad_norm": 0.32101908326148987, + "learning_rate": 3.996275760830125e-05 + }, + { + "step": 504, + "epoch": 3.15, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6958, + "grad_norm": 0.08150527626276016, + "learning_rate": 3.94083994784814e-05 + }, + { + "step": 505, + "epoch": 3.15625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6961, + "grad_norm": 0.3815608620643616, + "learning_rate": 3.885733119675616e-05 + }, + { + "step": 506, + "epoch": 3.1625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6924, + "grad_norm": 0.0926518440246582, + "learning_rate": 3.830956915615106e-05 + }, + { + "step": 507, + "epoch": 3.16875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.6934, + "grad_norm": 0.12169565260410309, + "learning_rate": 3.776512965133863e-05 + }, + { + "step": 508, + "epoch": 3.175, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.6887, + "grad_norm": 0.3043464124202728, + "learning_rate": 3.72240288781534e-05 + }, + { + "step": 509, + "epoch": 3.18125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6917, + "grad_norm": 0.3119855225086212, + "learning_rate": 3.66862829331103e-05 + }, + { + "step": 510, + "epoch": 3.1875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.696, + "grad_norm": 0.17051991820335388, + "learning_rate": 3.6151907812925717e-05 + }, + { + "step": 511, + "epoch": 3.19375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6923, + "grad_norm": 0.10309315472841263, + "learning_rate": 3.562091941404179e-05 + }, + { + "step": 512, + "epoch": 3.2, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6905, + "grad_norm": 0.09666906297206879, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 513, + "epoch": 3.20625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6902, + "grad_norm": 0.19074124097824097, + "learning_rate": 3.456916586173797e-05 + }, + { + "step": 514, + "epoch": 3.2125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6911, + "grad_norm": 0.3246023952960968, + "learning_rate": 3.404843199558945e-05 + }, + { + "step": 515, + "epoch": 3.21875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6955, + "grad_norm": 0.3217044472694397, + "learning_rate": 3.3531147424353664e-05 + }, + { + "step": 516, + "epoch": 3.225, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.694, + "grad_norm": 0.3072715699672699, + "learning_rate": 3.301732753606776e-05 + }, + { + "step": 517, + "epoch": 3.23125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6889, + "grad_norm": 0.6054893732070923, + "learning_rate": 3.250698761570244e-05 + }, + { + "step": 518, + "epoch": 3.2375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6885, + "grad_norm": 0.7390820980072021, + "learning_rate": 3.200014284470745e-05 + }, + { + "step": 519, + "epoch": 3.24375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6946, + "grad_norm": 0.08565685898065567, + "learning_rate": 3.149680830055967e-05 + }, + { + "step": 520, + "epoch": 3.25, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6908, + "grad_norm": 0.298713356256485, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 521, + "epoch": 3.25625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6915, + "grad_norm": 0.10765811055898666, + "learning_rate": 3.0500729680161663e-05 + }, + { + "step": 522, + "epoch": 3.2625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6876, + "grad_norm": 0.25140565633773804, + "learning_rate": 3.0008015234980552e-05 + }, + { + "step": 523, + "epoch": 3.26875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.694, + "grad_norm": 0.11185373365879059, + "learning_rate": 2.9518870277903274e-05 + }, + { + "step": 524, + "epoch": 3.275, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6891, + "grad_norm": 0.12432953715324402, + "learning_rate": 2.9033309359877597e-05 + }, + { + "step": 525, + "epoch": 3.28125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.7013, + "grad_norm": 0.4505141079425812, + "learning_rate": 2.855134692523438e-05 + }, + { + "step": 526, + "epoch": 3.2875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6916, + "grad_norm": 0.14192578196525574, + "learning_rate": 2.807299731125773e-05 + }, + { + "step": 527, + "epoch": 3.29375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6932, + "grad_norm": 0.1425580531358719, + "learning_rate": 2.759827474775852e-05 + }, + { + "step": 528, + "epoch": 3.3, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6897, + "grad_norm": 0.14806632697582245, + "learning_rate": 2.7127193356651213e-05 + }, + { + "step": 529, + "epoch": 3.30625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6979, + "grad_norm": 0.4461994469165802, + "learning_rate": 2.665976715153377e-05 + }, + { + "step": 530, + "epoch": 3.3125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6835, + "grad_norm": 0.3337884843349457, + "learning_rate": 2.619601003727043e-05 + }, + { + "step": 531, + "epoch": 3.31875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068504576, + "loss": 0.6997, + "grad_norm": 0.5258088707923889, + "learning_rate": 2.5735935809578656e-05 + }, + { + "step": 532, + "epoch": 3.325, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7004, + "grad_norm": 0.49278610944747925, + "learning_rate": 2.5279558154618197e-05 + }, + { + "step": 533, + "epoch": 3.33125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6924, + "grad_norm": 0.22338207066059113, + "learning_rate": 2.4826890648584353e-05 + }, + { + "step": 534, + "epoch": 3.3375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6961, + "grad_norm": 0.1735931932926178, + "learning_rate": 2.4377946757303828e-05 + }, + { + "step": 535, + "epoch": 3.34375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.7027, + "grad_norm": 0.1679200530052185, + "learning_rate": 2.393273983583427e-05 + }, + { + "step": 536, + "epoch": 3.35, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6938, + "grad_norm": 0.21122527122497559, + "learning_rate": 2.3491283128067174e-05 + }, + { + "step": 537, + "epoch": 3.35625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6876, + "grad_norm": 0.22090433537960052, + "learning_rate": 2.3053589766333414e-05 + }, + { + "step": 538, + "epoch": 3.3625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6902, + "grad_norm": 0.2584611177444458, + "learning_rate": 2.261967277101318e-05 + }, + { + "step": 539, + "epoch": 3.36875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6937, + "grad_norm": 0.12387854605913162, + "learning_rate": 2.218954505014821e-05 + }, + { + "step": 540, + "epoch": 3.375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6908, + "grad_norm": 0.4550953209400177, + "learning_rate": 2.1763219399058042e-05 + }, + { + "step": 541, + "epoch": 3.38125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.689, + "grad_norm": 0.10006877034902573, + "learning_rate": 2.1340708499959197e-05 + }, + { + "step": 542, + "epoch": 3.3875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.693, + "grad_norm": 0.2492443323135376, + "learning_rate": 2.0922024921588167e-05 + }, + { + "step": 543, + "epoch": 3.39375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6967, + "grad_norm": 0.21769756078720093, + "learning_rate": 2.0507181118827254e-05 + }, + { + "step": 544, + "epoch": 3.4, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.685, + "grad_norm": 0.10121368616819382, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 545, + "epoch": 3.40625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.7, + "grad_norm": 0.11487221717834473, + "learning_rate": 1.9689062088175154e-05 + }, + { + "step": 546, + "epoch": 3.4125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.691, + "grad_norm": 0.08381395041942596, + "learning_rate": 1.928581119746081e-05 + }, + { + "step": 547, + "epoch": 3.41875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6969, + "grad_norm": 0.267179399728775, + "learning_rate": 1.8886448755986193e-05 + }, + { + "step": 548, + "epoch": 3.425, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6933, + "grad_norm": 0.14143946766853333, + "learning_rate": 1.8490986643873845e-05 + }, + { + "step": 549, + "epoch": 3.43125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6933, + "grad_norm": 0.15395282208919525, + "learning_rate": 1.8099436625220443e-05 + }, + { + "step": 550, + "epoch": 3.4375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068501504, + "loss": 0.6905, + "grad_norm": 0.18735258281230927, + "learning_rate": 1.7711810347746757e-05 + }, + { + "step": 551, + "epoch": 3.44375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.695, + "grad_norm": 0.25961852073669434, + "learning_rate": 1.7328119342451165e-05 + }, + { + "step": 552, + "epoch": 3.45, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6947, + "grad_norm": 0.13014058768749237, + "learning_rate": 1.694837502326674e-05 + }, + { + "step": 553, + "epoch": 3.45625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6942, + "grad_norm": 0.2051059901714325, + "learning_rate": 1.6572588686721606e-05 + }, + { + "step": 554, + "epoch": 3.4625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6858, + "grad_norm": 0.3782062232494354, + "learning_rate": 1.6200771511602882e-05 + }, + { + "step": 555, + "epoch": 3.46875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.6894, + "grad_norm": 0.7799779772758484, + "learning_rate": 1.583293455862422e-05 + }, + { + "step": 556, + "epoch": 3.475, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6883, + "grad_norm": 0.11601296812295914, + "learning_rate": 1.546908877009676e-05 + }, + { + "step": 557, + "epoch": 3.48125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6974, + "grad_norm": 0.3643569350242615, + "learning_rate": 1.5109244969603546e-05 + }, + { + "step": 558, + "epoch": 3.4875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6995, + "grad_norm": 0.2757208049297333, + "learning_rate": 1.4753413861677604e-05 + }, + { + "step": 559, + "epoch": 3.49375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6938, + "grad_norm": 0.2277052402496338, + "learning_rate": 1.4401606031483497e-05 + }, + { + "step": 560, + "epoch": 3.5, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6935, + "grad_norm": 0.23295487463474274, + "learning_rate": 1.4053831944502508e-05 + }, + { + "step": 561, + "epoch": 3.50625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.695, + "grad_norm": 0.11207695305347443, + "learning_rate": 1.371010194622117e-05 + }, + { + "step": 562, + "epoch": 3.5125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.7028, + "grad_norm": 0.5064395070075989, + "learning_rate": 1.3370426261823613e-05 + }, + { + "step": 563, + "epoch": 3.51875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6949, + "grad_norm": 0.1738373190164566, + "learning_rate": 1.3034814995887433e-05 + }, + { + "step": 564, + "epoch": 3.525, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6903, + "grad_norm": 0.16464684903621674, + "learning_rate": 1.2703278132082934e-05 + }, + { + "step": 565, + "epoch": 3.53125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6909, + "grad_norm": 0.10186775028705597, + "learning_rate": 1.237582553287631e-05 + }, + { + "step": 566, + "epoch": 3.5375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.702, + "grad_norm": 0.5676169395446777, + "learning_rate": 1.205246693923616e-05 + }, + { + "step": 567, + "epoch": 3.54375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6994, + "grad_norm": 0.36957213282585144, + "learning_rate": 1.173321197034382e-05 + }, + { + "step": 568, + "epoch": 3.55, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6911, + "grad_norm": 0.2440928965806961, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 569, + "epoch": 3.55625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.7009, + "grad_norm": 0.4179360270500183, + "learning_rate": 1.1107050772877507e-05 + }, + { + "step": 570, + "epoch": 3.5625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6906, + "grad_norm": 0.1440572589635849, + "learning_rate": 1.0800163171172332e-05 + }, + { + "step": 571, + "epoch": 3.56875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6966, + "grad_norm": 0.1569071263074875, + "learning_rate": 1.0497416447398187e-05 + }, + { + "step": 572, + "epoch": 3.575, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.685, + "grad_norm": 0.3702599108219147, + "learning_rate": 1.0198819607580233e-05 + }, + { + "step": 573, + "epoch": 3.58125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.697, + "grad_norm": 0.2180940955877304, + "learning_rate": 9.904381534293993e-06 + }, + { + "step": 574, + "epoch": 3.5875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6899, + "grad_norm": 0.23850862681865692, + "learning_rate": 9.614110986401169e-06 + }, + { + "step": 575, + "epoch": 3.59375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6924, + "grad_norm": 0.08840714395046234, + "learning_rate": 9.32801659878905e-06 + }, + { + "step": 576, + "epoch": 3.6, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.7008, + "grad_norm": 0.29720503091812134, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 577, + "epoch": 3.60625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.6933, + "grad_norm": 0.10849012434482574, + "learning_rate": 8.768390222546895e-06 + }, + { + "step": 578, + "epoch": 3.6125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068499968, + "loss": 0.6943, + "grad_norm": 0.1271669566631317, + "learning_rate": 8.494874881526215e-06 + }, + { + "step": 579, + "epoch": 3.61875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6946, + "grad_norm": 0.10272325575351715, + "learning_rate": 8.225568995509834e-06 + }, + { + "step": 580, + "epoch": 3.625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6869, + "grad_norm": 0.44231104850769043, + "learning_rate": 7.960480575734162e-06 + }, + { + "step": 581, + "epoch": 3.63125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6893, + "grad_norm": 0.19077900052070618, + "learning_rate": 7.699617507975563e-06 + }, + { + "step": 582, + "epoch": 3.6375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.6902, + "grad_norm": 0.11466526985168457, + "learning_rate": 7.442987552315833e-06 + }, + { + "step": 583, + "epoch": 3.64375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6938, + "grad_norm": 0.094680055975914, + "learning_rate": 7.190598342911358e-06 + }, + { + "step": 584, + "epoch": 3.65, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6889, + "grad_norm": 0.10096382349729538, + "learning_rate": 6.942457387765976e-06 + }, + { + "step": 585, + "epoch": 3.65625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068499968, + "loss": 0.691, + "grad_norm": 0.19353455305099487, + "learning_rate": 6.698572068507596e-06 + }, + { + "step": 586, + "epoch": 3.6625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.701, + "grad_norm": 0.4765409827232361, + "learning_rate": 6.458949640168675e-06 + }, + { + "step": 587, + "epoch": 3.66875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068499968, + "loss": 0.691, + "grad_norm": 0.1250181645154953, + "learning_rate": 6.223597230970428e-06 + }, + { + "step": 588, + "epoch": 3.675, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6882, + "grad_norm": 0.3695790767669678, + "learning_rate": 5.992521842110709e-06 + }, + { + "step": 589, + "epoch": 3.68125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6918, + "grad_norm": 0.28121551871299744, + "learning_rate": 5.7657303475556974e-06 + }, + { + "step": 590, + "epoch": 3.6875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6985, + "grad_norm": 0.4061795175075531, + "learning_rate": 5.543229493835594e-06 + }, + { + "step": 591, + "epoch": 3.69375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6964, + "grad_norm": 0.6136791706085205, + "learning_rate": 5.325025899843732e-06 + }, + { + "step": 592, + "epoch": 3.7, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.6967, + "grad_norm": 0.336139440536499, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 593, + "epoch": 3.70625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6926, + "grad_norm": 0.25990375876426697, + "learning_rate": 4.901536327256589e-06 + }, + { + "step": 594, + "epoch": 3.7125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.7, + "grad_norm": 0.602540910243988, + "learning_rate": 4.6962629465110365e-06 + }, + { + "step": 595, + "epoch": 3.71875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068498432, + "loss": 0.6896, + "grad_norm": 0.1136828139424324, + "learning_rate": 4.495312020818403e-06 + }, + { + "step": 596, + "epoch": 3.725, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6911, + "grad_norm": 0.3104899227619171, + "learning_rate": 4.298689528010785e-06 + }, + { + "step": 597, + "epoch": 3.73125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6927, + "grad_norm": 0.14092765748500824, + "learning_rate": 4.106401317159275e-06 + }, + { + "step": 598, + "epoch": 3.7375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6964, + "grad_norm": 0.33869045972824097, + "learning_rate": 3.918453108399955e-06 + }, + { + "step": 599, + "epoch": 3.74375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6929, + "grad_norm": 0.19212546944618225, + "learning_rate": 3.7348504927637302e-06 + }, + { + "step": 600, + "epoch": 3.75, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6945, + "grad_norm": 0.193926140666008, + "learning_rate": 3.5555989320099952e-06 + }, + { + "step": 601, + "epoch": 3.75625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6911, + "grad_norm": 0.12735842168331146, + "learning_rate": 3.3807037584642316e-06 + }, + { + "step": 602, + "epoch": 3.7625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.692, + "grad_norm": 0.18898391723632812, + "learning_rate": 3.21017017485925e-06 + }, + { + "step": 603, + "epoch": 3.76875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068496896, + "loss": 0.6936, + "grad_norm": 0.32877117395401, + "learning_rate": 3.0440032541805825e-06 + }, + { + "step": 604, + "epoch": 3.775, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.696, + "grad_norm": 0.14287161827087402, + "learning_rate": 2.882207939515435e-06 + }, + { + "step": 605, + "epoch": 3.78125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068486144, + "loss": 0.693, + "grad_norm": 0.5137943625450134, + "learning_rate": 2.7247890439057064e-06 + }, + { + "step": 606, + "epoch": 3.7875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6939, + "grad_norm": 0.2515864074230194, + "learning_rate": 2.5717512502048342e-06 + }, + { + "step": 607, + "epoch": 3.79375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6912, + "grad_norm": 0.1985500156879425, + "learning_rate": 2.423099110938376e-06 + }, + { + "step": 608, + "epoch": 3.8, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6903, + "grad_norm": 0.28095823526382446, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 609, + "epoch": 3.80625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6936, + "grad_norm": 0.43502533435821533, + "learning_rate": 2.1389693533636455e-06 + }, + { + "step": 610, + "epoch": 3.8125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6995, + "grad_norm": 0.14164747297763824, + "learning_rate": 2.003500187268153e-06 + }, + { + "step": 611, + "epoch": 3.81875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6929, + "grad_norm": 0.2673846185207367, + "learning_rate": 1.8724335797812685e-06 + }, + { + "step": 612, + "epoch": 3.825, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.6941, + "grad_norm": 0.20323961973190308, + "learning_rate": 1.7457734298359005e-06 + }, + { + "step": 613, + "epoch": 3.83125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6969, + "grad_norm": 0.11928880214691162, + "learning_rate": 1.6235235052828476e-06 + }, + { + "step": 614, + "epoch": 3.8375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6909, + "grad_norm": 0.2696690261363983, + "learning_rate": 1.505687442778819e-06 + }, + { + "step": 615, + "epoch": 3.84375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6928, + "grad_norm": 0.27168479561805725, + "learning_rate": 1.3922687476781047e-06 + }, + { + "step": 616, + "epoch": 3.85, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6883, + "grad_norm": 0.0903002992272377, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 617, + "epoch": 3.85625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6914, + "grad_norm": 0.2481173872947693, + "learning_rate": 1.1786968239705486e-06 + }, + { + "step": 618, + "epoch": 3.8625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6912, + "grad_norm": 0.10357651859521866, + "learning_rate": 1.0785499486417438e-06 + }, + { + "step": 619, + "epoch": 3.86875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6913, + "grad_norm": 0.093354731798172, + "learning_rate": 9.82833147083345e-07 + }, + { + "step": 620, + "epoch": 3.875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.691, + "grad_norm": 0.3148961663246155, + "learning_rate": 8.91549266652053e-07 + }, + { + "step": 621, + "epoch": 3.88125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6973, + "grad_norm": 0.45378485321998596, + "learning_rate": 8.04701022835319e-07 + }, + { + "step": 622, + "epoch": 3.8875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.699, + "grad_norm": 0.1960097700357437, + "learning_rate": 7.222909991704773e-07 + }, + { + "step": 623, + "epoch": 3.89375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6915, + "grad_norm": 0.35037487745285034, + "learning_rate": 6.443216471679058e-07 + }, + { + "step": 624, + "epoch": 3.9, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6928, + "grad_norm": 0.1940658837556839, + "learning_rate": 5.707952862381681e-07 + }, + { + "step": 625, + "epoch": 3.90625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6913, + "grad_norm": 0.14327414333820343, + "learning_rate": 5.017141036229522e-07 + }, + { + "step": 626, + "epoch": 3.9125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.694, + "grad_norm": 0.094209223985672, + "learning_rate": 4.370801543300051e-07 + }, + { + "step": 627, + "epoch": 3.91875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6896, + "grad_norm": 0.19197936356067657, + "learning_rate": 3.768953610720327e-07 + }, + { + "step": 628, + "epoch": 3.925, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.69, + "grad_norm": 0.17874263226985931, + "learning_rate": 3.211615142094781e-07 + }, + { + "step": 629, + "epoch": 3.93125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068489216, + "loss": 0.6923, + "grad_norm": 0.26839059591293335, + "learning_rate": 2.6988027169728145e-07 + }, + { + "step": 630, + "epoch": 3.9375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06848768, + "loss": 0.6921, + "grad_norm": 0.30177614092826843, + "learning_rate": 2.2305315903553555e-07 + }, + { + "step": 631, + "epoch": 3.94375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068501504, + "loss": 0.6884, + "grad_norm": 0.25411301851272583, + "learning_rate": 1.8068156922413924e-07 + }, + { + "step": 632, + "epoch": 3.95, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6898, + "grad_norm": 0.14581865072250366, + "learning_rate": 1.4276676272133025e-07 + }, + { + "step": 633, + "epoch": 3.95625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.6897, + "grad_norm": 0.09368970990180969, + "learning_rate": 1.0930986740621539e-07 + }, + { + "step": 634, + "epoch": 3.9625, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068492288, + "loss": 0.6948, + "grad_norm": 0.29514747858047485, + "learning_rate": 8.031187854514731e-08 + }, + { + "step": 635, + "epoch": 3.96875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6932, + "grad_norm": 0.1015295460820198, + "learning_rate": 5.577365876224815e-08 + }, + { + "step": 636, + "epoch": 3.975, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068484608, + "loss": 0.6916, + "grad_norm": 0.12912791967391968, + "learning_rate": 3.5695938013630134e-08 + }, + { + "step": 637, + "epoch": 3.98125, + "cpu_mem": 3.74116352, + "gpu_mem": 1.06849536, + "loss": 0.6963, + "grad_norm": 0.505466103553772, + "learning_rate": 2.007931356572956e-08 + }, + { + "step": 638, + "epoch": 3.9875, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6939, + "grad_norm": 0.2741418480873108, + "learning_rate": 8.924249977537712e-09 + }, + { + "step": 639, + "epoch": 3.99375, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068493824, + "loss": 0.6901, + "grad_norm": 0.26659685373306274, + "learning_rate": 2.2310790867619e-09 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "loss": 0.69, + "grad_norm": 0.2104029804468155, + "learning_rate": 0.0 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 3.74116352, + "gpu_mem": 1.068490752, + "train_runtime": 1451.7235, + "train_samples_per_second": 28.198, + "train_steps_per_second": 0.441, + "total_flos": 1.4579617164754944e+16, + "train_loss": 0.7455016063526273 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r32-a2/README.md b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r32-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r32-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r32-a2/adapter_config.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..97cff55d3f03a364161498b7b6299c246238daf5 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r32-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r32-a2/eval_results.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c7d19a82f1141392d099f4ad415c22848a716242 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "winogrande", + "results": 0.5477505919494869 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r32-a2/training_configuration.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..5062e4ea443c70fbffbe5d3473465e616b0a93ca --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "WINOGRANDE", + "dataset_id": "allenai/winogrande", + "preprocess_id": "winogrande_train_deepeval" + }, + "peft_config": { + "method": "loraq4", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 25231360 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loraq4-winogrande-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loraq4-fix/TinyLlama_v1.1-loraq4-winogrande-r32-a2", + "seed": 42, + "timestamp": "2025-09-01T13:09:45.834334" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r32-a2/training_logs.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..65683df23f40393d5080511e3b1ed69c921315a9 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r32-a2/training_logs.json @@ -0,0 +1,5773 @@ +[ + { + "step": 1, + "epoch": 0.00625, + "cpu_mem": 3.270025216, + "gpu_mem": 1.150493184, + "loss": 3.2646, + "grad_norm": 63.73407745361328, + "learning_rate": 4.6875e-06 + }, + { + "step": 2, + "epoch": 0.0125, + "cpu_mem": 3.270418432, + "gpu_mem": 1.352342016, + "loss": 3.2643, + "grad_norm": 62.4156379699707, + "learning_rate": 9.375e-06 + }, + { + "step": 3, + "epoch": 0.01875, + "cpu_mem": 3.271008256, + "gpu_mem": 1.352346624, + "loss": 2.926, + "grad_norm": 59.52467727661133, + "learning_rate": 1.40625e-05 + }, + { + "step": 4, + "epoch": 0.025, + "cpu_mem": 3.271401472, + "gpu_mem": 1.352345088, + "loss": 2.3595, + "grad_norm": 55.23207473754883, + "learning_rate": 1.875e-05 + }, + { + "step": 5, + "epoch": 0.03125, + "cpu_mem": 3.271794688, + "gpu_mem": 1.352345088, + "loss": 1.7731, + "grad_norm": 46.444522857666016, + "learning_rate": 2.3437499999999997e-05 + }, + { + "step": 6, + "epoch": 0.0375, + "cpu_mem": 3.271991296, + "gpu_mem": 1.352351232, + "loss": 1.2917, + "grad_norm": 27.436742782592773, + "learning_rate": 2.8125e-05 + }, + { + "step": 7, + "epoch": 0.04375, + "cpu_mem": 3.272187904, + "gpu_mem": 1.352357376, + "loss": 0.9802, + "grad_norm": 14.524227142333984, + "learning_rate": 3.28125e-05 + }, + { + "step": 8, + "epoch": 0.05, + "cpu_mem": 3.272384512, + "gpu_mem": 1.35234048, + "loss": 0.8345, + "grad_norm": 9.032588005065918, + "learning_rate": 3.75e-05 + }, + { + "step": 9, + "epoch": 0.05625, + "cpu_mem": 3.27258112, + "gpu_mem": 1.352346624, + "loss": 0.773, + "grad_norm": 11.368887901306152, + "learning_rate": 4.2187499999999995e-05 + }, + { + "step": 10, + "epoch": 0.0625, + "cpu_mem": 3.272777728, + "gpu_mem": 1.352349696, + "loss": 0.9551, + "grad_norm": 27.99814796447754, + "learning_rate": 4.6874999999999994e-05 + }, + { + "step": 11, + "epoch": 0.06875, + "cpu_mem": 3.272974336, + "gpu_mem": 1.352338944, + "loss": 0.9049, + "grad_norm": 21.739261627197266, + "learning_rate": 5.156249999999999e-05 + }, + { + "step": 12, + "epoch": 0.075, + "cpu_mem": 3.273170944, + "gpu_mem": 1.352343552, + "loss": 0.6955, + "grad_norm": 3.2317769527435303, + "learning_rate": 5.625e-05 + }, + { + "step": 13, + "epoch": 0.08125, + "cpu_mem": 3.273170944, + "gpu_mem": 1.352351232, + "loss": 0.7001, + "grad_norm": 2.7765791416168213, + "learning_rate": 6.09375e-05 + }, + { + "step": 14, + "epoch": 0.0875, + "cpu_mem": 3.273367552, + "gpu_mem": 1.352346624, + "loss": 0.8012, + "grad_norm": 12.265933990478516, + "learning_rate": 6.5625e-05 + }, + { + "step": 15, + "epoch": 0.09375, + "cpu_mem": 3.27356416, + "gpu_mem": 1.352346624, + "loss": 0.6954, + "grad_norm": 3.497291326522827, + "learning_rate": 7.03125e-05 + }, + { + "step": 16, + "epoch": 0.1, + "cpu_mem": 3.27356416, + "gpu_mem": 1.352343552, + "loss": 0.6994, + "grad_norm": 4.9063849449157715, + "learning_rate": 7.5e-05 + }, + { + "step": 17, + "epoch": 0.10625, + "cpu_mem": 3.273760768, + "gpu_mem": 1.352343552, + "loss": 0.745, + "grad_norm": 6.929089546203613, + "learning_rate": 7.968749999999999e-05 + }, + { + "step": 18, + "epoch": 0.1125, + "cpu_mem": 3.273760768, + "gpu_mem": 1.352346624, + "loss": 0.7149, + "grad_norm": 2.5208630561828613, + "learning_rate": 8.437499999999999e-05 + }, + { + "step": 19, + "epoch": 0.11875, + "cpu_mem": 3.273760768, + "gpu_mem": 1.352343552, + "loss": 0.7072, + "grad_norm": 1.5155459642410278, + "learning_rate": 8.906249999999999e-05 + }, + { + "step": 20, + "epoch": 0.125, + "cpu_mem": 3.273957376, + "gpu_mem": 1.352351232, + "loss": 0.6939, + "grad_norm": 1.67841374874115, + "learning_rate": 9.374999999999999e-05 + }, + { + "step": 21, + "epoch": 0.13125, + "cpu_mem": 3.273957376, + "gpu_mem": 1.352343552, + "loss": 0.7881, + "grad_norm": 6.701051712036133, + "learning_rate": 9.843749999999999e-05 + }, + { + "step": 22, + "epoch": 0.1375, + "cpu_mem": 3.273957376, + "gpu_mem": 1.352343552, + "loss": 0.765, + "grad_norm": 6.371286392211914, + "learning_rate": 0.00010312499999999999 + }, + { + "step": 23, + "epoch": 0.14375, + "cpu_mem": 3.274153984, + "gpu_mem": 1.352338944, + "loss": 0.7587, + "grad_norm": 10.70005989074707, + "learning_rate": 0.00010781249999999998 + }, + { + "step": 24, + "epoch": 0.15, + "cpu_mem": 3.274153984, + "gpu_mem": 1.352342016, + "loss": 0.6969, + "grad_norm": 1.8845895528793335, + "learning_rate": 0.0001125 + }, + { + "step": 25, + "epoch": 0.15625, + "cpu_mem": 3.274153984, + "gpu_mem": 1.352345088, + "loss": 0.7118, + "grad_norm": 2.346724271774292, + "learning_rate": 0.0001171875 + }, + { + "step": 26, + "epoch": 0.1625, + "cpu_mem": 3.274153984, + "gpu_mem": 1.35234048, + "loss": 0.7328, + "grad_norm": 5.648448944091797, + "learning_rate": 0.000121875 + }, + { + "step": 27, + "epoch": 0.16875, + "cpu_mem": 3.274153984, + "gpu_mem": 1.352338944, + "loss": 0.7124, + "grad_norm": 4.639279365539551, + "learning_rate": 0.0001265625 + }, + { + "step": 28, + "epoch": 0.175, + "cpu_mem": 3.274153984, + "gpu_mem": 1.352345088, + "loss": 0.7074, + "grad_norm": 3.016674518585205, + "learning_rate": 0.00013125 + }, + { + "step": 29, + "epoch": 0.18125, + "cpu_mem": 3.274153984, + "gpu_mem": 1.352343552, + "loss": 0.7069, + "grad_norm": 1.549887776374817, + "learning_rate": 0.0001359375 + }, + { + "step": 30, + "epoch": 0.1875, + "cpu_mem": 3.274153984, + "gpu_mem": 1.352343552, + "loss": 0.6967, + "grad_norm": 2.5075032711029053, + "learning_rate": 0.000140625 + }, + { + "step": 31, + "epoch": 0.19375, + "cpu_mem": 3.274153984, + "gpu_mem": 1.352343552, + "loss": 0.7212, + "grad_norm": 2.7181057929992676, + "learning_rate": 0.0001453125 + }, + { + "step": 32, + "epoch": 0.2, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6883, + "grad_norm": 0.9924866557121277, + "learning_rate": 0.00015 + }, + { + "step": 33, + "epoch": 0.20625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.7101, + "grad_norm": 2.208127021789551, + "learning_rate": 0.00015468749999999999 + }, + { + "step": 34, + "epoch": 0.2125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.7482, + "grad_norm": 4.076840400695801, + "learning_rate": 0.00015937499999999998 + }, + { + "step": 35, + "epoch": 0.21875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7572, + "grad_norm": 3.853905200958252, + "learning_rate": 0.00016406249999999998 + }, + { + "step": 36, + "epoch": 0.225, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.7155, + "grad_norm": 2.3666326999664307, + "learning_rate": 0.00016874999999999998 + }, + { + "step": 37, + "epoch": 0.23125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.7358, + "grad_norm": 2.9161219596862793, + "learning_rate": 0.00017343749999999998 + }, + { + "step": 38, + "epoch": 0.2375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7001, + "grad_norm": 0.7008652091026306, + "learning_rate": 0.00017812499999999998 + }, + { + "step": 39, + "epoch": 0.24375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.761, + "grad_norm": 4.000246047973633, + "learning_rate": 0.00018281249999999998 + }, + { + "step": 40, + "epoch": 0.25, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7184, + "grad_norm": 1.5836018323898315, + "learning_rate": 0.00018749999999999998 + }, + { + "step": 41, + "epoch": 0.25625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7249, + "grad_norm": 1.9657446146011353, + "learning_rate": 0.00019218749999999998 + }, + { + "step": 42, + "epoch": 0.2625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7345, + "grad_norm": 3.4741196632385254, + "learning_rate": 0.00019687499999999997 + }, + { + "step": 43, + "epoch": 0.26875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6951, + "grad_norm": 1.5992825031280518, + "learning_rate": 0.00020156249999999997 + }, + { + "step": 44, + "epoch": 0.275, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7232, + "grad_norm": 2.6476211547851562, + "learning_rate": 0.00020624999999999997 + }, + { + "step": 45, + "epoch": 0.28125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352352768, + "loss": 0.6906, + "grad_norm": 0.9567970037460327, + "learning_rate": 0.00021093749999999997 + }, + { + "step": 46, + "epoch": 0.2875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.806, + "grad_norm": 4.252427577972412, + "learning_rate": 0.00021562499999999997 + }, + { + "step": 47, + "epoch": 0.29375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.8195, + "grad_norm": 4.374600410461426, + "learning_rate": 0.00022031249999999997 + }, + { + "step": 48, + "epoch": 0.3, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7004, + "grad_norm": 1.7798327207565308, + "learning_rate": 0.000225 + }, + { + "step": 49, + "epoch": 0.30625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.9731, + "grad_norm": 7.499812126159668, + "learning_rate": 0.0002296875 + }, + { + "step": 50, + "epoch": 0.3125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.839, + "grad_norm": 3.9224393367767334, + "learning_rate": 0.000234375 + }, + { + "step": 51, + "epoch": 0.31875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.8063, + "grad_norm": 3.210278272628784, + "learning_rate": 0.0002390625 + }, + { + "step": 52, + "epoch": 0.325, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.7242, + "grad_norm": 1.2137354612350464, + "learning_rate": 0.00024375 + }, + { + "step": 53, + "epoch": 0.33125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7136, + "grad_norm": 2.0668463706970215, + "learning_rate": 0.00024843749999999996 + }, + { + "step": 54, + "epoch": 0.3375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352334336, + "loss": 0.721, + "grad_norm": 2.2894632816314697, + "learning_rate": 0.000253125 + }, + { + "step": 55, + "epoch": 0.34375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.6947, + "grad_norm": 1.0013632774353027, + "learning_rate": 0.00025781249999999996 + }, + { + "step": 56, + "epoch": 0.35, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6985, + "grad_norm": 0.3791711926460266, + "learning_rate": 0.0002625 + }, + { + "step": 57, + "epoch": 0.35625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.7045, + "grad_norm": 1.9636479616165161, + "learning_rate": 0.00026718749999999996 + }, + { + "step": 58, + "epoch": 0.3625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.722, + "grad_norm": 2.538165807723999, + "learning_rate": 0.000271875 + }, + { + "step": 59, + "epoch": 0.36875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352352768, + "loss": 0.7094, + "grad_norm": 2.0806198120117188, + "learning_rate": 0.00027656249999999995 + }, + { + "step": 60, + "epoch": 0.375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7709, + "grad_norm": 3.6979589462280273, + "learning_rate": 0.00028125 + }, + { + "step": 61, + "epoch": 0.38125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7378, + "grad_norm": 2.772806406021118, + "learning_rate": 0.00028593749999999995 + }, + { + "step": 62, + "epoch": 0.3875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6969, + "grad_norm": 0.8926016688346863, + "learning_rate": 0.000290625 + }, + { + "step": 63, + "epoch": 0.39375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.7008, + "grad_norm": 1.8618338108062744, + "learning_rate": 0.00029531249999999995 + }, + { + "step": 64, + "epoch": 0.4, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.739, + "grad_norm": 2.900374174118042, + "learning_rate": 0.0003 + }, + { + "step": 65, + "epoch": 0.40625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6991, + "grad_norm": 2.10282039642334, + "learning_rate": 0.00029999776892091325 + }, + { + "step": 66, + "epoch": 0.4125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7569, + "grad_norm": 3.1595253944396973, + "learning_rate": 0.00029999107575002246 + }, + { + "step": 67, + "epoch": 0.41875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6848, + "grad_norm": 0.2748308777809143, + "learning_rate": 0.0002999799206864343 + }, + { + "step": 68, + "epoch": 0.425, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6978, + "grad_norm": 0.2644709050655365, + "learning_rate": 0.0002999643040619863 + }, + { + "step": 69, + "epoch": 0.43125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7079, + "grad_norm": 1.2326061725616455, + "learning_rate": 0.0002999442263412377 + }, + { + "step": 70, + "epoch": 0.4375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.6917, + "grad_norm": 0.3750247657299042, + "learning_rate": 0.00029991968812145484 + }, + { + "step": 71, + "epoch": 0.44375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6958, + "grad_norm": 0.24399234354496002, + "learning_rate": 0.00029989069013259374 + }, + { + "step": 72, + "epoch": 0.45, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6954, + "grad_norm": 0.3637118637561798, + "learning_rate": 0.00029985723323727866 + }, + { + "step": 73, + "epoch": 0.45625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6988, + "grad_norm": 1.0973938703536987, + "learning_rate": 0.00029981931843077583 + }, + { + "step": 74, + "epoch": 0.4625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.7088, + "grad_norm": 0.9531999230384827, + "learning_rate": 0.00029977694684096444 + }, + { + "step": 75, + "epoch": 0.46875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.675, + "grad_norm": 0.5387400388717651, + "learning_rate": 0.0002997301197283027 + }, + { + "step": 76, + "epoch": 0.475, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7442, + "grad_norm": 1.9291459321975708, + "learning_rate": 0.0002996788384857905 + }, + { + "step": 77, + "epoch": 0.48125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.6963, + "grad_norm": 0.12687836587429047, + "learning_rate": 0.00029962310463892795 + }, + { + "step": 78, + "epoch": 0.4875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6988, + "grad_norm": 0.48563769459724426, + "learning_rate": 0.00029956291984566997 + }, + { + "step": 79, + "epoch": 0.49375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7015, + "grad_norm": 0.572743833065033, + "learning_rate": 0.00029949828589637703 + }, + { + "step": 80, + "epoch": 0.5, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.696, + "grad_norm": 0.235610693693161, + "learning_rate": 0.0002994292047137618 + }, + { + "step": 81, + "epoch": 0.50625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.6983, + "grad_norm": 0.45529428124427795, + "learning_rate": 0.00029935567835283203 + }, + { + "step": 82, + "epoch": 0.5125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.7095, + "grad_norm": 0.8527933955192566, + "learning_rate": 0.00029927770900082954 + }, + { + "step": 83, + "epoch": 0.51875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.694, + "grad_norm": 0.6862139701843262, + "learning_rate": 0.0002991952989771647 + }, + { + "step": 84, + "epoch": 0.525, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.6957, + "grad_norm": 0.348800927400589, + "learning_rate": 0.0002991084507333479 + }, + { + "step": 85, + "epoch": 0.53125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6928, + "grad_norm": 0.09802792221307755, + "learning_rate": 0.00029901716685291663 + }, + { + "step": 86, + "epoch": 0.5375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.701, + "grad_norm": 0.6302459836006165, + "learning_rate": 0.0002989214500513582 + }, + { + "step": 87, + "epoch": 0.54375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6931, + "grad_norm": 0.40821778774261475, + "learning_rate": 0.0002988213031760294 + }, + { + "step": 88, + "epoch": 0.55, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7171, + "grad_norm": 1.0373386144638062, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 89, + "epoch": 0.55625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6905, + "grad_norm": 0.10351550579071045, + "learning_rate": 0.0002986077312523219 + }, + { + "step": 90, + "epoch": 0.5625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6967, + "grad_norm": 0.6066154837608337, + "learning_rate": 0.00029849431255722116 + }, + { + "step": 91, + "epoch": 0.56875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.6708, + "grad_norm": 0.5592688322067261, + "learning_rate": 0.00029837647649471715 + }, + { + "step": 92, + "epoch": 0.575, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.8093, + "grad_norm": 2.1380412578582764, + "learning_rate": 0.0002982542265701641 + }, + { + "step": 93, + "epoch": 0.58125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.7668, + "grad_norm": 1.5945909023284912, + "learning_rate": 0.0002981275664202187 + }, + { + "step": 94, + "epoch": 0.5875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.703, + "grad_norm": 0.599414050579071, + "learning_rate": 0.00029799649981273186 + }, + { + "step": 95, + "epoch": 0.59375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7, + "grad_norm": 0.9185135960578918, + "learning_rate": 0.00029786103064663634 + }, + { + "step": 96, + "epoch": 0.6, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6938, + "grad_norm": 0.5364395976066589, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 97, + "epoch": 0.60625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7119, + "grad_norm": 1.0012913942337036, + "learning_rate": 0.00029757690088906156 + }, + { + "step": 98, + "epoch": 0.6125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7008, + "grad_norm": 0.33099642395973206, + "learning_rate": 0.00029742824874979515 + }, + { + "step": 99, + "epoch": 0.61875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.7401, + "grad_norm": 2.66505765914917, + "learning_rate": 0.0002972752109560943 + }, + { + "step": 100, + "epoch": 0.625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6991, + "grad_norm": 0.47305557131767273, + "learning_rate": 0.00029711779206048454 + }, + { + "step": 101, + "epoch": 0.63125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6949, + "grad_norm": 0.4785209894180298, + "learning_rate": 0.0002969559967458194 + }, + { + "step": 102, + "epoch": 0.6375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.7062, + "grad_norm": 0.984514594078064, + "learning_rate": 0.0002967898298251407 + }, + { + "step": 103, + "epoch": 0.64375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.72, + "grad_norm": 1.632286787033081, + "learning_rate": 0.0002966192962415358 + }, + { + "step": 104, + "epoch": 0.65, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6951, + "grad_norm": 0.5210705399513245, + "learning_rate": 0.00029644440106799 + }, + { + "step": 105, + "epoch": 0.65625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.695, + "grad_norm": 0.27211979031562805, + "learning_rate": 0.00029626514950723627 + }, + { + "step": 106, + "epoch": 0.6625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6941, + "grad_norm": 0.15020450949668884, + "learning_rate": 0.0002960815468916 + }, + { + "step": 107, + "epoch": 0.66875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7012, + "grad_norm": 0.631259024143219, + "learning_rate": 0.0002958935986828407 + }, + { + "step": 108, + "epoch": 0.675, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7048, + "grad_norm": 0.7896405458450317, + "learning_rate": 0.00029570131047198915 + }, + { + "step": 109, + "epoch": 0.68125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7108, + "grad_norm": 0.7187256813049316, + "learning_rate": 0.0002955046879791816 + }, + { + "step": 110, + "epoch": 0.6875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.7309, + "grad_norm": 1.612958550453186, + "learning_rate": 0.00029530373705348895 + }, + { + "step": 111, + "epoch": 0.69375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.697, + "grad_norm": 0.11598368734121323, + "learning_rate": 0.00029509846367274336 + }, + { + "step": 112, + "epoch": 0.7, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6947, + "grad_norm": 0.2679135799407959, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 113, + "epoch": 0.70625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352335872, + "loss": 0.6763, + "grad_norm": 0.18590646982192993, + "learning_rate": 0.00029467497410015625 + }, + { + "step": 114, + "epoch": 0.7125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6959, + "grad_norm": 0.43049412965774536, + "learning_rate": 0.00029445677050616437 + }, + { + "step": 115, + "epoch": 0.71875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.705, + "grad_norm": 0.603291928768158, + "learning_rate": 0.0002942342696524443 + }, + { + "step": 116, + "epoch": 0.725, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.7119, + "grad_norm": 1.3865022659301758, + "learning_rate": 0.0002940074781578893 + }, + { + "step": 117, + "epoch": 0.73125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7327, + "grad_norm": 1.4207278490066528, + "learning_rate": 0.00029377640276902954 + }, + { + "step": 118, + "epoch": 0.7375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.723, + "grad_norm": 1.1547808647155762, + "learning_rate": 0.0002935410503598313 + }, + { + "step": 119, + "epoch": 0.74375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.738, + "grad_norm": 1.3826148509979248, + "learning_rate": 0.00029330142793149237 + }, + { + "step": 120, + "epoch": 0.75, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.705, + "grad_norm": 0.4601854085922241, + "learning_rate": 0.000293057542612234 + }, + { + "step": 121, + "epoch": 0.75625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.73, + "grad_norm": 1.0949137210845947, + "learning_rate": 0.0002928094016570886 + }, + { + "step": 122, + "epoch": 0.7625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6917, + "grad_norm": 0.8030508756637573, + "learning_rate": 0.00029255701244768414 + }, + { + "step": 123, + "epoch": 0.76875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.8013, + "grad_norm": 2.2999284267425537, + "learning_rate": 0.0002923003824920244 + }, + { + "step": 124, + "epoch": 0.775, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.845, + "grad_norm": 2.811913251876831, + "learning_rate": 0.0002920395194242658 + }, + { + "step": 125, + "epoch": 0.78125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352335872, + "loss": 0.7381, + "grad_norm": 1.3482791185379028, + "learning_rate": 0.00029177443100449014 + }, + { + "step": 126, + "epoch": 0.7875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6957, + "grad_norm": 0.11973300576210022, + "learning_rate": 0.00029150512511847375 + }, + { + "step": 127, + "epoch": 0.79375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7258, + "grad_norm": 1.0839433670043945, + "learning_rate": 0.00029123160977745306 + }, + { + "step": 128, + "epoch": 0.8, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.745, + "grad_norm": 1.417629599571228, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 129, + "epoch": 0.80625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6971, + "grad_norm": 0.17127159237861633, + "learning_rate": 0.00029067198340121094 + }, + { + "step": 130, + "epoch": 0.8125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7146, + "grad_norm": 0.8499923348426819, + "learning_rate": 0.00029038588901359884 + }, + { + "step": 131, + "epoch": 0.81875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.7138, + "grad_norm": 0.9009805917739868, + "learning_rate": 0.00029009561846570604 + }, + { + "step": 132, + "epoch": 0.825, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7075, + "grad_norm": 0.7757030725479126, + "learning_rate": 0.00028980118039241976 + }, + { + "step": 133, + "epoch": 0.83125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.716, + "grad_norm": 1.380367398262024, + "learning_rate": 0.00028950258355260177 + }, + { + "step": 134, + "epoch": 0.8375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7226, + "grad_norm": 1.0860220193862915, + "learning_rate": 0.00028919983682882766 + }, + { + "step": 135, + "epoch": 0.84375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6728, + "grad_norm": 0.10880745947360992, + "learning_rate": 0.0002888929492271224 + }, + { + "step": 136, + "epoch": 0.85, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6809, + "grad_norm": 0.1163453534245491, + "learning_rate": 0.000288581929876693 + }, + { + "step": 137, + "epoch": 0.85625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6883, + "grad_norm": 0.24833980202674866, + "learning_rate": 0.00028826678802965614 + }, + { + "step": 138, + "epoch": 0.8625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6972, + "grad_norm": 0.2804546654224396, + "learning_rate": 0.0002879475330607638 + }, + { + "step": 139, + "epoch": 0.86875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6933, + "grad_norm": 0.07717876881361008, + "learning_rate": 0.00028762417446712363 + }, + { + "step": 140, + "epoch": 0.875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7181, + "grad_norm": 1.2340861558914185, + "learning_rate": 0.00028729672186791704 + }, + { + "step": 141, + "epoch": 0.88125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7034, + "grad_norm": 0.9161601662635803, + "learning_rate": 0.00028696518500411254 + }, + { + "step": 142, + "epoch": 0.8875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6914, + "grad_norm": 0.34330111742019653, + "learning_rate": 0.0002866295737381763 + }, + { + "step": 143, + "epoch": 0.89375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7441, + "grad_norm": 1.4091439247131348, + "learning_rate": 0.0002862898980537788 + }, + { + "step": 144, + "epoch": 0.9, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7356, + "grad_norm": 1.1729670763015747, + "learning_rate": 0.0002859461680554975 + }, + { + "step": 145, + "epoch": 0.90625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.6866, + "grad_norm": 0.2830427587032318, + "learning_rate": 0.0002855983939685165 + }, + { + "step": 146, + "epoch": 0.9125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6967, + "grad_norm": 0.5672622323036194, + "learning_rate": 0.0002852465861383224 + }, + { + "step": 147, + "epoch": 0.91875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6964, + "grad_norm": 0.3954281508922577, + "learning_rate": 0.00028489075503039643 + }, + { + "step": 148, + "epoch": 0.925, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6977, + "grad_norm": 0.26878443360328674, + "learning_rate": 0.00028453091122990323 + }, + { + "step": 149, + "epoch": 0.93125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6941, + "grad_norm": 0.5253047347068787, + "learning_rate": 0.0002841670654413757 + }, + { + "step": 150, + "epoch": 0.9375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6839, + "grad_norm": 0.1599457859992981, + "learning_rate": 0.0002837992284883971 + }, + { + "step": 151, + "epoch": 0.94375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7665, + "grad_norm": 2.0819993019104004, + "learning_rate": 0.0002834274113132784 + }, + { + "step": 152, + "epoch": 0.95, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6998, + "grad_norm": 0.5312199592590332, + "learning_rate": 0.0002830516249767332 + }, + { + "step": 153, + "epoch": 0.95625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7067, + "grad_norm": 0.7498101592063904, + "learning_rate": 0.0002826718806575488 + }, + { + "step": 154, + "epoch": 0.9625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6966, + "grad_norm": 0.37377211451530457, + "learning_rate": 0.0002822881896522532 + }, + { + "step": 155, + "epoch": 0.96875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6864, + "grad_norm": 0.2595387399196625, + "learning_rate": 0.0002819005633747795 + }, + { + "step": 156, + "epoch": 0.975, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.7024, + "grad_norm": 0.5623244047164917, + "learning_rate": 0.00028150901335612615 + }, + { + "step": 157, + "epoch": 0.98125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6948, + "grad_norm": 1.7964485883712769, + "learning_rate": 0.0002811135512440138 + }, + { + "step": 158, + "epoch": 0.9875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352354304, + "loss": 0.6959, + "grad_norm": 0.2239331305027008, + "learning_rate": 0.0002807141888025392 + }, + { + "step": 159, + "epoch": 0.99375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6876, + "grad_norm": 0.20790685713291168, + "learning_rate": 0.00028031093791182484 + }, + { + "step": 160, + "epoch": 1.0, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.7, + "grad_norm": 0.4246101975440979, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 161, + "epoch": 1.00625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6975, + "grad_norm": 0.33159127831459045, + "learning_rate": 0.0002794928188811727 + }, + { + "step": 162, + "epoch": 1.0125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6941, + "grad_norm": 0.17968569695949554, + "learning_rate": 0.0002790779750784118 + }, + { + "step": 163, + "epoch": 1.01875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6793, + "grad_norm": 0.4299807846546173, + "learning_rate": 0.0002786592915000408 + }, + { + "step": 164, + "epoch": 1.025, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6745, + "grad_norm": 0.20216795802116394, + "learning_rate": 0.00027823678060094197 + }, + { + "step": 165, + "epoch": 1.03125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6886, + "grad_norm": 0.6413382887840271, + "learning_rate": 0.0002778104549498518 + }, + { + "step": 166, + "epoch": 1.0375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7405, + "grad_norm": 1.253934621810913, + "learning_rate": 0.00027738032722898683 + }, + { + "step": 167, + "epoch": 1.04375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.7088, + "grad_norm": 0.6245874762535095, + "learning_rate": 0.00027694641023366656 + }, + { + "step": 168, + "epoch": 1.05, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6947, + "grad_norm": 0.15330156683921814, + "learning_rate": 0.0002765087168719328 + }, + { + "step": 169, + "epoch": 1.05625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7012, + "grad_norm": 0.3976576030254364, + "learning_rate": 0.00027606726016416567 + }, + { + "step": 170, + "epoch": 1.0625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.6873, + "grad_norm": 0.13830946385860443, + "learning_rate": 0.00027562205324269617 + }, + { + "step": 171, + "epoch": 1.06875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.7389, + "grad_norm": 1.1554720401763916, + "learning_rate": 0.00027517310935141565 + }, + { + "step": 172, + "epoch": 1.075, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6975, + "grad_norm": 0.2242167741060257, + "learning_rate": 0.0002747204418453818 + }, + { + "step": 173, + "epoch": 1.08125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352335872, + "loss": 0.7113, + "grad_norm": 0.8983135223388672, + "learning_rate": 0.00027426406419042135 + }, + { + "step": 174, + "epoch": 1.0875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.702, + "grad_norm": 0.5645713806152344, + "learning_rate": 0.00027380398996272956 + }, + { + "step": 175, + "epoch": 1.09375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6922, + "grad_norm": 0.38416069746017456, + "learning_rate": 0.0002733402328484662 + }, + { + "step": 176, + "epoch": 1.1, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.697, + "grad_norm": 0.26240837574005127, + "learning_rate": 0.00027287280664334875 + }, + { + "step": 177, + "epoch": 1.10625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6973, + "grad_norm": 0.2065459042787552, + "learning_rate": 0.0002724017252522415 + }, + { + "step": 178, + "epoch": 1.1125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7008, + "grad_norm": 0.5262129306793213, + "learning_rate": 0.0002719270026887423 + }, + { + "step": 179, + "epoch": 1.11875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6996, + "grad_norm": 0.5007551908493042, + "learning_rate": 0.0002714486530747656 + }, + { + "step": 180, + "epoch": 1.125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6944, + "grad_norm": 0.16840845346450806, + "learning_rate": 0.0002709666906401224 + }, + { + "step": 181, + "epoch": 1.13125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352358912, + "loss": 0.6788, + "grad_norm": 0.8117008209228516, + "learning_rate": 0.0002704811297220967 + }, + { + "step": 182, + "epoch": 1.1375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7748, + "grad_norm": 2.5114011764526367, + "learning_rate": 0.00026999198476501945 + }, + { + "step": 183, + "epoch": 1.14375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7839, + "grad_norm": 2.6167209148406982, + "learning_rate": 0.0002694992703198383 + }, + { + "step": 184, + "epoch": 1.15, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7496, + "grad_norm": 1.9006192684173584, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 185, + "epoch": 1.15625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6978, + "grad_norm": 0.5582042932510376, + "learning_rate": 0.0002685031916994403 + }, + { + "step": 186, + "epoch": 1.1625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.7066, + "grad_norm": 0.5042638182640076, + "learning_rate": 0.0002679998571552925 + }, + { + "step": 187, + "epoch": 1.16875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6767, + "grad_norm": 0.07772033661603928, + "learning_rate": 0.0002674930123842975 + }, + { + "step": 188, + "epoch": 1.175, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7016, + "grad_norm": 0.4092998504638672, + "learning_rate": 0.0002669826724639322 + }, + { + "step": 189, + "epoch": 1.18125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7146, + "grad_norm": 0.8027478456497192, + "learning_rate": 0.0002664688525756463 + }, + { + "step": 190, + "epoch": 1.1875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6909, + "grad_norm": 0.6803792119026184, + "learning_rate": 0.0002659515680044105 + }, + { + "step": 191, + "epoch": 1.19375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.706, + "grad_norm": 1.057088017463684, + "learning_rate": 0.00026543083413826203 + }, + { + "step": 192, + "epoch": 1.2, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6847, + "grad_norm": 0.972978413105011, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 193, + "epoch": 1.20625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6931, + "grad_norm": 0.322747141122818, + "learning_rate": 0.0002643790805859582 + }, + { + "step": 194, + "epoch": 1.2125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.691, + "grad_norm": 0.39225634932518005, + "learning_rate": 0.00026384809218707423 + }, + { + "step": 195, + "epoch": 1.21875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7414, + "grad_norm": 2.4710655212402344, + "learning_rate": 0.0002633137170668897 + }, + { + "step": 196, + "epoch": 1.225, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6942, + "grad_norm": 0.5192441344261169, + "learning_rate": 0.0002627759711218466 + }, + { + "step": 197, + "epoch": 1.23125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.6974, + "grad_norm": 0.43489590287208557, + "learning_rate": 0.00026223487034866133 + }, + { + "step": 198, + "epoch": 1.2375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7017, + "grad_norm": 0.5858332514762878, + "learning_rate": 0.00026169043084384896 + }, + { + "step": 199, + "epoch": 1.24375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6827, + "grad_norm": 0.22319354116916656, + "learning_rate": 0.00026114266880324387 + }, + { + "step": 200, + "epoch": 1.25, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.723, + "grad_norm": 1.0692894458770752, + "learning_rate": 0.0002605916005215186 + }, + { + "step": 201, + "epoch": 1.25625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.6991, + "grad_norm": 0.501838743686676, + "learning_rate": 0.00026003724239169874 + }, + { + "step": 202, + "epoch": 1.2625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7149, + "grad_norm": 1.4547818899154663, + "learning_rate": 0.00025947961090467533 + }, + { + "step": 203, + "epoch": 1.26875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6682, + "grad_norm": 0.8860859274864197, + "learning_rate": 0.0002589187226487144 + }, + { + "step": 204, + "epoch": 1.275, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7146, + "grad_norm": 1.1095187664031982, + "learning_rate": 0.0002583545943089633 + }, + { + "step": 205, + "epoch": 1.28125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.7117, + "grad_norm": 1.114591121673584, + "learning_rate": 0.00025778724266695466 + }, + { + "step": 206, + "epoch": 1.2875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.713, + "grad_norm": 0.9130935668945312, + "learning_rate": 0.00025721668460010696 + }, + { + "step": 207, + "epoch": 1.29375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6939, + "grad_norm": 0.14576587080955505, + "learning_rate": 0.0002566429370812223 + }, + { + "step": 208, + "epoch": 1.3, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7113, + "grad_norm": 0.8140343427658081, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 209, + "epoch": 1.30625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6898, + "grad_norm": 0.17273324728012085, + "learning_rate": 0.0002554859420524386 + }, + { + "step": 210, + "epoch": 1.3125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6922, + "grad_norm": 0.0784626230597496, + "learning_rate": 0.00025490272896050507 + }, + { + "step": 211, + "epoch": 1.31875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7202, + "grad_norm": 0.8068010807037354, + "learning_rate": 0.00025431639525144175 + }, + { + "step": 212, + "epoch": 1.325, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6954, + "grad_norm": 0.1635095626115799, + "learning_rate": 0.0002537269583673404 + }, + { + "step": 213, + "epoch": 1.33125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6961, + "grad_norm": 0.2893528938293457, + "learning_rate": 0.0002531344358426051 + }, + { + "step": 214, + "epoch": 1.3375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352352768, + "loss": 0.6915, + "grad_norm": 0.08015276491641998, + "learning_rate": 0.0002525388453034307 + }, + { + "step": 215, + "epoch": 1.34375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.7128, + "grad_norm": 0.7268898487091064, + "learning_rate": 0.0002519402044672784 + }, + { + "step": 216, + "epoch": 1.35, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7153, + "grad_norm": 0.7744960188865662, + "learning_rate": 0.00025133853114234905 + }, + { + "step": 217, + "epoch": 1.35625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.698, + "grad_norm": 0.15026028454303741, + "learning_rate": 0.00025073384322705274 + }, + { + "step": 218, + "epoch": 1.3625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7073, + "grad_norm": 0.59479159116745, + "learning_rate": 0.0002501261587094771 + }, + { + "step": 219, + "epoch": 1.36875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6918, + "grad_norm": 0.05160045623779297, + "learning_rate": 0.00024951549566685165 + }, + { + "step": 220, + "epoch": 1.375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6964, + "grad_norm": 0.15494582056999207, + "learning_rate": 0.0002489018722650103 + }, + { + "step": 221, + "epoch": 1.38125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6827, + "grad_norm": 0.41117727756500244, + "learning_rate": 0.00024828530675785094 + }, + { + "step": 222, + "epoch": 1.3875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6988, + "grad_norm": 0.2589212656021118, + "learning_rate": 0.00024766581748679234 + }, + { + "step": 223, + "epoch": 1.39375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.6876, + "grad_norm": 0.1615212857723236, + "learning_rate": 0.0002470434228802286 + }, + { + "step": 224, + "epoch": 1.4, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6947, + "grad_norm": 0.13706658780574799, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 225, + "epoch": 1.40625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.7021, + "grad_norm": 0.48667848110198975, + "learning_rate": 0.0002457899918057468 + }, + { + "step": 226, + "epoch": 1.4125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6972, + "grad_norm": 0.3070719242095947, + "learning_rate": 0.0002451589926245468 + }, + { + "step": 227, + "epoch": 1.41875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.696, + "grad_norm": 0.13087020814418793, + "learning_rate": 0.00024452516268016865 + }, + { + "step": 228, + "epoch": 1.425, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.7014, + "grad_norm": 0.4563586413860321, + "learning_rate": 0.00024388852082760884 + }, + { + "step": 229, + "epoch": 1.43125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.7015, + "grad_norm": 0.5223767757415771, + "learning_rate": 0.00024324908600551162 + }, + { + "step": 230, + "epoch": 1.4375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.7005, + "grad_norm": 0.4904763698577881, + "learning_rate": 0.00024260687723560574 + }, + { + "step": 231, + "epoch": 1.44375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6918, + "grad_norm": 0.44867679476737976, + "learning_rate": 0.00024196191362213862 + }, + { + "step": 232, + "epoch": 1.45, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6915, + "grad_norm": 0.19542352855205536, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 233, + "epoch": 1.45625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6933, + "grad_norm": 0.04600891098380089, + "learning_rate": 0.0002406637986906913 + }, + { + "step": 234, + "epoch": 1.4625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352354304, + "loss": 0.6936, + "grad_norm": 0.10508575290441513, + "learning_rate": 0.00024001068598867212 + }, + { + "step": 235, + "epoch": 1.46875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6937, + "grad_norm": 0.2867787778377533, + "learning_rate": 0.000239354895673865 + }, + { + "step": 236, + "epoch": 1.475, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.694, + "grad_norm": 0.5671350955963135, + "learning_rate": 0.00023869644725453735 + }, + { + "step": 237, + "epoch": 1.48125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352352768, + "loss": 0.7003, + "grad_norm": 0.49143028259277344, + "learning_rate": 0.00023803536031802918 + }, + { + "step": 238, + "epoch": 1.4875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.7009, + "grad_norm": 0.8550102114677429, + "learning_rate": 0.00023737165453017033 + }, + { + "step": 239, + "epoch": 1.49375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.7021, + "grad_norm": 0.35339057445526123, + "learning_rate": 0.0002367053496346955 + }, + { + "step": 240, + "epoch": 1.5, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6997, + "grad_norm": 0.21798287332057953, + "learning_rate": 0.00023603646545265687 + }, + { + "step": 241, + "epoch": 1.50625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6751, + "grad_norm": 0.24287700653076172, + "learning_rate": 0.00023536502188183472 + }, + { + "step": 242, + "epoch": 1.5125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.681, + "grad_norm": 0.057356953620910645, + "learning_rate": 0.00023469103889614505 + }, + { + "step": 243, + "epoch": 1.51875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352352768, + "loss": 0.7426, + "grad_norm": 0.8423979878425598, + "learning_rate": 0.0002340145365450458 + }, + { + "step": 244, + "epoch": 1.525, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7112, + "grad_norm": 0.4570711851119995, + "learning_rate": 0.0002333355349529403 + }, + { + "step": 245, + "epoch": 1.53125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.6995, + "grad_norm": 0.39405742287635803, + "learning_rate": 0.0002326540543185786 + }, + { + "step": 246, + "epoch": 1.5375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6985, + "grad_norm": 0.3444359600543976, + "learning_rate": 0.0002319701149144565 + }, + { + "step": 247, + "epoch": 1.54375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.694, + "grad_norm": 0.18827815353870392, + "learning_rate": 0.00023128373708621275 + }, + { + "step": 248, + "epoch": 1.55, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7244, + "grad_norm": 1.0636255741119385, + "learning_rate": 0.00023059494125202357 + }, + { + "step": 249, + "epoch": 1.55625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6855, + "grad_norm": 0.041457850486040115, + "learning_rate": 0.00022990374790199532 + }, + { + "step": 250, + "epoch": 1.5625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7156, + "grad_norm": 0.8373501896858215, + "learning_rate": 0.0002292101775975552 + }, + { + "step": 251, + "epoch": 1.56875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6925, + "grad_norm": 0.1963270604610443, + "learning_rate": 0.00022851425097083906 + }, + { + "step": 252, + "epoch": 1.575, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6975, + "grad_norm": 0.5628525018692017, + "learning_rate": 0.00022781598872407822 + }, + { + "step": 253, + "epoch": 1.58125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6959, + "grad_norm": 0.15777277946472168, + "learning_rate": 0.00022711541162898321 + }, + { + "step": 254, + "epoch": 1.5875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.695, + "grad_norm": 0.04754224419593811, + "learning_rate": 0.00022641254052612627 + }, + { + "step": 255, + "epoch": 1.59375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.6913, + "grad_norm": 0.2699243128299713, + "learning_rate": 0.00022570739632432079 + }, + { + "step": 256, + "epoch": 1.6, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6995, + "grad_norm": 0.3779385983943939, + "learning_rate": 0.000225 + }, + { + "step": 257, + "epoch": 1.60625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6941, + "grad_norm": 0.12050600349903107, + "learning_rate": 0.0002242903725965924 + }, + { + "step": 258, + "epoch": 1.6125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7026, + "grad_norm": 0.6506355404853821, + "learning_rate": 0.00022357853522389615 + }, + { + "step": 259, + "epoch": 1.61875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6937, + "grad_norm": 0.6805200576782227, + "learning_rate": 0.000222864509057451 + }, + { + "step": 260, + "epoch": 1.625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6919, + "grad_norm": 0.3026537597179413, + "learning_rate": 0.00022214831533790813 + }, + { + "step": 261, + "epoch": 1.63125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.706, + "grad_norm": 1.1423968076705933, + "learning_rate": 0.0002214299753703987 + }, + { + "step": 262, + "epoch": 1.6375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6931, + "grad_norm": 0.05105169862508774, + "learning_rate": 0.00022070951052389966 + }, + { + "step": 263, + "epoch": 1.64375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352354304, + "loss": 0.6952, + "grad_norm": 0.184014230966568, + "learning_rate": 0.00021998694223059837 + }, + { + "step": 264, + "epoch": 1.65, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6937, + "grad_norm": 0.16503316164016724, + "learning_rate": 0.0002192622919852551 + }, + { + "step": 265, + "epoch": 1.65625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.7026, + "grad_norm": 0.5530457496643066, + "learning_rate": 0.00021853558134456307 + }, + { + "step": 266, + "epoch": 1.6625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6896, + "grad_norm": 0.1812533587217331, + "learning_rate": 0.00021780683192650796 + }, + { + "step": 267, + "epoch": 1.66875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6916, + "grad_norm": 0.23015443980693817, + "learning_rate": 0.00021707606540972413 + }, + { + "step": 268, + "epoch": 1.675, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7, + "grad_norm": 0.6884751319885254, + "learning_rate": 0.00021634330353285017 + }, + { + "step": 269, + "epoch": 1.68125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6943, + "grad_norm": 0.4781191349029541, + "learning_rate": 0.00021560856809388213 + }, + { + "step": 270, + "epoch": 1.6875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6922, + "grad_norm": 0.36291030049324036, + "learning_rate": 0.00021487188094952489 + }, + { + "step": 271, + "epoch": 1.69375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6931, + "grad_norm": 0.16358566284179688, + "learning_rate": 0.0002141332640145423 + }, + { + "step": 272, + "epoch": 1.7, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7057, + "grad_norm": 0.8412946462631226, + "learning_rate": 0.0002133927392611049 + }, + { + "step": 273, + "epoch": 1.70625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.6905, + "grad_norm": 0.2422756850719452, + "learning_rate": 0.00021265032871813658 + }, + { + "step": 274, + "epoch": 1.7125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6909, + "grad_norm": 0.4419383108615875, + "learning_rate": 0.00021190605447065917 + }, + { + "step": 275, + "epoch": 1.71875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7015, + "grad_norm": 0.5250729918479919, + "learning_rate": 0.0002111599386591355 + }, + { + "step": 276, + "epoch": 1.725, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.704, + "grad_norm": 0.4332287013530731, + "learning_rate": 0.00021041200347881057 + }, + { + "step": 277, + "epoch": 1.73125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.6881, + "grad_norm": 0.3057299554347992, + "learning_rate": 0.00020966227117905163 + }, + { + "step": 278, + "epoch": 1.7375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6944, + "grad_norm": 0.19857926666736603, + "learning_rate": 0.00020891076406268612 + }, + { + "step": 279, + "epoch": 1.74375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6996, + "grad_norm": 0.4467513859272003, + "learning_rate": 0.00020815750448533805 + }, + { + "step": 280, + "epoch": 1.75, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.6941, + "grad_norm": 0.10934343934059143, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 281, + "epoch": 1.75625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.6872, + "grad_norm": 0.21460023522377014, + "learning_rate": 0.00020664581763018324 + }, + { + "step": 282, + "epoch": 1.7625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7066, + "grad_norm": 0.3915785253047943, + "learning_rate": 0.00020588743532161543 + }, + { + "step": 283, + "epoch": 1.76875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7424, + "grad_norm": 0.9662010073661804, + "learning_rate": 0.00020512739048920552 + }, + { + "step": 284, + "epoch": 1.775, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.704, + "grad_norm": 0.336186021566391, + "learning_rate": 0.00020436570574255522 + }, + { + "step": 285, + "epoch": 1.78125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6938, + "grad_norm": 0.1000228151679039, + "learning_rate": 0.00020360240374005 + }, + { + "step": 286, + "epoch": 1.7875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.7104, + "grad_norm": 0.6986575722694397, + "learning_rate": 0.00020283750718818501 + }, + { + "step": 287, + "epoch": 1.79375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.7005, + "grad_norm": 0.2519655227661133, + "learning_rate": 0.00020207103884088955 + }, + { + "step": 288, + "epoch": 1.8, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6958, + "grad_norm": 0.15090671181678772, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 289, + "epoch": 1.80625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.695, + "grad_norm": 0.3053504228591919, + "learning_rate": 0.00020053347800883298 + }, + { + "step": 290, + "epoch": 1.8125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6904, + "grad_norm": 0.1252623349428177, + "learning_rate": 0.00019976243126300282 + }, + { + "step": 291, + "epoch": 1.81875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6953, + "grad_norm": 0.23911452293395996, + "learning_rate": 0.00019898990419824333 + }, + { + "step": 292, + "epoch": 1.825, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352334336, + "loss": 0.7002, + "grad_norm": 0.38573217391967773, + "learning_rate": 0.00019821591979547423 + }, + { + "step": 293, + "epoch": 1.83125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7305, + "grad_norm": 0.9929963946342468, + "learning_rate": 0.00019744050107896774 + }, + { + "step": 294, + "epoch": 1.8375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352335872, + "loss": 0.7193, + "grad_norm": 0.8850874304771423, + "learning_rate": 0.0001966636711156636 + }, + { + "step": 295, + "epoch": 1.84375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6941, + "grad_norm": 0.06360094249248505, + "learning_rate": 0.00019588545301448302 + }, + { + "step": 296, + "epoch": 1.85, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7115, + "grad_norm": 0.4887949526309967, + "learning_rate": 0.00019510586992564093 + }, + { + "step": 297, + "epoch": 1.85625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7, + "grad_norm": 0.20785105228424072, + "learning_rate": 0.0001943249450399578 + }, + { + "step": 298, + "epoch": 1.8625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.7019, + "grad_norm": 0.27733251452445984, + "learning_rate": 0.0001935427015881693 + }, + { + "step": 299, + "epoch": 1.86875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6914, + "grad_norm": 0.15654239058494568, + "learning_rate": 0.00019275916284023563 + }, + { + "step": 300, + "epoch": 1.875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6971, + "grad_norm": 0.714505136013031, + "learning_rate": 0.00019197435210464882 + }, + { + "step": 301, + "epoch": 1.88125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.7053, + "grad_norm": 0.634199857711792, + "learning_rate": 0.00019118829272773985 + }, + { + "step": 302, + "epoch": 1.8875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.714, + "grad_norm": 0.8933894038200378, + "learning_rate": 0.00019040100809298392 + }, + { + "step": 303, + "epoch": 1.89375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352354304, + "loss": 0.6941, + "grad_norm": 0.19121013581752777, + "learning_rate": 0.00018961252162030476 + }, + { + "step": 304, + "epoch": 1.9, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6883, + "grad_norm": 0.13519123196601868, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 305, + "epoch": 1.90625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6909, + "grad_norm": 0.33823707699775696, + "learning_rate": 0.00018803203701893393 + }, + { + "step": 306, + "epoch": 1.9125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6924, + "grad_norm": 0.11599043011665344, + "learning_rate": 0.00018724008590605742 + }, + { + "step": 307, + "epoch": 1.91875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.701, + "grad_norm": 0.650134801864624, + "learning_rate": 0.0001864470269854896 + }, + { + "step": 308, + "epoch": 1.925, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6946, + "grad_norm": 0.33209845423698425, + "learning_rate": 0.00018565288384892595 + }, + { + "step": 309, + "epoch": 1.93125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6913, + "grad_norm": 0.6237694025039673, + "learning_rate": 0.00018485768012031518 + }, + { + "step": 310, + "epoch": 1.9375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.7053, + "grad_norm": 0.7133299112319946, + "learning_rate": 0.00018406143945515598 + }, + { + "step": 311, + "epoch": 1.94375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.7017, + "grad_norm": 0.5022284984588623, + "learning_rate": 0.00018326418553979367 + }, + { + "step": 312, + "epoch": 1.95, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.6929, + "grad_norm": 0.15943026542663574, + "learning_rate": 0.0001824659420907154 + }, + { + "step": 313, + "epoch": 1.95625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6922, + "grad_norm": 0.2929353713989258, + "learning_rate": 0.00018166673285384475 + }, + { + "step": 314, + "epoch": 1.9625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.695, + "grad_norm": 0.21617786586284637, + "learning_rate": 0.00018086658160383523 + }, + { + "step": 315, + "epoch": 1.96875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6897, + "grad_norm": 0.042839836329221725, + "learning_rate": 0.00018006551214336304 + }, + { + "step": 316, + "epoch": 1.975, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7182, + "grad_norm": 1.0577114820480347, + "learning_rate": 0.00017926354830241924 + }, + { + "step": 317, + "epoch": 1.98125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.702, + "grad_norm": 0.6705797910690308, + "learning_rate": 0.00017846071393760044 + }, + { + "step": 318, + "epoch": 1.9875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6921, + "grad_norm": 0.19153627753257751, + "learning_rate": 0.00017765703293139948 + }, + { + "step": 319, + "epoch": 1.99375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6819, + "grad_norm": 0.09637956321239471, + "learning_rate": 0.00017685252919149493 + }, + { + "step": 320, + "epoch": 2.0, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6715, + "grad_norm": 0.04713713750243187, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 321, + "epoch": 2.00625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6377, + "grad_norm": 0.2001551389694214, + "learning_rate": 0.00017524114926294887 + }, + { + "step": 322, + "epoch": 2.0125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7271, + "grad_norm": 0.6198419332504272, + "learning_rate": 0.0001744343210091883 + }, + { + "step": 323, + "epoch": 2.01875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7412, + "grad_norm": 0.7371752262115479, + "learning_rate": 0.00017362676589005967 + }, + { + "step": 324, + "epoch": 2.025, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352354304, + "loss": 0.7642, + "grad_norm": 0.9926972389221191, + "learning_rate": 0.0001728185079284875 + }, + { + "step": 325, + "epoch": 2.03125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7108, + "grad_norm": 0.561001181602478, + "learning_rate": 0.00017200957116830423 + }, + { + "step": 326, + "epoch": 2.0375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352352768, + "loss": 0.696, + "grad_norm": 0.21953046321868896, + "learning_rate": 0.00017119997967353514 + }, + { + "step": 327, + "epoch": 2.04375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7059, + "grad_norm": 0.914376437664032, + "learning_rate": 0.00017038975752768211 + }, + { + "step": 328, + "epoch": 2.05, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.6981, + "grad_norm": 0.5195743441581726, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 329, + "epoch": 2.05625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.6876, + "grad_norm": 0.15769077837467194, + "learning_rate": 0.0001687675177098179 + }, + { + "step": 330, + "epoch": 2.0625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7071, + "grad_norm": 0.8363847136497498, + "learning_rate": 0.00016795554829574435 + }, + { + "step": 331, + "epoch": 2.06875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7061, + "grad_norm": 0.7036767601966858, + "learning_rate": 0.00016714304474502696 + }, + { + "step": 332, + "epoch": 2.075, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.691, + "grad_norm": 0.7434396147727966, + "learning_rate": 0.00016633003122779467 + }, + { + "step": 333, + "epoch": 2.08125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6963, + "grad_norm": 0.6973944902420044, + "learning_rate": 0.00016551653192934694 + }, + { + "step": 334, + "epoch": 2.0875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6944, + "grad_norm": 0.0699223130941391, + "learning_rate": 0.0001647025710494341 + }, + { + "step": 335, + "epoch": 2.09375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6948, + "grad_norm": 0.10515100508928299, + "learning_rate": 0.00016388817280153735 + }, + { + "step": 336, + "epoch": 2.1, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6845, + "grad_norm": 0.479001522064209, + "learning_rate": 0.00016307336141214873 + }, + { + "step": 337, + "epoch": 2.10625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.7119, + "grad_norm": 0.8940211534500122, + "learning_rate": 0.00016225816112005022 + }, + { + "step": 338, + "epoch": 2.1125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7122, + "grad_norm": 0.939653754234314, + "learning_rate": 0.00016144259617559286 + }, + { + "step": 339, + "epoch": 2.11875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6946, + "grad_norm": 0.2811334729194641, + "learning_rate": 0.00016062669083997513 + }, + { + "step": 340, + "epoch": 2.125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6961, + "grad_norm": 0.20503851771354675, + "learning_rate": 0.00015981046938452146 + }, + { + "step": 341, + "epoch": 2.13125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.713, + "grad_norm": 0.961098849773407, + "learning_rate": 0.00015899395608996015 + }, + { + "step": 342, + "epoch": 2.1375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7077, + "grad_norm": 0.8160921931266785, + "learning_rate": 0.00015817717524570094 + }, + { + "step": 343, + "epoch": 2.14375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6934, + "grad_norm": 0.06153903156518936, + "learning_rate": 0.0001573601511491127 + }, + { + "step": 344, + "epoch": 2.15, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6878, + "grad_norm": 0.2907518446445465, + "learning_rate": 0.00015654290810480042 + }, + { + "step": 345, + "epoch": 2.15625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.685, + "grad_norm": 0.05061933025717735, + "learning_rate": 0.00015572547042388223 + }, + { + "step": 346, + "epoch": 2.1625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.7106, + "grad_norm": 0.42087066173553467, + "learning_rate": 0.00015490786242326643 + }, + { + "step": 347, + "epoch": 2.16875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.7295, + "grad_norm": 0.6391321420669556, + "learning_rate": 0.00015409010842492777 + }, + { + "step": 348, + "epoch": 2.175, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.7221, + "grad_norm": 0.5444504618644714, + "learning_rate": 0.00015327223275518416 + }, + { + "step": 349, + "epoch": 2.18125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.3523328, + "loss": 0.6922, + "grad_norm": 0.07998336106538773, + "learning_rate": 0.000152454259743973 + }, + { + "step": 350, + "epoch": 2.1875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6901, + "grad_norm": 0.06688784807920456, + "learning_rate": 0.00015163621372412734 + }, + { + "step": 351, + "epoch": 2.19375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6856, + "grad_norm": 0.14002864062786102, + "learning_rate": 0.00015081811903065205 + }, + { + "step": 352, + "epoch": 2.2, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7034, + "grad_norm": 0.35526224970817566, + "learning_rate": 0.00015 + }, + { + "step": 353, + "epoch": 2.20625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.695, + "grad_norm": 0.19997824728488922, + "learning_rate": 0.0001491818809693479 + }, + { + "step": 354, + "epoch": 2.2125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6308, + "grad_norm": 0.9206176400184631, + "learning_rate": 0.00014836378627587266 + }, + { + "step": 355, + "epoch": 2.21875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.7413, + "grad_norm": 0.9374169707298279, + "learning_rate": 0.00014754574025602698 + }, + { + "step": 356, + "epoch": 2.225, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.7299, + "grad_norm": 0.8007163405418396, + "learning_rate": 0.00014672776724481584 + }, + { + "step": 357, + "epoch": 2.23125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.7163, + "grad_norm": 0.6180440783500671, + "learning_rate": 0.00014590989157507224 + }, + { + "step": 358, + "epoch": 2.2375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6934, + "grad_norm": 0.26234170794487, + "learning_rate": 0.00014509213757673357 + }, + { + "step": 359, + "epoch": 2.24375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6975, + "grad_norm": 0.13576945662498474, + "learning_rate": 0.00014427452957611775 + }, + { + "step": 360, + "epoch": 2.25, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6885, + "grad_norm": 0.05884963274002075, + "learning_rate": 0.0001434570918951996 + }, + { + "step": 361, + "epoch": 2.25625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7088, + "grad_norm": 0.5611029863357544, + "learning_rate": 0.0001426398488508873 + }, + { + "step": 362, + "epoch": 2.2625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6741, + "grad_norm": 0.31234028935432434, + "learning_rate": 0.00014182282475429903 + }, + { + "step": 363, + "epoch": 2.26875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6935, + "grad_norm": 0.2849453389644623, + "learning_rate": 0.00014100604391003985 + }, + { + "step": 364, + "epoch": 2.275, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7136, + "grad_norm": 0.8932207822799683, + "learning_rate": 0.0001401895306154785 + }, + { + "step": 365, + "epoch": 2.28125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7047, + "grad_norm": 0.8668509721755981, + "learning_rate": 0.00013937330916002487 + }, + { + "step": 366, + "epoch": 2.2875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6948, + "grad_norm": 0.1296972781419754, + "learning_rate": 0.00013855740382440714 + }, + { + "step": 367, + "epoch": 2.29375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7375, + "grad_norm": 1.6148147583007812, + "learning_rate": 0.0001377418388799498 + }, + { + "step": 368, + "epoch": 2.3, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7367, + "grad_norm": 1.4988172054290771, + "learning_rate": 0.00013692663858785124 + }, + { + "step": 369, + "epoch": 2.30625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7171, + "grad_norm": 1.0587363243103027, + "learning_rate": 0.00013611182719846268 + }, + { + "step": 370, + "epoch": 2.3125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6929, + "grad_norm": 0.06711717694997787, + "learning_rate": 0.0001352974289505659 + }, + { + "step": 371, + "epoch": 2.31875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6991, + "grad_norm": 0.3152831792831421, + "learning_rate": 0.000134483468070653 + }, + { + "step": 372, + "epoch": 2.325, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7097, + "grad_norm": 0.48199462890625, + "learning_rate": 0.00013366996877220533 + }, + { + "step": 373, + "epoch": 2.33125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7019, + "grad_norm": 0.3229775130748749, + "learning_rate": 0.000132856955254973 + }, + { + "step": 374, + "epoch": 2.3375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35235584, + "loss": 0.6784, + "grad_norm": 0.16099011898040771, + "learning_rate": 0.00013204445170425565 + }, + { + "step": 375, + "epoch": 2.34375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7108, + "grad_norm": 0.45997947454452515, + "learning_rate": 0.00013123248229018214 + }, + { + "step": 376, + "epoch": 2.35, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7067, + "grad_norm": 0.44827020168304443, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 377, + "epoch": 2.35625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35235584, + "loss": 0.6955, + "grad_norm": 0.24606803059577942, + "learning_rate": 0.0001296102424723179 + }, + { + "step": 378, + "epoch": 2.3625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6879, + "grad_norm": 0.16817867755889893, + "learning_rate": 0.0001288000203264649 + }, + { + "step": 379, + "epoch": 2.36875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7162, + "grad_norm": 0.5832684636116028, + "learning_rate": 0.00012799042883169574 + }, + { + "step": 380, + "epoch": 2.375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.677, + "grad_norm": 0.10288379341363907, + "learning_rate": 0.00012718149207151247 + }, + { + "step": 381, + "epoch": 2.38125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7188, + "grad_norm": 0.5815040469169617, + "learning_rate": 0.00012637323410994033 + }, + { + "step": 382, + "epoch": 2.3875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6817, + "grad_norm": 0.06431329250335693, + "learning_rate": 0.0001255656789908117 + }, + { + "step": 383, + "epoch": 2.39375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6889, + "grad_norm": 0.10012605041265488, + "learning_rate": 0.0001247588507370511 + }, + { + "step": 384, + "epoch": 2.4, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7115, + "grad_norm": 0.5092347860336304, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 385, + "epoch": 2.40625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7118, + "grad_norm": 0.6294969320297241, + "learning_rate": 0.0001231474708085051 + }, + { + "step": 386, + "epoch": 2.4125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.6943, + "grad_norm": 0.14178267121315002, + "learning_rate": 0.0001223429670686005 + }, + { + "step": 387, + "epoch": 2.41875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7006, + "grad_norm": 0.44986408948898315, + "learning_rate": 0.00012153928606239957 + }, + { + "step": 388, + "epoch": 2.425, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6924, + "grad_norm": 0.0598842017352581, + "learning_rate": 0.00012073645169758076 + }, + { + "step": 389, + "epoch": 2.43125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.7003, + "grad_norm": 0.3876159191131592, + "learning_rate": 0.00011993448785663692 + }, + { + "step": 390, + "epoch": 2.4375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6876, + "grad_norm": 0.06612315773963928, + "learning_rate": 0.00011913341839616476 + }, + { + "step": 391, + "epoch": 2.44375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6964, + "grad_norm": 0.2518918514251709, + "learning_rate": 0.00011833326714615522 + }, + { + "step": 392, + "epoch": 2.45, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352352768, + "loss": 0.7006, + "grad_norm": 0.3235779404640198, + "learning_rate": 0.00011753405790928456 + }, + { + "step": 393, + "epoch": 2.45625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6893, + "grad_norm": 0.046971939504146576, + "learning_rate": 0.0001167358144602063 + }, + { + "step": 394, + "epoch": 2.4625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6992, + "grad_norm": 0.3088509738445282, + "learning_rate": 0.00011593856054484402 + }, + { + "step": 395, + "epoch": 2.46875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6774, + "grad_norm": 0.7367755174636841, + "learning_rate": 0.00011514231987968482 + }, + { + "step": 396, + "epoch": 2.475, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6924, + "grad_norm": 0.04641527682542801, + "learning_rate": 0.00011434711615107404 + }, + { + "step": 397, + "epoch": 2.48125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.6919, + "grad_norm": 0.03991466015577316, + "learning_rate": 0.00011355297301451042 + }, + { + "step": 398, + "epoch": 2.4875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352352768, + "loss": 0.6932, + "grad_norm": 0.05119848996400833, + "learning_rate": 0.00011275991409394253 + }, + { + "step": 399, + "epoch": 2.49375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7021, + "grad_norm": 0.4675575792789459, + "learning_rate": 0.00011196796298106608 + }, + { + "step": 400, + "epoch": 2.5, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.7007, + "grad_norm": 0.4248453378677368, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 401, + "epoch": 2.50625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6856, + "grad_norm": 0.5627554059028625, + "learning_rate": 0.00011038747837969526 + }, + { + "step": 402, + "epoch": 2.5125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6926, + "grad_norm": 0.05353507027029991, + "learning_rate": 0.00010959899190701608 + }, + { + "step": 403, + "epoch": 2.51875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6907, + "grad_norm": 0.13665935397148132, + "learning_rate": 0.00010881170727226018 + }, + { + "step": 404, + "epoch": 2.525, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6949, + "grad_norm": 0.3334975838661194, + "learning_rate": 0.00010802564789535119 + }, + { + "step": 405, + "epoch": 2.53125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6925, + "grad_norm": 0.2518015503883362, + "learning_rate": 0.00010724083715976441 + }, + { + "step": 406, + "epoch": 2.5375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6919, + "grad_norm": 0.25070706009864807, + "learning_rate": 0.00010645729841183066 + }, + { + "step": 407, + "epoch": 2.54375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6937, + "grad_norm": 0.13203802704811096, + "learning_rate": 0.00010567505496004213 + }, + { + "step": 408, + "epoch": 2.55, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.6915, + "grad_norm": 0.09975284337997437, + "learning_rate": 0.00010489413007435904 + }, + { + "step": 409, + "epoch": 2.55625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6913, + "grad_norm": 0.06211432069540024, + "learning_rate": 0.00010411454698551695 + }, + { + "step": 410, + "epoch": 2.5625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6857, + "grad_norm": 0.2561088800430298, + "learning_rate": 0.00010333632888433638 + }, + { + "step": 411, + "epoch": 2.56875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.7061, + "grad_norm": 0.5514450073242188, + "learning_rate": 0.00010255949892103225 + }, + { + "step": 412, + "epoch": 2.575, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6887, + "grad_norm": 0.05640260502696037, + "learning_rate": 0.00010178408020452579 + }, + { + "step": 413, + "epoch": 2.58125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352352768, + "loss": 0.6865, + "grad_norm": 0.06372450292110443, + "learning_rate": 0.00010101009580175669 + }, + { + "step": 414, + "epoch": 2.5875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6961, + "grad_norm": 0.2164548933506012, + "learning_rate": 0.00010023756873699722 + }, + { + "step": 415, + "epoch": 2.59375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6948, + "grad_norm": 0.2877260446548462, + "learning_rate": 9.946652199116699e-05 + }, + { + "step": 416, + "epoch": 2.6, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7037, + "grad_norm": 0.4891502559185028, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 417, + "epoch": 2.60625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.696, + "grad_norm": 0.4184412956237793, + "learning_rate": 9.792896115911045e-05 + }, + { + "step": 418, + "epoch": 2.6125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6953, + "grad_norm": 0.30615904927253723, + "learning_rate": 9.716249281181497e-05 + }, + { + "step": 419, + "epoch": 2.61875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6866, + "grad_norm": 0.419046014547348, + "learning_rate": 9.639759625994998e-05 + }, + { + "step": 420, + "epoch": 2.625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6973, + "grad_norm": 0.25855883955955505, + "learning_rate": 9.563429425744476e-05 + }, + { + "step": 421, + "epoch": 2.63125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7082, + "grad_norm": 0.6514796614646912, + "learning_rate": 9.487260951079448e-05 + }, + { + "step": 422, + "epoch": 2.6375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.7172, + "grad_norm": 0.841404139995575, + "learning_rate": 9.411256467838455e-05 + }, + { + "step": 423, + "epoch": 2.64375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6737, + "grad_norm": 0.4717291295528412, + "learning_rate": 9.335418236981677e-05 + }, + { + "step": 424, + "epoch": 2.65, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7007, + "grad_norm": 0.3156585097312927, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 425, + "epoch": 2.65625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6987, + "grad_norm": 0.2603183090686798, + "learning_rate": 9.184249551466189e-05 + }, + { + "step": 426, + "epoch": 2.6625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.6865, + "grad_norm": 0.2737148106098175, + "learning_rate": 9.10892359373139e-05 + }, + { + "step": 427, + "epoch": 2.66875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352352768, + "loss": 0.6925, + "grad_norm": 0.05192165449261665, + "learning_rate": 9.033772882094833e-05 + }, + { + "step": 428, + "epoch": 2.675, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.698, + "grad_norm": 0.47019270062446594, + "learning_rate": 8.958799652118943e-05 + }, + { + "step": 429, + "epoch": 2.68125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6912, + "grad_norm": 0.05502508580684662, + "learning_rate": 8.884006134086449e-05 + }, + { + "step": 430, + "epoch": 2.6875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6886, + "grad_norm": 0.1574275642633438, + "learning_rate": 8.809394552934079e-05 + }, + { + "step": 431, + "epoch": 2.69375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6953, + "grad_norm": 0.41993892192840576, + "learning_rate": 8.734967128186338e-05 + }, + { + "step": 432, + "epoch": 2.7, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.6918, + "grad_norm": 0.08328646421432495, + "learning_rate": 8.660726073889511e-05 + }, + { + "step": 433, + "epoch": 2.70625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6907, + "grad_norm": 0.06835395097732544, + "learning_rate": 8.586673598545771e-05 + }, + { + "step": 434, + "epoch": 2.7125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6935, + "grad_norm": 0.060632605105638504, + "learning_rate": 8.512811905047505e-05 + }, + { + "step": 435, + "epoch": 2.71875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.6944, + "grad_norm": 0.21012665331363678, + "learning_rate": 8.439143190611787e-05 + }, + { + "step": 436, + "epoch": 2.725, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6926, + "grad_norm": 0.0977601632475853, + "learning_rate": 8.365669646714983e-05 + }, + { + "step": 437, + "epoch": 2.73125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.6966, + "grad_norm": 0.08853652328252792, + "learning_rate": 8.29239345902759e-05 + }, + { + "step": 438, + "epoch": 2.7375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6909, + "grad_norm": 0.07817822694778442, + "learning_rate": 8.219316807349204e-05 + }, + { + "step": 439, + "epoch": 2.74375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6915, + "grad_norm": 0.0855395495891571, + "learning_rate": 8.146441865543689e-05 + }, + { + "step": 440, + "epoch": 2.75, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6834, + "grad_norm": 0.5175971388816833, + "learning_rate": 8.073770801474495e-05 + }, + { + "step": 441, + "epoch": 2.75625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6898, + "grad_norm": 0.07819784432649612, + "learning_rate": 8.001305776940163e-05 + }, + { + "step": 442, + "epoch": 2.7625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6862, + "grad_norm": 0.08807367831468582, + "learning_rate": 7.929048947610034e-05 + }, + { + "step": 443, + "epoch": 2.76875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6936, + "grad_norm": 0.22537831962108612, + "learning_rate": 7.857002462960132e-05 + }, + { + "step": 444, + "epoch": 2.775, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6833, + "grad_norm": 0.10298366099596024, + "learning_rate": 7.785168466209187e-05 + }, + { + "step": 445, + "epoch": 2.78125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6839, + "grad_norm": 0.0992562547326088, + "learning_rate": 7.713549094254897e-05 + }, + { + "step": 446, + "epoch": 2.7875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6791, + "grad_norm": 0.11199034750461578, + "learning_rate": 7.64214647761038e-05 + }, + { + "step": 447, + "epoch": 2.79375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6888, + "grad_norm": 0.1342579424381256, + "learning_rate": 7.570962740340759e-05 + }, + { + "step": 448, + "epoch": 2.8, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.6841, + "grad_norm": 0.07570768147706985, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 449, + "epoch": 2.80625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6827, + "grad_norm": 0.11123040318489075, + "learning_rate": 7.429260367567916e-05 + }, + { + "step": 450, + "epoch": 2.8125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.7004, + "grad_norm": 0.42737382650375366, + "learning_rate": 7.358745947387373e-05 + }, + { + "step": 451, + "epoch": 2.81875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6857, + "grad_norm": 0.2513103187084198, + "learning_rate": 7.288458837101675e-05 + }, + { + "step": 452, + "epoch": 2.825, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6628, + "grad_norm": 0.31552913784980774, + "learning_rate": 7.218401127592175e-05 + }, + { + "step": 453, + "epoch": 2.83125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6745, + "grad_norm": 0.22437934577465057, + "learning_rate": 7.14857490291609e-05 + }, + { + "step": 454, + "epoch": 2.8375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6844, + "grad_norm": 0.17069807648658752, + "learning_rate": 7.07898224024448e-05 + }, + { + "step": 455, + "epoch": 2.84375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.707, + "grad_norm": 0.42335760593414307, + "learning_rate": 7.009625209800465e-05 + }, + { + "step": 456, + "epoch": 2.85, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6999, + "grad_norm": 0.36898475885391235, + "learning_rate": 6.940505874797639e-05 + }, + { + "step": 457, + "epoch": 2.85625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7019, + "grad_norm": 0.24590528011322021, + "learning_rate": 6.871626291378728e-05 + }, + { + "step": 458, + "epoch": 2.8625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.697, + "grad_norm": 0.40048301219940186, + "learning_rate": 6.80298850855435e-05 + }, + { + "step": 459, + "epoch": 2.86875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6957, + "grad_norm": 0.16248609125614166, + "learning_rate": 6.734594568142142e-05 + }, + { + "step": 460, + "epoch": 2.875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6896, + "grad_norm": 0.20503361523151398, + "learning_rate": 6.66644650470597e-05 + }, + { + "step": 461, + "epoch": 2.88125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.7386, + "grad_norm": 0.9776124358177185, + "learning_rate": 6.598546345495417e-05 + }, + { + "step": 462, + "epoch": 2.8875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6701, + "grad_norm": 0.3218100368976593, + "learning_rate": 6.530896110385494e-05 + }, + { + "step": 463, + "epoch": 2.89375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7027, + "grad_norm": 0.23634834587574005, + "learning_rate": 6.463497811816523e-05 + }, + { + "step": 464, + "epoch": 2.9, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.7063, + "grad_norm": 0.40373656153678894, + "learning_rate": 6.396353454734311e-05 + }, + { + "step": 465, + "epoch": 2.90625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6798, + "grad_norm": 0.23233628273010254, + "learning_rate": 6.32946503653045e-05 + }, + { + "step": 466, + "epoch": 2.9125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.697, + "grad_norm": 0.2303260713815689, + "learning_rate": 6.262834546982969e-05 + }, + { + "step": 467, + "epoch": 2.91875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352352768, + "loss": 0.6855, + "grad_norm": 0.1406712383031845, + "learning_rate": 6.196463968197084e-05 + }, + { + "step": 468, + "epoch": 2.925, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6907, + "grad_norm": 0.15576517581939697, + "learning_rate": 6.130355274546267e-05 + }, + { + "step": 469, + "epoch": 2.93125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6929, + "grad_norm": 0.36941686272621155, + "learning_rate": 6.064510432613499e-05 + }, + { + "step": 470, + "epoch": 2.9375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6839, + "grad_norm": 0.18228726089000702, + "learning_rate": 5.998931401132786e-05 + }, + { + "step": 471, + "epoch": 2.94375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6975, + "grad_norm": 0.41298383474349976, + "learning_rate": 5.933620130930867e-05 + }, + { + "step": 472, + "epoch": 2.95, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6892, + "grad_norm": 0.20105738937854767, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 473, + "epoch": 2.95625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6886, + "grad_norm": 0.12906034290790558, + "learning_rate": 5.803808637786135e-05 + }, + { + "step": 474, + "epoch": 2.9625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.6956, + "grad_norm": 0.14986974000930786, + "learning_rate": 5.739312276439427e-05 + }, + { + "step": 475, + "epoch": 2.96875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6944, + "grad_norm": 0.15232160687446594, + "learning_rate": 5.6750913994488415e-05 + }, + { + "step": 476, + "epoch": 2.975, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6958, + "grad_norm": 0.14093594253063202, + "learning_rate": 5.6111479172391136e-05 + }, + { + "step": 477, + "epoch": 2.98125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7031, + "grad_norm": 0.2815418541431427, + "learning_rate": 5.5474837319831314e-05 + }, + { + "step": 478, + "epoch": 2.9875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6869, + "grad_norm": 0.30493640899658203, + "learning_rate": 5.4841007375453186e-05 + }, + { + "step": 479, + "epoch": 2.99375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6971, + "grad_norm": 0.26601457595825195, + "learning_rate": 5.4210008194253196e-05 + }, + { + "step": 480, + "epoch": 3.0, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6936, + "grad_norm": 0.2676689326763153, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 481, + "epoch": 3.00625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.6838, + "grad_norm": 0.23511265218257904, + "learning_rate": 5.2956577119771405e-05 + }, + { + "step": 482, + "epoch": 3.0125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.6892, + "grad_norm": 0.22490835189819336, + "learning_rate": 5.233418251320765e-05 + }, + { + "step": 483, + "epoch": 3.01875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6985, + "grad_norm": 0.3242700695991516, + "learning_rate": 5.171469324214901e-05 + }, + { + "step": 484, + "epoch": 3.025, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6881, + "grad_norm": 0.15535849332809448, + "learning_rate": 5.109812773498967e-05 + }, + { + "step": 485, + "epoch": 3.03125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.708, + "grad_norm": 0.3692547082901001, + "learning_rate": 5.048450433314835e-05 + }, + { + "step": 486, + "epoch": 3.0375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6791, + "grad_norm": 0.21382948756217957, + "learning_rate": 4.987384129052291e-05 + }, + { + "step": 487, + "epoch": 3.04375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6947, + "grad_norm": 0.23230446875095367, + "learning_rate": 4.926615677294723e-05 + }, + { + "step": 488, + "epoch": 3.05, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6921, + "grad_norm": 0.4255184829235077, + "learning_rate": 4.866146885765096e-05 + }, + { + "step": 489, + "epoch": 3.05625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6819, + "grad_norm": 0.3195727467536926, + "learning_rate": 4.8059795532721575e-05 + }, + { + "step": 490, + "epoch": 3.0625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6907, + "grad_norm": 0.25535380840301514, + "learning_rate": 4.7461154696569294e-05 + }, + { + "step": 491, + "epoch": 3.06875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6841, + "grad_norm": 0.2917921841144562, + "learning_rate": 4.686556415739488e-05 + }, + { + "step": 492, + "epoch": 3.075, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6804, + "grad_norm": 0.21833764016628265, + "learning_rate": 4.62730416326596e-05 + }, + { + "step": 493, + "epoch": 3.08125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6811, + "grad_norm": 0.4681790769100189, + "learning_rate": 4.568360474855826e-05 + }, + { + "step": 494, + "epoch": 3.0875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6882, + "grad_norm": 0.21810591220855713, + "learning_rate": 4.509727103949492e-05 + }, + { + "step": 495, + "epoch": 3.09375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6702, + "grad_norm": 0.5053053498268127, + "learning_rate": 4.451405794756138e-05 + }, + { + "step": 496, + "epoch": 3.1, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352352768, + "loss": 0.6852, + "grad_norm": 0.2549866735935211, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 497, + "epoch": 3.10625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352352768, + "loss": 0.7092, + "grad_norm": 0.5446391701698303, + "learning_rate": 4.33570629187776e-05 + }, + { + "step": 498, + "epoch": 3.1125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7031, + "grad_norm": 0.5939232707023621, + "learning_rate": 4.278331539989307e-05 + }, + { + "step": 499, + "epoch": 3.11875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.6894, + "grad_norm": 0.3148786425590515, + "learning_rate": 4.2212757333045283e-05 + }, + { + "step": 500, + "epoch": 3.125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6922, + "grad_norm": 0.302279531955719, + "learning_rate": 4.164540569103667e-05 + }, + { + "step": 501, + "epoch": 3.13125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6709, + "grad_norm": 0.2708870470523834, + "learning_rate": 4.108127735128561e-05 + }, + { + "step": 502, + "epoch": 3.1375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6739, + "grad_norm": 0.3843221664428711, + "learning_rate": 4.052038909532469e-05 + }, + { + "step": 503, + "epoch": 3.14375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.7044, + "grad_norm": 0.41796034574508667, + "learning_rate": 3.996275760830125e-05 + }, + { + "step": 504, + "epoch": 3.15, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6871, + "grad_norm": 0.2966718375682831, + "learning_rate": 3.94083994784814e-05 + }, + { + "step": 505, + "epoch": 3.15625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6798, + "grad_norm": 0.3628740906715393, + "learning_rate": 3.885733119675616e-05 + }, + { + "step": 506, + "epoch": 3.1625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6792, + "grad_norm": 0.3824528455734253, + "learning_rate": 3.830956915615106e-05 + }, + { + "step": 507, + "epoch": 3.16875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.6809, + "grad_norm": 0.3693743646144867, + "learning_rate": 3.776512965133863e-05 + }, + { + "step": 508, + "epoch": 3.175, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.6638, + "grad_norm": 0.45180243253707886, + "learning_rate": 3.72240288781534e-05 + }, + { + "step": 509, + "epoch": 3.18125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6881, + "grad_norm": 0.4745689928531647, + "learning_rate": 3.66862829331103e-05 + }, + { + "step": 510, + "epoch": 3.1875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.6812, + "grad_norm": 0.39209091663360596, + "learning_rate": 3.6151907812925717e-05 + }, + { + "step": 511, + "epoch": 3.19375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6359, + "grad_norm": 0.5042276382446289, + "learning_rate": 3.562091941404179e-05 + }, + { + "step": 512, + "epoch": 3.2, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6771, + "grad_norm": 0.45253047347068787, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 513, + "epoch": 3.20625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.654, + "grad_norm": 0.520455539226532, + "learning_rate": 3.456916586173797e-05 + }, + { + "step": 514, + "epoch": 3.2125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6766, + "grad_norm": 0.6826391220092773, + "learning_rate": 3.404843199558945e-05 + }, + { + "step": 515, + "epoch": 3.21875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7024, + "grad_norm": 0.9243565201759338, + "learning_rate": 3.3531147424353664e-05 + }, + { + "step": 516, + "epoch": 3.225, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.669, + "grad_norm": 0.5933087468147278, + "learning_rate": 3.301732753606776e-05 + }, + { + "step": 517, + "epoch": 3.23125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.6873, + "grad_norm": 1.014005184173584, + "learning_rate": 3.250698761570244e-05 + }, + { + "step": 518, + "epoch": 3.2375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.667, + "grad_norm": 1.2020173072814941, + "learning_rate": 3.200014284470745e-05 + }, + { + "step": 519, + "epoch": 3.24375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6916, + "grad_norm": 0.9538174271583557, + "learning_rate": 3.149680830055967e-05 + }, + { + "step": 520, + "epoch": 3.25, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.7177, + "grad_norm": 0.7869241833686829, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 521, + "epoch": 3.25625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6598, + "grad_norm": 0.6747916340827942, + "learning_rate": 3.0500729680161663e-05 + }, + { + "step": 522, + "epoch": 3.2625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6868, + "grad_norm": 0.6702748537063599, + "learning_rate": 3.0008015234980552e-05 + }, + { + "step": 523, + "epoch": 3.26875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6894, + "grad_norm": 0.7077994346618652, + "learning_rate": 2.9518870277903274e-05 + }, + { + "step": 524, + "epoch": 3.275, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6861, + "grad_norm": 0.7142360210418701, + "learning_rate": 2.9033309359877597e-05 + }, + { + "step": 525, + "epoch": 3.28125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.7079, + "grad_norm": 0.9747588038444519, + "learning_rate": 2.855134692523438e-05 + }, + { + "step": 526, + "epoch": 3.2875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7046, + "grad_norm": 0.5830492973327637, + "learning_rate": 2.807299731125773e-05 + }, + { + "step": 527, + "epoch": 3.29375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6723, + "grad_norm": 0.4663049280643463, + "learning_rate": 2.759827474775852e-05 + }, + { + "step": 528, + "epoch": 3.3, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.6631, + "grad_norm": 0.48369091749191284, + "learning_rate": 2.7127193356651213e-05 + }, + { + "step": 529, + "epoch": 3.30625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6565, + "grad_norm": 0.5685160756111145, + "learning_rate": 2.665976715153377e-05 + }, + { + "step": 530, + "epoch": 3.3125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6904, + "grad_norm": 0.5871265530586243, + "learning_rate": 2.619601003727043e-05 + }, + { + "step": 531, + "epoch": 3.31875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352357376, + "loss": 0.6711, + "grad_norm": 0.5238766670227051, + "learning_rate": 2.5735935809578656e-05 + }, + { + "step": 532, + "epoch": 3.325, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6649, + "grad_norm": 0.4528191089630127, + "learning_rate": 2.5279558154618197e-05 + }, + { + "step": 533, + "epoch": 3.33125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6887, + "grad_norm": 0.3769638240337372, + "learning_rate": 2.4826890648584353e-05 + }, + { + "step": 534, + "epoch": 3.3375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6706, + "grad_norm": 0.4239142835140228, + "learning_rate": 2.4377946757303828e-05 + }, + { + "step": 535, + "epoch": 3.34375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6877, + "grad_norm": 0.49645620584487915, + "learning_rate": 2.393273983583427e-05 + }, + { + "step": 536, + "epoch": 3.35, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6551, + "grad_norm": 0.4547468423843384, + "learning_rate": 2.3491283128067174e-05 + }, + { + "step": 537, + "epoch": 3.35625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6788, + "grad_norm": 0.6517921090126038, + "learning_rate": 2.3053589766333414e-05 + }, + { + "step": 538, + "epoch": 3.3625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.697, + "grad_norm": 0.45100662112236023, + "learning_rate": 2.261967277101318e-05 + }, + { + "step": 539, + "epoch": 3.36875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.663, + "grad_norm": 0.47672438621520996, + "learning_rate": 2.218954505014821e-05 + }, + { + "step": 540, + "epoch": 3.375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.695, + "grad_norm": 0.7985601425170898, + "learning_rate": 2.1763219399058042e-05 + }, + { + "step": 541, + "epoch": 3.38125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7137, + "grad_norm": 0.5589563846588135, + "learning_rate": 2.1340708499959197e-05 + }, + { + "step": 542, + "epoch": 3.3875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6918, + "grad_norm": 0.5139294266700745, + "learning_rate": 2.0922024921588167e-05 + }, + { + "step": 543, + "epoch": 3.39375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6549, + "grad_norm": 0.45009109377861023, + "learning_rate": 2.0507181118827254e-05 + }, + { + "step": 544, + "epoch": 3.4, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6673, + "grad_norm": 0.4339359700679779, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 545, + "epoch": 3.40625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.6859, + "grad_norm": 0.4537097215652466, + "learning_rate": 1.9689062088175154e-05 + }, + { + "step": 546, + "epoch": 3.4125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6755, + "grad_norm": 0.41648659110069275, + "learning_rate": 1.928581119746081e-05 + }, + { + "step": 547, + "epoch": 3.41875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6523, + "grad_norm": 0.624713659286499, + "learning_rate": 1.8886448755986193e-05 + }, + { + "step": 548, + "epoch": 3.425, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.7097, + "grad_norm": 0.5285186171531677, + "learning_rate": 1.8490986643873845e-05 + }, + { + "step": 549, + "epoch": 3.43125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.685, + "grad_norm": 0.47190508246421814, + "learning_rate": 1.8099436625220443e-05 + }, + { + "step": 550, + "epoch": 3.4375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352354304, + "loss": 0.6654, + "grad_norm": 0.490545392036438, + "learning_rate": 1.7711810347746757e-05 + }, + { + "step": 551, + "epoch": 3.44375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6762, + "grad_norm": 0.5042676329612732, + "learning_rate": 1.7328119342451165e-05 + }, + { + "step": 552, + "epoch": 3.45, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.696, + "grad_norm": 0.546953022480011, + "learning_rate": 1.694837502326674e-05 + }, + { + "step": 553, + "epoch": 3.45625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7153, + "grad_norm": 0.516740620136261, + "learning_rate": 1.6572588686721606e-05 + }, + { + "step": 554, + "epoch": 3.4625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6802, + "grad_norm": 0.5863431096076965, + "learning_rate": 1.6200771511602882e-05 + }, + { + "step": 555, + "epoch": 3.46875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.7158, + "grad_norm": 0.9267789721488953, + "learning_rate": 1.583293455862422e-05 + }, + { + "step": 556, + "epoch": 3.475, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6872, + "grad_norm": 0.49865931272506714, + "learning_rate": 1.546908877009676e-05 + }, + { + "step": 557, + "epoch": 3.48125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6728, + "grad_norm": 0.7071788311004639, + "learning_rate": 1.5109244969603546e-05 + }, + { + "step": 558, + "epoch": 3.4875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6872, + "grad_norm": 0.524202287197113, + "learning_rate": 1.4753413861677604e-05 + }, + { + "step": 559, + "epoch": 3.49375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6947, + "grad_norm": 0.5448168516159058, + "learning_rate": 1.4401606031483497e-05 + }, + { + "step": 560, + "epoch": 3.5, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.645, + "grad_norm": 0.5859795808792114, + "learning_rate": 1.4053831944502508e-05 + }, + { + "step": 561, + "epoch": 3.50625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6707, + "grad_norm": 0.4346643090248108, + "learning_rate": 1.371010194622117e-05 + }, + { + "step": 562, + "epoch": 3.5125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6962, + "grad_norm": 0.799657940864563, + "learning_rate": 1.3370426261823613e-05 + }, + { + "step": 563, + "epoch": 3.51875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.689, + "grad_norm": 0.6018645763397217, + "learning_rate": 1.3034814995887433e-05 + }, + { + "step": 564, + "epoch": 3.525, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.677, + "grad_norm": 0.44465598464012146, + "learning_rate": 1.2703278132082934e-05 + }, + { + "step": 565, + "epoch": 3.53125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.667, + "grad_norm": 0.5111331343650818, + "learning_rate": 1.237582553287631e-05 + }, + { + "step": 566, + "epoch": 3.5375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6628, + "grad_norm": 0.7975919246673584, + "learning_rate": 1.205246693923616e-05 + }, + { + "step": 567, + "epoch": 3.54375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6971, + "grad_norm": 0.7185885906219482, + "learning_rate": 1.173321197034382e-05 + }, + { + "step": 568, + "epoch": 3.55, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6806, + "grad_norm": 0.573652446269989, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 569, + "epoch": 3.55625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6845, + "grad_norm": 0.6192213892936707, + "learning_rate": 1.1107050772877507e-05 + }, + { + "step": 570, + "epoch": 3.5625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.6704, + "grad_norm": 0.5589597821235657, + "learning_rate": 1.0800163171172332e-05 + }, + { + "step": 571, + "epoch": 3.56875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6683, + "grad_norm": 0.5322818160057068, + "learning_rate": 1.0497416447398187e-05 + }, + { + "step": 572, + "epoch": 3.575, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6672, + "grad_norm": 0.7597686648368835, + "learning_rate": 1.0198819607580233e-05 + }, + { + "step": 573, + "epoch": 3.58125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6668, + "grad_norm": 0.6358884572982788, + "learning_rate": 9.904381534293993e-06 + }, + { + "step": 574, + "epoch": 3.5875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6916, + "grad_norm": 0.6729602217674255, + "learning_rate": 9.614110986401169e-06 + }, + { + "step": 575, + "epoch": 3.59375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6806, + "grad_norm": 0.6088345050811768, + "learning_rate": 9.32801659878905e-06 + }, + { + "step": 576, + "epoch": 3.6, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6929, + "grad_norm": 0.8367296457290649, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 577, + "epoch": 3.60625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.7127, + "grad_norm": 0.6685983538627625, + "learning_rate": 8.768390222546895e-06 + }, + { + "step": 578, + "epoch": 3.6125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352352768, + "loss": 0.6994, + "grad_norm": 0.6598076820373535, + "learning_rate": 8.494874881526215e-06 + }, + { + "step": 579, + "epoch": 3.61875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6743, + "grad_norm": 0.6219344139099121, + "learning_rate": 8.225568995509834e-06 + }, + { + "step": 580, + "epoch": 3.625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6663, + "grad_norm": 0.901719868183136, + "learning_rate": 7.960480575734162e-06 + }, + { + "step": 581, + "epoch": 3.63125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.678, + "grad_norm": 0.6553974151611328, + "learning_rate": 7.699617507975563e-06 + }, + { + "step": 582, + "epoch": 3.6375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.6652, + "grad_norm": 0.6581661105155945, + "learning_rate": 7.442987552315833e-06 + }, + { + "step": 583, + "epoch": 3.64375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6606, + "grad_norm": 0.6388129591941833, + "learning_rate": 7.190598342911358e-06 + }, + { + "step": 584, + "epoch": 3.65, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6661, + "grad_norm": 0.5000895261764526, + "learning_rate": 6.942457387765976e-06 + }, + { + "step": 585, + "epoch": 3.65625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352352768, + "loss": 0.6342, + "grad_norm": 0.7444621324539185, + "learning_rate": 6.698572068507596e-06 + }, + { + "step": 586, + "epoch": 3.6625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6754, + "grad_norm": 0.7770431637763977, + "learning_rate": 6.458949640168675e-06 + }, + { + "step": 587, + "epoch": 3.66875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352352768, + "loss": 0.6328, + "grad_norm": 0.6519712805747986, + "learning_rate": 6.223597230970428e-06 + }, + { + "step": 588, + "epoch": 3.675, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6659, + "grad_norm": 0.6867470741271973, + "learning_rate": 5.992521842110709e-06 + }, + { + "step": 589, + "epoch": 3.68125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6865, + "grad_norm": 0.7203111052513123, + "learning_rate": 5.7657303475556974e-06 + }, + { + "step": 590, + "epoch": 3.6875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6784, + "grad_norm": 0.8268718719482422, + "learning_rate": 5.543229493835594e-06 + }, + { + "step": 591, + "epoch": 3.69375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6812, + "grad_norm": 1.0130610466003418, + "learning_rate": 5.325025899843732e-06 + }, + { + "step": 592, + "epoch": 3.7, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.6771, + "grad_norm": 0.6728276014328003, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 593, + "epoch": 3.70625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6687, + "grad_norm": 0.7160840034484863, + "learning_rate": 4.901536327256589e-06 + }, + { + "step": 594, + "epoch": 3.7125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6536, + "grad_norm": 0.9922387003898621, + "learning_rate": 4.6962629465110365e-06 + }, + { + "step": 595, + "epoch": 3.71875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352351232, + "loss": 0.6796, + "grad_norm": 0.6761037111282349, + "learning_rate": 4.495312020818403e-06 + }, + { + "step": 596, + "epoch": 3.725, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6889, + "grad_norm": 0.8063040375709534, + "learning_rate": 4.298689528010785e-06 + }, + { + "step": 597, + "epoch": 3.73125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6344, + "grad_norm": 0.6899248361587524, + "learning_rate": 4.106401317159275e-06 + }, + { + "step": 598, + "epoch": 3.7375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6969, + "grad_norm": 0.7980103492736816, + "learning_rate": 3.918453108399955e-06 + }, + { + "step": 599, + "epoch": 3.74375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6952, + "grad_norm": 0.7644289135932922, + "learning_rate": 3.7348504927637302e-06 + }, + { + "step": 600, + "epoch": 3.75, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6908, + "grad_norm": 0.7800067067146301, + "learning_rate": 3.5555989320099952e-06 + }, + { + "step": 601, + "epoch": 3.75625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.7007, + "grad_norm": 0.7677456736564636, + "learning_rate": 3.3807037584642316e-06 + }, + { + "step": 602, + "epoch": 3.7625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.713, + "grad_norm": 0.7357360124588013, + "learning_rate": 3.21017017485925e-06 + }, + { + "step": 603, + "epoch": 3.76875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352349696, + "loss": 0.6781, + "grad_norm": 0.7277910113334656, + "learning_rate": 3.0440032541805825e-06 + }, + { + "step": 604, + "epoch": 3.775, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6485, + "grad_norm": 0.654171884059906, + "learning_rate": 2.882207939515435e-06 + }, + { + "step": 605, + "epoch": 3.78125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352338944, + "loss": 0.6851, + "grad_norm": 0.9097616076469421, + "learning_rate": 2.7247890439057064e-06 + }, + { + "step": 606, + "epoch": 3.7875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.7163, + "grad_norm": 0.8270599246025085, + "learning_rate": 2.5717512502048342e-06 + }, + { + "step": 607, + "epoch": 3.79375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6852, + "grad_norm": 0.623088538646698, + "learning_rate": 2.423099110938376e-06 + }, + { + "step": 608, + "epoch": 3.8, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6408, + "grad_norm": 0.8615767359733582, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 609, + "epoch": 3.80625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6465, + "grad_norm": 0.7764793634414673, + "learning_rate": 2.1389693533636455e-06 + }, + { + "step": 610, + "epoch": 3.8125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6668, + "grad_norm": 0.6064276695251465, + "learning_rate": 2.003500187268153e-06 + }, + { + "step": 611, + "epoch": 3.81875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6539, + "grad_norm": 0.784004807472229, + "learning_rate": 1.8724335797812685e-06 + }, + { + "step": 612, + "epoch": 3.825, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.681, + "grad_norm": 0.8682699799537659, + "learning_rate": 1.7457734298359005e-06 + }, + { + "step": 613, + "epoch": 3.83125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6807, + "grad_norm": 0.6872929930686951, + "learning_rate": 1.6235235052828476e-06 + }, + { + "step": 614, + "epoch": 3.8375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6733, + "grad_norm": 0.844731330871582, + "learning_rate": 1.505687442778819e-06 + }, + { + "step": 615, + "epoch": 3.84375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6934, + "grad_norm": 0.78240567445755, + "learning_rate": 1.3922687476781047e-06 + }, + { + "step": 616, + "epoch": 3.85, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6395, + "grad_norm": 0.6723353266716003, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 617, + "epoch": 3.85625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6572, + "grad_norm": 0.8500782251358032, + "learning_rate": 1.1786968239705486e-06 + }, + { + "step": 618, + "epoch": 3.8625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6531, + "grad_norm": 0.7276672124862671, + "learning_rate": 1.0785499486417438e-06 + }, + { + "step": 619, + "epoch": 3.86875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6866, + "grad_norm": 0.6828616261482239, + "learning_rate": 9.82833147083345e-07 + }, + { + "step": 620, + "epoch": 3.875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.6775, + "grad_norm": 0.7734342813491821, + "learning_rate": 8.91549266652053e-07 + }, + { + "step": 621, + "epoch": 3.88125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6725, + "grad_norm": 0.6623788475990295, + "learning_rate": 8.04701022835319e-07 + }, + { + "step": 622, + "epoch": 3.8875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.7061, + "grad_norm": 0.8566242456436157, + "learning_rate": 7.222909991704773e-07 + }, + { + "step": 623, + "epoch": 3.89375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.648, + "grad_norm": 0.8611614108085632, + "learning_rate": 6.443216471679058e-07 + }, + { + "step": 624, + "epoch": 3.9, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6886, + "grad_norm": 0.7277034521102905, + "learning_rate": 5.707952862381681e-07 + }, + { + "step": 625, + "epoch": 3.90625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6443, + "grad_norm": 0.6561474204063416, + "learning_rate": 5.017141036229522e-07 + }, + { + "step": 626, + "epoch": 3.9125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6945, + "grad_norm": 0.7757680416107178, + "learning_rate": 4.370801543300051e-07 + }, + { + "step": 627, + "epoch": 3.91875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6968, + "grad_norm": 0.835774838924408, + "learning_rate": 3.768953610720327e-07 + }, + { + "step": 628, + "epoch": 3.925, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6195, + "grad_norm": 0.8312222361564636, + "learning_rate": 3.211615142094781e-07 + }, + { + "step": 629, + "epoch": 3.93125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352342016, + "loss": 0.7151, + "grad_norm": 0.8102865815162659, + "learning_rate": 2.6988027169728145e-07 + }, + { + "step": 630, + "epoch": 3.9375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234048, + "loss": 0.6925, + "grad_norm": 0.8188010454177856, + "learning_rate": 2.2305315903553555e-07 + }, + { + "step": 631, + "epoch": 3.94375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352354304, + "loss": 0.65, + "grad_norm": 0.8779844641685486, + "learning_rate": 1.8068156922413924e-07 + }, + { + "step": 632, + "epoch": 3.95, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6486, + "grad_norm": 0.6420283317565918, + "learning_rate": 1.4276676272133025e-07 + }, + { + "step": 633, + "epoch": 3.95625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6458, + "grad_norm": 0.6414893269538879, + "learning_rate": 1.0930986740621539e-07 + }, + { + "step": 634, + "epoch": 3.9625, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352345088, + "loss": 0.6486, + "grad_norm": 0.807214617729187, + "learning_rate": 8.031187854514731e-08 + }, + { + "step": 635, + "epoch": 3.96875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6789, + "grad_norm": 0.6743530631065369, + "learning_rate": 5.577365876224815e-08 + }, + { + "step": 636, + "epoch": 3.975, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352337408, + "loss": 0.663, + "grad_norm": 0.6953024864196777, + "learning_rate": 3.5695938013630134e-08 + }, + { + "step": 637, + "epoch": 3.98125, + "cpu_mem": 3.274350592, + "gpu_mem": 1.35234816, + "loss": 0.6664, + "grad_norm": 0.8051717281341553, + "learning_rate": 2.007931356572956e-08 + }, + { + "step": 638, + "epoch": 3.9875, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6827, + "grad_norm": 0.8205377459526062, + "learning_rate": 8.924249977537712e-09 + }, + { + "step": 639, + "epoch": 3.99375, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352346624, + "loss": 0.6776, + "grad_norm": 0.7422493696212769, + "learning_rate": 2.2310790867619e-09 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "loss": 0.6757, + "grad_norm": 0.7327739000320435, + "learning_rate": 0.0 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 3.274350592, + "gpu_mem": 1.352343552, + "train_runtime": 1465.15, + "train_samples_per_second": 27.94, + "train_steps_per_second": 0.437, + "total_flos": 1.4912476582969344e+16, + "train_loss": 0.7159746838733554 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r8-a2/README.md b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r8-a2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e76d8bbcd207e545f2f008a678dd387957d3c8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r8-a2/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama_v1.1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r8-a2/adapter_config.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a34e999804ff05ab393ed2117c936e4d7827f88f --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r8-a2/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r8-a2/eval_results.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..0d9455606f25e24b1e64152724158e17603c8234 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "winogrande", + "results": 0.5114443567482242 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r8-a2/training_configuration.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..3417d17c59cd5d04fbdb2eff65c56b1fb374c6a8 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "WINOGRANDE", + "dataset_id": "allenai/winogrande", + "preprocess_id": "winogrande_train_deepeval" + }, + "peft_config": { + "method": "loraq4", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 6307840 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 4, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-loraq4-winogrande-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-loraq4-fix/TinyLlama_v1.1-loraq4-winogrande-r8-a2", + "seed": 42, + "timestamp": "2025-09-01T05:58:18.840145" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r8-a2/training_logs.json b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..b39298db8c7b19bc53b9b5b9d6eff06b9901bf93 --- /dev/null +++ b/TinyLlama_v1.1-loraq4/TinyLlama_v1.1-loraq4-winogrande-r8-a2/training_logs.json @@ -0,0 +1,5773 @@ +[ + { + "step": 1, + "epoch": 0.00625, + "cpu_mem": 3.334053888, + "gpu_mem": 1.074799104, + "loss": 3.2646, + "grad_norm": 31.348905563354492, + "learning_rate": 4.6875e-06 + }, + { + "step": 2, + "epoch": 0.0125, + "cpu_mem": 3.334643712, + "gpu_mem": 1.125259776, + "loss": 3.2643, + "grad_norm": 30.573467254638672, + "learning_rate": 9.375e-06 + }, + { + "step": 3, + "epoch": 0.01875, + "cpu_mem": 3.335036928, + "gpu_mem": 1.125264384, + "loss": 3.1225, + "grad_norm": 29.93150520324707, + "learning_rate": 1.40625e-05 + }, + { + "step": 4, + "epoch": 0.025, + "cpu_mem": 3.335430144, + "gpu_mem": 1.125262848, + "loss": 2.9066, + "grad_norm": 29.105424880981445, + "learning_rate": 1.875e-05 + }, + { + "step": 5, + "epoch": 0.03125, + "cpu_mem": 3.335626752, + "gpu_mem": 1.125262848, + "loss": 2.7604, + "grad_norm": 29.79389190673828, + "learning_rate": 2.3437499999999997e-05 + }, + { + "step": 6, + "epoch": 0.0375, + "cpu_mem": 3.336019968, + "gpu_mem": 1.125268992, + "loss": 2.5539, + "grad_norm": 30.576383590698242, + "learning_rate": 2.8125e-05 + }, + { + "step": 7, + "epoch": 0.04375, + "cpu_mem": 3.336216576, + "gpu_mem": 1.125275136, + "loss": 2.1306, + "grad_norm": 29.040273666381836, + "learning_rate": 3.28125e-05 + }, + { + "step": 8, + "epoch": 0.05, + "cpu_mem": 3.336413184, + "gpu_mem": 1.12525824, + "loss": 1.8327, + "grad_norm": 24.18002700805664, + "learning_rate": 3.75e-05 + }, + { + "step": 9, + "epoch": 0.05625, + "cpu_mem": 3.336609792, + "gpu_mem": 1.125264384, + "loss": 1.598, + "grad_norm": 19.806760787963867, + "learning_rate": 4.2187499999999995e-05 + }, + { + "step": 10, + "epoch": 0.0625, + "cpu_mem": 3.3368064, + "gpu_mem": 1.125267456, + "loss": 1.2729, + "grad_norm": 12.563774108886719, + "learning_rate": 4.6874999999999994e-05 + }, + { + "step": 11, + "epoch": 0.06875, + "cpu_mem": 3.337003008, + "gpu_mem": 1.125256704, + "loss": 1.0205, + "grad_norm": 8.938533782958984, + "learning_rate": 5.156249999999999e-05 + }, + { + "step": 12, + "epoch": 0.075, + "cpu_mem": 3.337199616, + "gpu_mem": 1.125261312, + "loss": 1.015, + "grad_norm": 11.079157829284668, + "learning_rate": 5.625e-05 + }, + { + "step": 13, + "epoch": 0.08125, + "cpu_mem": 3.337199616, + "gpu_mem": 1.125268992, + "loss": 0.8419, + "grad_norm": 5.969776153564453, + "learning_rate": 6.09375e-05 + }, + { + "step": 14, + "epoch": 0.0875, + "cpu_mem": 3.337396224, + "gpu_mem": 1.125264384, + "loss": 0.7682, + "grad_norm": 4.40029239654541, + "learning_rate": 6.5625e-05 + }, + { + "step": 15, + "epoch": 0.09375, + "cpu_mem": 3.337592832, + "gpu_mem": 1.125264384, + "loss": 0.8347, + "grad_norm": 13.470856666564941, + "learning_rate": 7.03125e-05 + }, + { + "step": 16, + "epoch": 0.1, + "cpu_mem": 3.337592832, + "gpu_mem": 1.125261312, + "loss": 0.8392, + "grad_norm": 14.965635299682617, + "learning_rate": 7.5e-05 + }, + { + "step": 17, + "epoch": 0.10625, + "cpu_mem": 3.33778944, + "gpu_mem": 1.125261312, + "loss": 0.759, + "grad_norm": 11.945836067199707, + "learning_rate": 7.968749999999999e-05 + }, + { + "step": 18, + "epoch": 0.1125, + "cpu_mem": 3.33778944, + "gpu_mem": 1.125264384, + "loss": 0.7806, + "grad_norm": 6.744518280029297, + "learning_rate": 8.437499999999999e-05 + }, + { + "step": 19, + "epoch": 0.11875, + "cpu_mem": 3.33778944, + "gpu_mem": 1.125261312, + "loss": 0.7629, + "grad_norm": 8.484512329101562, + "learning_rate": 8.906249999999999e-05 + }, + { + "step": 20, + "epoch": 0.125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125268992, + "loss": 0.7437, + "grad_norm": 7.3926472663879395, + "learning_rate": 9.374999999999999e-05 + }, + { + "step": 21, + "epoch": 0.13125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.7135, + "grad_norm": 3.0578315258026123, + "learning_rate": 9.843749999999999e-05 + }, + { + "step": 22, + "epoch": 0.1375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.7401, + "grad_norm": 4.140644073486328, + "learning_rate": 0.00010312499999999999 + }, + { + "step": 23, + "epoch": 0.14375, + "cpu_mem": 3.338182656, + "gpu_mem": 1.125256704, + "loss": 0.6983, + "grad_norm": 2.733891010284424, + "learning_rate": 0.00010781249999999998 + }, + { + "step": 24, + "epoch": 0.15, + "cpu_mem": 3.338182656, + "gpu_mem": 1.125259776, + "loss": 0.6889, + "grad_norm": 1.042647123336792, + "learning_rate": 0.0001125 + }, + { + "step": 25, + "epoch": 0.15625, + "cpu_mem": 3.338182656, + "gpu_mem": 1.125262848, + "loss": 0.7126, + "grad_norm": 1.178985357284546, + "learning_rate": 0.0001171875 + }, + { + "step": 26, + "epoch": 0.1625, + "cpu_mem": 3.338182656, + "gpu_mem": 1.12525824, + "loss": 0.7267, + "grad_norm": 4.1669392585754395, + "learning_rate": 0.000121875 + }, + { + "step": 27, + "epoch": 0.16875, + "cpu_mem": 3.338182656, + "gpu_mem": 1.125256704, + "loss": 0.6903, + "grad_norm": 2.1856868267059326, + "learning_rate": 0.0001265625 + }, + { + "step": 28, + "epoch": 0.175, + "cpu_mem": 3.338182656, + "gpu_mem": 1.125262848, + "loss": 0.7289, + "grad_norm": 3.3991594314575195, + "learning_rate": 0.00013125 + }, + { + "step": 29, + "epoch": 0.18125, + "cpu_mem": 3.338182656, + "gpu_mem": 1.125261312, + "loss": 0.7163, + "grad_norm": 1.0145467519760132, + "learning_rate": 0.0001359375 + }, + { + "step": 30, + "epoch": 0.1875, + "cpu_mem": 3.338182656, + "gpu_mem": 1.125261312, + "loss": 0.726, + "grad_norm": 4.1021223068237305, + "learning_rate": 0.000140625 + }, + { + "step": 31, + "epoch": 0.19375, + "cpu_mem": 3.338182656, + "gpu_mem": 1.125261312, + "loss": 0.6879, + "grad_norm": 1.0327023267745972, + "learning_rate": 0.0001453125 + }, + { + "step": 32, + "epoch": 0.2, + "cpu_mem": 3.338182656, + "gpu_mem": 1.12525824, + "loss": 0.6722, + "grad_norm": 1.394906759262085, + "learning_rate": 0.00015 + }, + { + "step": 33, + "epoch": 0.20625, + "cpu_mem": 3.338182656, + "gpu_mem": 1.12525824, + "loss": 0.8145, + "grad_norm": 6.382537364959717, + "learning_rate": 0.00015468749999999999 + }, + { + "step": 34, + "epoch": 0.2125, + "cpu_mem": 3.338182656, + "gpu_mem": 1.12525824, + "loss": 0.7533, + "grad_norm": 4.025751113891602, + "learning_rate": 0.00015937499999999998 + }, + { + "step": 35, + "epoch": 0.21875, + "cpu_mem": 3.338182656, + "gpu_mem": 1.125264384, + "loss": 0.7148, + "grad_norm": 0.785695493221283, + "learning_rate": 0.00016406249999999998 + }, + { + "step": 36, + "epoch": 0.225, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125259776, + "loss": 0.7451, + "grad_norm": 3.199381113052368, + "learning_rate": 0.00016874999999999998 + }, + { + "step": 37, + "epoch": 0.23125, + "cpu_mem": 3.338379264, + "gpu_mem": 1.12525824, + "loss": 0.6927, + "grad_norm": 0.5238595008850098, + "learning_rate": 0.00017343749999999998 + }, + { + "step": 38, + "epoch": 0.2375, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125262848, + "loss": 0.7011, + "grad_norm": 1.056441068649292, + "learning_rate": 0.00017812499999999998 + }, + { + "step": 39, + "epoch": 0.24375, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125268992, + "loss": 0.691, + "grad_norm": 2.1621549129486084, + "learning_rate": 0.00018281249999999998 + }, + { + "step": 40, + "epoch": 0.25, + "cpu_mem": 3.338379264, + "gpu_mem": 1.12526592, + "loss": 0.7801, + "grad_norm": 5.166017055511475, + "learning_rate": 0.00018749999999999998 + }, + { + "step": 41, + "epoch": 0.25625, + "cpu_mem": 3.338379264, + "gpu_mem": 1.12526592, + "loss": 0.7391, + "grad_norm": 3.116899251937866, + "learning_rate": 0.00019218749999999998 + }, + { + "step": 42, + "epoch": 0.2625, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125262848, + "loss": 0.7303, + "grad_norm": 3.34189510345459, + "learning_rate": 0.00019687499999999997 + }, + { + "step": 43, + "epoch": 0.26875, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125262848, + "loss": 0.7254, + "grad_norm": 2.8347527980804443, + "learning_rate": 0.00020156249999999997 + }, + { + "step": 44, + "epoch": 0.275, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125262848, + "loss": 0.7035, + "grad_norm": 0.8919041156768799, + "learning_rate": 0.00020624999999999997 + }, + { + "step": 45, + "epoch": 0.28125, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125270528, + "loss": 0.6947, + "grad_norm": 0.6013179421424866, + "learning_rate": 0.00021093749999999997 + }, + { + "step": 46, + "epoch": 0.2875, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125262848, + "loss": 0.7117, + "grad_norm": 0.4496428668498993, + "learning_rate": 0.00021562499999999997 + }, + { + "step": 47, + "epoch": 0.29375, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125264384, + "loss": 0.7206, + "grad_norm": 0.6632198095321655, + "learning_rate": 0.00022031249999999997 + }, + { + "step": 48, + "epoch": 0.3, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125264384, + "loss": 0.6879, + "grad_norm": 1.0655150413513184, + "learning_rate": 0.000225 + }, + { + "step": 49, + "epoch": 0.30625, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125255168, + "loss": 0.7626, + "grad_norm": 3.381382703781128, + "learning_rate": 0.0002296875 + }, + { + "step": 50, + "epoch": 0.3125, + "cpu_mem": 3.338379264, + "gpu_mem": 1.12525824, + "loss": 0.7183, + "grad_norm": 1.202592372894287, + "learning_rate": 0.000234375 + }, + { + "step": 51, + "epoch": 0.31875, + "cpu_mem": 3.338379264, + "gpu_mem": 1.12526592, + "loss": 0.6969, + "grad_norm": 1.201512098312378, + "learning_rate": 0.0002390625 + }, + { + "step": 52, + "epoch": 0.325, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125259776, + "loss": 0.7542, + "grad_norm": 2.3213555812835693, + "learning_rate": 0.00024375 + }, + { + "step": 53, + "epoch": 0.33125, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125264384, + "loss": 0.8049, + "grad_norm": 3.1781206130981445, + "learning_rate": 0.00024843749999999996 + }, + { + "step": 54, + "epoch": 0.3375, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125252096, + "loss": 0.7016, + "grad_norm": 0.933201014995575, + "learning_rate": 0.000253125 + }, + { + "step": 55, + "epoch": 0.34375, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125256704, + "loss": 0.6916, + "grad_norm": 0.40164893865585327, + "learning_rate": 0.00025781249999999996 + }, + { + "step": 56, + "epoch": 0.35, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125264384, + "loss": 0.7104, + "grad_norm": 1.4165618419647217, + "learning_rate": 0.0002625 + }, + { + "step": 57, + "epoch": 0.35625, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125255168, + "loss": 0.7386, + "grad_norm": 2.3399786949157715, + "learning_rate": 0.00026718749999999996 + }, + { + "step": 58, + "epoch": 0.3625, + "cpu_mem": 3.338379264, + "gpu_mem": 1.12525824, + "loss": 0.7064, + "grad_norm": 2.464600086212158, + "learning_rate": 0.000271875 + }, + { + "step": 59, + "epoch": 0.36875, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125270528, + "loss": 0.6891, + "grad_norm": 1.8421812057495117, + "learning_rate": 0.00027656249999999995 + }, + { + "step": 60, + "epoch": 0.375, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125261312, + "loss": 0.7241, + "grad_norm": 1.9040037393569946, + "learning_rate": 0.00028125 + }, + { + "step": 61, + "epoch": 0.38125, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125261312, + "loss": 0.7084, + "grad_norm": 1.2278995513916016, + "learning_rate": 0.00028593749999999995 + }, + { + "step": 62, + "epoch": 0.3875, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125261312, + "loss": 0.7082, + "grad_norm": 1.4662808179855347, + "learning_rate": 0.000290625 + }, + { + "step": 63, + "epoch": 0.39375, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125259776, + "loss": 0.6962, + "grad_norm": 1.6732220649719238, + "learning_rate": 0.00029531249999999995 + }, + { + "step": 64, + "epoch": 0.4, + "cpu_mem": 3.338379264, + "gpu_mem": 1.125268992, + "loss": 0.721, + "grad_norm": 1.709835410118103, + "learning_rate": 0.0003 + }, + { + "step": 65, + "epoch": 0.40625, + "cpu_mem": 3.338379264, + "gpu_mem": 1.12525824, + "loss": 0.6895, + "grad_norm": 0.47113358974456787, + "learning_rate": 0.00029999776892091325 + }, + { + "step": 66, + "epoch": 0.4125, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125264384, + "loss": 0.711, + "grad_norm": 1.1039882898330688, + "learning_rate": 0.00029999107575002246 + }, + { + "step": 67, + "epoch": 0.41875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125262848, + "loss": 0.7007, + "grad_norm": 1.169838786125183, + "learning_rate": 0.0002999799206864343 + }, + { + "step": 68, + "epoch": 0.425, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125261312, + "loss": 0.6985, + "grad_norm": 0.2674361765384674, + "learning_rate": 0.0002999643040619863 + }, + { + "step": 69, + "epoch": 0.43125, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125262848, + "loss": 0.6888, + "grad_norm": 0.33002161979675293, + "learning_rate": 0.0002999442263412377 + }, + { + "step": 70, + "epoch": 0.4375, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125268992, + "loss": 0.6979, + "grad_norm": 0.8321757316589355, + "learning_rate": 0.00029991968812145484 + }, + { + "step": 71, + "epoch": 0.44375, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125267456, + "loss": 0.7013, + "grad_norm": 0.2042980194091797, + "learning_rate": 0.00029989069013259374 + }, + { + "step": 72, + "epoch": 0.45, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12526592, + "loss": 0.6945, + "grad_norm": 0.2417713701725006, + "learning_rate": 0.00029985723323727866 + }, + { + "step": 73, + "epoch": 0.45625, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125262848, + "loss": 0.721, + "grad_norm": 1.3082057237625122, + "learning_rate": 0.00029981931843077583 + }, + { + "step": 74, + "epoch": 0.4625, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125256704, + "loss": 0.6935, + "grad_norm": 0.2232559323310852, + "learning_rate": 0.00029977694684096444 + }, + { + "step": 75, + "epoch": 0.46875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125262848, + "loss": 0.6804, + "grad_norm": 0.6609664559364319, + "learning_rate": 0.0002997301197283027 + }, + { + "step": 76, + "epoch": 0.475, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12526592, + "loss": 0.7736, + "grad_norm": 2.1837685108184814, + "learning_rate": 0.0002996788384857905 + }, + { + "step": 77, + "epoch": 0.48125, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125268992, + "loss": 0.7195, + "grad_norm": 1.0964577198028564, + "learning_rate": 0.00029962310463892795 + }, + { + "step": 78, + "epoch": 0.4875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12525824, + "loss": 0.7005, + "grad_norm": 0.6466937065124512, + "learning_rate": 0.00029956291984566997 + }, + { + "step": 79, + "epoch": 0.49375, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125261312, + "loss": 0.701, + "grad_norm": 0.42822495102882385, + "learning_rate": 0.00029949828589637703 + }, + { + "step": 80, + "epoch": 0.5, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12525824, + "loss": 0.7125, + "grad_norm": 0.9681013822555542, + "learning_rate": 0.0002994292047137618 + }, + { + "step": 81, + "epoch": 0.50625, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125255168, + "loss": 0.7103, + "grad_norm": 0.7473350763320923, + "learning_rate": 0.00029935567835283203 + }, + { + "step": 82, + "epoch": 0.5125, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125256704, + "loss": 0.6923, + "grad_norm": 0.19296567142009735, + "learning_rate": 0.00029927770900082954 + }, + { + "step": 83, + "epoch": 0.51875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125259776, + "loss": 0.6899, + "grad_norm": 0.4977751076221466, + "learning_rate": 0.0002991952989771647 + }, + { + "step": 84, + "epoch": 0.525, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125256704, + "loss": 0.7187, + "grad_norm": 1.0494526624679565, + "learning_rate": 0.0002991084507333479 + }, + { + "step": 85, + "epoch": 0.53125, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125261312, + "loss": 0.7083, + "grad_norm": 0.7746499180793762, + "learning_rate": 0.00029901716685291663 + }, + { + "step": 86, + "epoch": 0.5375, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12525824, + "loss": 0.702, + "grad_norm": 0.6742169857025146, + "learning_rate": 0.0002989214500513582 + }, + { + "step": 87, + "epoch": 0.54375, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125264384, + "loss": 0.7036, + "grad_norm": 1.082574725151062, + "learning_rate": 0.0002988213031760294 + }, + { + "step": 88, + "epoch": 0.55, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125264384, + "loss": 0.699, + "grad_norm": 0.506112813949585, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 89, + "epoch": 0.55625, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12526592, + "loss": 0.6934, + "grad_norm": 0.2280891239643097, + "learning_rate": 0.0002986077312523219 + }, + { + "step": 90, + "epoch": 0.5625, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125264384, + "loss": 0.7071, + "grad_norm": 0.9058667421340942, + "learning_rate": 0.00029849431255722116 + }, + { + "step": 91, + "epoch": 0.56875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125256704, + "loss": 0.7043, + "grad_norm": 1.3870923519134521, + "learning_rate": 0.00029837647649471715 + }, + { + "step": 92, + "epoch": 0.575, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125264384, + "loss": 0.7189, + "grad_norm": 1.318220615386963, + "learning_rate": 0.0002982542265701641 + }, + { + "step": 93, + "epoch": 0.58125, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12525824, + "loss": 0.712, + "grad_norm": 0.8364346027374268, + "learning_rate": 0.0002981275664202187 + }, + { + "step": 94, + "epoch": 0.5875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125259776, + "loss": 0.6969, + "grad_norm": 0.27816787362098694, + "learning_rate": 0.00029799649981273186 + }, + { + "step": 95, + "epoch": 0.59375, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125264384, + "loss": 0.6881, + "grad_norm": 0.4684586524963379, + "learning_rate": 0.00029786103064663634 + }, + { + "step": 96, + "epoch": 0.6, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12525824, + "loss": 0.7005, + "grad_norm": 0.7952578067779541, + "learning_rate": 0.0002977211629518312 + }, + { + "step": 97, + "epoch": 0.60625, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125261312, + "loss": 0.7039, + "grad_norm": 0.3836963474750519, + "learning_rate": 0.00029757690088906156 + }, + { + "step": 98, + "epoch": 0.6125, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12526592, + "loss": 0.7024, + "grad_norm": 0.3860487937927246, + "learning_rate": 0.00029742824874979515 + }, + { + "step": 99, + "epoch": 0.61875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12525824, + "loss": 0.6808, + "grad_norm": 0.7013923525810242, + "learning_rate": 0.0002972752109560943 + }, + { + "step": 100, + "epoch": 0.625, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125262848, + "loss": 0.6982, + "grad_norm": 0.4964832365512848, + "learning_rate": 0.00029711779206048454 + }, + { + "step": 101, + "epoch": 0.63125, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125264384, + "loss": 0.6959, + "grad_norm": 0.33692336082458496, + "learning_rate": 0.0002969559967458194 + }, + { + "step": 102, + "epoch": 0.6375, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125268992, + "loss": 0.6992, + "grad_norm": 0.6139791011810303, + "learning_rate": 0.0002967898298251407 + }, + { + "step": 103, + "epoch": 0.64375, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12525824, + "loss": 0.6933, + "grad_norm": 0.5420084595680237, + "learning_rate": 0.0002966192962415358 + }, + { + "step": 104, + "epoch": 0.65, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125264384, + "loss": 0.702, + "grad_norm": 0.20431657135486603, + "learning_rate": 0.00029644440106799 + }, + { + "step": 105, + "epoch": 0.65625, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125259776, + "loss": 0.6916, + "grad_norm": 0.3810305893421173, + "learning_rate": 0.00029626514950723627 + }, + { + "step": 106, + "epoch": 0.6625, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125261312, + "loss": 0.6992, + "grad_norm": 0.5180448889732361, + "learning_rate": 0.0002960815468916 + }, + { + "step": 107, + "epoch": 0.66875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125262848, + "loss": 0.6916, + "grad_norm": 0.15606437623500824, + "learning_rate": 0.0002958935986828407 + }, + { + "step": 108, + "epoch": 0.675, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125262848, + "loss": 0.7015, + "grad_norm": 0.6650442481040955, + "learning_rate": 0.00029570131047198915 + }, + { + "step": 109, + "epoch": 0.68125, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125264384, + "loss": 0.7094, + "grad_norm": 0.27570831775665283, + "learning_rate": 0.0002955046879791816 + }, + { + "step": 110, + "epoch": 0.6875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125267456, + "loss": 0.6921, + "grad_norm": 0.42183470726013184, + "learning_rate": 0.00029530373705348895 + }, + { + "step": 111, + "epoch": 0.69375, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12526592, + "loss": 0.7065, + "grad_norm": 0.30969876050949097, + "learning_rate": 0.00029509846367274336 + }, + { + "step": 112, + "epoch": 0.7, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125262848, + "loss": 0.6939, + "grad_norm": 0.14388111233711243, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 113, + "epoch": 0.70625, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125253632, + "loss": 0.6878, + "grad_norm": 0.5401323437690735, + "learning_rate": 0.00029467497410015625 + }, + { + "step": 114, + "epoch": 0.7125, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125261312, + "loss": 0.6962, + "grad_norm": 0.1796480268239975, + "learning_rate": 0.00029445677050616437 + }, + { + "step": 115, + "epoch": 0.71875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12526592, + "loss": 0.6993, + "grad_norm": 0.31227290630340576, + "learning_rate": 0.0002942342696524443 + }, + { + "step": 116, + "epoch": 0.725, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125256704, + "loss": 0.7332, + "grad_norm": 1.4486843347549438, + "learning_rate": 0.0002940074781578893 + }, + { + "step": 117, + "epoch": 0.73125, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12526592, + "loss": 0.6914, + "grad_norm": 0.2258821278810501, + "learning_rate": 0.00029377640276902954 + }, + { + "step": 118, + "epoch": 0.7375, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125261312, + "loss": 0.6944, + "grad_norm": 0.22110995650291443, + "learning_rate": 0.0002935410503598313 + }, + { + "step": 119, + "epoch": 0.74375, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125261312, + "loss": 0.73, + "grad_norm": 1.294443964958191, + "learning_rate": 0.00029330142793149237 + }, + { + "step": 120, + "epoch": 0.75, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125262848, + "loss": 0.697, + "grad_norm": 0.215035080909729, + "learning_rate": 0.000293057542612234 + }, + { + "step": 121, + "epoch": 0.75625, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12526592, + "loss": 0.6993, + "grad_norm": 0.31354016065597534, + "learning_rate": 0.0002928094016570886 + }, + { + "step": 122, + "epoch": 0.7625, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12526592, + "loss": 0.7071, + "grad_norm": 0.922015905380249, + "learning_rate": 0.00029255701244768414 + }, + { + "step": 123, + "epoch": 0.76875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12525824, + "loss": 0.6961, + "grad_norm": 0.12846408784389496, + "learning_rate": 0.0002923003824920244 + }, + { + "step": 124, + "epoch": 0.775, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125261312, + "loss": 0.6984, + "grad_norm": 0.3582961857318878, + "learning_rate": 0.0002920395194242658 + }, + { + "step": 125, + "epoch": 0.78125, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125253632, + "loss": 0.7011, + "grad_norm": 0.3661620616912842, + "learning_rate": 0.00029177443100449014 + }, + { + "step": 126, + "epoch": 0.7875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12525824, + "loss": 0.7059, + "grad_norm": 0.5112825036048889, + "learning_rate": 0.00029150512511847375 + }, + { + "step": 127, + "epoch": 0.79375, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125264384, + "loss": 0.6953, + "grad_norm": 0.18130381405353546, + "learning_rate": 0.00029123160977745306 + }, + { + "step": 128, + "epoch": 0.8, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125256704, + "loss": 0.7018, + "grad_norm": 0.47207528352737427, + "learning_rate": 0.0002909538931178862 + }, + { + "step": 129, + "epoch": 0.80625, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125262848, + "loss": 0.6955, + "grad_norm": 0.15063069760799408, + "learning_rate": 0.00029067198340121094 + }, + { + "step": 130, + "epoch": 0.8125, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12526592, + "loss": 0.6977, + "grad_norm": 0.23474393784999847, + "learning_rate": 0.00029038588901359884 + }, + { + "step": 131, + "epoch": 0.81875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125259776, + "loss": 0.6956, + "grad_norm": 0.15187595784664154, + "learning_rate": 0.00029009561846570604 + }, + { + "step": 132, + "epoch": 0.825, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12526592, + "loss": 0.7016, + "grad_norm": 0.5358698964118958, + "learning_rate": 0.00028980118039241976 + }, + { + "step": 133, + "epoch": 0.83125, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125262848, + "loss": 0.6987, + "grad_norm": 0.8203025460243225, + "learning_rate": 0.00028950258355260177 + }, + { + "step": 134, + "epoch": 0.8375, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125262848, + "loss": 0.7008, + "grad_norm": 0.3519965708255768, + "learning_rate": 0.00028919983682882766 + }, + { + "step": 135, + "epoch": 0.84375, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125267456, + "loss": 0.6716, + "grad_norm": 0.1485525667667389, + "learning_rate": 0.0002888929492271224 + }, + { + "step": 136, + "epoch": 0.85, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12525824, + "loss": 0.6794, + "grad_norm": 0.23088304698467255, + "learning_rate": 0.000288581929876693 + }, + { + "step": 137, + "epoch": 0.85625, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125261312, + "loss": 0.7045, + "grad_norm": 0.5907098650932312, + "learning_rate": 0.00028826678802965614 + }, + { + "step": 138, + "epoch": 0.8625, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125261312, + "loss": 0.7283, + "grad_norm": 0.831703782081604, + "learning_rate": 0.0002879475330607638 + }, + { + "step": 139, + "epoch": 0.86875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12525824, + "loss": 0.7215, + "grad_norm": 0.7954141497612, + "learning_rate": 0.00028762417446712363 + }, + { + "step": 140, + "epoch": 0.875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125264384, + "loss": 0.69, + "grad_norm": 0.2576504349708557, + "learning_rate": 0.00028729672186791704 + }, + { + "step": 141, + "epoch": 0.88125, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125261312, + "loss": 0.7003, + "grad_norm": 0.9135605096817017, + "learning_rate": 0.00028696518500411254 + }, + { + "step": 142, + "epoch": 0.8875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125261312, + "loss": 0.701, + "grad_norm": 0.773673415184021, + "learning_rate": 0.0002866295737381763 + }, + { + "step": 143, + "epoch": 0.89375, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125261312, + "loss": 0.6963, + "grad_norm": 0.10294463485479355, + "learning_rate": 0.0002862898980537788 + }, + { + "step": 144, + "epoch": 0.9, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125264384, + "loss": 0.701, + "grad_norm": 0.44326943159103394, + "learning_rate": 0.0002859461680554975 + }, + { + "step": 145, + "epoch": 0.90625, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125268992, + "loss": 0.6855, + "grad_norm": 0.18475618958473206, + "learning_rate": 0.0002855983939685165 + }, + { + "step": 146, + "epoch": 0.9125, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12526592, + "loss": 0.6817, + "grad_norm": 0.09964551031589508, + "learning_rate": 0.0002852465861383224 + }, + { + "step": 147, + "epoch": 0.91875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125259776, + "loss": 0.7238, + "grad_norm": 1.3654460906982422, + "learning_rate": 0.00028489075503039643 + }, + { + "step": 148, + "epoch": 0.925, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125262848, + "loss": 0.6911, + "grad_norm": 0.2719765901565552, + "learning_rate": 0.00028453091122990323 + }, + { + "step": 149, + "epoch": 0.93125, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125267456, + "loss": 0.6922, + "grad_norm": 0.4913496673107147, + "learning_rate": 0.0002841670654413757 + }, + { + "step": 150, + "epoch": 0.9375, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125267456, + "loss": 0.6918, + "grad_norm": 0.548733651638031, + "learning_rate": 0.0002837992284883971 + }, + { + "step": 151, + "epoch": 0.94375, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12526592, + "loss": 0.6972, + "grad_norm": 0.3924992084503174, + "learning_rate": 0.0002834274113132784 + }, + { + "step": 152, + "epoch": 0.95, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125259776, + "loss": 0.6933, + "grad_norm": 0.08304734528064728, + "learning_rate": 0.0002830516249767332 + }, + { + "step": 153, + "epoch": 0.95625, + "cpu_mem": 3.338575872, + "gpu_mem": 1.12526592, + "loss": 0.6907, + "grad_norm": 0.3129887580871582, + "learning_rate": 0.0002826718806575488 + }, + { + "step": 154, + "epoch": 0.9625, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125264384, + "loss": 0.6929, + "grad_norm": 0.26939156651496887, + "learning_rate": 0.0002822881896522532 + }, + { + "step": 155, + "epoch": 0.96875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125259776, + "loss": 0.6975, + "grad_norm": 0.766124427318573, + "learning_rate": 0.0002819005633747795 + }, + { + "step": 156, + "epoch": 0.975, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125255168, + "loss": 0.6983, + "grad_norm": 0.39280903339385986, + "learning_rate": 0.00028150901335612615 + }, + { + "step": 157, + "epoch": 0.98125, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125262848, + "loss": 0.6652, + "grad_norm": 1.476635456085205, + "learning_rate": 0.0002811135512440138 + }, + { + "step": 158, + "epoch": 0.9875, + "cpu_mem": 3.338575872, + "gpu_mem": 1.125272064, + "loss": 0.7124, + "grad_norm": 0.692840576171875, + "learning_rate": 0.0002807141888025392 + }, + { + "step": 159, + "epoch": 0.99375, + "cpu_mem": 3.332677632, + "gpu_mem": 1.125262848, + "loss": 0.6913, + "grad_norm": 0.34894928336143494, + "learning_rate": 0.00028031093791182484 + }, + { + "step": 160, + "epoch": 1.0, + "cpu_mem": 3.333070848, + "gpu_mem": 1.125256704, + "loss": 0.7178, + "grad_norm": 0.7027056813240051, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 161, + "epoch": 1.00625, + "cpu_mem": 3.33385728, + "gpu_mem": 1.125259776, + "loss": 0.7044, + "grad_norm": 0.4262169599533081, + "learning_rate": 0.0002794928188811727 + }, + { + "step": 162, + "epoch": 1.0125, + "cpu_mem": 3.334250496, + "gpu_mem": 1.125267456, + "loss": 0.6961, + "grad_norm": 0.20304028689861298, + "learning_rate": 0.0002790779750784118 + }, + { + "step": 163, + "epoch": 1.01875, + "cpu_mem": 3.334643712, + "gpu_mem": 1.125262848, + "loss": 0.6842, + "grad_norm": 0.4112168550491333, + "learning_rate": 0.0002786592915000408 + }, + { + "step": 164, + "epoch": 1.025, + "cpu_mem": 3.335036928, + "gpu_mem": 1.125264384, + "loss": 0.6761, + "grad_norm": 0.11614347994327545, + "learning_rate": 0.00027823678060094197 + }, + { + "step": 165, + "epoch": 1.03125, + "cpu_mem": 3.335233536, + "gpu_mem": 1.125267456, + "loss": 0.6824, + "grad_norm": 0.2793985903263092, + "learning_rate": 0.0002778104549498518 + }, + { + "step": 166, + "epoch": 1.0375, + "cpu_mem": 3.335430144, + "gpu_mem": 1.125261312, + "loss": 0.7341, + "grad_norm": 1.1876628398895264, + "learning_rate": 0.00027738032722898683 + }, + { + "step": 167, + "epoch": 1.04375, + "cpu_mem": 3.335626752, + "gpu_mem": 1.125259776, + "loss": 0.735, + "grad_norm": 1.156569004058838, + "learning_rate": 0.00027694641023366656 + }, + { + "step": 168, + "epoch": 1.05, + "cpu_mem": 3.336019968, + "gpu_mem": 1.125267456, + "loss": 0.7065, + "grad_norm": 0.5908222198486328, + "learning_rate": 0.0002765087168719328 + }, + { + "step": 169, + "epoch": 1.05625, + "cpu_mem": 3.336216576, + "gpu_mem": 1.125264384, + "loss": 0.6964, + "grad_norm": 0.18823248147964478, + "learning_rate": 0.00027606726016416567 + }, + { + "step": 170, + "epoch": 1.0625, + "cpu_mem": 3.336413184, + "gpu_mem": 1.125268992, + "loss": 0.6911, + "grad_norm": 0.17716974020004272, + "learning_rate": 0.00027562205324269617 + }, + { + "step": 171, + "epoch": 1.06875, + "cpu_mem": 3.336413184, + "gpu_mem": 1.12525824, + "loss": 0.7851, + "grad_norm": 3.084944486618042, + "learning_rate": 0.00027517310935141565 + }, + { + "step": 172, + "epoch": 1.075, + "cpu_mem": 3.336609792, + "gpu_mem": 1.125261312, + "loss": 0.7197, + "grad_norm": 1.467236042022705, + "learning_rate": 0.0002747204418453818 + }, + { + "step": 173, + "epoch": 1.08125, + "cpu_mem": 3.336609792, + "gpu_mem": 1.125253632, + "loss": 0.7043, + "grad_norm": 0.65302574634552, + "learning_rate": 0.00027426406419042135 + }, + { + "step": 174, + "epoch": 1.0875, + "cpu_mem": 3.3368064, + "gpu_mem": 1.125259776, + "loss": 0.695, + "grad_norm": 0.15446710586547852, + "learning_rate": 0.00027380398996272956 + }, + { + "step": 175, + "epoch": 1.09375, + "cpu_mem": 3.3368064, + "gpu_mem": 1.125262848, + "loss": 0.6895, + "grad_norm": 0.14360199868679047, + "learning_rate": 0.0002733402328484662 + }, + { + "step": 176, + "epoch": 1.1, + "cpu_mem": 3.337003008, + "gpu_mem": 1.12525824, + "loss": 0.7265, + "grad_norm": 1.6894454956054688, + "learning_rate": 0.00027287280664334875 + }, + { + "step": 177, + "epoch": 1.10625, + "cpu_mem": 3.337003008, + "gpu_mem": 1.125259776, + "loss": 0.7355, + "grad_norm": 1.6969295740127563, + "learning_rate": 0.0002724017252522415 + }, + { + "step": 178, + "epoch": 1.1125, + "cpu_mem": 3.337199616, + "gpu_mem": 1.125264384, + "loss": 0.6896, + "grad_norm": 0.287123441696167, + "learning_rate": 0.0002719270026887423 + }, + { + "step": 179, + "epoch": 1.11875, + "cpu_mem": 3.337199616, + "gpu_mem": 1.125264384, + "loss": 0.6984, + "grad_norm": 0.5458834171295166, + "learning_rate": 0.0002714486530747656 + }, + { + "step": 180, + "epoch": 1.125, + "cpu_mem": 3.337396224, + "gpu_mem": 1.125259776, + "loss": 0.6962, + "grad_norm": 0.5668650269508362, + "learning_rate": 0.0002709666906401224 + }, + { + "step": 181, + "epoch": 1.13125, + "cpu_mem": 3.337396224, + "gpu_mem": 1.125276672, + "loss": 0.7001, + "grad_norm": 1.3992080688476562, + "learning_rate": 0.0002704811297220967 + }, + { + "step": 182, + "epoch": 1.1375, + "cpu_mem": 3.337396224, + "gpu_mem": 1.125264384, + "loss": 0.734, + "grad_norm": 1.6586384773254395, + "learning_rate": 0.00026999198476501945 + }, + { + "step": 183, + "epoch": 1.14375, + "cpu_mem": 3.337396224, + "gpu_mem": 1.125261312, + "loss": 0.7351, + "grad_norm": 1.5944339036941528, + "learning_rate": 0.0002694992703198383 + }, + { + "step": 184, + "epoch": 1.15, + "cpu_mem": 3.337592832, + "gpu_mem": 1.125261312, + "loss": 0.7069, + "grad_norm": 1.018441081047058, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 185, + "epoch": 1.15625, + "cpu_mem": 3.337592832, + "gpu_mem": 1.125259776, + "loss": 0.6849, + "grad_norm": 0.13938173651695251, + "learning_rate": 0.0002685031916994403 + }, + { + "step": 186, + "epoch": 1.1625, + "cpu_mem": 3.337592832, + "gpu_mem": 1.125267456, + "loss": 0.73, + "grad_norm": 1.3627973794937134, + "learning_rate": 0.0002679998571552925 + }, + { + "step": 187, + "epoch": 1.16875, + "cpu_mem": 3.337592832, + "gpu_mem": 1.125264384, + "loss": 0.6786, + "grad_norm": 0.2917529046535492, + "learning_rate": 0.0002674930123842975 + }, + { + "step": 188, + "epoch": 1.175, + "cpu_mem": 3.337592832, + "gpu_mem": 1.125262848, + "loss": 0.7042, + "grad_norm": 0.7940145134925842, + "learning_rate": 0.0002669826724639322 + }, + { + "step": 189, + "epoch": 1.18125, + "cpu_mem": 3.337592832, + "gpu_mem": 1.125261312, + "loss": 0.7024, + "grad_norm": 0.952116847038269, + "learning_rate": 0.0002664688525756463 + }, + { + "step": 190, + "epoch": 1.1875, + "cpu_mem": 3.337592832, + "gpu_mem": 1.12526592, + "loss": 0.7063, + "grad_norm": 1.3027238845825195, + "learning_rate": 0.0002659515680044105 + }, + { + "step": 191, + "epoch": 1.19375, + "cpu_mem": 3.33778944, + "gpu_mem": 1.125256704, + "loss": 0.7223, + "grad_norm": 1.5002529621124268, + "learning_rate": 0.00026543083413826203 + }, + { + "step": 192, + "epoch": 1.2, + "cpu_mem": 3.33778944, + "gpu_mem": 1.125261312, + "loss": 0.683, + "grad_norm": 0.7732875347137451, + "learning_rate": 0.00026490666646784665 + }, + { + "step": 193, + "epoch": 1.20625, + "cpu_mem": 3.33778944, + "gpu_mem": 1.125259776, + "loss": 0.6885, + "grad_norm": 0.13847628235816956, + "learning_rate": 0.0002643790805859582 + }, + { + "step": 194, + "epoch": 1.2125, + "cpu_mem": 3.33778944, + "gpu_mem": 1.125255168, + "loss": 0.6898, + "grad_norm": 0.13827762007713318, + "learning_rate": 0.00026384809218707423 + }, + { + "step": 195, + "epoch": 1.21875, + "cpu_mem": 3.33778944, + "gpu_mem": 1.12526592, + "loss": 0.7265, + "grad_norm": 1.606848955154419, + "learning_rate": 0.0002633137170668897 + }, + { + "step": 196, + "epoch": 1.225, + "cpu_mem": 3.33778944, + "gpu_mem": 1.125264384, + "loss": 0.694, + "grad_norm": 0.39670172333717346, + "learning_rate": 0.0002627759711218466 + }, + { + "step": 197, + "epoch": 1.23125, + "cpu_mem": 3.33778944, + "gpu_mem": 1.125268992, + "loss": 0.6942, + "grad_norm": 0.11334912478923798, + "learning_rate": 0.00026223487034866133 + }, + { + "step": 198, + "epoch": 1.2375, + "cpu_mem": 3.33778944, + "gpu_mem": 1.125262848, + "loss": 0.6932, + "grad_norm": 0.13718892633914948, + "learning_rate": 0.00026169043084384896 + }, + { + "step": 199, + "epoch": 1.24375, + "cpu_mem": 3.33778944, + "gpu_mem": 1.12525824, + "loss": 0.6854, + "grad_norm": 0.1494653970003128, + "learning_rate": 0.00026114266880324387 + }, + { + "step": 200, + "epoch": 1.25, + "cpu_mem": 3.33778944, + "gpu_mem": 1.125262848, + "loss": 0.7127, + "grad_norm": 0.6840382218360901, + "learning_rate": 0.0002605916005215186 + }, + { + "step": 201, + "epoch": 1.25625, + "cpu_mem": 3.33778944, + "gpu_mem": 1.125268992, + "loss": 0.7067, + "grad_norm": 0.43828630447387695, + "learning_rate": 0.00026003724239169874 + }, + { + "step": 202, + "epoch": 1.2625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6964, + "grad_norm": 0.6447861790657043, + "learning_rate": 0.00025947961090467533 + }, + { + "step": 203, + "epoch": 1.26875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6946, + "grad_norm": 0.9413389563560486, + "learning_rate": 0.0002589187226487144 + }, + { + "step": 204, + "epoch": 1.275, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6965, + "grad_norm": 0.3139038383960724, + "learning_rate": 0.0002583545943089633 + }, + { + "step": 205, + "epoch": 1.28125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.7031, + "grad_norm": 0.6115783452987671, + "learning_rate": 0.00025778724266695466 + }, + { + "step": 206, + "epoch": 1.2875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.7178, + "grad_norm": 0.7768978476524353, + "learning_rate": 0.00025721668460010696 + }, + { + "step": 207, + "epoch": 1.29375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6918, + "grad_norm": 0.11720523238182068, + "learning_rate": 0.0002566429370812223 + }, + { + "step": 208, + "epoch": 1.3, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.7044, + "grad_norm": 0.6255006194114685, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 209, + "epoch": 1.30625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6898, + "grad_norm": 0.22264277935028076, + "learning_rate": 0.0002554859420524386 + }, + { + "step": 210, + "epoch": 1.3125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6904, + "grad_norm": 0.12505744397640228, + "learning_rate": 0.00025490272896050507 + }, + { + "step": 211, + "epoch": 1.31875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.7218, + "grad_norm": 0.7488930225372314, + "learning_rate": 0.00025431639525144175 + }, + { + "step": 212, + "epoch": 1.325, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.695, + "grad_norm": 0.1913757622241974, + "learning_rate": 0.0002537269583673404 + }, + { + "step": 213, + "epoch": 1.33125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6892, + "grad_norm": 0.18769820034503937, + "learning_rate": 0.0002531344358426051 + }, + { + "step": 214, + "epoch": 1.3375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125270528, + "loss": 0.6895, + "grad_norm": 0.11731309443712234, + "learning_rate": 0.0002525388453034307 + }, + { + "step": 215, + "epoch": 1.34375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6986, + "grad_norm": 0.5063328146934509, + "learning_rate": 0.0002519402044672784 + }, + { + "step": 216, + "epoch": 1.35, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.7092, + "grad_norm": 0.623552143573761, + "learning_rate": 0.00025133853114234905 + }, + { + "step": 217, + "epoch": 1.35625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.7008, + "grad_norm": 0.16271036863327026, + "learning_rate": 0.00025073384322705274 + }, + { + "step": 218, + "epoch": 1.3625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.699, + "grad_norm": 0.4856497645378113, + "learning_rate": 0.0002501261587094771 + }, + { + "step": 219, + "epoch": 1.36875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6938, + "grad_norm": 0.13967518508434296, + "learning_rate": 0.00024951549566685165 + }, + { + "step": 220, + "epoch": 1.375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6958, + "grad_norm": 0.1186295747756958, + "learning_rate": 0.0002489018722650103 + }, + { + "step": 221, + "epoch": 1.38125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6855, + "grad_norm": 0.401638001203537, + "learning_rate": 0.00024828530675785094 + }, + { + "step": 222, + "epoch": 1.3875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6982, + "grad_norm": 0.22315290570259094, + "learning_rate": 0.00024766581748679234 + }, + { + "step": 223, + "epoch": 1.39375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125255168, + "loss": 0.6885, + "grad_norm": 0.1469808667898178, + "learning_rate": 0.0002470434228802286 + }, + { + "step": 224, + "epoch": 1.4, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6944, + "grad_norm": 0.17828629910945892, + "learning_rate": 0.0002464181414529809 + }, + { + "step": 225, + "epoch": 1.40625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.7097, + "grad_norm": 0.457846075296402, + "learning_rate": 0.0002457899918057468 + }, + { + "step": 226, + "epoch": 1.4125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.702, + "grad_norm": 0.3052440583705902, + "learning_rate": 0.0002451589926245468 + }, + { + "step": 227, + "epoch": 1.41875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6955, + "grad_norm": 0.09104784578084946, + "learning_rate": 0.00024452516268016865 + }, + { + "step": 228, + "epoch": 1.425, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6952, + "grad_norm": 0.1496618390083313, + "learning_rate": 0.00024388852082760884 + }, + { + "step": 229, + "epoch": 1.43125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6983, + "grad_norm": 0.21705561876296997, + "learning_rate": 0.00024324908600551162 + }, + { + "step": 230, + "epoch": 1.4375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.7003, + "grad_norm": 0.2856804430484772, + "learning_rate": 0.00024260687723560574 + }, + { + "step": 231, + "epoch": 1.44375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6849, + "grad_norm": 0.2699078321456909, + "learning_rate": 0.00024196191362213862 + }, + { + "step": 232, + "epoch": 1.45, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6911, + "grad_norm": 0.11205316334962845, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 233, + "epoch": 1.45625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6958, + "grad_norm": 0.16415227949619293, + "learning_rate": 0.0002406637986906913 + }, + { + "step": 234, + "epoch": 1.4625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125272064, + "loss": 0.6971, + "grad_norm": 0.15599234402179718, + "learning_rate": 0.00024001068598867212 + }, + { + "step": 235, + "epoch": 1.46875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.695, + "grad_norm": 0.35523083806037903, + "learning_rate": 0.000239354895673865 + }, + { + "step": 236, + "epoch": 1.475, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.7001, + "grad_norm": 0.6585632562637329, + "learning_rate": 0.00023869644725453735 + }, + { + "step": 237, + "epoch": 1.48125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125270528, + "loss": 0.697, + "grad_norm": 0.48015928268432617, + "learning_rate": 0.00023803536031802918 + }, + { + "step": 238, + "epoch": 1.4875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125268992, + "loss": 0.6923, + "grad_norm": 0.8910136818885803, + "learning_rate": 0.00023737165453017033 + }, + { + "step": 239, + "epoch": 1.49375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.7144, + "grad_norm": 0.7677074074745178, + "learning_rate": 0.0002367053496346955 + }, + { + "step": 240, + "epoch": 1.5, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.7059, + "grad_norm": 0.507446825504303, + "learning_rate": 0.00023603646545265687 + }, + { + "step": 241, + "epoch": 1.50625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6717, + "grad_norm": 0.3548586368560791, + "learning_rate": 0.00023536502188183472 + }, + { + "step": 242, + "epoch": 1.5125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6809, + "grad_norm": 0.14279431104660034, + "learning_rate": 0.00023469103889614505 + }, + { + "step": 243, + "epoch": 1.51875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125270528, + "loss": 0.733, + "grad_norm": 1.1021513938903809, + "learning_rate": 0.0002340145365450458 + }, + { + "step": 244, + "epoch": 1.525, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.702, + "grad_norm": 0.3790605068206787, + "learning_rate": 0.0002333355349529403 + }, + { + "step": 245, + "epoch": 1.53125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6937, + "grad_norm": 0.2467116415500641, + "learning_rate": 0.0002326540543185786 + }, + { + "step": 246, + "epoch": 1.5375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.7015, + "grad_norm": 0.30124029517173767, + "learning_rate": 0.0002319701149144565 + }, + { + "step": 247, + "epoch": 1.54375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6945, + "grad_norm": 0.11881984025239944, + "learning_rate": 0.00023128373708621275 + }, + { + "step": 248, + "epoch": 1.55, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.7128, + "grad_norm": 0.5144086480140686, + "learning_rate": 0.00023059494125202357 + }, + { + "step": 249, + "epoch": 1.55625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6854, + "grad_norm": 0.10162854939699173, + "learning_rate": 0.00022990374790199532 + }, + { + "step": 250, + "epoch": 1.5625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.7081, + "grad_norm": 0.5089104175567627, + "learning_rate": 0.0002292101775975552 + }, + { + "step": 251, + "epoch": 1.56875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6974, + "grad_norm": 0.27169546484947205, + "learning_rate": 0.00022851425097083906 + }, + { + "step": 252, + "epoch": 1.575, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.7038, + "grad_norm": 0.5937886238098145, + "learning_rate": 0.00022781598872407822 + }, + { + "step": 253, + "epoch": 1.58125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6977, + "grad_norm": 0.21666978299617767, + "learning_rate": 0.00022711541162898321 + }, + { + "step": 254, + "epoch": 1.5875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6986, + "grad_norm": 0.10477087646722794, + "learning_rate": 0.00022641254052612627 + }, + { + "step": 255, + "epoch": 1.59375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125255168, + "loss": 0.6936, + "grad_norm": 0.24831020832061768, + "learning_rate": 0.00022570739632432079 + }, + { + "step": 256, + "epoch": 1.6, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6998, + "grad_norm": 0.2902606427669525, + "learning_rate": 0.000225 + }, + { + "step": 257, + "epoch": 1.60625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.697, + "grad_norm": 0.12245406210422516, + "learning_rate": 0.0002242903725965924 + }, + { + "step": 258, + "epoch": 1.6125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.704, + "grad_norm": 0.4410349428653717, + "learning_rate": 0.00022357853522389615 + }, + { + "step": 259, + "epoch": 1.61875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.689, + "grad_norm": 0.38825294375419617, + "learning_rate": 0.000222864509057451 + }, + { + "step": 260, + "epoch": 1.625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6919, + "grad_norm": 0.1604098379611969, + "learning_rate": 0.00022214831533790813 + }, + { + "step": 261, + "epoch": 1.63125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.7101, + "grad_norm": 0.724481999874115, + "learning_rate": 0.0002214299753703987 + }, + { + "step": 262, + "epoch": 1.6375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6922, + "grad_norm": 0.0811934545636177, + "learning_rate": 0.00022070951052389966 + }, + { + "step": 263, + "epoch": 1.64375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125272064, + "loss": 0.693, + "grad_norm": 0.10190754383802414, + "learning_rate": 0.00021998694223059837 + }, + { + "step": 264, + "epoch": 1.65, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6943, + "grad_norm": 0.11035846918821335, + "learning_rate": 0.0002192622919852551 + }, + { + "step": 265, + "epoch": 1.65625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.7015, + "grad_norm": 0.4031934440135956, + "learning_rate": 0.00021853558134456307 + }, + { + "step": 266, + "epoch": 1.6625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6894, + "grad_norm": 0.10278279334306717, + "learning_rate": 0.00021780683192650796 + }, + { + "step": 267, + "epoch": 1.66875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6887, + "grad_norm": 0.12756480276584625, + "learning_rate": 0.00021707606540972413 + }, + { + "step": 268, + "epoch": 1.675, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.7077, + "grad_norm": 0.6038954854011536, + "learning_rate": 0.00021634330353285017 + }, + { + "step": 269, + "epoch": 1.68125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6909, + "grad_norm": 0.25872159004211426, + "learning_rate": 0.00021560856809388213 + }, + { + "step": 270, + "epoch": 1.6875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6966, + "grad_norm": 0.33204492926597595, + "learning_rate": 0.00021487188094952489 + }, + { + "step": 271, + "epoch": 1.69375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6934, + "grad_norm": 0.09300601482391357, + "learning_rate": 0.0002141332640145423 + }, + { + "step": 272, + "epoch": 1.7, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.7045, + "grad_norm": 0.5162025094032288, + "learning_rate": 0.0002133927392611049 + }, + { + "step": 273, + "epoch": 1.70625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.69, + "grad_norm": 0.12253855168819427, + "learning_rate": 0.00021265032871813658 + }, + { + "step": 274, + "epoch": 1.7125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6996, + "grad_norm": 0.4143090546131134, + "learning_rate": 0.00021190605447065917 + }, + { + "step": 275, + "epoch": 1.71875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.694, + "grad_norm": 0.23697364330291748, + "learning_rate": 0.0002111599386591355 + }, + { + "step": 276, + "epoch": 1.725, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6928, + "grad_norm": 0.1962956339120865, + "learning_rate": 0.00021041200347881057 + }, + { + "step": 277, + "epoch": 1.73125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125268992, + "loss": 0.6968, + "grad_norm": 0.34148111939430237, + "learning_rate": 0.00020966227117905163 + }, + { + "step": 278, + "epoch": 1.7375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6914, + "grad_norm": 0.11305201053619385, + "learning_rate": 0.00020891076406268612 + }, + { + "step": 279, + "epoch": 1.74375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6998, + "grad_norm": 0.3818724453449249, + "learning_rate": 0.00020815750448533805 + }, + { + "step": 280, + "epoch": 1.75, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125268992, + "loss": 0.6975, + "grad_norm": 0.16268140077590942, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 281, + "epoch": 1.75625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125255168, + "loss": 0.6919, + "grad_norm": 0.3051871955394745, + "learning_rate": 0.00020664581763018324 + }, + { + "step": 282, + "epoch": 1.7625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6981, + "grad_norm": 0.17470215260982513, + "learning_rate": 0.00020588743532161543 + }, + { + "step": 283, + "epoch": 1.76875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.7147, + "grad_norm": 0.6485084891319275, + "learning_rate": 0.00020512739048920552 + }, + { + "step": 284, + "epoch": 1.775, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.7007, + "grad_norm": 0.24745546281337738, + "learning_rate": 0.00020436570574255522 + }, + { + "step": 285, + "epoch": 1.78125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6927, + "grad_norm": 0.1202501654624939, + "learning_rate": 0.00020360240374005 + }, + { + "step": 286, + "epoch": 1.7875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6997, + "grad_norm": 0.5814319252967834, + "learning_rate": 0.00020283750718818501 + }, + { + "step": 287, + "epoch": 1.79375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125255168, + "loss": 0.6968, + "grad_norm": 0.1567773073911667, + "learning_rate": 0.00020207103884088955 + }, + { + "step": 288, + "epoch": 1.8, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6944, + "grad_norm": 0.08155351877212524, + "learning_rate": 0.00020130302149885031 + }, + { + "step": 289, + "epoch": 1.80625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.694, + "grad_norm": 0.21660815179347992, + "learning_rate": 0.00020053347800883298 + }, + { + "step": 290, + "epoch": 1.8125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6907, + "grad_norm": 0.1288982778787613, + "learning_rate": 0.00019976243126300282 + }, + { + "step": 291, + "epoch": 1.81875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.692, + "grad_norm": 0.09217669069766998, + "learning_rate": 0.00019898990419824333 + }, + { + "step": 292, + "epoch": 1.825, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125252096, + "loss": 0.6954, + "grad_norm": 0.11234449595212936, + "learning_rate": 0.00019821591979547423 + }, + { + "step": 293, + "epoch": 1.83125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.7143, + "grad_norm": 0.48493075370788574, + "learning_rate": 0.00019744050107896774 + }, + { + "step": 294, + "epoch": 1.8375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125253632, + "loss": 0.7181, + "grad_norm": 0.6155734658241272, + "learning_rate": 0.0001966636711156636 + }, + { + "step": 295, + "epoch": 1.84375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6922, + "grad_norm": 0.08432667702436447, + "learning_rate": 0.00019588545301448302 + }, + { + "step": 296, + "epoch": 1.85, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6956, + "grad_norm": 0.32032787799835205, + "learning_rate": 0.00019510586992564093 + }, + { + "step": 297, + "epoch": 1.85625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6972, + "grad_norm": 0.09736859798431396, + "learning_rate": 0.0001943249450399578 + }, + { + "step": 298, + "epoch": 1.8625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.698, + "grad_norm": 0.17176523804664612, + "learning_rate": 0.0001935427015881693 + }, + { + "step": 299, + "epoch": 1.86875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6915, + "grad_norm": 0.09317140281200409, + "learning_rate": 0.00019275916284023563 + }, + { + "step": 300, + "epoch": 1.875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.7117, + "grad_norm": 0.5486005544662476, + "learning_rate": 0.00019197435210464882 + }, + { + "step": 301, + "epoch": 1.88125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6938, + "grad_norm": 0.06743083894252777, + "learning_rate": 0.00019118829272773985 + }, + { + "step": 302, + "epoch": 1.8875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6936, + "grad_norm": 0.09482726454734802, + "learning_rate": 0.00019040100809298392 + }, + { + "step": 303, + "epoch": 1.89375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125272064, + "loss": 0.6961, + "grad_norm": 0.1684800684452057, + "learning_rate": 0.00018961252162030476 + }, + { + "step": 304, + "epoch": 1.9, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.692, + "grad_norm": 0.1922428011894226, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 305, + "epoch": 1.90625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6921, + "grad_norm": 0.19973766803741455, + "learning_rate": 0.00018803203701893393 + }, + { + "step": 306, + "epoch": 1.9125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6932, + "grad_norm": 0.06828421354293823, + "learning_rate": 0.00018724008590605742 + }, + { + "step": 307, + "epoch": 1.91875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.7109, + "grad_norm": 0.46476683020591736, + "learning_rate": 0.0001864470269854896 + }, + { + "step": 308, + "epoch": 1.925, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6895, + "grad_norm": 0.06402019411325455, + "learning_rate": 0.00018565288384892595 + }, + { + "step": 309, + "epoch": 1.93125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.7172, + "grad_norm": 0.5795151591300964, + "learning_rate": 0.00018485768012031518 + }, + { + "step": 310, + "epoch": 1.9375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.686, + "grad_norm": 0.1812419295310974, + "learning_rate": 0.00018406143945515598 + }, + { + "step": 311, + "epoch": 1.94375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125255168, + "loss": 0.6915, + "grad_norm": 0.10565205663442612, + "learning_rate": 0.00018326418553979367 + }, + { + "step": 312, + "epoch": 1.95, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125255168, + "loss": 0.6975, + "grad_norm": 0.19832223653793335, + "learning_rate": 0.0001824659420907154 + }, + { + "step": 313, + "epoch": 1.95625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6909, + "grad_norm": 0.1295897215604782, + "learning_rate": 0.00018166673285384475 + }, + { + "step": 314, + "epoch": 1.9625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6924, + "grad_norm": 0.06630301475524902, + "learning_rate": 0.00018086658160383523 + }, + { + "step": 315, + "epoch": 1.96875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6921, + "grad_norm": 0.16033147275447845, + "learning_rate": 0.00018006551214336304 + }, + { + "step": 316, + "epoch": 1.975, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6948, + "grad_norm": 0.3078438639640808, + "learning_rate": 0.00017926354830241924 + }, + { + "step": 317, + "epoch": 1.98125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6927, + "grad_norm": 0.22512038052082062, + "learning_rate": 0.00017846071393760044 + }, + { + "step": 318, + "epoch": 1.9875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6908, + "grad_norm": 0.12931238114833832, + "learning_rate": 0.00017765703293139948 + }, + { + "step": 319, + "epoch": 1.99375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6846, + "grad_norm": 0.20147636532783508, + "learning_rate": 0.00017685252919149493 + }, + { + "step": 320, + "epoch": 2.0, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6793, + "grad_norm": 0.25893735885620117, + "learning_rate": 0.00017604722665003956 + }, + { + "step": 321, + "epoch": 2.00625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6545, + "grad_norm": 0.47142738103866577, + "learning_rate": 0.00017524114926294887 + }, + { + "step": 322, + "epoch": 2.0125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.7009, + "grad_norm": 0.39167261123657227, + "learning_rate": 0.0001744343210091883 + }, + { + "step": 323, + "epoch": 2.01875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.718, + "grad_norm": 0.6048240065574646, + "learning_rate": 0.00017362676589005967 + }, + { + "step": 324, + "epoch": 2.025, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125272064, + "loss": 0.7528, + "grad_norm": 0.9523923397064209, + "learning_rate": 0.0001728185079284875 + }, + { + "step": 325, + "epoch": 2.03125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.7155, + "grad_norm": 0.5011513829231262, + "learning_rate": 0.00017200957116830423 + }, + { + "step": 326, + "epoch": 2.0375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125270528, + "loss": 0.7068, + "grad_norm": 0.31644800305366516, + "learning_rate": 0.00017119997967353514 + }, + { + "step": 327, + "epoch": 2.04375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6868, + "grad_norm": 0.07485314458608627, + "learning_rate": 0.00017038975752768211 + }, + { + "step": 328, + "epoch": 2.05, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6951, + "grad_norm": 0.16485320031642914, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 329, + "epoch": 2.05625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6948, + "grad_norm": 0.28276580572128296, + "learning_rate": 0.0001687675177098179 + }, + { + "step": 330, + "epoch": 2.0625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6917, + "grad_norm": 0.08523645251989365, + "learning_rate": 0.00016795554829574435 + }, + { + "step": 331, + "epoch": 2.06875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.7008, + "grad_norm": 0.23040834069252014, + "learning_rate": 0.00016714304474502696 + }, + { + "step": 332, + "epoch": 2.075, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6814, + "grad_norm": 0.27334246039390564, + "learning_rate": 0.00016633003122779467 + }, + { + "step": 333, + "epoch": 2.08125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6837, + "grad_norm": 0.13934960961341858, + "learning_rate": 0.00016551653192934694 + }, + { + "step": 334, + "epoch": 2.0875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6984, + "grad_norm": 0.3375794589519501, + "learning_rate": 0.0001647025710494341 + }, + { + "step": 335, + "epoch": 2.09375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6989, + "grad_norm": 0.3868290185928345, + "learning_rate": 0.00016388817280153735 + }, + { + "step": 336, + "epoch": 2.1, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6761, + "grad_norm": 0.20098507404327393, + "learning_rate": 0.00016307336141214873 + }, + { + "step": 337, + "epoch": 2.10625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.7137, + "grad_norm": 0.7988783121109009, + "learning_rate": 0.00016225816112005022 + }, + { + "step": 338, + "epoch": 2.1125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.7039, + "grad_norm": 0.8396597504615784, + "learning_rate": 0.00016144259617559286 + }, + { + "step": 339, + "epoch": 2.11875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6927, + "grad_norm": 0.12363620847463608, + "learning_rate": 0.00016062669083997513 + }, + { + "step": 340, + "epoch": 2.125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.7025, + "grad_norm": 0.6848641633987427, + "learning_rate": 0.00015981046938452146 + }, + { + "step": 341, + "epoch": 2.13125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.729, + "grad_norm": 1.8492637872695923, + "learning_rate": 0.00015899395608996015 + }, + { + "step": 342, + "epoch": 2.1375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.7261, + "grad_norm": 1.8087702989578247, + "learning_rate": 0.00015817717524570094 + }, + { + "step": 343, + "epoch": 2.14375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6993, + "grad_norm": 0.5901176929473877, + "learning_rate": 0.0001573601511491127 + }, + { + "step": 344, + "epoch": 2.15, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6992, + "grad_norm": 0.9114717841148376, + "learning_rate": 0.00015654290810480042 + }, + { + "step": 345, + "epoch": 2.15625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6883, + "grad_norm": 0.25217965245246887, + "learning_rate": 0.00015572547042388223 + }, + { + "step": 346, + "epoch": 2.1625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.7055, + "grad_norm": 0.4325340688228607, + "learning_rate": 0.00015490786242326643 + }, + { + "step": 347, + "epoch": 2.16875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.7258, + "grad_norm": 0.686921238899231, + "learning_rate": 0.00015409010842492777 + }, + { + "step": 348, + "epoch": 2.175, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.728, + "grad_norm": 0.63204425573349, + "learning_rate": 0.00015327223275518416 + }, + { + "step": 349, + "epoch": 2.18125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525056, + "loss": 0.6967, + "grad_norm": 0.23319289088249207, + "learning_rate": 0.000152454259743973 + }, + { + "step": 350, + "epoch": 2.1875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6952, + "grad_norm": 0.25094565749168396, + "learning_rate": 0.00015163621372412734 + }, + { + "step": 351, + "epoch": 2.19375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.7044, + "grad_norm": 0.4609512686729431, + "learning_rate": 0.00015081811903065205 + }, + { + "step": 352, + "epoch": 2.2, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6948, + "grad_norm": 0.10063501447439194, + "learning_rate": 0.00015 + }, + { + "step": 353, + "epoch": 2.20625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.694, + "grad_norm": 0.22907935082912445, + "learning_rate": 0.0001491818809693479 + }, + { + "step": 354, + "epoch": 2.2125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6664, + "grad_norm": 1.54314124584198, + "learning_rate": 0.00014836378627587266 + }, + { + "step": 355, + "epoch": 2.21875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.7138, + "grad_norm": 0.9795949459075928, + "learning_rate": 0.00014754574025602698 + }, + { + "step": 356, + "epoch": 2.225, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.7231, + "grad_norm": 1.1523598432540894, + "learning_rate": 0.00014672776724481584 + }, + { + "step": 357, + "epoch": 2.23125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.7297, + "grad_norm": 1.2247941493988037, + "learning_rate": 0.00014590989157507224 + }, + { + "step": 358, + "epoch": 2.2375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.7079, + "grad_norm": 0.8389212489128113, + "learning_rate": 0.00014509213757673357 + }, + { + "step": 359, + "epoch": 2.24375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6993, + "grad_norm": 0.321490079164505, + "learning_rate": 0.00014427452957611775 + }, + { + "step": 360, + "epoch": 2.25, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6953, + "grad_norm": 0.3756146728992462, + "learning_rate": 0.0001434570918951996 + }, + { + "step": 361, + "epoch": 2.25625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6998, + "grad_norm": 0.16573995351791382, + "learning_rate": 0.0001426398488508873 + }, + { + "step": 362, + "epoch": 2.2625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6845, + "grad_norm": 0.3822590112686157, + "learning_rate": 0.00014182282475429903 + }, + { + "step": 363, + "epoch": 2.26875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6919, + "grad_norm": 0.12987999618053436, + "learning_rate": 0.00014100604391003985 + }, + { + "step": 364, + "epoch": 2.275, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.7101, + "grad_norm": 0.6347695589065552, + "learning_rate": 0.0001401895306154785 + }, + { + "step": 365, + "epoch": 2.28125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.7178, + "grad_norm": 0.8246526122093201, + "learning_rate": 0.00013937330916002487 + }, + { + "step": 366, + "epoch": 2.2875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.7012, + "grad_norm": 0.3713129758834839, + "learning_rate": 0.00013855740382440714 + }, + { + "step": 367, + "epoch": 2.29375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6871, + "grad_norm": 0.4683196544647217, + "learning_rate": 0.0001377418388799498 + }, + { + "step": 368, + "epoch": 2.3, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6891, + "grad_norm": 0.4636493921279907, + "learning_rate": 0.00013692663858785124 + }, + { + "step": 369, + "epoch": 2.30625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6895, + "grad_norm": 0.5213725566864014, + "learning_rate": 0.00013611182719846268 + }, + { + "step": 370, + "epoch": 2.3125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6919, + "grad_norm": 0.10538755357265472, + "learning_rate": 0.0001352974289505659 + }, + { + "step": 371, + "epoch": 2.31875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6952, + "grad_norm": 0.24602587521076202, + "learning_rate": 0.000134483468070653 + }, + { + "step": 372, + "epoch": 2.325, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6987, + "grad_norm": 0.3601747155189514, + "learning_rate": 0.00013366996877220533 + }, + { + "step": 373, + "epoch": 2.33125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6923, + "grad_norm": 0.10256825387477875, + "learning_rate": 0.000132856955254973 + }, + { + "step": 374, + "epoch": 2.3375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.1252736, + "loss": 0.6935, + "grad_norm": 0.7699581980705261, + "learning_rate": 0.00013204445170425565 + }, + { + "step": 375, + "epoch": 2.34375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.695, + "grad_norm": 0.4469190239906311, + "learning_rate": 0.00013123248229018214 + }, + { + "step": 376, + "epoch": 2.35, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6926, + "grad_norm": 0.5761776566505432, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 377, + "epoch": 2.35625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.1252736, + "loss": 0.6927, + "grad_norm": 0.39169833064079285, + "learning_rate": 0.0001296102424723179 + }, + { + "step": 378, + "epoch": 2.3625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6898, + "grad_norm": 0.42367443442344666, + "learning_rate": 0.0001288000203264649 + }, + { + "step": 379, + "epoch": 2.36875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.7071, + "grad_norm": 0.9209778308868408, + "learning_rate": 0.00012799042883169574 + }, + { + "step": 380, + "epoch": 2.375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6829, + "grad_norm": 0.5709733963012695, + "learning_rate": 0.00012718149207151247 + }, + { + "step": 381, + "epoch": 2.38125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.7036, + "grad_norm": 0.8025131821632385, + "learning_rate": 0.00012637323410994033 + }, + { + "step": 382, + "epoch": 2.3875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6829, + "grad_norm": 0.23625433444976807, + "learning_rate": 0.0001255656789908117 + }, + { + "step": 383, + "epoch": 2.39375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6902, + "grad_norm": 0.10295777767896652, + "learning_rate": 0.0001247588507370511 + }, + { + "step": 384, + "epoch": 2.4, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.7109, + "grad_norm": 1.0728468894958496, + "learning_rate": 0.00012395277334996044 + }, + { + "step": 385, + "epoch": 2.40625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.716, + "grad_norm": 1.3867613077163696, + "learning_rate": 0.0001231474708085051 + }, + { + "step": 386, + "epoch": 2.4125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125255168, + "loss": 0.6977, + "grad_norm": 0.4172632098197937, + "learning_rate": 0.0001223429670686005 + }, + { + "step": 387, + "epoch": 2.41875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.693, + "grad_norm": 0.5568313002586365, + "learning_rate": 0.00012153928606239957 + }, + { + "step": 388, + "epoch": 2.425, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6923, + "grad_norm": 0.15788021683692932, + "learning_rate": 0.00012073645169758076 + }, + { + "step": 389, + "epoch": 2.43125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6959, + "grad_norm": 0.3490711748600006, + "learning_rate": 0.00011993448785663692 + }, + { + "step": 390, + "epoch": 2.4375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6887, + "grad_norm": 0.19922082126140594, + "learning_rate": 0.00011913341839616476 + }, + { + "step": 391, + "epoch": 2.44375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6937, + "grad_norm": 0.2074379026889801, + "learning_rate": 0.00011833326714615522 + }, + { + "step": 392, + "epoch": 2.45, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125270528, + "loss": 0.698, + "grad_norm": 0.3303549885749817, + "learning_rate": 0.00011753405790928456 + }, + { + "step": 393, + "epoch": 2.45625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6907, + "grad_norm": 0.07187434285879135, + "learning_rate": 0.0001167358144602063 + }, + { + "step": 394, + "epoch": 2.4625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.699, + "grad_norm": 0.3802112340927124, + "learning_rate": 0.00011593856054484402 + }, + { + "step": 395, + "epoch": 2.46875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6674, + "grad_norm": 0.7081384062767029, + "learning_rate": 0.00011514231987968482 + }, + { + "step": 396, + "epoch": 2.475, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6939, + "grad_norm": 0.11892477422952652, + "learning_rate": 0.00011434711615107404 + }, + { + "step": 397, + "epoch": 2.48125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125255168, + "loss": 0.6917, + "grad_norm": 0.12920916080474854, + "learning_rate": 0.00011355297301451042 + }, + { + "step": 398, + "epoch": 2.4875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125270528, + "loss": 0.6969, + "grad_norm": 0.14373473823070526, + "learning_rate": 0.00011275991409394253 + }, + { + "step": 399, + "epoch": 2.49375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.7086, + "grad_norm": 0.5513309836387634, + "learning_rate": 0.00011196796298106608 + }, + { + "step": 400, + "epoch": 2.5, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.7057, + "grad_norm": 0.515099287033081, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 401, + "epoch": 2.50625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6839, + "grad_norm": 0.5127894282341003, + "learning_rate": 0.00011038747837969526 + }, + { + "step": 402, + "epoch": 2.5125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6916, + "grad_norm": 0.07560497522354126, + "learning_rate": 0.00010959899190701608 + }, + { + "step": 403, + "epoch": 2.51875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6932, + "grad_norm": 0.14066872000694275, + "learning_rate": 0.00010881170727226018 + }, + { + "step": 404, + "epoch": 2.525, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6929, + "grad_norm": 0.32408609986305237, + "learning_rate": 0.00010802564789535119 + }, + { + "step": 405, + "epoch": 2.53125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6928, + "grad_norm": 0.3146069049835205, + "learning_rate": 0.00010724083715976441 + }, + { + "step": 406, + "epoch": 2.5375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6912, + "grad_norm": 0.33855053782463074, + "learning_rate": 0.00010645729841183066 + }, + { + "step": 407, + "epoch": 2.54375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6941, + "grad_norm": 0.21648286283016205, + "learning_rate": 0.00010567505496004213 + }, + { + "step": 408, + "epoch": 2.55, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125268992, + "loss": 0.6927, + "grad_norm": 0.21529866755008698, + "learning_rate": 0.00010489413007435904 + }, + { + "step": 409, + "epoch": 2.55625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6918, + "grad_norm": 0.1439589112997055, + "learning_rate": 0.00010411454698551695 + }, + { + "step": 410, + "epoch": 2.5625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.691, + "grad_norm": 0.3738196790218353, + "learning_rate": 0.00010333632888433638 + }, + { + "step": 411, + "epoch": 2.56875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.7015, + "grad_norm": 0.47344323992729187, + "learning_rate": 0.00010255949892103225 + }, + { + "step": 412, + "epoch": 2.575, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6887, + "grad_norm": 0.12582124769687653, + "learning_rate": 0.00010178408020452579 + }, + { + "step": 413, + "epoch": 2.58125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125270528, + "loss": 0.6881, + "grad_norm": 0.1123841181397438, + "learning_rate": 0.00010101009580175669 + }, + { + "step": 414, + "epoch": 2.5875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6969, + "grad_norm": 0.19353842735290527, + "learning_rate": 0.00010023756873699722 + }, + { + "step": 415, + "epoch": 2.59375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6965, + "grad_norm": 0.285798043012619, + "learning_rate": 9.946652199116699e-05 + }, + { + "step": 416, + "epoch": 2.6, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.7054, + "grad_norm": 0.5005611777305603, + "learning_rate": 9.869697850114969e-05 + }, + { + "step": 417, + "epoch": 2.60625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.7039, + "grad_norm": 0.4666767418384552, + "learning_rate": 9.792896115911045e-05 + }, + { + "step": 418, + "epoch": 2.6125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6896, + "grad_norm": 0.18099334836006165, + "learning_rate": 9.716249281181497e-05 + }, + { + "step": 419, + "epoch": 2.61875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6961, + "grad_norm": 0.5334787368774414, + "learning_rate": 9.639759625994998e-05 + }, + { + "step": 420, + "epoch": 2.625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6911, + "grad_norm": 0.09216499328613281, + "learning_rate": 9.563429425744476e-05 + }, + { + "step": 421, + "epoch": 2.63125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.7011, + "grad_norm": 0.43369564414024353, + "learning_rate": 9.487260951079448e-05 + }, + { + "step": 422, + "epoch": 2.6375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.7025, + "grad_norm": 0.6535469889640808, + "learning_rate": 9.411256467838455e-05 + }, + { + "step": 423, + "epoch": 2.64375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6822, + "grad_norm": 0.5698631405830383, + "learning_rate": 9.335418236981677e-05 + }, + { + "step": 424, + "epoch": 2.65, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.7019, + "grad_norm": 0.3116818070411682, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 425, + "epoch": 2.65625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6993, + "grad_norm": 0.3273603916168213, + "learning_rate": 9.184249551466189e-05 + }, + { + "step": 426, + "epoch": 2.6625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6874, + "grad_norm": 0.27513882517814636, + "learning_rate": 9.10892359373139e-05 + }, + { + "step": 427, + "epoch": 2.66875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125270528, + "loss": 0.695, + "grad_norm": 0.1304827332496643, + "learning_rate": 9.033772882094833e-05 + }, + { + "step": 428, + "epoch": 2.675, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125255168, + "loss": 0.7082, + "grad_norm": 0.720856249332428, + "learning_rate": 8.958799652118943e-05 + }, + { + "step": 429, + "epoch": 2.68125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.696, + "grad_norm": 0.1487133651971817, + "learning_rate": 8.884006134086449e-05 + }, + { + "step": 430, + "epoch": 2.6875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6908, + "grad_norm": 0.11521019786596298, + "learning_rate": 8.809394552934079e-05 + }, + { + "step": 431, + "epoch": 2.69375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6887, + "grad_norm": 0.445828378200531, + "learning_rate": 8.734967128186338e-05 + }, + { + "step": 432, + "epoch": 2.7, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6937, + "grad_norm": 0.07160533219575882, + "learning_rate": 8.660726073889511e-05 + }, + { + "step": 433, + "epoch": 2.70625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6937, + "grad_norm": 0.12393249571323395, + "learning_rate": 8.586673598545771e-05 + }, + { + "step": 434, + "epoch": 2.7125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6947, + "grad_norm": 0.11748331785202026, + "learning_rate": 8.512811905047505e-05 + }, + { + "step": 435, + "epoch": 2.71875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6954, + "grad_norm": 0.23126472532749176, + "learning_rate": 8.439143190611787e-05 + }, + { + "step": 436, + "epoch": 2.725, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6921, + "grad_norm": 0.11005670577287674, + "learning_rate": 8.365669646714983e-05 + }, + { + "step": 437, + "epoch": 2.73125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125268992, + "loss": 0.6971, + "grad_norm": 0.10512620955705643, + "learning_rate": 8.29239345902759e-05 + }, + { + "step": 438, + "epoch": 2.7375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6949, + "grad_norm": 0.0608757883310318, + "learning_rate": 8.219316807349204e-05 + }, + { + "step": 439, + "epoch": 2.74375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6913, + "grad_norm": 0.06250602751970291, + "learning_rate": 8.146441865543689e-05 + }, + { + "step": 440, + "epoch": 2.75, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.696, + "grad_norm": 0.7498326301574707, + "learning_rate": 8.073770801474495e-05 + }, + { + "step": 441, + "epoch": 2.75625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6933, + "grad_norm": 0.08074386417865753, + "learning_rate": 8.001305776940163e-05 + }, + { + "step": 442, + "epoch": 2.7625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6919, + "grad_norm": 0.17472496628761292, + "learning_rate": 7.929048947610034e-05 + }, + { + "step": 443, + "epoch": 2.76875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6934, + "grad_norm": 0.19197845458984375, + "learning_rate": 7.857002462960132e-05 + }, + { + "step": 444, + "epoch": 2.775, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6879, + "grad_norm": 0.20592744648456573, + "learning_rate": 7.785168466209187e-05 + }, + { + "step": 445, + "epoch": 2.78125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6886, + "grad_norm": 0.1047009527683258, + "learning_rate": 7.713549094254897e-05 + }, + { + "step": 446, + "epoch": 2.7875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6838, + "grad_norm": 0.22834080457687378, + "learning_rate": 7.64214647761038e-05 + }, + { + "step": 447, + "epoch": 2.79375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.686, + "grad_norm": 0.1029311865568161, + "learning_rate": 7.570962740340759e-05 + }, + { + "step": 448, + "epoch": 2.8, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6873, + "grad_norm": 0.0684528797864914, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 449, + "epoch": 2.80625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6858, + "grad_norm": 0.058492451906204224, + "learning_rate": 7.429260367567916e-05 + }, + { + "step": 450, + "epoch": 2.8125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.7119, + "grad_norm": 0.6199074983596802, + "learning_rate": 7.358745947387373e-05 + }, + { + "step": 451, + "epoch": 2.81875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6972, + "grad_norm": 0.3497580289840698, + "learning_rate": 7.288458837101675e-05 + }, + { + "step": 452, + "epoch": 2.825, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6622, + "grad_norm": 0.44165685772895813, + "learning_rate": 7.218401127592175e-05 + }, + { + "step": 453, + "epoch": 2.83125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6718, + "grad_norm": 0.2440240979194641, + "learning_rate": 7.14857490291609e-05 + }, + { + "step": 454, + "epoch": 2.8375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6954, + "grad_norm": 0.2545589208602905, + "learning_rate": 7.07898224024448e-05 + }, + { + "step": 455, + "epoch": 2.84375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.718, + "grad_norm": 0.7534648180007935, + "learning_rate": 7.009625209800465e-05 + }, + { + "step": 456, + "epoch": 2.85, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.7116, + "grad_norm": 0.5986682176589966, + "learning_rate": 6.940505874797639e-05 + }, + { + "step": 457, + "epoch": 2.85625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.7056, + "grad_norm": 0.43100282549858093, + "learning_rate": 6.871626291378728e-05 + }, + { + "step": 458, + "epoch": 2.8625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.7138, + "grad_norm": 0.678110659122467, + "learning_rate": 6.80298850855435e-05 + }, + { + "step": 459, + "epoch": 2.86875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6986, + "grad_norm": 0.17331917583942413, + "learning_rate": 6.734594568142142e-05 + }, + { + "step": 460, + "epoch": 2.875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6986, + "grad_norm": 0.2997139096260071, + "learning_rate": 6.66644650470597e-05 + }, + { + "step": 461, + "epoch": 2.88125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6942, + "grad_norm": 0.6641805768013, + "learning_rate": 6.598546345495417e-05 + }, + { + "step": 462, + "epoch": 2.8875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6889, + "grad_norm": 0.4220745265483856, + "learning_rate": 6.530896110385494e-05 + }, + { + "step": 463, + "epoch": 2.89375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6966, + "grad_norm": 0.0717463567852974, + "learning_rate": 6.463497811816523e-05 + }, + { + "step": 464, + "epoch": 2.9, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6969, + "grad_norm": 0.21131061017513275, + "learning_rate": 6.396353454734311e-05 + }, + { + "step": 465, + "epoch": 2.90625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6847, + "grad_norm": 0.2352937012910843, + "learning_rate": 6.32946503653045e-05 + }, + { + "step": 466, + "epoch": 2.9125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6968, + "grad_norm": 0.14926855266094208, + "learning_rate": 6.262834546982969e-05 + }, + { + "step": 467, + "epoch": 2.91875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125270528, + "loss": 0.692, + "grad_norm": 0.10684549063444138, + "learning_rate": 6.196463968197084e-05 + }, + { + "step": 468, + "epoch": 2.925, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6959, + "grad_norm": 0.16404668986797333, + "learning_rate": 6.130355274546267e-05 + }, + { + "step": 469, + "epoch": 2.93125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.7107, + "grad_norm": 0.4303646385669708, + "learning_rate": 6.064510432613499e-05 + }, + { + "step": 470, + "epoch": 2.9375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.7027, + "grad_norm": 0.2557452321052551, + "learning_rate": 5.998931401132786e-05 + }, + { + "step": 471, + "epoch": 2.94375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6798, + "grad_norm": 0.2410067319869995, + "learning_rate": 5.933620130930867e-05 + }, + { + "step": 472, + "epoch": 2.95, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.7033, + "grad_norm": 0.2912239134311676, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 473, + "epoch": 2.95625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.696, + "grad_norm": 0.14192511141300201, + "learning_rate": 5.803808637786135e-05 + }, + { + "step": 474, + "epoch": 2.9625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125255168, + "loss": 0.6989, + "grad_norm": 0.1904364973306656, + "learning_rate": 5.739312276439427e-05 + }, + { + "step": 475, + "epoch": 2.96875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6979, + "grad_norm": 0.11238906532526016, + "learning_rate": 5.6750913994488415e-05 + }, + { + "step": 476, + "epoch": 2.975, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6972, + "grad_norm": 0.1016833558678627, + "learning_rate": 5.6111479172391136e-05 + }, + { + "step": 477, + "epoch": 2.98125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.691, + "grad_norm": 0.11427196860313416, + "learning_rate": 5.5474837319831314e-05 + }, + { + "step": 478, + "epoch": 2.9875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6907, + "grad_norm": 0.13492944836616516, + "learning_rate": 5.4841007375453186e-05 + }, + { + "step": 479, + "epoch": 2.99375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.691, + "grad_norm": 0.1503468155860901, + "learning_rate": 5.4210008194253196e-05 + }, + { + "step": 480, + "epoch": 3.0, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6911, + "grad_norm": 0.15045872330665588, + "learning_rate": 5.358185854701909e-05 + }, + { + "step": 481, + "epoch": 3.00625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125255168, + "loss": 0.6873, + "grad_norm": 0.16269592940807343, + "learning_rate": 5.2956577119771405e-05 + }, + { + "step": 482, + "epoch": 3.0125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6935, + "grad_norm": 0.19804009795188904, + "learning_rate": 5.233418251320765e-05 + }, + { + "step": 483, + "epoch": 3.01875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6994, + "grad_norm": 0.2612445652484894, + "learning_rate": 5.171469324214901e-05 + }, + { + "step": 484, + "epoch": 3.025, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6927, + "grad_norm": 0.05862182378768921, + "learning_rate": 5.109812773498967e-05 + }, + { + "step": 485, + "epoch": 3.03125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6964, + "grad_norm": 0.23759996891021729, + "learning_rate": 5.048450433314835e-05 + }, + { + "step": 486, + "epoch": 3.0375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6897, + "grad_norm": 0.21181254088878632, + "learning_rate": 4.987384129052291e-05 + }, + { + "step": 487, + "epoch": 3.04375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6955, + "grad_norm": 0.12667913734912872, + "learning_rate": 4.926615677294723e-05 + }, + { + "step": 488, + "epoch": 3.05, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6931, + "grad_norm": 0.31823867559432983, + "learning_rate": 4.866146885765096e-05 + }, + { + "step": 489, + "epoch": 3.05625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6955, + "grad_norm": 0.35170838236808777, + "learning_rate": 4.8059795532721575e-05 + }, + { + "step": 490, + "epoch": 3.0625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6928, + "grad_norm": 0.15956376492977142, + "learning_rate": 4.7461154696569294e-05 + }, + { + "step": 491, + "epoch": 3.06875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6943, + "grad_norm": 0.30872446298599243, + "learning_rate": 4.686556415739488e-05 + }, + { + "step": 492, + "epoch": 3.075, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6935, + "grad_norm": 0.15700098872184753, + "learning_rate": 4.62730416326596e-05 + }, + { + "step": 493, + "epoch": 3.08125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6977, + "grad_norm": 0.41455429792404175, + "learning_rate": 4.568360474855826e-05 + }, + { + "step": 494, + "epoch": 3.0875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6913, + "grad_norm": 0.10853341966867447, + "learning_rate": 4.509727103949492e-05 + }, + { + "step": 495, + "epoch": 3.09375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6926, + "grad_norm": 0.4400142431259155, + "learning_rate": 4.451405794756138e-05 + }, + { + "step": 496, + "epoch": 3.1, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125270528, + "loss": 0.6933, + "grad_norm": 0.16079115867614746, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 497, + "epoch": 3.10625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125270528, + "loss": 0.6941, + "grad_norm": 0.29592418670654297, + "learning_rate": 4.33570629187776e-05 + }, + { + "step": 498, + "epoch": 3.1125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6975, + "grad_norm": 0.4071213901042938, + "learning_rate": 4.278331539989307e-05 + }, + { + "step": 499, + "epoch": 3.11875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125268992, + "loss": 0.694, + "grad_norm": 0.1004914939403534, + "learning_rate": 4.2212757333045283e-05 + }, + { + "step": 500, + "epoch": 3.125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6957, + "grad_norm": 0.16775313019752502, + "learning_rate": 4.164540569103667e-05 + }, + { + "step": 501, + "epoch": 3.13125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6932, + "grad_norm": 0.060192953795194626, + "learning_rate": 4.108127735128561e-05 + }, + { + "step": 502, + "epoch": 3.1375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6909, + "grad_norm": 0.19140060245990753, + "learning_rate": 4.052038909532469e-05 + }, + { + "step": 503, + "epoch": 3.14375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6894, + "grad_norm": 0.24252495169639587, + "learning_rate": 3.996275760830125e-05 + }, + { + "step": 504, + "epoch": 3.15, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6937, + "grad_norm": 0.05318218469619751, + "learning_rate": 3.94083994784814e-05 + }, + { + "step": 505, + "epoch": 3.15625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6977, + "grad_norm": 0.3046831488609314, + "learning_rate": 3.885733119675616e-05 + }, + { + "step": 506, + "epoch": 3.1625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6924, + "grad_norm": 0.07094284147024155, + "learning_rate": 3.830956915615106e-05 + }, + { + "step": 507, + "epoch": 3.16875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125268992, + "loss": 0.6942, + "grad_norm": 0.06773317605257034, + "learning_rate": 3.776512965133863e-05 + }, + { + "step": 508, + "epoch": 3.175, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125255168, + "loss": 0.6888, + "grad_norm": 0.23378834128379822, + "learning_rate": 3.72240288781534e-05 + }, + { + "step": 509, + "epoch": 3.18125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6938, + "grad_norm": 0.25344982743263245, + "learning_rate": 3.66862829331103e-05 + }, + { + "step": 510, + "epoch": 3.1875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6946, + "grad_norm": 0.12745171785354614, + "learning_rate": 3.6151907812925717e-05 + }, + { + "step": 511, + "epoch": 3.19375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6929, + "grad_norm": 0.07218015193939209, + "learning_rate": 3.562091941404179e-05 + }, + { + "step": 512, + "epoch": 3.2, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6927, + "grad_norm": 0.06551176309585571, + "learning_rate": 3.509333353215331e-05 + }, + { + "step": 513, + "epoch": 3.20625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6919, + "grad_norm": 0.15264345705509186, + "learning_rate": 3.456916586173797e-05 + }, + { + "step": 514, + "epoch": 3.2125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6905, + "grad_norm": 0.2416311353445053, + "learning_rate": 3.404843199558945e-05 + }, + { + "step": 515, + "epoch": 3.21875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6954, + "grad_norm": 0.24841423332691193, + "learning_rate": 3.3531147424353664e-05 + }, + { + "step": 516, + "epoch": 3.225, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6949, + "grad_norm": 0.24199911952018738, + "learning_rate": 3.301732753606776e-05 + }, + { + "step": 517, + "epoch": 3.23125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6905, + "grad_norm": 0.4696517884731293, + "learning_rate": 3.250698761570244e-05 + }, + { + "step": 518, + "epoch": 3.2375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6891, + "grad_norm": 0.5710655450820923, + "learning_rate": 3.200014284470745e-05 + }, + { + "step": 519, + "epoch": 3.24375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6926, + "grad_norm": 0.058380063623189926, + "learning_rate": 3.149680830055967e-05 + }, + { + "step": 520, + "epoch": 3.25, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6909, + "grad_norm": 0.23663786053657532, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 521, + "epoch": 3.25625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6905, + "grad_norm": 0.07632457464933395, + "learning_rate": 3.0500729680161663e-05 + }, + { + "step": 522, + "epoch": 3.2625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6876, + "grad_norm": 0.1750950813293457, + "learning_rate": 3.0008015234980552e-05 + }, + { + "step": 523, + "epoch": 3.26875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6927, + "grad_norm": 0.08061862736940384, + "learning_rate": 2.9518870277903274e-05 + }, + { + "step": 524, + "epoch": 3.275, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.691, + "grad_norm": 0.08497494459152222, + "learning_rate": 2.9033309359877597e-05 + }, + { + "step": 525, + "epoch": 3.28125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.7012, + "grad_norm": 0.3552340567111969, + "learning_rate": 2.855134692523438e-05 + }, + { + "step": 526, + "epoch": 3.2875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.692, + "grad_norm": 0.10007113963365555, + "learning_rate": 2.807299731125773e-05 + }, + { + "step": 527, + "epoch": 3.29375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6925, + "grad_norm": 0.10130195319652557, + "learning_rate": 2.759827474775852e-05 + }, + { + "step": 528, + "epoch": 3.3, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6896, + "grad_norm": 0.10907964408397675, + "learning_rate": 2.7127193356651213e-05 + }, + { + "step": 529, + "epoch": 3.30625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.7001, + "grad_norm": 0.3444664776325226, + "learning_rate": 2.665976715153377e-05 + }, + { + "step": 530, + "epoch": 3.3125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6833, + "grad_norm": 0.24995650351047516, + "learning_rate": 2.619601003727043e-05 + }, + { + "step": 531, + "epoch": 3.31875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125275136, + "loss": 0.7015, + "grad_norm": 0.3986763060092926, + "learning_rate": 2.5735935809578656e-05 + }, + { + "step": 532, + "epoch": 3.325, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.702, + "grad_norm": 0.39164891839027405, + "learning_rate": 2.5279558154618197e-05 + }, + { + "step": 533, + "epoch": 3.33125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6936, + "grad_norm": 0.17026498913764954, + "learning_rate": 2.4826890648584353e-05 + }, + { + "step": 534, + "epoch": 3.3375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6953, + "grad_norm": 0.12437622994184494, + "learning_rate": 2.4377946757303828e-05 + }, + { + "step": 535, + "epoch": 3.34375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6988, + "grad_norm": 0.11980962753295898, + "learning_rate": 2.393273983583427e-05 + }, + { + "step": 536, + "epoch": 3.35, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6936, + "grad_norm": 0.15400809049606323, + "learning_rate": 2.3491283128067174e-05 + }, + { + "step": 537, + "epoch": 3.35625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6906, + "grad_norm": 0.16444404423236847, + "learning_rate": 2.3053589766333414e-05 + }, + { + "step": 538, + "epoch": 3.3625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6928, + "grad_norm": 0.1859978884458542, + "learning_rate": 2.261967277101318e-05 + }, + { + "step": 539, + "epoch": 3.36875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6917, + "grad_norm": 0.08429887145757675, + "learning_rate": 2.218954505014821e-05 + }, + { + "step": 540, + "epoch": 3.375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6923, + "grad_norm": 0.3759167194366455, + "learning_rate": 2.1763219399058042e-05 + }, + { + "step": 541, + "epoch": 3.38125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6894, + "grad_norm": 0.07322327047586441, + "learning_rate": 2.1340708499959197e-05 + }, + { + "step": 542, + "epoch": 3.3875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.692, + "grad_norm": 0.16967077553272247, + "learning_rate": 2.0922024921588167e-05 + }, + { + "step": 543, + "epoch": 3.39375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6936, + "grad_norm": 0.15847735106945038, + "learning_rate": 2.0507181118827254e-05 + }, + { + "step": 544, + "epoch": 3.4, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6887, + "grad_norm": 0.07129736989736557, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 545, + "epoch": 3.40625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125268992, + "loss": 0.6961, + "grad_norm": 0.09234222024679184, + "learning_rate": 1.9689062088175154e-05 + }, + { + "step": 546, + "epoch": 3.4125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6912, + "grad_norm": 0.06566044688224792, + "learning_rate": 1.928581119746081e-05 + }, + { + "step": 547, + "epoch": 3.41875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.693, + "grad_norm": 0.19692517817020416, + "learning_rate": 1.8886448755986193e-05 + }, + { + "step": 548, + "epoch": 3.425, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6938, + "grad_norm": 0.09652529656887054, + "learning_rate": 1.8490986643873845e-05 + }, + { + "step": 549, + "epoch": 3.43125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6924, + "grad_norm": 0.09869689494371414, + "learning_rate": 1.8099436625220443e-05 + }, + { + "step": 550, + "epoch": 3.4375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125272064, + "loss": 0.6878, + "grad_norm": 0.13488256931304932, + "learning_rate": 1.7711810347746757e-05 + }, + { + "step": 551, + "epoch": 3.44375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.696, + "grad_norm": 0.21530310809612274, + "learning_rate": 1.7328119342451165e-05 + }, + { + "step": 552, + "epoch": 3.45, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6921, + "grad_norm": 0.08769610524177551, + "learning_rate": 1.694837502326674e-05 + }, + { + "step": 553, + "epoch": 3.45625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6964, + "grad_norm": 0.16983729600906372, + "learning_rate": 1.6572588686721606e-05 + }, + { + "step": 554, + "epoch": 3.4625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.686, + "grad_norm": 0.2858846187591553, + "learning_rate": 1.6200771511602882e-05 + }, + { + "step": 555, + "epoch": 3.46875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125268992, + "loss": 0.6852, + "grad_norm": 0.5962482690811157, + "learning_rate": 1.583293455862422e-05 + }, + { + "step": 556, + "epoch": 3.475, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.689, + "grad_norm": 0.08101170510053635, + "learning_rate": 1.546908877009676e-05 + }, + { + "step": 557, + "epoch": 3.48125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6986, + "grad_norm": 0.2933458387851715, + "learning_rate": 1.5109244969603546e-05 + }, + { + "step": 558, + "epoch": 3.4875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6995, + "grad_norm": 0.23836594820022583, + "learning_rate": 1.4753413861677604e-05 + }, + { + "step": 559, + "epoch": 3.49375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6887, + "grad_norm": 0.16455255448818207, + "learning_rate": 1.4401606031483497e-05 + }, + { + "step": 560, + "epoch": 3.5, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6944, + "grad_norm": 0.19197876751422882, + "learning_rate": 1.4053831944502508e-05 + }, + { + "step": 561, + "epoch": 3.50625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6923, + "grad_norm": 0.0698540136218071, + "learning_rate": 1.371010194622117e-05 + }, + { + "step": 562, + "epoch": 3.5125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.7032, + "grad_norm": 0.40879228711128235, + "learning_rate": 1.3370426261823613e-05 + }, + { + "step": 563, + "epoch": 3.51875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6956, + "grad_norm": 0.1469973921775818, + "learning_rate": 1.3034814995887433e-05 + }, + { + "step": 564, + "epoch": 3.525, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6888, + "grad_norm": 0.11112227290868759, + "learning_rate": 1.2703278132082934e-05 + }, + { + "step": 565, + "epoch": 3.53125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6916, + "grad_norm": 0.07269681245088577, + "learning_rate": 1.237582553287631e-05 + }, + { + "step": 566, + "epoch": 3.5375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.7038, + "grad_norm": 0.46191561222076416, + "learning_rate": 1.205246693923616e-05 + }, + { + "step": 567, + "epoch": 3.54375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.7001, + "grad_norm": 0.2882208228111267, + "learning_rate": 1.173321197034382e-05 + }, + { + "step": 568, + "epoch": 3.55, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.689, + "grad_norm": 0.1793852597475052, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 569, + "epoch": 3.55625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.7002, + "grad_norm": 0.3280603289604187, + "learning_rate": 1.1107050772877507e-05 + }, + { + "step": 570, + "epoch": 3.5625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6909, + "grad_norm": 0.08574327826499939, + "learning_rate": 1.0800163171172332e-05 + }, + { + "step": 571, + "epoch": 3.56875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6942, + "grad_norm": 0.12177206575870514, + "learning_rate": 1.0497416447398187e-05 + }, + { + "step": 572, + "epoch": 3.575, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6858, + "grad_norm": 0.2826426029205322, + "learning_rate": 1.0198819607580233e-05 + }, + { + "step": 573, + "epoch": 3.58125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6944, + "grad_norm": 0.15841981768608093, + "learning_rate": 9.904381534293993e-06 + }, + { + "step": 574, + "epoch": 3.5875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6898, + "grad_norm": 0.1922999918460846, + "learning_rate": 9.614110986401169e-06 + }, + { + "step": 575, + "epoch": 3.59375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.692, + "grad_norm": 0.0639992356300354, + "learning_rate": 9.32801659878905e-06 + }, + { + "step": 576, + "epoch": 3.6, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.698, + "grad_norm": 0.21107128262519836, + "learning_rate": 9.046106882113751e-06 + }, + { + "step": 577, + "epoch": 3.60625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6925, + "grad_norm": 0.07920857518911362, + "learning_rate": 8.768390222546895e-06 + }, + { + "step": 578, + "epoch": 3.6125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125270528, + "loss": 0.6929, + "grad_norm": 0.09764102101325989, + "learning_rate": 8.494874881526215e-06 + }, + { + "step": 579, + "epoch": 3.61875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6943, + "grad_norm": 0.06641624867916107, + "learning_rate": 8.225568995509834e-06 + }, + { + "step": 580, + "epoch": 3.625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6884, + "grad_norm": 0.3399096429347992, + "learning_rate": 7.960480575734162e-06 + }, + { + "step": 581, + "epoch": 3.63125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.691, + "grad_norm": 0.1418691724538803, + "learning_rate": 7.699617507975563e-06 + }, + { + "step": 582, + "epoch": 3.6375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125255168, + "loss": 0.6915, + "grad_norm": 0.06821310520172119, + "learning_rate": 7.442987552315833e-06 + }, + { + "step": 583, + "epoch": 3.64375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6946, + "grad_norm": 0.07989329099655151, + "learning_rate": 7.190598342911358e-06 + }, + { + "step": 584, + "epoch": 3.65, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6898, + "grad_norm": 0.07532931119203568, + "learning_rate": 6.942457387765976e-06 + }, + { + "step": 585, + "epoch": 3.65625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125270528, + "loss": 0.6893, + "grad_norm": 0.13732770085334778, + "learning_rate": 6.698572068507596e-06 + }, + { + "step": 586, + "epoch": 3.6625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6986, + "grad_norm": 0.3618239164352417, + "learning_rate": 6.458949640168675e-06 + }, + { + "step": 587, + "epoch": 3.66875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125270528, + "loss": 0.6905, + "grad_norm": 0.102206751704216, + "learning_rate": 6.223597230970428e-06 + }, + { + "step": 588, + "epoch": 3.675, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6872, + "grad_norm": 0.28788381814956665, + "learning_rate": 5.992521842110709e-06 + }, + { + "step": 589, + "epoch": 3.68125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6923, + "grad_norm": 0.21753068268299103, + "learning_rate": 5.7657303475556974e-06 + }, + { + "step": 590, + "epoch": 3.6875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6969, + "grad_norm": 0.3121455907821655, + "learning_rate": 5.543229493835594e-06 + }, + { + "step": 591, + "epoch": 3.69375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.698, + "grad_norm": 0.4683733880519867, + "learning_rate": 5.325025899843732e-06 + }, + { + "step": 592, + "epoch": 3.7, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125268992, + "loss": 0.6949, + "grad_norm": 0.26093462109565735, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 593, + "epoch": 3.70625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.692, + "grad_norm": 0.2050081193447113, + "learning_rate": 4.901536327256589e-06 + }, + { + "step": 594, + "epoch": 3.7125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6983, + "grad_norm": 0.4628903865814209, + "learning_rate": 4.6962629465110365e-06 + }, + { + "step": 595, + "epoch": 3.71875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125268992, + "loss": 0.6911, + "grad_norm": 0.07045736908912659, + "learning_rate": 4.495312020818403e-06 + }, + { + "step": 596, + "epoch": 3.725, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6909, + "grad_norm": 0.24409109354019165, + "learning_rate": 4.298689528010785e-06 + }, + { + "step": 597, + "epoch": 3.73125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.693, + "grad_norm": 0.11331121623516083, + "learning_rate": 4.106401317159275e-06 + }, + { + "step": 598, + "epoch": 3.7375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6942, + "grad_norm": 0.2646442651748657, + "learning_rate": 3.918453108399955e-06 + }, + { + "step": 599, + "epoch": 3.74375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6932, + "grad_norm": 0.1485799103975296, + "learning_rate": 3.7348504927637302e-06 + }, + { + "step": 600, + "epoch": 3.75, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6932, + "grad_norm": 0.15669243037700653, + "learning_rate": 3.5555989320099952e-06 + }, + { + "step": 601, + "epoch": 3.75625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6918, + "grad_norm": 0.09167775511741638, + "learning_rate": 3.3807037584642316e-06 + }, + { + "step": 602, + "epoch": 3.7625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.691, + "grad_norm": 0.1394340842962265, + "learning_rate": 3.21017017485925e-06 + }, + { + "step": 603, + "epoch": 3.76875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125267456, + "loss": 0.6937, + "grad_norm": 0.23971301317214966, + "learning_rate": 3.0440032541805825e-06 + }, + { + "step": 604, + "epoch": 3.775, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6931, + "grad_norm": 0.10911689698696136, + "learning_rate": 2.882207939515435e-06 + }, + { + "step": 605, + "epoch": 3.78125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125256704, + "loss": 0.6935, + "grad_norm": 0.3896804749965668, + "learning_rate": 2.7247890439057064e-06 + }, + { + "step": 606, + "epoch": 3.7875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6935, + "grad_norm": 0.1936010718345642, + "learning_rate": 2.5717512502048342e-06 + }, + { + "step": 607, + "epoch": 3.79375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6922, + "grad_norm": 0.16034403443336487, + "learning_rate": 2.423099110938376e-06 + }, + { + "step": 608, + "epoch": 3.8, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.691, + "grad_norm": 0.21620425581932068, + "learning_rate": 2.2788370481687965e-06 + }, + { + "step": 609, + "epoch": 3.80625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6921, + "grad_norm": 0.3283686339855194, + "learning_rate": 2.1389693533636455e-06 + }, + { + "step": 610, + "epoch": 3.8125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6989, + "grad_norm": 0.09613630920648575, + "learning_rate": 2.003500187268153e-06 + }, + { + "step": 611, + "epoch": 3.81875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6921, + "grad_norm": 0.21230916678905487, + "learning_rate": 1.8724335797812685e-06 + }, + { + "step": 612, + "epoch": 3.825, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125255168, + "loss": 0.6922, + "grad_norm": 0.17048922181129456, + "learning_rate": 1.7457734298359005e-06 + }, + { + "step": 613, + "epoch": 3.83125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.696, + "grad_norm": 0.08308877795934677, + "learning_rate": 1.6235235052828476e-06 + }, + { + "step": 614, + "epoch": 3.8375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6914, + "grad_norm": 0.2184290736913681, + "learning_rate": 1.505687442778819e-06 + }, + { + "step": 615, + "epoch": 3.84375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6929, + "grad_norm": 0.21098296344280243, + "learning_rate": 1.3922687476781047e-06 + }, + { + "step": 616, + "epoch": 3.85, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6892, + "grad_norm": 0.06611732393503189, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 617, + "epoch": 3.85625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6921, + "grad_norm": 0.18085877597332, + "learning_rate": 1.1786968239705486e-06 + }, + { + "step": 618, + "epoch": 3.8625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.6928, + "grad_norm": 0.0660034567117691, + "learning_rate": 1.0785499486417438e-06 + }, + { + "step": 619, + "epoch": 3.86875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6909, + "grad_norm": 0.0808580294251442, + "learning_rate": 9.82833147083345e-07 + }, + { + "step": 620, + "epoch": 3.875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.691, + "grad_norm": 0.25544846057891846, + "learning_rate": 8.91549266652053e-07 + }, + { + "step": 621, + "epoch": 3.88125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6938, + "grad_norm": 0.3309765160083771, + "learning_rate": 8.04701022835319e-07 + }, + { + "step": 622, + "epoch": 3.8875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6966, + "grad_norm": 0.13793374598026276, + "learning_rate": 7.222909991704773e-07 + }, + { + "step": 623, + "epoch": 3.89375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6905, + "grad_norm": 0.26817551255226135, + "learning_rate": 6.443216471679058e-07 + }, + { + "step": 624, + "epoch": 3.9, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6926, + "grad_norm": 0.1442233771085739, + "learning_rate": 5.707952862381681e-07 + }, + { + "step": 625, + "epoch": 3.90625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.692, + "grad_norm": 0.11467944085597992, + "learning_rate": 5.017141036229522e-07 + }, + { + "step": 626, + "epoch": 3.9125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6931, + "grad_norm": 0.06496375054121017, + "learning_rate": 4.370801543300051e-07 + }, + { + "step": 627, + "epoch": 3.91875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6927, + "grad_norm": 0.13738664984703064, + "learning_rate": 3.768953610720327e-07 + }, + { + "step": 628, + "epoch": 3.925, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6902, + "grad_norm": 0.12869153916835785, + "learning_rate": 3.211615142094781e-07 + }, + { + "step": 629, + "epoch": 3.93125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125259776, + "loss": 0.6921, + "grad_norm": 0.2165045291185379, + "learning_rate": 2.6988027169728145e-07 + }, + { + "step": 630, + "epoch": 3.9375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12525824, + "loss": 0.693, + "grad_norm": 0.2329886555671692, + "learning_rate": 2.2305315903553555e-07 + }, + { + "step": 631, + "epoch": 3.94375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125272064, + "loss": 0.6903, + "grad_norm": 0.18569444119930267, + "learning_rate": 1.8068156922413924e-07 + }, + { + "step": 632, + "epoch": 3.95, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6905, + "grad_norm": 0.11198175698518753, + "learning_rate": 1.4276676272133025e-07 + }, + { + "step": 633, + "epoch": 3.95625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6887, + "grad_norm": 0.07742220908403397, + "learning_rate": 1.0930986740621539e-07 + }, + { + "step": 634, + "epoch": 3.9625, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125262848, + "loss": 0.6931, + "grad_norm": 0.2214115411043167, + "learning_rate": 8.031187854514731e-08 + }, + { + "step": 635, + "epoch": 3.96875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6937, + "grad_norm": 0.07705278694629669, + "learning_rate": 5.577365876224815e-08 + }, + { + "step": 636, + "epoch": 3.975, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125255168, + "loss": 0.6918, + "grad_norm": 0.08737623691558838, + "learning_rate": 3.5695938013630134e-08 + }, + { + "step": 637, + "epoch": 3.98125, + "cpu_mem": 3.337986048, + "gpu_mem": 1.12526592, + "loss": 0.6938, + "grad_norm": 0.38655605912208557, + "learning_rate": 2.007931356572956e-08 + }, + { + "step": 638, + "epoch": 3.9875, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6923, + "grad_norm": 0.2094232439994812, + "learning_rate": 8.924249977537712e-09 + }, + { + "step": 639, + "epoch": 3.99375, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125264384, + "loss": 0.6907, + "grad_norm": 0.21501874923706055, + "learning_rate": 2.2310790867619e-09 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "loss": 0.6907, + "grad_norm": 0.17324714362621307, + "learning_rate": 0.0 + }, + { + "step": 640, + "epoch": 4.0, + "cpu_mem": 3.337986048, + "gpu_mem": 1.125261312, + "train_runtime": 1452.0387, + "train_samples_per_second": 28.192, + "train_steps_per_second": 0.441, + "total_flos": 1.4646189048397824e+16, + "train_loss": 0.7283771393820644 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r32-a2/adapter_config.json b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b45b0ec05dc0b2725649e69090539b84b3e445ea --- /dev/null +++ b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r32-a2/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha": 64, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "enabled_mlp": true, + "enabled_qkv": [ + "q", + "k", + "v" + ], + "fan_in_fan_out": false, + "inference_mode": false, + "layers_pattern": null, + "layers_to_transform": null, + "mixture": false, + "modules_to_preserve_errors": null, + "modules_to_quantize": null, + "modules_to_save": null, + "onnx_export": false, + "optimization_level": 3, + "orthogonal_init": false, + "peft_type": "MARS", + "quant_n_bits": 8, + "r": 32, + "revision": null, + "seed": 42, + "shared_r": 32, + "target_modules": [ + "down_proj", + "v_proj", + "k_proj", + "up_proj", + "gate_proj", + "o_proj", + "q_proj" + ], + "task_type": null, + "use_bnb": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r32-a2/eval_results.json b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a20e3f2e9777385a79fb25c4ca798f9c5d214ab7 --- /dev/null +++ b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "boolq", + "results": 0.807645259938838 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r32-a2/training_configuration.json b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..a845e62fc32f164da1d92fa178a7afe634d0c034 --- /dev/null +++ b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "BOOLQ", + "dataset_id": "google/boolq", + "preprocess_id": "boolq_train_deepeval" + }, + "peft_config": { + "method": "mars", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 21018624 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 2, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-mars-boolq-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q8/TinyLlama_v1.1-mars-boolq-r32-a2", + "seed": 42, + "timestamp": "2025-09-02T11:29:53.479776" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r32-a2/training_logs.json b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..4e558b4855168bb52e17fbed563ffa3904bc1bac --- /dev/null +++ b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r32-a2/training_logs.json @@ -0,0 +1,2659 @@ +[ + { + "step": 1, + "epoch": 0.006779661016949152, + "cpu_mem": 3.067879424, + "gpu_mem": 1.65287424, + "loss": 8.8586, + "grad_norm": 68.87384796142578, + "learning_rate": 9.999999999999999e-06 + }, + { + "step": 2, + "epoch": 0.013559322033898305, + "cpu_mem": 3.070631936, + "gpu_mem": 1.82116096, + "loss": 8.9138, + "grad_norm": 70.49295043945312, + "learning_rate": 1.9999999999999998e-05 + }, + { + "step": 3, + "epoch": 0.020338983050847456, + "cpu_mem": 3.071418368, + "gpu_mem": 1.821079552, + "loss": 8.1818, + "grad_norm": 69.55998992919922, + "learning_rate": 2.9999999999999997e-05 + }, + { + "step": 4, + "epoch": 0.02711864406779661, + "cpu_mem": 3.072008192, + "gpu_mem": 1.821079552, + "loss": 6.6488, + "grad_norm": 69.75458526611328, + "learning_rate": 3.9999999999999996e-05 + }, + { + "step": 5, + "epoch": 0.03389830508474576, + "cpu_mem": 3.072598016, + "gpu_mem": 1.82101504, + "loss": 4.6674, + "grad_norm": 61.14771270751953, + "learning_rate": 4.9999999999999996e-05 + }, + { + "step": 6, + "epoch": 0.04067796610169491, + "cpu_mem": 3.072991232, + "gpu_mem": 1.821035008, + "loss": 2.7781, + "grad_norm": 43.38957977294922, + "learning_rate": 5.9999999999999995e-05 + }, + { + "step": 7, + "epoch": 0.04745762711864407, + "cpu_mem": 3.073581056, + "gpu_mem": 1.821087232, + "loss": 1.4404, + "grad_norm": 18.615880966186523, + "learning_rate": 7e-05 + }, + { + "step": 8, + "epoch": 0.05423728813559322, + "cpu_mem": 3.07417088, + "gpu_mem": 1.821173248, + "loss": 0.8792, + "grad_norm": 10.323603630065918, + "learning_rate": 7.999999999999999e-05 + }, + { + "step": 9, + "epoch": 0.061016949152542375, + "cpu_mem": 3.074564096, + "gpu_mem": 1.821081088, + "loss": 1.0412, + "grad_norm": 20.1569881439209, + "learning_rate": 8.999999999999999e-05 + }, + { + "step": 10, + "epoch": 0.06779661016949153, + "cpu_mem": 3.074957312, + "gpu_mem": 1.820981248, + "loss": 0.734, + "grad_norm": 8.159199714660645, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 11, + "epoch": 0.07457627118644068, + "cpu_mem": 3.075350528, + "gpu_mem": 1.821085696, + "loss": 1.1999, + "grad_norm": 31.27134895324707, + "learning_rate": 0.00010999999999999998 + }, + { + "step": 12, + "epoch": 0.08135593220338982, + "cpu_mem": 3.075743744, + "gpu_mem": 1.821457408, + "loss": 1.1859, + "grad_norm": 27.713180541992188, + "learning_rate": 0.00011999999999999999 + }, + { + "step": 13, + "epoch": 0.08813559322033898, + "cpu_mem": 3.076333568, + "gpu_mem": 1.82106112, + "loss": 0.6966, + "grad_norm": 5.669466495513916, + "learning_rate": 0.00013 + }, + { + "step": 14, + "epoch": 0.09491525423728814, + "cpu_mem": 3.076726784, + "gpu_mem": 1.82103808, + "loss": 0.922, + "grad_norm": 14.796576499938965, + "learning_rate": 0.00014 + }, + { + "step": 15, + "epoch": 0.1016949152542373, + "cpu_mem": 3.076923392, + "gpu_mem": 1.82097664, + "loss": 1.351, + "grad_norm": 21.428823471069336, + "learning_rate": 0.00015 + }, + { + "step": 16, + "epoch": 0.10847457627118644, + "cpu_mem": 3.077316608, + "gpu_mem": 1.82106112, + "loss": 0.9097, + "grad_norm": 13.006420135498047, + "learning_rate": 0.00015999999999999999 + }, + { + "step": 17, + "epoch": 0.1152542372881356, + "cpu_mem": 3.077709824, + "gpu_mem": 1.821101056, + "loss": 0.6797, + "grad_norm": 0.9331075549125671, + "learning_rate": 0.00016999999999999999 + }, + { + "step": 18, + "epoch": 0.12203389830508475, + "cpu_mem": 3.07810304, + "gpu_mem": 1.821164032, + "loss": 0.9202, + "grad_norm": 13.914567947387695, + "learning_rate": 0.00017999999999999998 + }, + { + "step": 19, + "epoch": 0.1288135593220339, + "cpu_mem": 3.078496256, + "gpu_mem": 1.821001216, + "loss": 0.7902, + "grad_norm": 10.132259368896484, + "learning_rate": 0.00018999999999999998 + }, + { + "step": 20, + "epoch": 0.13559322033898305, + "cpu_mem": 3.078692864, + "gpu_mem": 1.821113344, + "loss": 0.6598, + "grad_norm": 5.340718746185303, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 21, + "epoch": 0.1423728813559322, + "cpu_mem": 3.07908608, + "gpu_mem": 1.821271552, + "loss": 0.7344, + "grad_norm": 6.192241668701172, + "learning_rate": 0.00020999999999999998 + }, + { + "step": 22, + "epoch": 0.14915254237288136, + "cpu_mem": 3.079479296, + "gpu_mem": 1.821164032, + "loss": 0.9945, + "grad_norm": 10.157393455505371, + "learning_rate": 0.00021999999999999995 + }, + { + "step": 23, + "epoch": 0.15593220338983052, + "cpu_mem": 3.079675904, + "gpu_mem": 1.821136384, + "loss": 0.732, + "grad_norm": 4.2285308837890625, + "learning_rate": 0.00023 + }, + { + "step": 24, + "epoch": 0.16271186440677965, + "cpu_mem": 3.08006912, + "gpu_mem": 1.821193216, + "loss": 0.6698, + "grad_norm": 5.390683174133301, + "learning_rate": 0.00023999999999999998 + }, + { + "step": 25, + "epoch": 0.1694915254237288, + "cpu_mem": 3.080462336, + "gpu_mem": 1.820978176, + "loss": 0.6955, + "grad_norm": 3.9773783683776855, + "learning_rate": 0.00025 + }, + { + "step": 26, + "epoch": 0.17627118644067796, + "cpu_mem": 3.080658944, + "gpu_mem": 1.821033472, + "loss": 0.6924, + "grad_norm": 1.9215008020401, + "learning_rate": 0.00026 + }, + { + "step": 27, + "epoch": 0.18305084745762712, + "cpu_mem": 3.080855552, + "gpu_mem": 1.821325312, + "loss": 0.6542, + "grad_norm": 3.314211368560791, + "learning_rate": 0.00027 + }, + { + "step": 28, + "epoch": 0.18983050847457628, + "cpu_mem": 3.08105216, + "gpu_mem": 1.821004288, + "loss": 0.7542, + "grad_norm": 5.283987045288086, + "learning_rate": 0.00028 + }, + { + "step": 29, + "epoch": 0.19661016949152543, + "cpu_mem": 3.081248768, + "gpu_mem": 1.8210688, + "loss": 0.6318, + "grad_norm": 0.7347916960716248, + "learning_rate": 0.00029 + }, + { + "step": 30, + "epoch": 0.2033898305084746, + "cpu_mem": 3.081445376, + "gpu_mem": 1.821147136, + "loss": 0.6533, + "grad_norm": 0.9134010672569275, + "learning_rate": 0.0003 + }, + { + "step": 31, + "epoch": 0.21016949152542372, + "cpu_mem": 3.081641984, + "gpu_mem": 1.820950528, + "loss": 0.6258, + "grad_norm": 3.723344087600708, + "learning_rate": 0.0002999893794250036 + }, + { + "step": 32, + "epoch": 0.21694915254237288, + "cpu_mem": 3.0820352, + "gpu_mem": 1.821064192, + "loss": 0.704, + "grad_norm": 3.575549364089966, + "learning_rate": 0.00029995751920396937 + }, + { + "step": 33, + "epoch": 0.22372881355932203, + "cpu_mem": 3.082231808, + "gpu_mem": 1.821302272, + "loss": 0.7105, + "grad_norm": 1.3929890394210815, + "learning_rate": 0.00029990442384854874 + }, + { + "step": 34, + "epoch": 0.2305084745762712, + "cpu_mem": 3.082625024, + "gpu_mem": 1.821004288, + "loss": 0.709, + "grad_norm": 6.829777717590332, + "learning_rate": 0.0002998301008774512 + }, + { + "step": 35, + "epoch": 0.23728813559322035, + "cpu_mem": 3.082821632, + "gpu_mem": 1.82121472, + "loss": 0.6779, + "grad_norm": 0.9693426489830017, + "learning_rate": 0.0002997345608153792 + }, + { + "step": 36, + "epoch": 0.2440677966101695, + "cpu_mem": 3.08301824, + "gpu_mem": 1.821165568, + "loss": 0.6436, + "grad_norm": 1.6492255926132202, + "learning_rate": 0.000299617817191538 + }, + { + "step": 37, + "epoch": 0.25084745762711863, + "cpu_mem": 3.083214848, + "gpu_mem": 1.82097664, + "loss": 0.6186, + "grad_norm": 1.9952858686447144, + "learning_rate": 0.0002994798865377198 + }, + { + "step": 38, + "epoch": 0.2576271186440678, + "cpu_mem": 3.083411456, + "gpu_mem": 1.821223936, + "loss": 0.7312, + "grad_norm": 4.579022407531738, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 39, + "epoch": 0.26440677966101694, + "cpu_mem": 3.083608064, + "gpu_mem": 1.821603328, + "loss": 0.7613, + "grad_norm": 6.799362659454346, + "learning_rate": 0.0002991405452657846 + }, + { + "step": 40, + "epoch": 0.2711864406779661, + "cpu_mem": 3.083804672, + "gpu_mem": 1.821173248, + "loss": 0.719, + "grad_norm": 7.090038299560547, + "learning_rate": 0.00029893918270099324 + }, + { + "step": 41, + "epoch": 0.27796610169491526, + "cpu_mem": 3.08400128, + "gpu_mem": 1.821400576, + "loss": 0.6777, + "grad_norm": 0.9114485383033752, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 42, + "epoch": 0.2847457627118644, + "cpu_mem": 3.08400128, + "gpu_mem": 1.821297664, + "loss": 0.6254, + "grad_norm": 2.1200194358825684, + "learning_rate": 0.0002984732162821399 + }, + { + "step": 43, + "epoch": 0.29152542372881357, + "cpu_mem": 3.08400128, + "gpu_mem": 1.821119488, + "loss": 0.6097, + "grad_norm": 4.063666820526123, + "learning_rate": 0.0002982086784124952 + }, + { + "step": 44, + "epoch": 0.2983050847457627, + "cpu_mem": 3.084197888, + "gpu_mem": 1.821262336, + "loss": 0.6051, + "grad_norm": 2.041332483291626, + "learning_rate": 0.00029792315305772796 + }, + { + "step": 45, + "epoch": 0.3050847457627119, + "cpu_mem": 3.084394496, + "gpu_mem": 1.821042688, + "loss": 1.1121, + "grad_norm": 10.205484390258789, + "learning_rate": 0.0002976166806504174 + }, + { + "step": 46, + "epoch": 0.31186440677966104, + "cpu_mem": 3.084591104, + "gpu_mem": 1.821285376, + "loss": 0.8166, + "grad_norm": 6.637809753417969, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 47, + "epoch": 0.31864406779661014, + "cpu_mem": 3.084787712, + "gpu_mem": 1.821008896, + "loss": 0.5818, + "grad_norm": 2.5051164627075195, + "learning_rate": 0.00029694107123365385 + }, + { + "step": 48, + "epoch": 0.3254237288135593, + "cpu_mem": 3.08498432, + "gpu_mem": 1.821085696, + "loss": 0.7606, + "grad_norm": 7.728519916534424, + "learning_rate": 0.00029657202989567393 + }, + { + "step": 49, + "epoch": 0.33220338983050846, + "cpu_mem": 3.08498432, + "gpu_mem": 1.821102592, + "loss": 0.7961, + "grad_norm": 5.756411552429199, + "learning_rate": 0.00029618223283454893 + }, + { + "step": 50, + "epoch": 0.3389830508474576, + "cpu_mem": 3.085180928, + "gpu_mem": 1.821041152, + "loss": 0.6894, + "grad_norm": 5.5516767501831055, + "learning_rate": 0.00029577173524853123 + }, + { + "step": 51, + "epoch": 0.34576271186440677, + "cpu_mem": 3.085180928, + "gpu_mem": 1.82104576, + "loss": 0.5769, + "grad_norm": 1.282005786895752, + "learning_rate": 0.0002953405952672261 + }, + { + "step": 52, + "epoch": 0.3525423728813559, + "cpu_mem": 3.085377536, + "gpu_mem": 1.821125632, + "loss": 1.0026, + "grad_norm": 6.9171857833862305, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 53, + "epoch": 0.3593220338983051, + "cpu_mem": 3.085377536, + "gpu_mem": 1.821148672, + "loss": 0.6944, + "grad_norm": 4.297009468078613, + "learning_rate": 0.0002944166352441363 + }, + { + "step": 54, + "epoch": 0.36610169491525424, + "cpu_mem": 3.085574144, + "gpu_mem": 1.82107648, + "loss": 1.0375, + "grad_norm": 6.81887674331665, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 55, + "epoch": 0.3728813559322034, + "cpu_mem": 3.085770752, + "gpu_mem": 1.821346816, + "loss": 0.5916, + "grad_norm": 1.279556393623352, + "learning_rate": 0.00029341087610604337 + }, + { + "step": 56, + "epoch": 0.37966101694915255, + "cpu_mem": 3.085770752, + "gpu_mem": 1.821133312, + "loss": 0.7076, + "grad_norm": 2.5867040157318115, + "learning_rate": 0.00029287749809037904 + }, + { + "step": 57, + "epoch": 0.3864406779661017, + "cpu_mem": 3.085770752, + "gpu_mem": 1.821127168, + "loss": 0.8227, + "grad_norm": 6.0817084312438965, + "learning_rate": 0.0002923238875255979 + }, + { + "step": 58, + "epoch": 0.39322033898305087, + "cpu_mem": 3.085770752, + "gpu_mem": 1.82102272, + "loss": 0.6387, + "grad_norm": 3.0675289630889893, + "learning_rate": 0.00029175012280720024 + }, + { + "step": 59, + "epoch": 0.4, + "cpu_mem": 3.08596736, + "gpu_mem": 1.821039616, + "loss": 0.6917, + "grad_norm": 3.3201441764831543, + "learning_rate": 0.000291156285184669 + }, + { + "step": 60, + "epoch": 0.4067796610169492, + "cpu_mem": 3.08596736, + "gpu_mem": 1.821133312, + "loss": 0.6182, + "grad_norm": 2.879568338394165, + "learning_rate": 0.00029054245874996426 + }, + { + "step": 61, + "epoch": 0.4135593220338983, + "cpu_mem": 3.086163968, + "gpu_mem": 1.821144064, + "loss": 0.5991, + "grad_norm": 1.3914790153503418, + "learning_rate": 0.0002899087304256151 + }, + { + "step": 62, + "epoch": 0.42033898305084744, + "cpu_mem": 3.086360576, + "gpu_mem": 1.821131776, + "loss": 0.7127, + "grad_norm": 1.0733517408370972, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 63, + "epoch": 0.4271186440677966, + "cpu_mem": 3.086360576, + "gpu_mem": 1.821124096, + "loss": 0.7192, + "grad_norm": 5.623044013977051, + "learning_rate": 0.000288581929876693 + }, + { + "step": 64, + "epoch": 0.43389830508474575, + "cpu_mem": 3.086557184, + "gpu_mem": 1.82105344, + "loss": 0.7199, + "grad_norm": 4.617927074432373, + "learning_rate": 0.0002878890455372498 + }, + { + "step": 65, + "epoch": 0.4406779661016949, + "cpu_mem": 3.086557184, + "gpu_mem": 1.821097984, + "loss": 0.6343, + "grad_norm": 1.7221837043762207, + "learning_rate": 0.0002871766350518159 + }, + { + "step": 66, + "epoch": 0.44745762711864406, + "cpu_mem": 3.086557184, + "gpu_mem": 1.82129152, + "loss": 0.6031, + "grad_norm": 1.726184606552124, + "learning_rate": 0.00028644479930317775 + }, + { + "step": 67, + "epoch": 0.4542372881355932, + "cpu_mem": 3.086557184, + "gpu_mem": 1.821001216, + "loss": 0.6923, + "grad_norm": 3.2478525638580322, + "learning_rate": 0.00028569364192488803 + }, + { + "step": 68, + "epoch": 0.4610169491525424, + "cpu_mem": 3.086753792, + "gpu_mem": 1.82096896, + "loss": 0.8044, + "grad_norm": 4.173428535461426, + "learning_rate": 0.00028492326928659045 + }, + { + "step": 69, + "epoch": 0.46779661016949153, + "cpu_mem": 3.0869504, + "gpu_mem": 1.821035008, + "loss": 0.5631, + "grad_norm": 0.7027332782745361, + "learning_rate": 0.00028413379047895665 + }, + { + "step": 70, + "epoch": 0.4745762711864407, + "cpu_mem": 3.0869504, + "gpu_mem": 1.821028864, + "loss": 0.5858, + "grad_norm": 2.211245536804199, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 71, + "epoch": 0.48135593220338985, + "cpu_mem": 3.0869504, + "gpu_mem": 1.821257728, + "loss": 0.6429, + "grad_norm": 4.235151290893555, + "learning_rate": 0.0002824979642304366 + }, + { + "step": 72, + "epoch": 0.488135593220339, + "cpu_mem": 3.0869504, + "gpu_mem": 1.821250048, + "loss": 0.5417, + "grad_norm": 2.080139398574829, + "learning_rate": 0.0002816518484350883 + }, + { + "step": 73, + "epoch": 0.49491525423728816, + "cpu_mem": 3.0869504, + "gpu_mem": 1.821216256, + "loss": 0.7728, + "grad_norm": 4.390351295471191, + "learning_rate": 0.0002807870897286772 + }, + { + "step": 74, + "epoch": 0.5016949152542373, + "cpu_mem": 3.0869504, + "gpu_mem": 1.82107648, + "loss": 0.5373, + "grad_norm": 2.799133062362671, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 75, + "epoch": 0.5084745762711864, + "cpu_mem": 3.0869504, + "gpu_mem": 1.821001216, + "loss": 0.4932, + "grad_norm": 1.385074496269226, + "learning_rate": 0.000279002136031155 + }, + { + "step": 76, + "epoch": 0.5152542372881356, + "cpu_mem": 3.087147008, + "gpu_mem": 1.820941312, + "loss": 0.5045, + "grad_norm": 1.8684202432632446, + "learning_rate": 0.00027808219380317216 + }, + { + "step": 77, + "epoch": 0.5220338983050847, + "cpu_mem": 3.087343616, + "gpu_mem": 1.82101504, + "loss": 0.5677, + "grad_norm": 3.309020519256592, + "learning_rate": 0.0002771441141545895 + }, + { + "step": 78, + "epoch": 0.5288135593220339, + "cpu_mem": 3.087343616, + "gpu_mem": 1.821067264, + "loss": 0.5923, + "grad_norm": 3.132214307785034, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 79, + "epoch": 0.535593220338983, + "cpu_mem": 3.087343616, + "gpu_mem": 1.82119936, + "loss": 0.5859, + "grad_norm": 1.3290568590164185, + "learning_rate": 0.000275214076502292 + }, + { + "step": 80, + "epoch": 0.5423728813559322, + "cpu_mem": 3.087343616, + "gpu_mem": 1.821090304, + "loss": 0.5663, + "grad_norm": 1.8612052202224731, + "learning_rate": 0.0002742223918067056 + }, + { + "step": 81, + "epoch": 0.5491525423728814, + "cpu_mem": 3.087343616, + "gpu_mem": 1.820970496, + "loss": 0.5933, + "grad_norm": 1.262265920639038, + "learning_rate": 0.00027321311626807374 + }, + { + "step": 82, + "epoch": 0.5559322033898305, + "cpu_mem": 3.087343616, + "gpu_mem": 1.821039616, + "loss": 0.5743, + "grad_norm": 1.8388952016830444, + "learning_rate": 0.0002721863928075503 + }, + { + "step": 83, + "epoch": 0.5627118644067797, + "cpu_mem": 3.087540224, + "gpu_mem": 1.821139456, + "loss": 0.6411, + "grad_norm": 1.4783612489700317, + "learning_rate": 0.000271142366817049 + }, + { + "step": 84, + "epoch": 0.5694915254237288, + "cpu_mem": 3.087540224, + "gpu_mem": 1.821102592, + "loss": 0.6417, + "grad_norm": 4.5199198722839355, + "learning_rate": 0.00027008118613865406 + }, + { + "step": 85, + "epoch": 0.576271186440678, + "cpu_mem": 3.087540224, + "gpu_mem": 1.821134848, + "loss": 0.6002, + "grad_norm": 2.9216723442077637, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 86, + "epoch": 0.5830508474576271, + "cpu_mem": 3.087540224, + "gpu_mem": 1.821085696, + "loss": 0.627, + "grad_norm": 2.0996506214141846, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 87, + "epoch": 0.5898305084745763, + "cpu_mem": 3.087540224, + "gpu_mem": 1.821093376, + "loss": 0.5831, + "grad_norm": 2.96175217628479, + "learning_rate": 0.00026679623070746325 + }, + { + "step": 88, + "epoch": 0.5966101694915255, + "cpu_mem": 3.087540224, + "gpu_mem": 1.82123776, + "loss": 0.5318, + "grad_norm": 1.351048231124878, + "learning_rate": 0.0002656679579618081 + }, + { + "step": 89, + "epoch": 0.6033898305084746, + "cpu_mem": 3.087540224, + "gpu_mem": 1.821019648, + "loss": 0.6521, + "grad_norm": 1.614142656326294, + "learning_rate": 0.0002645233057465235 + }, + { + "step": 90, + "epoch": 0.6101694915254238, + "cpu_mem": 3.087540224, + "gpu_mem": 1.821073408, + "loss": 0.5216, + "grad_norm": 2.7360522747039795, + "learning_rate": 0.00026336243615313873 + }, + { + "step": 91, + "epoch": 0.6169491525423729, + "cpu_mem": 3.087540224, + "gpu_mem": 1.821041152, + "loss": 0.4923, + "grad_norm": 2.242475748062134, + "learning_rate": 0.00026218551356968814 + }, + { + "step": 92, + "epoch": 0.6237288135593221, + "cpu_mem": 3.087540224, + "gpu_mem": 1.82112256, + "loss": 0.8061, + "grad_norm": 6.171937465667725, + "learning_rate": 0.00026099270465743254 + }, + { + "step": 93, + "epoch": 0.6305084745762712, + "cpu_mem": 3.087540224, + "gpu_mem": 1.820925952, + "loss": 0.7999, + "grad_norm": 5.889636516571045, + "learning_rate": 0.0002597841783272588 + }, + { + "step": 94, + "epoch": 0.6372881355932203, + "cpu_mem": 3.087540224, + "gpu_mem": 1.821039616, + "loss": 0.5113, + "grad_norm": 2.6934938430786133, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 95, + "epoch": 0.6440677966101694, + "cpu_mem": 3.087736832, + "gpu_mem": 1.821059584, + "loss": 0.6275, + "grad_norm": 2.2074472904205322, + "learning_rate": 0.00025732066016100394 + }, + { + "step": 96, + "epoch": 0.6508474576271186, + "cpu_mem": 3.087736832, + "gpu_mem": 1.821097984, + "loss": 0.5287, + "grad_norm": 2.2889792919158936, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 97, + "epoch": 0.6576271186440678, + "cpu_mem": 3.087736832, + "gpu_mem": 1.821082624, + "loss": 0.5444, + "grad_norm": 1.904442548751831, + "learning_rate": 0.0002547963544337602 + }, + { + "step": 98, + "epoch": 0.6644067796610169, + "cpu_mem": 3.087736832, + "gpu_mem": 1.820995072, + "loss": 0.584, + "grad_norm": 1.780060052871704, + "learning_rate": 0.0002535118517223168 + }, + { + "step": 99, + "epoch": 0.6711864406779661, + "cpu_mem": 3.087736832, + "gpu_mem": 1.820944384, + "loss": 0.5249, + "grad_norm": 2.3458874225616455, + "learning_rate": 0.00025221269093908365 + }, + { + "step": 100, + "epoch": 0.6779661016949152, + "cpu_mem": 3.087736832, + "gpu_mem": 1.82106112, + "loss": 0.5166, + "grad_norm": 2.62747859954834, + "learning_rate": 0.0002508990560551879 + }, + { + "step": 101, + "epoch": 0.6847457627118644, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821093376, + "loss": 0.5458, + "grad_norm": 3.075012683868408, + "learning_rate": 0.0002495711330914001 + }, + { + "step": 102, + "epoch": 0.6915254237288135, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821127168, + "loss": 0.5595, + "grad_norm": 2.9845681190490723, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 103, + "epoch": 0.6983050847457627, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821177856, + "loss": 0.6022, + "grad_norm": 2.430687427520752, + "learning_rate": 0.0002468731770971113 + }, + { + "step": 104, + "epoch": 0.7050847457627119, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821082624, + "loss": 0.5753, + "grad_norm": 3.7953081130981445, + "learning_rate": 0.0002455035261178632 + }, + { + "step": 105, + "epoch": 0.711864406779661, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821184, + "loss": 0.5786, + "grad_norm": 2.2572851181030273, + "learning_rate": 0.0002441203511071278 + }, + { + "step": 106, + "epoch": 0.7186440677966102, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821134848, + "loss": 0.4867, + "grad_norm": 2.731645345687866, + "learning_rate": 0.00024272384793309077 + }, + { + "step": 107, + "epoch": 0.7254237288135593, + "cpu_mem": 3.08793344, + "gpu_mem": 1.82102272, + "loss": 0.4715, + "grad_norm": 2.1887848377227783, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 108, + "epoch": 0.7322033898305085, + "cpu_mem": 3.08793344, + "gpu_mem": 1.82120704, + "loss": 0.538, + "grad_norm": 2.018214702606201, + "learning_rate": 0.00023989164997670202 + }, + { + "step": 109, + "epoch": 0.7389830508474576, + "cpu_mem": 3.08793344, + "gpu_mem": 1.82106112, + "loss": 0.6104, + "grad_norm": 2.1448326110839844, + "learning_rate": 0.0002384563562552943 + }, + { + "step": 110, + "epoch": 0.7457627118644068, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821064192, + "loss": 0.5229, + "grad_norm": 2.063460111618042, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 111, + "epoch": 0.752542372881356, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821033472, + "loss": 0.4985, + "grad_norm": 2.1617538928985596, + "learning_rate": 0.0002355483955402446 + }, + { + "step": 112, + "epoch": 0.7593220338983051, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821079552, + "loss": 0.5175, + "grad_norm": 2.1552674770355225, + "learning_rate": 0.00023407614033613407 + }, + { + "step": 113, + "epoch": 0.7661016949152543, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821070336, + "loss": 0.5277, + "grad_norm": 3.787642240524292, + "learning_rate": 0.0002325919793059723 + }, + { + "step": 114, + "epoch": 0.7728813559322034, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821051904, + "loss": 0.4492, + "grad_norm": 2.0796303749084473, + "learning_rate": 0.00023109612261833963 + }, + { + "step": 115, + "epoch": 0.7796610169491526, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821127168, + "loss": 0.4392, + "grad_norm": 1.841799259185791, + "learning_rate": 0.0002295887820980112 + }, + { + "step": 116, + "epoch": 0.7864406779661017, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821047296, + "loss": 0.6744, + "grad_norm": 4.277587890625, + "learning_rate": 0.0002280701711959608 + }, + { + "step": 117, + "epoch": 0.7932203389830509, + "cpu_mem": 3.08793344, + "gpu_mem": 1.82093824, + "loss": 0.5132, + "grad_norm": 2.3245608806610107, + "learning_rate": 0.00022654050495913495 + }, + { + "step": 118, + "epoch": 0.8, + "cpu_mem": 3.08793344, + "gpu_mem": 1.82117632, + "loss": 0.582, + "grad_norm": 3.7581348419189453, + "learning_rate": 0.000225 + }, + { + "step": 119, + "epoch": 0.8067796610169492, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821346816, + "loss": 0.4666, + "grad_norm": 2.6801259517669678, + "learning_rate": 0.00022344887446586865 + }, + { + "step": 120, + "epoch": 0.8135593220338984, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821079552, + "loss": 0.4339, + "grad_norm": 2.298764228820801, + "learning_rate": 0.00022188734800800852 + }, + { + "step": 121, + "epoch": 0.8203389830508474, + "cpu_mem": 3.08793344, + "gpu_mem": 1.8211072, + "loss": 0.5424, + "grad_norm": 2.430497407913208, + "learning_rate": 0.00022031564175053754 + }, + { + "step": 122, + "epoch": 0.8271186440677966, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821157888, + "loss": 0.3895, + "grad_norm": 2.3427796363830566, + "learning_rate": 0.00021873397825911153 + }, + { + "step": 123, + "epoch": 0.8338983050847457, + "cpu_mem": 3.08793344, + "gpu_mem": 1.820967424, + "loss": 0.5118, + "grad_norm": 3.429617404937744, + "learning_rate": 0.00021714258150940685 + }, + { + "step": 124, + "epoch": 0.8406779661016949, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821409792, + "loss": 0.4851, + "grad_norm": 1.9890496730804443, + "learning_rate": 0.0002155416768554039 + }, + { + "step": 125, + "epoch": 0.847457627118644, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821136384, + "loss": 0.5656, + "grad_norm": 3.4110472202301025, + "learning_rate": 0.00021393149099747523 + }, + { + "step": 126, + "epoch": 0.8542372881355932, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821019648, + "loss": 0.5503, + "grad_norm": 3.3352224826812744, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 127, + "epoch": 0.8610169491525423, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821458944, + "loss": 0.5308, + "grad_norm": 2.0365774631500244, + "learning_rate": 0.00021068418901049025 + }, + { + "step": 128, + "epoch": 0.8677966101694915, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821234688, + "loss": 0.4275, + "grad_norm": 2.6051623821258545, + "learning_rate": 0.0002090475327242912 + }, + { + "step": 129, + "epoch": 0.8745762711864407, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821274624, + "loss": 0.6324, + "grad_norm": 3.0258097648620605, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 130, + "epoch": 0.8813559322033898, + "cpu_mem": 3.08793344, + "gpu_mem": 1.821056512, + "loss": 0.593, + "grad_norm": 2.115528106689453, + "learning_rate": 0.0002057493683490491 + }, + { + "step": 131, + "epoch": 0.888135593220339, + "cpu_mem": 3.088130048, + "gpu_mem": 1.821185536, + "loss": 0.4707, + "grad_norm": 1.3393425941467285, + "learning_rate": 0.00020408832730536746 + }, + { + "step": 132, + "epoch": 0.8949152542372881, + "cpu_mem": 3.088130048, + "gpu_mem": 1.821266944, + "loss": 0.5119, + "grad_norm": 3.0318400859832764, + "learning_rate": 0.00020241962693986476 + }, + { + "step": 133, + "epoch": 0.9016949152542373, + "cpu_mem": 3.088130048, + "gpu_mem": 1.821050368, + "loss": 0.4862, + "grad_norm": 3.282351016998291, + "learning_rate": 0.0002007435035533061 + }, + { + "step": 134, + "epoch": 0.9084745762711864, + "cpu_mem": 3.088130048, + "gpu_mem": 1.821184, + "loss": 0.4504, + "grad_norm": 2.880984306335449, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 135, + "epoch": 0.9152542372881356, + "cpu_mem": 3.088130048, + "gpu_mem": 1.82120704, + "loss": 0.5226, + "grad_norm": 2.555544853210449, + "learning_rate": 0.00019736993814225374 + }, + { + "step": 136, + "epoch": 0.9220338983050848, + "cpu_mem": 3.088130048, + "gpu_mem": 1.821044224, + "loss": 0.5268, + "grad_norm": 3.1878459453582764, + "learning_rate": 0.00019567297384048604 + }, + { + "step": 137, + "epoch": 0.9288135593220339, + "cpu_mem": 3.088130048, + "gpu_mem": 1.820924416, + "loss": 0.6217, + "grad_norm": 4.039100646972656, + "learning_rate": 0.0001939695418954653 + }, + { + "step": 138, + "epoch": 0.9355932203389831, + "cpu_mem": 3.088130048, + "gpu_mem": 1.821105664, + "loss": 0.4681, + "grad_norm": 2.8015811443328857, + "learning_rate": 0.00019225988352621445 + }, + { + "step": 139, + "epoch": 0.9423728813559322, + "cpu_mem": 3.088130048, + "gpu_mem": 1.821004288, + "loss": 0.3943, + "grad_norm": 1.8389748334884644, + "learning_rate": 0.00019054424083346592 + }, + { + "step": 140, + "epoch": 0.9491525423728814, + "cpu_mem": 3.088130048, + "gpu_mem": 1.821056512, + "loss": 0.5247, + "grad_norm": 3.605134963989258, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 141, + "epoch": 0.9559322033898305, + "cpu_mem": 3.088130048, + "gpu_mem": 1.821088768, + "loss": 0.778, + "grad_norm": 5.948938369750977, + "learning_rate": 0.0001870959750831323 + }, + { + "step": 142, + "epoch": 0.9627118644067797, + "cpu_mem": 3.088130048, + "gpu_mem": 1.821228544, + "loss": 0.5586, + "grad_norm": 3.9241139888763428, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 143, + "epoch": 0.9694915254237289, + "cpu_mem": 3.088130048, + "gpu_mem": 1.821211648, + "loss": 0.5738, + "grad_norm": 2.483680009841919, + "learning_rate": 0.00018362669777878453 + }, + { + "step": 144, + "epoch": 0.976271186440678, + "cpu_mem": 3.088130048, + "gpu_mem": 1.821403648, + "loss": 0.5539, + "grad_norm": 2.19439697265625, + "learning_rate": 0.00018188479343294648 + }, + { + "step": 145, + "epoch": 0.9830508474576272, + "cpu_mem": 3.088130048, + "gpu_mem": 1.82111488, + "loss": 0.4885, + "grad_norm": 1.787278413772583, + "learning_rate": 0.0001801383739559098 + }, + { + "step": 146, + "epoch": 0.9898305084745763, + "cpu_mem": 3.088130048, + "gpu_mem": 1.821150208, + "loss": 0.6056, + "grad_norm": 2.780120611190796, + "learning_rate": 0.0001783876866540615 + }, + { + "step": 147, + "epoch": 0.9966101694915255, + "cpu_mem": 3.088130048, + "gpu_mem": 1.821048832, + "loss": 0.5866, + "grad_norm": 2.0629403591156006, + "learning_rate": 0.00017663297943814552 + }, + { + "step": 148, + "epoch": 1.0033898305084745, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90528768, + "loss": 0.5888, + "grad_norm": 2.759999990463257, + "learning_rate": 0.0001748745007881561 + }, + { + "step": 149, + "epoch": 1.0101694915254238, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905223168, + "loss": 0.4555, + "grad_norm": 1.639000415802002, + "learning_rate": 0.00017311249971815185 + }, + { + "step": 150, + "epoch": 1.0169491525423728, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905060352, + "loss": 0.4355, + "grad_norm": 1.898119568824768, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 151, + "epoch": 1.023728813559322, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905132544, + "loss": 0.4667, + "grad_norm": 1.5665459632873535, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 152, + "epoch": 1.0305084745762711, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905167872, + "loss": 0.3741, + "grad_norm": 1.4567912817001343, + "learning_rate": 0.00016780785939859576 + }, + { + "step": 153, + "epoch": 1.0372881355932204, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905192448, + "loss": 0.4774, + "grad_norm": 1.4101380109786987, + "learning_rate": 0.00016603426823476693 + }, + { + "step": 154, + "epoch": 1.0440677966101695, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905154048, + "loss": 0.3512, + "grad_norm": 1.2983170747756958, + "learning_rate": 0.00016425840649562736 + }, + { + "step": 155, + "epoch": 1.0508474576271187, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905375232, + "loss": 0.4622, + "grad_norm": 2.1358556747436523, + "learning_rate": 0.00016248052565681436 + }, + { + "step": 156, + "epoch": 1.0576271186440678, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905283072, + "loss": 0.3399, + "grad_norm": 2.1808676719665527, + "learning_rate": 0.00016070087747988482 + }, + { + "step": 157, + "epoch": 1.064406779661017, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905189376, + "loss": 0.4186, + "grad_norm": 2.2214033603668213, + "learning_rate": 0.00015891971397666464 + }, + { + "step": 158, + "epoch": 1.071186440677966, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905115648, + "loss": 0.3355, + "grad_norm": 2.6199612617492676, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 159, + "epoch": 1.0779661016949154, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90546432, + "loss": 0.4038, + "grad_norm": 2.1600446701049805, + "learning_rate": 0.00015535385007584706 + }, + { + "step": 160, + "epoch": 1.0847457627118644, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905058816, + "loss": 0.3416, + "grad_norm": 2.210366725921631, + "learning_rate": 0.0001535696546319161 + }, + { + "step": 161, + "epoch": 1.0915254237288137, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905005056, + "loss": 0.3772, + "grad_norm": 3.0205626487731934, + "learning_rate": 0.00015178495369752213 + }, + { + "step": 162, + "epoch": 1.0983050847457627, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905780736, + "loss": 0.2804, + "grad_norm": 1.8242988586425781, + "learning_rate": 0.00015 + }, + { + "step": 163, + "epoch": 1.1050847457627118, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90525696, + "loss": 0.3589, + "grad_norm": 2.2699551582336426, + "learning_rate": 0.00014821504630247785 + }, + { + "step": 164, + "epoch": 1.111864406779661, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905169408, + "loss": 0.4525, + "grad_norm": 2.6243233680725098, + "learning_rate": 0.00014643034536808387 + }, + { + "step": 165, + "epoch": 1.11864406779661, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90511872, + "loss": 0.2822, + "grad_norm": 1.9388184547424316, + "learning_rate": 0.00014464614992415294 + }, + { + "step": 166, + "epoch": 1.1254237288135593, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905213952, + "loss": 0.3213, + "grad_norm": 2.853483200073242, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 167, + "epoch": 1.1322033898305084, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905131008, + "loss": 0.3941, + "grad_norm": 2.3319296836853027, + "learning_rate": 0.00014108028602333536 + }, + { + "step": 168, + "epoch": 1.1389830508474577, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90514944, + "loss": 0.428, + "grad_norm": 3.1277780532836914, + "learning_rate": 0.00013929912252011516 + }, + { + "step": 169, + "epoch": 1.1457627118644067, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905236992, + "loss": 0.5057, + "grad_norm": 4.2538251876831055, + "learning_rate": 0.00013751947434318564 + }, + { + "step": 170, + "epoch": 1.152542372881356, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905121792, + "loss": 0.3875, + "grad_norm": 2.391735792160034, + "learning_rate": 0.00013574159350437261 + }, + { + "step": 171, + "epoch": 1.159322033898305, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905184768, + "loss": 0.5268, + "grad_norm": 3.7759625911712646, + "learning_rate": 0.0001339657317652331 + }, + { + "step": 172, + "epoch": 1.1661016949152543, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905092608, + "loss": 0.3404, + "grad_norm": 3.112100601196289, + "learning_rate": 0.00013219214060140424 + }, + { + "step": 173, + "epoch": 1.1728813559322033, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905392128, + "loss": 0.4771, + "grad_norm": 3.123610734939575, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 174, + "epoch": 1.1796610169491526, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905115648, + "loss": 0.4069, + "grad_norm": 2.8058180809020996, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 175, + "epoch": 1.1864406779661016, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905081856, + "loss": 0.3842, + "grad_norm": 2.293651819229126, + "learning_rate": 0.00012688750028184818 + }, + { + "step": 176, + "epoch": 1.193220338983051, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905220096, + "loss": 0.3947, + "grad_norm": 3.036417007446289, + "learning_rate": 0.0001251254992118439 + }, + { + "step": 177, + "epoch": 1.2, + "cpu_mem": 3.088130048, + "gpu_mem": 1.9053184, + "loss": 0.4337, + "grad_norm": 2.6453988552093506, + "learning_rate": 0.00012336702056185453 + }, + { + "step": 178, + "epoch": 1.2067796610169492, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90506496, + "loss": 0.4354, + "grad_norm": 2.252537965774536, + "learning_rate": 0.00012161231334593851 + }, + { + "step": 179, + "epoch": 1.2135593220338983, + "cpu_mem": 3.088130048, + "gpu_mem": 1.9051648, + "loss": 0.6042, + "grad_norm": 3.986675262451172, + "learning_rate": 0.00011986162604409015 + }, + { + "step": 180, + "epoch": 1.2203389830508475, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905137152, + "loss": 0.3888, + "grad_norm": 2.408649444580078, + "learning_rate": 0.00011811520656705348 + }, + { + "step": 181, + "epoch": 1.2271186440677966, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905074176, + "loss": 0.2908, + "grad_norm": 1.7434226274490356, + "learning_rate": 0.00011637330222121543 + }, + { + "step": 182, + "epoch": 1.2338983050847459, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905292288, + "loss": 0.4571, + "grad_norm": 3.20436692237854, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 183, + "epoch": 1.240677966101695, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905189376, + "loss": 0.369, + "grad_norm": 2.2354419231414795, + "learning_rate": 0.00011290402491686766 + }, + { + "step": 184, + "epoch": 1.2474576271186442, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905137152, + "loss": 0.2711, + "grad_norm": 1.473290205001831, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 185, + "epoch": 1.2542372881355932, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905115648, + "loss": 0.3971, + "grad_norm": 1.987845778465271, + "learning_rate": 0.00010945575916653407 + }, + { + "step": 186, + "epoch": 1.2610169491525425, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905124864, + "loss": 0.3837, + "grad_norm": 2.081988573074341, + "learning_rate": 0.00010774011647378553 + }, + { + "step": 187, + "epoch": 1.2677966101694915, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90505728, + "loss": 0.4075, + "grad_norm": 3.0159099102020264, + "learning_rate": 0.00010603045810453468 + }, + { + "step": 188, + "epoch": 1.2745762711864406, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905220096, + "loss": 0.2327, + "grad_norm": 1.6922200918197632, + "learning_rate": 0.00010432702615951396 + }, + { + "step": 189, + "epoch": 1.2813559322033898, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905089536, + "loss": 0.3871, + "grad_norm": 2.6570751667022705, + "learning_rate": 0.00010263006185774627 + }, + { + "step": 190, + "epoch": 1.288135593220339, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905209344, + "loss": 0.4401, + "grad_norm": 3.228680372238159, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 191, + "epoch": 1.2949152542372881, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905028096, + "loss": 0.2657, + "grad_norm": 2.0415596961975098, + "learning_rate": 9.925649644669391e-05 + }, + { + "step": 192, + "epoch": 1.3016949152542372, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905160192, + "loss": 0.2755, + "grad_norm": 2.5326294898986816, + "learning_rate": 9.758037306013526e-05 + }, + { + "step": 193, + "epoch": 1.3084745762711865, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90513408, + "loss": 0.3214, + "grad_norm": 3.19537615776062, + "learning_rate": 9.591167269463255e-05 + }, + { + "step": 194, + "epoch": 1.3152542372881357, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905100288, + "loss": 0.3248, + "grad_norm": 2.7023966312408447, + "learning_rate": 9.425063165095088e-05 + }, + { + "step": 195, + "epoch": 1.3220338983050848, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905204736, + "loss": 0.2241, + "grad_norm": 2.757185220718384, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 196, + "epoch": 1.3288135593220338, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905200128, + "loss": 0.3704, + "grad_norm": 2.4209144115448, + "learning_rate": 9.095246727570879e-05 + }, + { + "step": 197, + "epoch": 1.335593220338983, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905058816, + "loss": 0.3498, + "grad_norm": 4.108653545379639, + "learning_rate": 8.931581098950973e-05 + }, + { + "step": 198, + "epoch": 1.3423728813559321, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905250816, + "loss": 0.2934, + "grad_norm": 3.206144094467163, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 199, + "epoch": 1.3491525423728814, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905101824, + "loss": 0.3563, + "grad_norm": 3.5283663272857666, + "learning_rate": 8.606850900252478e-05 + }, + { + "step": 200, + "epoch": 1.3559322033898304, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905204736, + "loss": 0.2369, + "grad_norm": 2.480332136154175, + "learning_rate": 8.445832314459608e-05 + }, + { + "step": 201, + "epoch": 1.3627118644067797, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905407488, + "loss": 0.3149, + "grad_norm": 3.096338987350464, + "learning_rate": 8.285741849059311e-05 + }, + { + "step": 202, + "epoch": 1.3694915254237288, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905209344, + "loss": 0.3531, + "grad_norm": 3.719872236251831, + "learning_rate": 8.126602174088843e-05 + }, + { + "step": 203, + "epoch": 1.376271186440678, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90509568, + "loss": 0.2387, + "grad_norm": 2.4631316661834717, + "learning_rate": 7.968435824946242e-05 + }, + { + "step": 204, + "epoch": 1.383050847457627, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905109504, + "loss": 0.3279, + "grad_norm": 2.2474913597106934, + "learning_rate": 7.811265199199152e-05 + }, + { + "step": 205, + "epoch": 1.3898305084745763, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905154048, + "loss": 0.3099, + "grad_norm": 2.455280065536499, + "learning_rate": 7.655112553413135e-05 + }, + { + "step": 206, + "epoch": 1.3966101694915254, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90509568, + "loss": 0.3017, + "grad_norm": 2.1304771900177, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 207, + "epoch": 1.4033898305084747, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905329152, + "loss": 0.3679, + "grad_norm": 2.926450252532959, + "learning_rate": 7.345949504086507e-05 + }, + { + "step": 208, + "epoch": 1.4101694915254237, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905359872, + "loss": 0.2293, + "grad_norm": 3.0154922008514404, + "learning_rate": 7.192982880403917e-05 + }, + { + "step": 209, + "epoch": 1.4169491525423727, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905286144, + "loss": 0.3435, + "grad_norm": 3.2104692459106445, + "learning_rate": 7.041121790198881e-05 + }, + { + "step": 210, + "epoch": 1.423728813559322, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905174016, + "loss": 0.361, + "grad_norm": 2.571568250656128, + "learning_rate": 6.890387738166041e-05 + }, + { + "step": 211, + "epoch": 1.4305084745762713, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905123328, + "loss": 0.3191, + "grad_norm": 2.800898551940918, + "learning_rate": 6.740802069402771e-05 + }, + { + "step": 212, + "epoch": 1.4372881355932203, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905092608, + "loss": 0.2131, + "grad_norm": 1.7589691877365112, + "learning_rate": 6.592385966386588e-05 + }, + { + "step": 213, + "epoch": 1.4440677966101694, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905115648, + "loss": 0.3144, + "grad_norm": 3.1759116649627686, + "learning_rate": 6.445160445975536e-05 + }, + { + "step": 214, + "epoch": 1.4508474576271186, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905198592, + "loss": 0.3252, + "grad_norm": 2.7820334434509277, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 215, + "epoch": 1.457627118644068, + "cpu_mem": 3.088130048, + "gpu_mem": 1.9051264, + "loss": 0.4845, + "grad_norm": 4.795464992523193, + "learning_rate": 6.154364374470568e-05 + }, + { + "step": 216, + "epoch": 1.464406779661017, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905292288, + "loss": 0.2801, + "grad_norm": 2.4745736122131348, + "learning_rate": 6.010835002329795e-05 + }, + { + "step": 217, + "epoch": 1.471186440677966, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90513408, + "loss": 0.4736, + "grad_norm": 4.10355281829834, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 218, + "epoch": 1.4779661016949153, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90511104, + "loss": 0.3417, + "grad_norm": 2.9731242656707764, + "learning_rate": 5.72761520669092e-05 + }, + { + "step": 219, + "epoch": 1.4847457627118645, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905236992, + "loss": 0.432, + "grad_norm": 3.3400681018829346, + "learning_rate": 5.587964889287218e-05 + }, + { + "step": 220, + "epoch": 1.4915254237288136, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905270784, + "loss": 0.3862, + "grad_norm": 3.405529737472534, + "learning_rate": 5.449647388213678e-05 + }, + { + "step": 221, + "epoch": 1.4983050847457626, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905138688, + "loss": 0.3309, + "grad_norm": 2.390625, + "learning_rate": 5.312682290288869e-05 + }, + { + "step": 222, + "epoch": 1.505084745762712, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905275392, + "loss": 0.3287, + "grad_norm": 2.8816545009613037, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 223, + "epoch": 1.5118644067796612, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905189376, + "loss": 0.3474, + "grad_norm": 3.9402477741241455, + "learning_rate": 5.0428866908599864e-05 + }, + { + "step": 224, + "epoch": 1.5186440677966102, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905154048, + "loss": 0.2141, + "grad_norm": 2.1563289165496826, + "learning_rate": 4.9100943944812114e-05 + }, + { + "step": 225, + "epoch": 1.5254237288135593, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90511872, + "loss": 0.2424, + "grad_norm": 2.032608985900879, + "learning_rate": 4.778730906091632e-05 + }, + { + "step": 226, + "epoch": 1.5322033898305085, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905267712, + "loss": 0.3345, + "grad_norm": 2.1559553146362305, + "learning_rate": 4.648814827768322e-05 + }, + { + "step": 227, + "epoch": 1.5389830508474578, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90515712, + "loss": 0.2854, + "grad_norm": 2.230639934539795, + "learning_rate": 4.5203645566239816e-05 + }, + { + "step": 228, + "epoch": 1.5457627118644068, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905101824, + "loss": 0.3652, + "grad_norm": 2.4192652702331543, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 229, + "epoch": 1.5525423728813559, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905043456, + "loss": 0.2825, + "grad_norm": 1.8954260349273682, + "learning_rate": 4.267933983899601e-05 + }, + { + "step": 230, + "epoch": 1.559322033898305, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905100288, + "loss": 0.2852, + "grad_norm": 2.1987552642822266, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 231, + "epoch": 1.5661016949152542, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905378304, + "loss": 0.3409, + "grad_norm": 2.9520516395568848, + "learning_rate": 4.0215821672741213e-05 + }, + { + "step": 232, + "epoch": 1.5728813559322035, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905101824, + "loss": 0.3936, + "grad_norm": 2.425276041030884, + "learning_rate": 3.900729534256745e-05 + }, + { + "step": 233, + "epoch": 1.5796610169491525, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905415168, + "loss": 0.2775, + "grad_norm": 2.4323878288269043, + "learning_rate": 3.781448643031187e-05 + }, + { + "step": 234, + "epoch": 1.5864406779661016, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905290752, + "loss": 0.2819, + "grad_norm": 2.0637269020080566, + "learning_rate": 3.663756384686127e-05 + }, + { + "step": 235, + "epoch": 1.5932203389830508, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905046528, + "loss": 0.1979, + "grad_norm": 1.6604588031768799, + "learning_rate": 3.547669425347647e-05 + }, + { + "step": 236, + "epoch": 1.6, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905106432, + "loss": 0.3534, + "grad_norm": 2.7899138927459717, + "learning_rate": 3.433204203819185e-05 + }, + { + "step": 237, + "epoch": 1.6067796610169491, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905167872, + "loss": 0.3442, + "grad_norm": 2.851421594619751, + "learning_rate": 3.3203769292536764e-05 + }, + { + "step": 238, + "epoch": 1.6135593220338982, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905169408, + "loss": 0.2356, + "grad_norm": 2.115088701248169, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 239, + "epoch": 1.6203389830508474, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905422848, + "loss": 0.3174, + "grad_norm": 2.6605639457702637, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 240, + "epoch": 1.6271186440677967, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90507264, + "loss": 0.5938, + "grad_norm": 5.371547222137451, + "learning_rate": 2.9918813861345952e-05 + }, + { + "step": 241, + "epoch": 1.6338983050847458, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905369088, + "loss": 0.3722, + "grad_norm": 3.1654694080352783, + "learning_rate": 2.885763318295102e-05 + }, + { + "step": 242, + "epoch": 1.6406779661016948, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905230848, + "loss": 0.3948, + "grad_norm": 2.478217124938965, + "learning_rate": 2.781360719244964e-05 + }, + { + "step": 243, + "epoch": 1.647457627118644, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905083392, + "loss": 0.3575, + "grad_norm": 2.542072057723999, + "learning_rate": 2.6786883731926306e-05 + }, + { + "step": 244, + "epoch": 1.6542372881355933, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905223168, + "loss": 0.2856, + "grad_norm": 3.4679834842681885, + "learning_rate": 2.5777608193294396e-05 + }, + { + "step": 245, + "epoch": 1.6610169491525424, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905101824, + "loss": 0.34, + "grad_norm": 2.368711471557617, + "learning_rate": 2.4785923497707956e-05 + }, + { + "step": 246, + "epoch": 1.6677966101694914, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90519552, + "loss": 0.3686, + "grad_norm": 2.4036035537719727, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 247, + "epoch": 1.6745762711864407, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905213952, + "loss": 0.235, + "grad_norm": 2.453899383544922, + "learning_rate": 2.285588584541047e-05 + }, + { + "step": 248, + "epoch": 1.68135593220339, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905166336, + "loss": 0.3151, + "grad_norm": 2.232832431793213, + "learning_rate": 2.1917806196827792e-05 + }, + { + "step": 249, + "epoch": 1.688135593220339, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90507264, + "loss": 0.307, + "grad_norm": 2.11991286277771, + "learning_rate": 2.0997863968844914e-05 + }, + { + "step": 250, + "epoch": 1.694915254237288, + "cpu_mem": 3.088130048, + "gpu_mem": 1.9051648, + "loss": 0.3301, + "grad_norm": 2.8754398822784424, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 251, + "epoch": 1.7016949152542373, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905077248, + "loss": 0.2503, + "grad_norm": 3.0982937812805176, + "learning_rate": 1.921291027132278e-05 + }, + { + "step": 252, + "epoch": 1.7084745762711866, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905120256, + "loss": 0.3421, + "grad_norm": 2.0872092247009277, + "learning_rate": 1.834815156491165e-05 + }, + { + "step": 253, + "epoch": 1.7152542372881356, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905313792, + "loss": 0.3734, + "grad_norm": 3.675940752029419, + "learning_rate": 1.750203576956341e-05 + }, + { + "step": 254, + "epoch": 1.7220338983050847, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905109504, + "loss": 0.3215, + "grad_norm": 2.3378677368164062, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 255, + "epoch": 1.7288135593220337, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905266176, + "loss": 0.2911, + "grad_norm": 2.9144768714904785, + "learning_rate": 1.5866209521043304e-05 + }, + { + "step": 256, + "epoch": 1.735593220338983, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905092608, + "loss": 0.2758, + "grad_norm": 2.1164915561676025, + "learning_rate": 1.5076730713409523e-05 + }, + { + "step": 257, + "epoch": 1.7423728813559323, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905505792, + "loss": 0.3197, + "grad_norm": 2.065190315246582, + "learning_rate": 1.4306358075111923e-05 + }, + { + "step": 258, + "epoch": 1.7491525423728813, + "cpu_mem": 3.088130048, + "gpu_mem": 1.9051648, + "loss": 0.3354, + "grad_norm": 3.146599531173706, + "learning_rate": 1.3555200696822232e-05 + }, + { + "step": 259, + "epoch": 1.7559322033898304, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905081856, + "loss": 0.3881, + "grad_norm": 3.2915120124816895, + "learning_rate": 1.2823364948184095e-05 + }, + { + "step": 260, + "epoch": 1.7627118644067796, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905198592, + "loss": 0.2118, + "grad_norm": 1.4625931978225708, + "learning_rate": 1.2110954462750166e-05 + }, + { + "step": 261, + "epoch": 1.769491525423729, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905154048, + "loss": 0.1809, + "grad_norm": 1.6819443702697754, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 262, + "epoch": 1.776271186440678, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90511104, + "loss": 0.2982, + "grad_norm": 2.6149179935455322, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 263, + "epoch": 1.783050847457627, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905147904, + "loss": 0.2749, + "grad_norm": 2.261990547180176, + "learning_rate": 1.0091269574384874e-05 + }, + { + "step": 264, + "epoch": 1.7898305084745763, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905235456, + "loss": 0.2551, + "grad_norm": 2.130862236022949, + "learning_rate": 9.45754125003576e-06 + }, + { + "step": 265, + "epoch": 1.7966101694915255, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905154048, + "loss": 0.4172, + "grad_norm": 3.065723419189453, + "learning_rate": 8.843714815330987e-06 + }, + { + "step": 266, + "epoch": 1.8033898305084746, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905369088, + "loss": 0.5303, + "grad_norm": 2.521975517272949, + "learning_rate": 8.249877192799731e-06 + }, + { + "step": 267, + "epoch": 1.8101694915254236, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905161728, + "loss": 0.3578, + "grad_norm": 3.311399459838867, + "learning_rate": 7.676112474402068e-06 + }, + { + "step": 268, + "epoch": 1.8169491525423729, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905166336, + "loss": 0.2372, + "grad_norm": 2.3744606971740723, + "learning_rate": 7.122501909620926e-06 + }, + { + "step": 269, + "epoch": 1.8237288135593221, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905177088, + "loss": 0.3684, + "grad_norm": 3.0438168048858643, + "learning_rate": 6.5891238939566275e-06 + }, + { + "step": 270, + "epoch": 1.8305084745762712, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905215488, + "loss": 0.2554, + "grad_norm": 2.380549907684326, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 271, + "epoch": 1.8372881355932202, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905267712, + "loss": 0.3092, + "grad_norm": 2.471889019012451, + "learning_rate": 5.583364755863701e-06 + }, + { + "step": 272, + "epoch": 1.8440677966101695, + "cpu_mem": 3.088130048, + "gpu_mem": 1.9051264, + "loss": 0.3333, + "grad_norm": 2.126983404159546, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 273, + "epoch": 1.8508474576271188, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905006592, + "loss": 0.3154, + "grad_norm": 2.670673370361328, + "learning_rate": 4.659404732773908e-06 + }, + { + "step": 274, + "epoch": 1.8576271186440678, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90523392, + "loss": 0.2453, + "grad_norm": 1.9982452392578125, + "learning_rate": 4.228264751468752e-06 + }, + { + "step": 275, + "epoch": 1.8644067796610169, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905478144, + "loss": 0.3091, + "grad_norm": 2.6423826217651367, + "learning_rate": 3.817767165451041e-06 + }, + { + "step": 276, + "epoch": 1.8711864406779661, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905138688, + "loss": 0.299, + "grad_norm": 2.723599672317505, + "learning_rate": 3.4279701043260886e-06 + }, + { + "step": 277, + "epoch": 1.8779661016949154, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905084928, + "loss": 0.3956, + "grad_norm": 3.303978204727173, + "learning_rate": 3.0589287663461472e-06 + }, + { + "step": 278, + "epoch": 1.8847457627118644, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905247744, + "loss": 0.3713, + "grad_norm": 2.627657890319824, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 279, + "epoch": 1.8915254237288135, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90518784, + "loss": 0.3013, + "grad_norm": 2.154676914215088, + "learning_rate": 2.3833193495825853e-06 + }, + { + "step": 280, + "epoch": 1.8983050847457628, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905167872, + "loss": 0.3185, + "grad_norm": 2.9566659927368164, + "learning_rate": 2.076846942272026e-06 + }, + { + "step": 281, + "epoch": 1.905084745762712, + "cpu_mem": 3.088130048, + "gpu_mem": 1.90510336, + "loss": 0.3667, + "grad_norm": 2.5651063919067383, + "learning_rate": 1.791321587504768e-06 + }, + { + "step": 282, + "epoch": 1.911864406779661, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905531904, + "loss": 0.2643, + "grad_norm": 2.6878583431243896, + "learning_rate": 1.5267837178600972e-06 + }, + { + "step": 283, + "epoch": 1.9186440677966101, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905238528, + "loss": 0.3464, + "grad_norm": 2.3561227321624756, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 284, + "epoch": 1.9254237288135592, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905094144, + "loss": 0.3194, + "grad_norm": 2.2974135875701904, + "learning_rate": 1.0608172990067553e-06 + }, + { + "step": 285, + "epoch": 1.9322033898305084, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905147904, + "loss": 0.3036, + "grad_norm": 3.040360689163208, + "learning_rate": 8.594547342153979e-07 + }, + { + "step": 286, + "epoch": 1.9389830508474577, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905565696, + "loss": 0.2789, + "grad_norm": 2.4805023670196533, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 287, + "epoch": 1.9457627118644067, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905335296, + "loss": 0.2411, + "grad_norm": 2.1130123138427734, + "learning_rate": 5.201134622801473e-07 + }, + { + "step": 288, + "epoch": 1.9525423728813558, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905120256, + "loss": 0.4391, + "grad_norm": 2.3872954845428467, + "learning_rate": 3.821828084619727e-07 + }, + { + "step": 289, + "epoch": 1.959322033898305, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905204736, + "loss": 0.1848, + "grad_norm": 1.850911021232605, + "learning_rate": 2.654391846207915e-07 + }, + { + "step": 290, + "epoch": 1.9661016949152543, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905129472, + "loss": 0.407, + "grad_norm": 4.644960403442383, + "learning_rate": 1.6989912254880556e-07 + }, + { + "step": 291, + "epoch": 1.9728813559322034, + "cpu_mem": 3.088130048, + "gpu_mem": 1.9051648, + "loss": 0.2849, + "grad_norm": 2.7832741737365723, + "learning_rate": 9.557615145123765e-08 + }, + { + "step": 292, + "epoch": 1.9796610169491524, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905247744, + "loss": 0.3232, + "grad_norm": 2.5905661582946777, + "learning_rate": 4.248079603064724e-08 + }, + { + "step": 293, + "epoch": 1.9864406779661017, + "cpu_mem": 3.088130048, + "gpu_mem": 1.9051648, + "loss": 0.3863, + "grad_norm": 2.8176510334014893, + "learning_rate": 1.0620574996372811e-08 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905190912, + "loss": 0.3619, + "grad_norm": 2.6373684406280518, + "learning_rate": 0.0 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 3.088130048, + "gpu_mem": 1.905190912, + "train_runtime": 4584.1691, + "train_samples_per_second": 4.113, + "train_steps_per_second": 0.064, + "total_flos": 0.0, + "train_loss": 0.6196057465003462 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r8-a2/adapter_config.json b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r8-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f92b7dff40e15f34db28e26d606dc21c058568aa --- /dev/null +++ b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r8-a2/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha": 16, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "enabled_mlp": true, + "enabled_qkv": [ + "q", + "k", + "v" + ], + "fan_in_fan_out": false, + "inference_mode": false, + "layers_pattern": null, + "layers_to_transform": null, + "mixture": false, + "modules_to_preserve_errors": null, + "modules_to_quantize": null, + "modules_to_save": null, + "onnx_export": false, + "optimization_level": 3, + "orthogonal_init": false, + "peft_type": "MARS", + "quant_n_bits": 8, + "r": 8, + "revision": null, + "seed": 42, + "shared_r": 8, + "target_modules": [ + "down_proj", + "v_proj", + "k_proj", + "up_proj", + "gate_proj", + "o_proj", + "q_proj" + ], + "task_type": null, + "use_bnb": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r8-a2/eval_results.json b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r8-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..809e8bad459bea8306771cc61ea38e11f9efccb4 --- /dev/null +++ b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r8-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "boolq", + "results": 0.7984709480122324 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r8-a2/training_configuration.json b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r8-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..b1022d01ecfac5d8c57f146ae55843264ed3f675 --- /dev/null +++ b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r8-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "BOOLQ", + "dataset_id": "google/boolq", + "preprocess_id": "boolq_train_deepeval" + }, + "peft_config": { + "method": "mars", + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 5233536 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 2, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-mars-boolq-r8-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q8/TinyLlama_v1.1-mars-boolq-r8-a2", + "seed": 42, + "timestamp": "2025-09-02T04:10:03.769222" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r8-a2/training_logs.json b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r8-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..40aa478d24a41286a3c949350c008c7b5d93c423 --- /dev/null +++ b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r8-a2/training_logs.json @@ -0,0 +1,2659 @@ +[ + { + "step": 1, + "epoch": 0.006779661016949152, + "cpu_mem": 2.267635712, + "gpu_mem": 1.587642368, + "loss": 8.8586, + "grad_norm": 144.2429656982422, + "learning_rate": 9.999999999999999e-06 + }, + { + "step": 2, + "epoch": 0.013559322033898305, + "cpu_mem": 2.270584832, + "gpu_mem": 1.629659648, + "loss": 8.9138, + "grad_norm": 147.96966552734375, + "learning_rate": 1.9999999999999998e-05 + }, + { + "step": 3, + "epoch": 0.020338983050847456, + "cpu_mem": 2.271174656, + "gpu_mem": 1.62957824, + "loss": 8.0696, + "grad_norm": 146.60386657714844, + "learning_rate": 2.9999999999999997e-05 + }, + { + "step": 4, + "epoch": 0.02711864406779661, + "cpu_mem": 2.271961088, + "gpu_mem": 1.62957824, + "loss": 6.3472, + "grad_norm": 145.776611328125, + "learning_rate": 3.9999999999999996e-05 + }, + { + "step": 5, + "epoch": 0.03389830508474576, + "cpu_mem": 2.272354304, + "gpu_mem": 1.629513728, + "loss": 4.218, + "grad_norm": 125.45213317871094, + "learning_rate": 4.9999999999999996e-05 + }, + { + "step": 6, + "epoch": 0.04067796610169491, + "cpu_mem": 2.272944128, + "gpu_mem": 1.629533696, + "loss": 2.3874, + "grad_norm": 73.55642700195312, + "learning_rate": 5.9999999999999995e-05 + }, + { + "step": 7, + "epoch": 0.04745762711864407, + "cpu_mem": 2.273533952, + "gpu_mem": 1.62958592, + "loss": 1.3013, + "grad_norm": 33.80267333984375, + "learning_rate": 7e-05 + }, + { + "step": 8, + "epoch": 0.05423728813559322, + "cpu_mem": 2.274123776, + "gpu_mem": 1.629671936, + "loss": 0.7852, + "grad_norm": 16.36908721923828, + "learning_rate": 7.999999999999999e-05 + }, + { + "step": 9, + "epoch": 0.061016949152542375, + "cpu_mem": 2.274516992, + "gpu_mem": 1.629579776, + "loss": 1.1211, + "grad_norm": 53.422481536865234, + "learning_rate": 8.999999999999999e-05 + }, + { + "step": 10, + "epoch": 0.06779661016949153, + "cpu_mem": 2.275106816, + "gpu_mem": 1.629479936, + "loss": 0.72, + "grad_norm": 13.193144798278809, + "learning_rate": 9.999999999999999e-05 + }, + { + "step": 11, + "epoch": 0.07457627118644068, + "cpu_mem": 2.275500032, + "gpu_mem": 1.629584384, + "loss": 1.5928, + "grad_norm": 83.73548889160156, + "learning_rate": 0.00010999999999999998 + }, + { + "step": 12, + "epoch": 0.08135593220338982, + "cpu_mem": 2.275893248, + "gpu_mem": 1.629956096, + "loss": 1.4642, + "grad_norm": 66.13136291503906, + "learning_rate": 0.00011999999999999999 + }, + { + "step": 13, + "epoch": 0.08813559322033898, + "cpu_mem": 2.276286464, + "gpu_mem": 1.629559808, + "loss": 0.7507, + "grad_norm": 19.544227600097656, + "learning_rate": 0.00013 + }, + { + "step": 14, + "epoch": 0.09491525423728814, + "cpu_mem": 2.27667968, + "gpu_mem": 1.629536768, + "loss": 0.8223, + "grad_norm": 22.892868041992188, + "learning_rate": 0.00014 + }, + { + "step": 15, + "epoch": 0.1016949152542373, + "cpu_mem": 2.277072896, + "gpu_mem": 1.629475328, + "loss": 1.1516, + "grad_norm": 34.80082321166992, + "learning_rate": 0.00015 + }, + { + "step": 16, + "epoch": 0.10847457627118644, + "cpu_mem": 2.277466112, + "gpu_mem": 1.629559808, + "loss": 0.787, + "grad_norm": 15.969379425048828, + "learning_rate": 0.00015999999999999999 + }, + { + "step": 17, + "epoch": 0.1152542372881356, + "cpu_mem": 2.27766272, + "gpu_mem": 1.629599744, + "loss": 0.7201, + "grad_norm": 8.328862190246582, + "learning_rate": 0.00016999999999999999 + }, + { + "step": 18, + "epoch": 0.12203389830508475, + "cpu_mem": 2.278055936, + "gpu_mem": 1.62966272, + "loss": 0.802, + "grad_norm": 16.595346450805664, + "learning_rate": 0.00017999999999999998 + }, + { + "step": 19, + "epoch": 0.1288135593220339, + "cpu_mem": 2.278449152, + "gpu_mem": 1.629499904, + "loss": 0.6962, + "grad_norm": 2.630282163619995, + "learning_rate": 0.00018999999999999998 + }, + { + "step": 20, + "epoch": 0.13559322033898305, + "cpu_mem": 2.278842368, + "gpu_mem": 1.629612032, + "loss": 0.6674, + "grad_norm": 10.08876895904541, + "learning_rate": 0.00019999999999999998 + }, + { + "step": 21, + "epoch": 0.1423728813559322, + "cpu_mem": 2.279038976, + "gpu_mem": 1.62977024, + "loss": 0.6576, + "grad_norm": 4.867812156677246, + "learning_rate": 0.00020999999999999998 + }, + { + "step": 22, + "epoch": 0.14915254237288136, + "cpu_mem": 2.279432192, + "gpu_mem": 1.62966272, + "loss": 0.7288, + "grad_norm": 10.249846458435059, + "learning_rate": 0.00021999999999999995 + }, + { + "step": 23, + "epoch": 0.15593220338983052, + "cpu_mem": 2.2796288, + "gpu_mem": 1.629635072, + "loss": 0.7356, + "grad_norm": 15.4491605758667, + "learning_rate": 0.00023 + }, + { + "step": 24, + "epoch": 0.16271186440677965, + "cpu_mem": 2.280022016, + "gpu_mem": 1.629691904, + "loss": 0.6953, + "grad_norm": 19.0009708404541, + "learning_rate": 0.00023999999999999998 + }, + { + "step": 25, + "epoch": 0.1694915254237288, + "cpu_mem": 2.280415232, + "gpu_mem": 1.629476864, + "loss": 0.6648, + "grad_norm": 6.04645299911499, + "learning_rate": 0.00025 + }, + { + "step": 26, + "epoch": 0.17627118644067796, + "cpu_mem": 2.28061184, + "gpu_mem": 1.62953216, + "loss": 0.7047, + "grad_norm": 8.8228120803833, + "learning_rate": 0.00026 + }, + { + "step": 27, + "epoch": 0.18305084745762712, + "cpu_mem": 2.280808448, + "gpu_mem": 1.629824, + "loss": 0.7486, + "grad_norm": 20.652299880981445, + "learning_rate": 0.00027 + }, + { + "step": 28, + "epoch": 0.18983050847457628, + "cpu_mem": 2.281005056, + "gpu_mem": 1.629502976, + "loss": 0.6838, + "grad_norm": 6.600456237792969, + "learning_rate": 0.00028 + }, + { + "step": 29, + "epoch": 0.19661016949152543, + "cpu_mem": 2.281398272, + "gpu_mem": 1.629567488, + "loss": 0.6161, + "grad_norm": 4.181739807128906, + "learning_rate": 0.00029 + }, + { + "step": 30, + "epoch": 0.2033898305084746, + "cpu_mem": 2.28159488, + "gpu_mem": 1.629645824, + "loss": 0.673, + "grad_norm": 8.070191383361816, + "learning_rate": 0.0003 + }, + { + "step": 31, + "epoch": 0.21016949152542372, + "cpu_mem": 2.281791488, + "gpu_mem": 1.629449216, + "loss": 0.6888, + "grad_norm": 17.474058151245117, + "learning_rate": 0.0002999893794250036 + }, + { + "step": 32, + "epoch": 0.21694915254237288, + "cpu_mem": 2.281988096, + "gpu_mem": 1.62956288, + "loss": 0.6913, + "grad_norm": 10.40870475769043, + "learning_rate": 0.00029995751920396937 + }, + { + "step": 33, + "epoch": 0.22372881355932203, + "cpu_mem": 2.282184704, + "gpu_mem": 1.62980096, + "loss": 0.7044, + "grad_norm": 7.032500743865967, + "learning_rate": 0.00029990442384854874 + }, + { + "step": 34, + "epoch": 0.2305084745762712, + "cpu_mem": 2.282381312, + "gpu_mem": 1.629502976, + "loss": 0.5805, + "grad_norm": 2.290640115737915, + "learning_rate": 0.0002998301008774512 + }, + { + "step": 35, + "epoch": 0.23728813559322035, + "cpu_mem": 2.28257792, + "gpu_mem": 1.629713408, + "loss": 0.6416, + "grad_norm": 5.307508945465088, + "learning_rate": 0.0002997345608153792 + }, + { + "step": 36, + "epoch": 0.2440677966101695, + "cpu_mem": 2.282774528, + "gpu_mem": 1.629664256, + "loss": 0.9183, + "grad_norm": 25.747779846191406, + "learning_rate": 0.000299617817191538 + }, + { + "step": 37, + "epoch": 0.25084745762711863, + "cpu_mem": 2.282971136, + "gpu_mem": 1.629475328, + "loss": 0.9282, + "grad_norm": 26.683223724365234, + "learning_rate": 0.0002994798865377198 + }, + { + "step": 38, + "epoch": 0.2576271186440678, + "cpu_mem": 2.283167744, + "gpu_mem": 1.629722624, + "loss": 0.6603, + "grad_norm": 2.1753015518188477, + "learning_rate": 0.0002993207883859627 + }, + { + "step": 39, + "epoch": 0.26440677966101694, + "cpu_mem": 2.283364352, + "gpu_mem": 1.630102016, + "loss": 0.6032, + "grad_norm": 2.1343305110931396, + "learning_rate": 0.0002991405452657846 + }, + { + "step": 40, + "epoch": 0.2711864406779661, + "cpu_mem": 2.28356096, + "gpu_mem": 1.629671936, + "loss": 0.6237, + "grad_norm": 3.13690185546875, + "learning_rate": 0.00029893918270099324 + }, + { + "step": 41, + "epoch": 0.27796610169491526, + "cpu_mem": 2.283757568, + "gpu_mem": 1.629899264, + "loss": 0.6992, + "grad_norm": 5.651178359985352, + "learning_rate": 0.00029871672920607153 + }, + { + "step": 42, + "epoch": 0.2847457627118644, + "cpu_mem": 2.283954176, + "gpu_mem": 1.629796352, + "loss": 0.5885, + "grad_norm": 3.0241758823394775, + "learning_rate": 0.0002984732162821399 + }, + { + "step": 43, + "epoch": 0.29152542372881357, + "cpu_mem": 2.284150784, + "gpu_mem": 1.629618176, + "loss": 0.6329, + "grad_norm": 10.584344863891602, + "learning_rate": 0.0002982086784124952 + }, + { + "step": 44, + "epoch": 0.2983050847457627, + "cpu_mem": 2.284347392, + "gpu_mem": 1.629761024, + "loss": 0.5715, + "grad_norm": 7.9485859870910645, + "learning_rate": 0.00029792315305772796 + }, + { + "step": 45, + "epoch": 0.3050847457627119, + "cpu_mem": 2.284347392, + "gpu_mem": 1.629541376, + "loss": 0.7837, + "grad_norm": 14.259190559387207, + "learning_rate": 0.0002976166806504174 + }, + { + "step": 46, + "epoch": 0.31186440677966104, + "cpu_mem": 2.284544, + "gpu_mem": 1.629784064, + "loss": 1.172, + "grad_norm": 33.988765716552734, + "learning_rate": 0.00029728930458940595 + }, + { + "step": 47, + "epoch": 0.31864406779661014, + "cpu_mem": 2.284740608, + "gpu_mem": 1.629507584, + "loss": 1.0339, + "grad_norm": 29.476423263549805, + "learning_rate": 0.00029694107123365385 + }, + { + "step": 48, + "epoch": 0.3254237288135593, + "cpu_mem": 2.284937216, + "gpu_mem": 1.629584384, + "loss": 0.672, + "grad_norm": 14.903922080993652, + "learning_rate": 0.00029657202989567393 + }, + { + "step": 49, + "epoch": 0.33220338983050846, + "cpu_mem": 2.285133824, + "gpu_mem": 1.62960128, + "loss": 1.0404, + "grad_norm": 21.83669662475586, + "learning_rate": 0.00029618223283454893 + }, + { + "step": 50, + "epoch": 0.3389830508474576, + "cpu_mem": 2.285133824, + "gpu_mem": 1.62953984, + "loss": 0.7913, + "grad_norm": 15.41524600982666, + "learning_rate": 0.00029577173524853123 + }, + { + "step": 51, + "epoch": 0.34576271186440677, + "cpu_mem": 2.285330432, + "gpu_mem": 1.629544448, + "loss": 0.5784, + "grad_norm": 5.735236644744873, + "learning_rate": 0.0002953405952672261 + }, + { + "step": 52, + "epoch": 0.3525423728813559, + "cpu_mem": 2.285330432, + "gpu_mem": 1.62962432, + "loss": 0.6792, + "grad_norm": 2.5827674865722656, + "learning_rate": 0.0002948888739433602 + }, + { + "step": 53, + "epoch": 0.3593220338983051, + "cpu_mem": 2.28552704, + "gpu_mem": 1.62964736, + "loss": 0.7903, + "grad_norm": 16.892005920410156, + "learning_rate": 0.0002944166352441363 + }, + { + "step": 54, + "epoch": 0.36610169491525424, + "cpu_mem": 2.28552704, + "gpu_mem": 1.629575168, + "loss": 0.6634, + "grad_norm": 3.408280849456787, + "learning_rate": 0.0002939239460421746 + }, + { + "step": 55, + "epoch": 0.3728813559322034, + "cpu_mem": 2.285723648, + "gpu_mem": 1.629845504, + "loss": 0.6152, + "grad_norm": 1.8816794157028198, + "learning_rate": 0.00029341087610604337 + }, + { + "step": 56, + "epoch": 0.37966101694915255, + "cpu_mem": 2.285723648, + "gpu_mem": 1.629632, + "loss": 0.8321, + "grad_norm": 12.979559898376465, + "learning_rate": 0.00029287749809037904 + }, + { + "step": 57, + "epoch": 0.3864406779661017, + "cpu_mem": 2.285920256, + "gpu_mem": 1.629625856, + "loss": 0.7125, + "grad_norm": 9.270895957946777, + "learning_rate": 0.0002923238875255979 + }, + { + "step": 58, + "epoch": 0.39322033898305087, + "cpu_mem": 2.285920256, + "gpu_mem": 1.629521408, + "loss": 0.6093, + "grad_norm": 5.737097263336182, + "learning_rate": 0.00029175012280720024 + }, + { + "step": 59, + "epoch": 0.4, + "cpu_mem": 2.286116864, + "gpu_mem": 1.629538304, + "loss": 0.6307, + "grad_norm": 1.514950156211853, + "learning_rate": 0.000291156285184669 + }, + { + "step": 60, + "epoch": 0.4067796610169492, + "cpu_mem": 2.286116864, + "gpu_mem": 1.629632, + "loss": 0.6646, + "grad_norm": 11.895840644836426, + "learning_rate": 0.00029054245874996426 + }, + { + "step": 61, + "epoch": 0.4135593220338983, + "cpu_mem": 2.286313472, + "gpu_mem": 1.629642752, + "loss": 0.6407, + "grad_norm": 9.51547622680664, + "learning_rate": 0.0002899087304256151 + }, + { + "step": 62, + "epoch": 0.42033898305084744, + "cpu_mem": 2.286313472, + "gpu_mem": 1.629630464, + "loss": 0.7282, + "grad_norm": 4.956035137176514, + "learning_rate": 0.0002892551899524109 + }, + { + "step": 63, + "epoch": 0.4271186440677966, + "cpu_mem": 2.28651008, + "gpu_mem": 1.629622784, + "loss": 0.4795, + "grad_norm": 3.0325164794921875, + "learning_rate": 0.000288581929876693 + }, + { + "step": 64, + "epoch": 0.43389830508474575, + "cpu_mem": 2.28651008, + "gpu_mem": 1.629552128, + "loss": 0.598, + "grad_norm": 7.368662357330322, + "learning_rate": 0.0002878890455372498 + }, + { + "step": 65, + "epoch": 0.4406779661016949, + "cpu_mem": 2.28651008, + "gpu_mem": 1.629596672, + "loss": 0.6301, + "grad_norm": 7.999270439147949, + "learning_rate": 0.0002871766350518159 + }, + { + "step": 66, + "epoch": 0.44745762711864406, + "cpu_mem": 2.286706688, + "gpu_mem": 1.629790208, + "loss": 0.5574, + "grad_norm": 4.22140645980835, + "learning_rate": 0.00028644479930317775 + }, + { + "step": 67, + "epoch": 0.4542372881355932, + "cpu_mem": 2.286706688, + "gpu_mem": 1.629499904, + "loss": 0.6395, + "grad_norm": 5.7975664138793945, + "learning_rate": 0.00028569364192488803 + }, + { + "step": 68, + "epoch": 0.4610169491525424, + "cpu_mem": 2.286706688, + "gpu_mem": 1.629467648, + "loss": 0.684, + "grad_norm": 6.35243034362793, + "learning_rate": 0.00028492326928659045 + }, + { + "step": 69, + "epoch": 0.46779661016949153, + "cpu_mem": 2.286903296, + "gpu_mem": 1.629533696, + "loss": 0.5577, + "grad_norm": 2.9018375873565674, + "learning_rate": 0.00028413379047895665 + }, + { + "step": 70, + "epoch": 0.4745762711864407, + "cpu_mem": 2.286903296, + "gpu_mem": 1.629527552, + "loss": 0.5382, + "grad_norm": 3.604681968688965, + "learning_rate": 0.0002833253172982385 + }, + { + "step": 71, + "epoch": 0.48135593220338985, + "cpu_mem": 2.286903296, + "gpu_mem": 1.629756416, + "loss": 0.556, + "grad_norm": 6.14028263092041, + "learning_rate": 0.0002824979642304366 + }, + { + "step": 72, + "epoch": 0.488135593220339, + "cpu_mem": 2.287099904, + "gpu_mem": 1.629748736, + "loss": 0.5006, + "grad_norm": 3.7308502197265625, + "learning_rate": 0.0002816518484350883 + }, + { + "step": 73, + "epoch": 0.49491525423728816, + "cpu_mem": 2.287099904, + "gpu_mem": 1.629714944, + "loss": 0.7734, + "grad_norm": 12.130180358886719, + "learning_rate": 0.0002807870897286772 + }, + { + "step": 74, + "epoch": 0.5016949152542373, + "cpu_mem": 2.287099904, + "gpu_mem": 1.629575168, + "loss": 0.4178, + "grad_norm": 3.5092108249664307, + "learning_rate": 0.0002799038105676658 + }, + { + "step": 75, + "epoch": 0.5084745762711864, + "cpu_mem": 2.287099904, + "gpu_mem": 1.629499904, + "loss": 0.6387, + "grad_norm": 9.334840774536133, + "learning_rate": 0.000279002136031155 + }, + { + "step": 76, + "epoch": 0.5152542372881356, + "cpu_mem": 2.287099904, + "gpu_mem": 1.62944, + "loss": 0.6527, + "grad_norm": 7.279111385345459, + "learning_rate": 0.00027808219380317216 + }, + { + "step": 77, + "epoch": 0.5220338983050847, + "cpu_mem": 2.287099904, + "gpu_mem": 1.629513728, + "loss": 0.4927, + "grad_norm": 5.388290882110596, + "learning_rate": 0.0002771441141545895 + }, + { + "step": 78, + "epoch": 0.5288135593220339, + "cpu_mem": 2.287296512, + "gpu_mem": 1.629565952, + "loss": 0.7974, + "grad_norm": 17.519184112548828, + "learning_rate": 0.0002761880299246772 + }, + { + "step": 79, + "epoch": 0.535593220338983, + "cpu_mem": 2.287296512, + "gpu_mem": 1.629698048, + "loss": 0.6155, + "grad_norm": 6.999842166900635, + "learning_rate": 0.000275214076502292 + }, + { + "step": 80, + "epoch": 0.5423728813559322, + "cpu_mem": 2.287296512, + "gpu_mem": 1.629588992, + "loss": 0.6331, + "grad_norm": 6.951205730438232, + "learning_rate": 0.0002742223918067056 + }, + { + "step": 81, + "epoch": 0.5491525423728814, + "cpu_mem": 2.287296512, + "gpu_mem": 1.629469184, + "loss": 0.6335, + "grad_norm": 7.675774097442627, + "learning_rate": 0.00027321311626807374 + }, + { + "step": 82, + "epoch": 0.5559322033898305, + "cpu_mem": 2.287296512, + "gpu_mem": 1.629538304, + "loss": 0.6085, + "grad_norm": 3.8413643836975098, + "learning_rate": 0.0002721863928075503 + }, + { + "step": 83, + "epoch": 0.5627118644067797, + "cpu_mem": 2.287296512, + "gpu_mem": 1.629638144, + "loss": 0.744, + "grad_norm": 8.29317855834961, + "learning_rate": 0.000271142366817049 + }, + { + "step": 84, + "epoch": 0.5694915254237288, + "cpu_mem": 2.28749312, + "gpu_mem": 1.62960128, + "loss": 0.5578, + "grad_norm": 3.062340021133423, + "learning_rate": 0.00027008118613865406 + }, + { + "step": 85, + "epoch": 0.576271186440678, + "cpu_mem": 2.28749312, + "gpu_mem": 1.629633536, + "loss": 0.5651, + "grad_norm": 2.2785956859588623, + "learning_rate": 0.00026900300104368524 + }, + { + "step": 86, + "epoch": 0.5830508474576271, + "cpu_mem": 2.28749312, + "gpu_mem": 1.629584384, + "loss": 0.5762, + "grad_norm": 3.3499417304992676, + "learning_rate": 0.00026790796421141813 + }, + { + "step": 87, + "epoch": 0.5898305084745763, + "cpu_mem": 2.28749312, + "gpu_mem": 1.629592064, + "loss": 0.4992, + "grad_norm": 2.441471576690674, + "learning_rate": 0.00026679623070746325 + }, + { + "step": 88, + "epoch": 0.5966101694915255, + "cpu_mem": 2.28749312, + "gpu_mem": 1.629736448, + "loss": 0.5305, + "grad_norm": 3.4041829109191895, + "learning_rate": 0.0002656679579618081 + }, + { + "step": 89, + "epoch": 0.6033898305084746, + "cpu_mem": 2.28749312, + "gpu_mem": 1.629518336, + "loss": 0.6027, + "grad_norm": 4.006837368011475, + "learning_rate": 0.0002645233057465235 + }, + { + "step": 90, + "epoch": 0.6101694915254238, + "cpu_mem": 2.287689728, + "gpu_mem": 1.629572096, + "loss": 0.4848, + "grad_norm": 3.148393392562866, + "learning_rate": 0.00026336243615313873 + }, + { + "step": 91, + "epoch": 0.6169491525423729, + "cpu_mem": 2.287689728, + "gpu_mem": 1.62953984, + "loss": 0.424, + "grad_norm": 3.6978237628936768, + "learning_rate": 0.00026218551356968814 + }, + { + "step": 92, + "epoch": 0.6237288135593221, + "cpu_mem": 2.287689728, + "gpu_mem": 1.629621248, + "loss": 0.7222, + "grad_norm": 11.239879608154297, + "learning_rate": 0.00026099270465743254 + }, + { + "step": 93, + "epoch": 0.6305084745762712, + "cpu_mem": 2.287689728, + "gpu_mem": 1.62942464, + "loss": 0.7732, + "grad_norm": 8.107810020446777, + "learning_rate": 0.0002597841783272588 + }, + { + "step": 94, + "epoch": 0.6372881355932203, + "cpu_mem": 2.287689728, + "gpu_mem": 1.629538304, + "loss": 0.597, + "grad_norm": 9.186545372009277, + "learning_rate": 0.0002585601057157605 + }, + { + "step": 95, + "epoch": 0.6440677966101694, + "cpu_mem": 2.287689728, + "gpu_mem": 1.629558272, + "loss": 0.599, + "grad_norm": 6.948132514953613, + "learning_rate": 0.00025732066016100394 + }, + { + "step": 96, + "epoch": 0.6508474576271186, + "cpu_mem": 2.287689728, + "gpu_mem": 1.629596672, + "loss": 0.4721, + "grad_norm": 7.138347625732422, + "learning_rate": 0.00025606601717798207 + }, + { + "step": 97, + "epoch": 0.6576271186440678, + "cpu_mem": 2.287689728, + "gpu_mem": 1.629581312, + "loss": 0.5511, + "grad_norm": 5.730069160461426, + "learning_rate": 0.0002547963544337602 + }, + { + "step": 98, + "epoch": 0.6644067796610169, + "cpu_mem": 2.287689728, + "gpu_mem": 1.62949376, + "loss": 0.6729, + "grad_norm": 9.908452033996582, + "learning_rate": 0.0002535118517223168 + }, + { + "step": 99, + "epoch": 0.6711864406779661, + "cpu_mem": 2.287689728, + "gpu_mem": 1.629443072, + "loss": 0.6008, + "grad_norm": 8.284820556640625, + "learning_rate": 0.00025221269093908365 + }, + { + "step": 100, + "epoch": 0.6779661016949152, + "cpu_mem": 2.287689728, + "gpu_mem": 1.629559808, + "loss": 0.5242, + "grad_norm": 6.290010929107666, + "learning_rate": 0.0002508990560551879 + }, + { + "step": 101, + "epoch": 0.6847457627118644, + "cpu_mem": 2.287689728, + "gpu_mem": 1.629592064, + "loss": 0.6053, + "grad_norm": 5.79994010925293, + "learning_rate": 0.0002495711330914001 + }, + { + "step": 102, + "epoch": 0.6915254237288135, + "cpu_mem": 2.287886336, + "gpu_mem": 1.629625856, + "loss": 0.5162, + "grad_norm": 2.7990219593048096, + "learning_rate": 0.00024822911009179276 + }, + { + "step": 103, + "epoch": 0.6983050847457627, + "cpu_mem": 2.287886336, + "gpu_mem": 1.629676544, + "loss": 0.6087, + "grad_norm": 4.584224700927734, + "learning_rate": 0.0002468731770971113 + }, + { + "step": 104, + "epoch": 0.7050847457627119, + "cpu_mem": 2.287886336, + "gpu_mem": 1.629581312, + "loss": 0.5571, + "grad_norm": 4.992157459259033, + "learning_rate": 0.0002455035261178632 + }, + { + "step": 105, + "epoch": 0.711864406779661, + "cpu_mem": 2.287886336, + "gpu_mem": 1.629682688, + "loss": 0.5607, + "grad_norm": 3.385103702545166, + "learning_rate": 0.0002441203511071278 + }, + { + "step": 106, + "epoch": 0.7186440677966102, + "cpu_mem": 2.287886336, + "gpu_mem": 1.629633536, + "loss": 0.5572, + "grad_norm": 6.2194318771362305, + "learning_rate": 0.00024272384793309077 + }, + { + "step": 107, + "epoch": 0.7254237288135593, + "cpu_mem": 2.287886336, + "gpu_mem": 1.629521408, + "loss": 0.5021, + "grad_norm": 5.199551105499268, + "learning_rate": 0.00024131421435130807 + }, + { + "step": 108, + "epoch": 0.7322033898305085, + "cpu_mem": 2.287886336, + "gpu_mem": 1.629705728, + "loss": 0.53, + "grad_norm": 4.109259128570557, + "learning_rate": 0.00023989164997670202 + }, + { + "step": 109, + "epoch": 0.7389830508474576, + "cpu_mem": 2.287886336, + "gpu_mem": 1.629559808, + "loss": 0.622, + "grad_norm": 5.802608966827393, + "learning_rate": 0.0002384563562552943 + }, + { + "step": 110, + "epoch": 0.7457627118644068, + "cpu_mem": 2.287886336, + "gpu_mem": 1.62956288, + "loss": 0.6068, + "grad_norm": 6.581479072570801, + "learning_rate": 0.0002370085364356797 + }, + { + "step": 111, + "epoch": 0.752542372881356, + "cpu_mem": 2.287886336, + "gpu_mem": 1.62953216, + "loss": 0.4929, + "grad_norm": 3.0374016761779785, + "learning_rate": 0.0002355483955402446 + }, + { + "step": 112, + "epoch": 0.7593220338983051, + "cpu_mem": 2.287886336, + "gpu_mem": 1.62957824, + "loss": 0.5096, + "grad_norm": 7.3958868980407715, + "learning_rate": 0.00023407614033613407 + }, + { + "step": 113, + "epoch": 0.7661016949152543, + "cpu_mem": 2.287886336, + "gpu_mem": 1.629569024, + "loss": 0.524, + "grad_norm": 6.370842933654785, + "learning_rate": 0.0002325919793059723 + }, + { + "step": 114, + "epoch": 0.7728813559322034, + "cpu_mem": 2.287886336, + "gpu_mem": 1.629550592, + "loss": 0.442, + "grad_norm": 4.379997730255127, + "learning_rate": 0.00023109612261833963 + }, + { + "step": 115, + "epoch": 0.7796610169491526, + "cpu_mem": 2.287886336, + "gpu_mem": 1.629625856, + "loss": 0.5522, + "grad_norm": 7.073269844055176, + "learning_rate": 0.0002295887820980112 + }, + { + "step": 116, + "epoch": 0.7864406779661017, + "cpu_mem": 2.287886336, + "gpu_mem": 1.629545984, + "loss": 0.4992, + "grad_norm": 4.717667579650879, + "learning_rate": 0.0002280701711959608 + }, + { + "step": 117, + "epoch": 0.7932203389830509, + "cpu_mem": 2.287886336, + "gpu_mem": 1.629436928, + "loss": 0.5653, + "grad_norm": 8.18282413482666, + "learning_rate": 0.00022654050495913495 + }, + { + "step": 118, + "epoch": 0.8, + "cpu_mem": 2.287886336, + "gpu_mem": 1.629675008, + "loss": 0.5615, + "grad_norm": 7.887237548828125, + "learning_rate": 0.000225 + }, + { + "step": 119, + "epoch": 0.8067796610169492, + "cpu_mem": 2.287886336, + "gpu_mem": 1.629845504, + "loss": 0.4864, + "grad_norm": 5.4350199699401855, + "learning_rate": 0.00022344887446586865 + }, + { + "step": 120, + "epoch": 0.8135593220338984, + "cpu_mem": 2.287886336, + "gpu_mem": 1.62957824, + "loss": 0.5289, + "grad_norm": 8.603075981140137, + "learning_rate": 0.00022188734800800852 + }, + { + "step": 121, + "epoch": 0.8203389830508474, + "cpu_mem": 2.287886336, + "gpu_mem": 1.629605888, + "loss": 0.5197, + "grad_norm": 5.900209426879883, + "learning_rate": 0.00022031564175053754 + }, + { + "step": 122, + "epoch": 0.8271186440677966, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629656576, + "loss": 0.3942, + "grad_norm": 4.095558166503906, + "learning_rate": 0.00021873397825911153 + }, + { + "step": 123, + "epoch": 0.8338983050847457, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629466112, + "loss": 0.5044, + "grad_norm": 6.620598793029785, + "learning_rate": 0.00021714258150940685 + }, + { + "step": 124, + "epoch": 0.8406779661016949, + "cpu_mem": 2.288082944, + "gpu_mem": 1.62990848, + "loss": 0.431, + "grad_norm": 5.4499711990356445, + "learning_rate": 0.0002155416768554039 + }, + { + "step": 125, + "epoch": 0.847457627118644, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629635072, + "loss": 0.4359, + "grad_norm": 3.7434792518615723, + "learning_rate": 0.00021393149099747523 + }, + { + "step": 126, + "epoch": 0.8542372881355932, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629518336, + "loss": 0.4911, + "grad_norm": 6.276898384094238, + "learning_rate": 0.00021231225195028297 + }, + { + "step": 127, + "epoch": 0.8610169491525423, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629957632, + "loss": 0.5329, + "grad_norm": 4.533207416534424, + "learning_rate": 0.00021068418901049025 + }, + { + "step": 128, + "epoch": 0.8677966101694915, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629733376, + "loss": 0.5059, + "grad_norm": 6.234988212585449, + "learning_rate": 0.0002090475327242912 + }, + { + "step": 129, + "epoch": 0.8745762711864407, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629773312, + "loss": 0.4855, + "grad_norm": 6.016932010650635, + "learning_rate": 0.00020740251485476345 + }, + { + "step": 130, + "epoch": 0.8813559322033898, + "cpu_mem": 2.288082944, + "gpu_mem": 1.6295552, + "loss": 0.6791, + "grad_norm": 5.402276515960693, + "learning_rate": 0.0002057493683490491 + }, + { + "step": 131, + "epoch": 0.888135593220339, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629684224, + "loss": 0.4445, + "grad_norm": 6.3723063468933105, + "learning_rate": 0.00020408832730536746 + }, + { + "step": 132, + "epoch": 0.8949152542372881, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629765632, + "loss": 0.4106, + "grad_norm": 5.194598197937012, + "learning_rate": 0.00020241962693986476 + }, + { + "step": 133, + "epoch": 0.9016949152542373, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629549056, + "loss": 0.455, + "grad_norm": 4.656627178192139, + "learning_rate": 0.0002007435035533061 + }, + { + "step": 134, + "epoch": 0.9084745762711864, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629682688, + "loss": 0.4188, + "grad_norm": 6.00368070602417, + "learning_rate": 0.00019906019449761325 + }, + { + "step": 135, + "epoch": 0.9152542372881356, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629705728, + "loss": 0.4822, + "grad_norm": 6.199875831604004, + "learning_rate": 0.00019736993814225374 + }, + { + "step": 136, + "epoch": 0.9220338983050848, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629542912, + "loss": 0.5065, + "grad_norm": 6.572737693786621, + "learning_rate": 0.00019567297384048604 + }, + { + "step": 137, + "epoch": 0.9288135593220339, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629423104, + "loss": 0.5345, + "grad_norm": 6.6476731300354, + "learning_rate": 0.0001939695418954653 + }, + { + "step": 138, + "epoch": 0.9355932203389831, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629604352, + "loss": 0.4502, + "grad_norm": 6.0112385749816895, + "learning_rate": 0.00019225988352621445 + }, + { + "step": 139, + "epoch": 0.9423728813559322, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629502976, + "loss": 0.4314, + "grad_norm": 5.074925899505615, + "learning_rate": 0.00019054424083346592 + }, + { + "step": 140, + "epoch": 0.9491525423728814, + "cpu_mem": 2.288082944, + "gpu_mem": 1.6295552, + "loss": 0.5383, + "grad_norm": 5.428180694580078, + "learning_rate": 0.0001888228567653781 + }, + { + "step": 141, + "epoch": 0.9559322033898305, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629587456, + "loss": 0.6143, + "grad_norm": 9.621994018554688, + "learning_rate": 0.0001870959750831323 + }, + { + "step": 142, + "epoch": 0.9627118644067797, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629727232, + "loss": 0.4809, + "grad_norm": 7.100337505340576, + "learning_rate": 0.0001853638403264141 + }, + { + "step": 143, + "epoch": 0.9694915254237289, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629710336, + "loss": 0.5918, + "grad_norm": 4.86319637298584, + "learning_rate": 0.00018362669777878453 + }, + { + "step": 144, + "epoch": 0.976271186440678, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629902336, + "loss": 0.5188, + "grad_norm": 4.734114170074463, + "learning_rate": 0.00018188479343294648 + }, + { + "step": 145, + "epoch": 0.9830508474576272, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629613568, + "loss": 0.4657, + "grad_norm": 3.6268129348754883, + "learning_rate": 0.0001801383739559098 + }, + { + "step": 146, + "epoch": 0.9898305084745763, + "cpu_mem": 2.288082944, + "gpu_mem": 1.629648896, + "loss": 0.5017, + "grad_norm": 6.146897315979004, + "learning_rate": 0.0001783876866540615 + }, + { + "step": 147, + "epoch": 0.9966101694915255, + "cpu_mem": 2.288082944, + "gpu_mem": 1.62954752, + "loss": 0.5294, + "grad_norm": 4.615911483764648, + "learning_rate": 0.00017663297943814552 + }, + { + "step": 148, + "epoch": 1.0033898305084745, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650651648, + "loss": 0.5857, + "grad_norm": 6.189650058746338, + "learning_rate": 0.0001748745007881561 + }, + { + "step": 149, + "epoch": 1.0101694915254238, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650587136, + "loss": 0.337, + "grad_norm": 4.757652759552002, + "learning_rate": 0.00017311249971815185 + }, + { + "step": 150, + "epoch": 1.0169491525423728, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65042432, + "loss": 0.3725, + "grad_norm": 4.583643913269043, + "learning_rate": 0.00017134722574099276 + }, + { + "step": 151, + "epoch": 1.023728813559322, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650496512, + "loss": 0.3796, + "grad_norm": 3.325227975845337, + "learning_rate": 0.00016957892883300775 + }, + { + "step": 152, + "epoch": 1.0305084745762711, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65053184, + "loss": 0.3196, + "grad_norm": 3.292144775390625, + "learning_rate": 0.00016780785939859576 + }, + { + "step": 153, + "epoch": 1.0372881355932204, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650556416, + "loss": 0.4089, + "grad_norm": 4.5043559074401855, + "learning_rate": 0.00016603426823476693 + }, + { + "step": 154, + "epoch": 1.0440677966101695, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650518016, + "loss": 0.3459, + "grad_norm": 4.523366451263428, + "learning_rate": 0.00016425840649562736 + }, + { + "step": 155, + "epoch": 1.0508474576271187, + "cpu_mem": 2.288082944, + "gpu_mem": 1.6507392, + "loss": 0.4068, + "grad_norm": 4.620089054107666, + "learning_rate": 0.00016248052565681436 + }, + { + "step": 156, + "epoch": 1.0576271186440678, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65064704, + "loss": 0.3716, + "grad_norm": 4.992485523223877, + "learning_rate": 0.00016070087747988482 + }, + { + "step": 157, + "epoch": 1.064406779661017, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650553344, + "loss": 0.4066, + "grad_norm": 6.3387041091918945, + "learning_rate": 0.00015891971397666464 + }, + { + "step": 158, + "epoch": 1.071186440677966, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650479616, + "loss": 0.3726, + "grad_norm": 7.757758617401123, + "learning_rate": 0.00015713728737356137 + }, + { + "step": 159, + "epoch": 1.0779661016949154, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650828288, + "loss": 0.2632, + "grad_norm": 5.583423137664795, + "learning_rate": 0.00015535385007584706 + }, + { + "step": 160, + "epoch": 1.0847457627118644, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650422784, + "loss": 0.3927, + "grad_norm": 7.188099384307861, + "learning_rate": 0.0001535696546319161 + }, + { + "step": 161, + "epoch": 1.0915254237288137, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650369024, + "loss": 0.3415, + "grad_norm": 4.80125093460083, + "learning_rate": 0.00015178495369752213 + }, + { + "step": 162, + "epoch": 1.0983050847457627, + "cpu_mem": 2.288082944, + "gpu_mem": 1.651144704, + "loss": 0.3316, + "grad_norm": 4.747864246368408, + "learning_rate": 0.00015 + }, + { + "step": 163, + "epoch": 1.1050847457627118, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650620928, + "loss": 0.3439, + "grad_norm": 7.393890857696533, + "learning_rate": 0.00014821504630247785 + }, + { + "step": 164, + "epoch": 1.111864406779661, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650533376, + "loss": 0.4808, + "grad_norm": 6.472229957580566, + "learning_rate": 0.00014643034536808387 + }, + { + "step": 165, + "epoch": 1.11864406779661, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650482688, + "loss": 0.3088, + "grad_norm": 5.336971282958984, + "learning_rate": 0.00014464614992415294 + }, + { + "step": 166, + "epoch": 1.1254237288135593, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65057792, + "loss": 0.2696, + "grad_norm": 4.33073616027832, + "learning_rate": 0.00014286271262643866 + }, + { + "step": 167, + "epoch": 1.1322033898305084, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650494976, + "loss": 0.4324, + "grad_norm": 6.192999839782715, + "learning_rate": 0.00014108028602333536 + }, + { + "step": 168, + "epoch": 1.1389830508474577, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650513408, + "loss": 0.3803, + "grad_norm": 5.837098121643066, + "learning_rate": 0.00013929912252011516 + }, + { + "step": 169, + "epoch": 1.1457627118644067, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65060096, + "loss": 0.3484, + "grad_norm": 6.140503406524658, + "learning_rate": 0.00013751947434318564 + }, + { + "step": 170, + "epoch": 1.152542372881356, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65048576, + "loss": 0.4578, + "grad_norm": 8.3561429977417, + "learning_rate": 0.00013574159350437261 + }, + { + "step": 171, + "epoch": 1.159322033898305, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650548736, + "loss": 0.4711, + "grad_norm": 7.292174339294434, + "learning_rate": 0.0001339657317652331 + }, + { + "step": 172, + "epoch": 1.1661016949152543, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650456576, + "loss": 0.2886, + "grad_norm": 4.5055646896362305, + "learning_rate": 0.00013219214060140424 + }, + { + "step": 173, + "epoch": 1.1728813559322033, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650756096, + "loss": 0.448, + "grad_norm": 5.777190685272217, + "learning_rate": 0.00013042107116699228 + }, + { + "step": 174, + "epoch": 1.1796610169491526, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650479616, + "loss": 0.4072, + "grad_norm": 7.206395626068115, + "learning_rate": 0.00012865277425900724 + }, + { + "step": 175, + "epoch": 1.1864406779661016, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650445824, + "loss": 0.3169, + "grad_norm": 4.376151084899902, + "learning_rate": 0.00012688750028184818 + }, + { + "step": 176, + "epoch": 1.193220338983051, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650584064, + "loss": 0.2967, + "grad_norm": 4.89902400970459, + "learning_rate": 0.0001251254992118439 + }, + { + "step": 177, + "epoch": 1.2, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650682368, + "loss": 0.4047, + "grad_norm": 4.529948711395264, + "learning_rate": 0.00012336702056185453 + }, + { + "step": 178, + "epoch": 1.2067796610169492, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650428928, + "loss": 0.4695, + "grad_norm": 7.002455711364746, + "learning_rate": 0.00012161231334593851 + }, + { + "step": 179, + "epoch": 1.2135593220338983, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650528768, + "loss": 0.4556, + "grad_norm": 6.528550624847412, + "learning_rate": 0.00011986162604409015 + }, + { + "step": 180, + "epoch": 1.2203389830508475, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65050112, + "loss": 0.371, + "grad_norm": 7.862176418304443, + "learning_rate": 0.00011811520656705348 + }, + { + "step": 181, + "epoch": 1.2271186440677966, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650438144, + "loss": 0.2265, + "grad_norm": 3.887237071990967, + "learning_rate": 0.00011637330222121543 + }, + { + "step": 182, + "epoch": 1.2338983050847459, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650656256, + "loss": 0.4033, + "grad_norm": 6.422468185424805, + "learning_rate": 0.00011463615967358588 + }, + { + "step": 183, + "epoch": 1.240677966101695, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650553344, + "loss": 0.2994, + "grad_norm": 5.061545372009277, + "learning_rate": 0.00011290402491686766 + }, + { + "step": 184, + "epoch": 1.2474576271186442, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65050112, + "loss": 0.2947, + "grad_norm": 6.038346290588379, + "learning_rate": 0.00011117714323462186 + }, + { + "step": 185, + "epoch": 1.2542372881355932, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650479616, + "loss": 0.4024, + "grad_norm": 5.587726593017578, + "learning_rate": 0.00010945575916653407 + }, + { + "step": 186, + "epoch": 1.2610169491525425, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650488832, + "loss": 0.2681, + "grad_norm": 4.074586391448975, + "learning_rate": 0.00010774011647378553 + }, + { + "step": 187, + "epoch": 1.2677966101694915, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650421248, + "loss": 0.4247, + "grad_norm": 5.73874568939209, + "learning_rate": 0.00010603045810453468 + }, + { + "step": 188, + "epoch": 1.2745762711864406, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650584064, + "loss": 0.1952, + "grad_norm": 3.658073902130127, + "learning_rate": 0.00010432702615951396 + }, + { + "step": 189, + "epoch": 1.2813559322033898, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650453504, + "loss": 0.3894, + "grad_norm": 7.434362888336182, + "learning_rate": 0.00010263006185774627 + }, + { + "step": 190, + "epoch": 1.288135593220339, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650573312, + "loss": 0.407, + "grad_norm": 5.246573448181152, + "learning_rate": 0.00010093980550238675 + }, + { + "step": 191, + "epoch": 1.2949152542372881, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650392064, + "loss": 0.1933, + "grad_norm": 3.512233257293701, + "learning_rate": 9.925649644669391e-05 + }, + { + "step": 192, + "epoch": 1.3016949152542372, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65052416, + "loss": 0.2265, + "grad_norm": 5.8709492683410645, + "learning_rate": 9.758037306013526e-05 + }, + { + "step": 193, + "epoch": 1.3084745762711865, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650498048, + "loss": 0.3409, + "grad_norm": 5.3833136558532715, + "learning_rate": 9.591167269463255e-05 + }, + { + "step": 194, + "epoch": 1.3152542372881357, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650464256, + "loss": 0.3032, + "grad_norm": 7.840549468994141, + "learning_rate": 9.425063165095088e-05 + }, + { + "step": 195, + "epoch": 1.3220338983050848, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650568704, + "loss": 0.2203, + "grad_norm": 4.502388000488281, + "learning_rate": 9.259748514523653e-05 + }, + { + "step": 196, + "epoch": 1.3288135593220338, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650564096, + "loss": 0.3328, + "grad_norm": 5.698582649230957, + "learning_rate": 9.095246727570879e-05 + }, + { + "step": 197, + "epoch": 1.335593220338983, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650422784, + "loss": 0.3245, + "grad_norm": 6.016025543212891, + "learning_rate": 8.931581098950973e-05 + }, + { + "step": 198, + "epoch": 1.3423728813559321, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650614784, + "loss": 0.348, + "grad_norm": 7.089911937713623, + "learning_rate": 8.768774804971705e-05 + }, + { + "step": 199, + "epoch": 1.3491525423728814, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650465792, + "loss": 0.3733, + "grad_norm": 7.767142295837402, + "learning_rate": 8.606850900252478e-05 + }, + { + "step": 200, + "epoch": 1.3559322033898304, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650568704, + "loss": 0.2499, + "grad_norm": 6.6793928146362305, + "learning_rate": 8.445832314459608e-05 + }, + { + "step": 201, + "epoch": 1.3627118644067797, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650771456, + "loss": 0.286, + "grad_norm": 4.929233074188232, + "learning_rate": 8.285741849059311e-05 + }, + { + "step": 202, + "epoch": 1.3694915254237288, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650573312, + "loss": 0.2626, + "grad_norm": 5.682394027709961, + "learning_rate": 8.126602174088843e-05 + }, + { + "step": 203, + "epoch": 1.376271186440678, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650459648, + "loss": 0.2832, + "grad_norm": 4.6697587966918945, + "learning_rate": 7.968435824946242e-05 + }, + { + "step": 204, + "epoch": 1.383050847457627, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650473472, + "loss": 0.2435, + "grad_norm": 5.328696250915527, + "learning_rate": 7.811265199199152e-05 + }, + { + "step": 205, + "epoch": 1.3898305084745763, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650518016, + "loss": 0.2431, + "grad_norm": 6.064335823059082, + "learning_rate": 7.655112553413135e-05 + }, + { + "step": 206, + "epoch": 1.3966101694915254, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650459648, + "loss": 0.2835, + "grad_norm": 5.274746417999268, + "learning_rate": 7.500000000000002e-05 + }, + { + "step": 207, + "epoch": 1.4033898305084747, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65069312, + "loss": 0.3621, + "grad_norm": 7.475527763366699, + "learning_rate": 7.345949504086507e-05 + }, + { + "step": 208, + "epoch": 1.4101694915254237, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65072384, + "loss": 0.2446, + "grad_norm": 8.708454132080078, + "learning_rate": 7.192982880403917e-05 + }, + { + "step": 209, + "epoch": 1.4169491525423727, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650650112, + "loss": 0.309, + "grad_norm": 6.420502662658691, + "learning_rate": 7.041121790198881e-05 + }, + { + "step": 210, + "epoch": 1.423728813559322, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650537984, + "loss": 0.3699, + "grad_norm": 5.0680365562438965, + "learning_rate": 6.890387738166041e-05 + }, + { + "step": 211, + "epoch": 1.4305084745762713, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650487296, + "loss": 0.2628, + "grad_norm": 5.140324592590332, + "learning_rate": 6.740802069402771e-05 + }, + { + "step": 212, + "epoch": 1.4372881355932203, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650456576, + "loss": 0.2256, + "grad_norm": 5.826368808746338, + "learning_rate": 6.592385966386588e-05 + }, + { + "step": 213, + "epoch": 1.4440677966101694, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650479616, + "loss": 0.3156, + "grad_norm": 8.571372985839844, + "learning_rate": 6.445160445975536e-05 + }, + { + "step": 214, + "epoch": 1.4508474576271186, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65056256, + "loss": 0.3306, + "grad_norm": 6.758704662322998, + "learning_rate": 6.299146356432029e-05 + }, + { + "step": 215, + "epoch": 1.457627118644068, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650490368, + "loss": 0.4185, + "grad_norm": 10.27810287475586, + "learning_rate": 6.154364374470568e-05 + }, + { + "step": 216, + "epoch": 1.464406779661017, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650656256, + "loss": 0.2506, + "grad_norm": 7.038588523864746, + "learning_rate": 6.010835002329795e-05 + }, + { + "step": 217, + "epoch": 1.471186440677966, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650498048, + "loss": 0.3153, + "grad_norm": 10.312743186950684, + "learning_rate": 5.8685785648691894e-05 + }, + { + "step": 218, + "epoch": 1.4779661016949153, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650475008, + "loss": 0.3857, + "grad_norm": 7.422705173492432, + "learning_rate": 5.72761520669092e-05 + }, + { + "step": 219, + "epoch": 1.4847457627118645, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65060096, + "loss": 0.3799, + "grad_norm": 6.267871856689453, + "learning_rate": 5.587964889287218e-05 + }, + { + "step": 220, + "epoch": 1.4915254237288136, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650634752, + "loss": 0.2953, + "grad_norm": 5.6727447509765625, + "learning_rate": 5.449647388213678e-05 + }, + { + "step": 221, + "epoch": 1.4983050847457626, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650502656, + "loss": 0.3401, + "grad_norm": 6.367251396179199, + "learning_rate": 5.312682290288869e-05 + }, + { + "step": 222, + "epoch": 1.505084745762712, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65063936, + "loss": 0.3315, + "grad_norm": 6.6244306564331055, + "learning_rate": 5.1770889908207245e-05 + }, + { + "step": 223, + "epoch": 1.5118644067796612, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650553344, + "loss": 0.3877, + "grad_norm": 11.416462898254395, + "learning_rate": 5.0428866908599864e-05 + }, + { + "step": 224, + "epoch": 1.5186440677966102, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650518016, + "loss": 0.1837, + "grad_norm": 4.181188583374023, + "learning_rate": 4.9100943944812114e-05 + }, + { + "step": 225, + "epoch": 1.5254237288135593, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650482688, + "loss": 0.2229, + "grad_norm": 5.6809611320495605, + "learning_rate": 4.778730906091632e-05 + }, + { + "step": 226, + "epoch": 1.5322033898305085, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65063168, + "loss": 0.3181, + "grad_norm": 5.814105987548828, + "learning_rate": 4.648814827768322e-05 + }, + { + "step": 227, + "epoch": 1.5389830508474578, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650521088, + "loss": 0.2826, + "grad_norm": 5.96216344833374, + "learning_rate": 4.5203645566239816e-05 + }, + { + "step": 228, + "epoch": 1.5457627118644068, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650465792, + "loss": 0.3167, + "grad_norm": 6.1743998527526855, + "learning_rate": 4.3933982822017876e-05 + }, + { + "step": 229, + "epoch": 1.5525423728813559, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650407424, + "loss": 0.2082, + "grad_norm": 4.7159423828125, + "learning_rate": 4.267933983899601e-05 + }, + { + "step": 230, + "epoch": 1.559322033898305, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650464256, + "loss": 0.1898, + "grad_norm": 5.020226001739502, + "learning_rate": 4.143989428423947e-05 + }, + { + "step": 231, + "epoch": 1.5661016949152542, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650742272, + "loss": 0.3824, + "grad_norm": 8.188497543334961, + "learning_rate": 4.0215821672741213e-05 + }, + { + "step": 232, + "epoch": 1.5728813559322035, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650465792, + "loss": 0.2713, + "grad_norm": 6.657375812530518, + "learning_rate": 3.900729534256745e-05 + }, + { + "step": 233, + "epoch": 1.5796610169491525, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650779136, + "loss": 0.2742, + "grad_norm": 6.72658634185791, + "learning_rate": 3.781448643031187e-05 + }, + { + "step": 234, + "epoch": 1.5864406779661016, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65065472, + "loss": 0.2236, + "grad_norm": 4.911664962768555, + "learning_rate": 3.663756384686127e-05 + }, + { + "step": 235, + "epoch": 1.5932203389830508, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650410496, + "loss": 0.2251, + "grad_norm": 4.894618034362793, + "learning_rate": 3.547669425347647e-05 + }, + { + "step": 236, + "epoch": 1.6, + "cpu_mem": 2.288082944, + "gpu_mem": 1.6504704, + "loss": 0.2851, + "grad_norm": 5.592400550842285, + "learning_rate": 3.433204203819185e-05 + }, + { + "step": 237, + "epoch": 1.6067796610169491, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65053184, + "loss": 0.3592, + "grad_norm": 7.877523422241211, + "learning_rate": 3.3203769292536764e-05 + }, + { + "step": 238, + "epoch": 1.6135593220338982, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650533376, + "loss": 0.3357, + "grad_norm": 6.169644355773926, + "learning_rate": 3.209203578858191e-05 + }, + { + "step": 239, + "epoch": 1.6203389830508474, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650786816, + "loss": 0.3318, + "grad_norm": 5.763091564178467, + "learning_rate": 3.099699895631474e-05 + }, + { + "step": 240, + "epoch": 1.6271186440677967, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650436608, + "loss": 0.5273, + "grad_norm": 11.911809921264648, + "learning_rate": 2.9918813861345952e-05 + }, + { + "step": 241, + "epoch": 1.6338983050847458, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650733056, + "loss": 0.3822, + "grad_norm": 8.010395050048828, + "learning_rate": 2.885763318295102e-05 + }, + { + "step": 242, + "epoch": 1.6406779661016948, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650594816, + "loss": 0.232, + "grad_norm": 4.8402838706970215, + "learning_rate": 2.781360719244964e-05 + }, + { + "step": 243, + "epoch": 1.647457627118644, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65044736, + "loss": 0.3186, + "grad_norm": 7.148870944976807, + "learning_rate": 2.6786883731926306e-05 + }, + { + "step": 244, + "epoch": 1.6542372881355933, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650587136, + "loss": 0.1882, + "grad_norm": 5.340954303741455, + "learning_rate": 2.5777608193294396e-05 + }, + { + "step": 245, + "epoch": 1.6610169491525424, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650465792, + "loss": 0.3445, + "grad_norm": 6.289117813110352, + "learning_rate": 2.4785923497707956e-05 + }, + { + "step": 246, + "epoch": 1.6677966101694914, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650559488, + "loss": 0.3711, + "grad_norm": 5.0606369972229, + "learning_rate": 2.38119700753228e-05 + }, + { + "step": 247, + "epoch": 1.6745762711864407, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65057792, + "loss": 0.1621, + "grad_norm": 4.27090311050415, + "learning_rate": 2.285588584541047e-05 + }, + { + "step": 248, + "epoch": 1.68135593220339, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650530304, + "loss": 0.3046, + "grad_norm": 4.624502182006836, + "learning_rate": 2.1917806196827792e-05 + }, + { + "step": 249, + "epoch": 1.688135593220339, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650436608, + "loss": 0.2048, + "grad_norm": 3.9607715606689453, + "learning_rate": 2.0997863968844914e-05 + }, + { + "step": 250, + "epoch": 1.694915254237288, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650528768, + "loss": 0.31, + "grad_norm": 6.483154296875, + "learning_rate": 2.009618943233419e-05 + }, + { + "step": 251, + "epoch": 1.7016949152542373, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650441216, + "loss": 0.2956, + "grad_norm": 8.095823287963867, + "learning_rate": 1.921291027132278e-05 + }, + { + "step": 252, + "epoch": 1.7084745762711866, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650484224, + "loss": 0.3078, + "grad_norm": 5.848687648773193, + "learning_rate": 1.834815156491165e-05 + }, + { + "step": 253, + "epoch": 1.7152542372881356, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65067776, + "loss": 0.421, + "grad_norm": 9.197225570678711, + "learning_rate": 1.750203576956341e-05 + }, + { + "step": 254, + "epoch": 1.7220338983050847, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650473472, + "loss": 0.4033, + "grad_norm": 6.230844974517822, + "learning_rate": 1.6674682701761493e-05 + }, + { + "step": 255, + "epoch": 1.7288135593220337, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650630144, + "loss": 0.3933, + "grad_norm": 7.9306793212890625, + "learning_rate": 1.5866209521043304e-05 + }, + { + "step": 256, + "epoch": 1.735593220338983, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650456576, + "loss": 0.2965, + "grad_norm": 6.549820899963379, + "learning_rate": 1.5076730713409523e-05 + }, + { + "step": 257, + "epoch": 1.7423728813559323, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65086976, + "loss": 0.3846, + "grad_norm": 4.548492908477783, + "learning_rate": 1.4306358075111923e-05 + }, + { + "step": 258, + "epoch": 1.7491525423728813, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650528768, + "loss": 0.3748, + "grad_norm": 9.147762298583984, + "learning_rate": 1.3555200696822232e-05 + }, + { + "step": 259, + "epoch": 1.7559322033898304, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650445824, + "loss": 0.3714, + "grad_norm": 5.773159980773926, + "learning_rate": 1.2823364948184095e-05 + }, + { + "step": 260, + "epoch": 1.7627118644067796, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65056256, + "loss": 0.1873, + "grad_norm": 4.348002910614014, + "learning_rate": 1.2110954462750166e-05 + }, + { + "step": 261, + "epoch": 1.769491525423729, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650518016, + "loss": 0.1657, + "grad_norm": 5.594480991363525, + "learning_rate": 1.1418070123306989e-05 + }, + { + "step": 262, + "epoch": 1.776271186440678, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650475008, + "loss": 0.2255, + "grad_norm": 6.176375865936279, + "learning_rate": 1.0744810047589115e-05 + }, + { + "step": 263, + "epoch": 1.783050847457627, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650511872, + "loss": 0.2636, + "grad_norm": 3.879056215286255, + "learning_rate": 1.0091269574384874e-05 + }, + { + "step": 264, + "epoch": 1.7898305084745763, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650599424, + "loss": 0.2695, + "grad_norm": 5.05800199508667, + "learning_rate": 9.45754125003576e-06 + }, + { + "step": 265, + "epoch": 1.7966101694915255, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650518016, + "loss": 0.3391, + "grad_norm": 6.6437907218933105, + "learning_rate": 8.843714815330987e-06 + }, + { + "step": 266, + "epoch": 1.8033898305084746, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650733056, + "loss": 0.468, + "grad_norm": 6.713739395141602, + "learning_rate": 8.249877192799731e-06 + }, + { + "step": 267, + "epoch": 1.8101694915254236, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650525696, + "loss": 0.4037, + "grad_norm": 5.255074501037598, + "learning_rate": 7.676112474402068e-06 + }, + { + "step": 268, + "epoch": 1.8169491525423729, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650530304, + "loss": 0.2648, + "grad_norm": 5.353787422180176, + "learning_rate": 7.122501909620926e-06 + }, + { + "step": 269, + "epoch": 1.8237288135593221, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650541056, + "loss": 0.3807, + "grad_norm": 6.517563343048096, + "learning_rate": 6.5891238939566275e-06 + }, + { + "step": 270, + "epoch": 1.8305084745762712, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650579456, + "loss": 0.2661, + "grad_norm": 5.572103023529053, + "learning_rate": 6.076053957825411e-06 + }, + { + "step": 271, + "epoch": 1.8372881355932202, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65063168, + "loss": 0.2527, + "grad_norm": 6.07205867767334, + "learning_rate": 5.583364755863701e-06 + }, + { + "step": 272, + "epoch": 1.8440677966101695, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650490368, + "loss": 0.3436, + "grad_norm": 6.657040119171143, + "learning_rate": 5.11112605663977e-06 + }, + { + "step": 273, + "epoch": 1.8508474576271188, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65037056, + "loss": 0.372, + "grad_norm": 5.9023332595825195, + "learning_rate": 4.659404732773908e-06 + }, + { + "step": 274, + "epoch": 1.8576271186440678, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650597888, + "loss": 0.2652, + "grad_norm": 5.236993789672852, + "learning_rate": 4.228264751468752e-06 + }, + { + "step": 275, + "epoch": 1.8644067796610169, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650842112, + "loss": 0.2475, + "grad_norm": 5.636241912841797, + "learning_rate": 3.817767165451041e-06 + }, + { + "step": 276, + "epoch": 1.8711864406779661, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650502656, + "loss": 0.246, + "grad_norm": 4.479111671447754, + "learning_rate": 3.4279701043260886e-06 + }, + { + "step": 277, + "epoch": 1.8779661016949154, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650448896, + "loss": 0.3801, + "grad_norm": 5.057372093200684, + "learning_rate": 3.0589287663461472e-06 + }, + { + "step": 278, + "epoch": 1.8847457627118644, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650611712, + "loss": 0.3574, + "grad_norm": 6.29480504989624, + "learning_rate": 2.710695410593994e-06 + }, + { + "step": 279, + "epoch": 1.8915254237288135, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650551808, + "loss": 0.3276, + "grad_norm": 5.134455680847168, + "learning_rate": 2.3833193495825853e-06 + }, + { + "step": 280, + "epoch": 1.8983050847457628, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65053184, + "loss": 0.195, + "grad_norm": 4.122469902038574, + "learning_rate": 2.076846942272026e-06 + }, + { + "step": 281, + "epoch": 1.905084745762712, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650467328, + "loss": 0.3247, + "grad_norm": 4.672211170196533, + "learning_rate": 1.791321587504768e-06 + }, + { + "step": 282, + "epoch": 1.911864406779661, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650895872, + "loss": 0.2197, + "grad_norm": 5.256533622741699, + "learning_rate": 1.5267837178600972e-06 + }, + { + "step": 283, + "epoch": 1.9186440677966101, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650602496, + "loss": 0.3066, + "grad_norm": 4.685293674468994, + "learning_rate": 1.2832707939284427e-06 + }, + { + "step": 284, + "epoch": 1.9254237288135592, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650458112, + "loss": 0.3476, + "grad_norm": 4.5423760414123535, + "learning_rate": 1.0608172990067553e-06 + }, + { + "step": 285, + "epoch": 1.9322033898305084, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650511872, + "loss": 0.262, + "grad_norm": 4.820805549621582, + "learning_rate": 8.594547342153979e-07 + }, + { + "step": 286, + "epoch": 1.9389830508474577, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650929664, + "loss": 0.2149, + "grad_norm": 5.467118263244629, + "learning_rate": 6.792116140373116e-07 + }, + { + "step": 287, + "epoch": 1.9457627118644067, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650699264, + "loss": 0.2304, + "grad_norm": 4.803750038146973, + "learning_rate": 5.201134622801473e-07 + }, + { + "step": 288, + "epoch": 1.9525423728813558, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650484224, + "loss": 0.4071, + "grad_norm": 6.370242118835449, + "learning_rate": 3.821828084619727e-07 + }, + { + "step": 289, + "epoch": 1.959322033898305, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650568704, + "loss": 0.2141, + "grad_norm": 5.2170820236206055, + "learning_rate": 2.654391846207915e-07 + }, + { + "step": 290, + "epoch": 1.9661016949152543, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65049344, + "loss": 0.3001, + "grad_norm": 6.073480606079102, + "learning_rate": 1.6989912254880556e-07 + }, + { + "step": 291, + "epoch": 1.9728813559322034, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650528768, + "loss": 0.3566, + "grad_norm": 8.38736629486084, + "learning_rate": 9.557615145123765e-08 + }, + { + "step": 292, + "epoch": 1.9796610169491524, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650611712, + "loss": 0.2794, + "grad_norm": 5.543145656585693, + "learning_rate": 4.248079603064724e-08 + }, + { + "step": 293, + "epoch": 1.9864406779661017, + "cpu_mem": 2.288082944, + "gpu_mem": 1.650528768, + "loss": 0.4191, + "grad_norm": 7.412874221801758, + "learning_rate": 1.0620574996372811e-08 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65055488, + "loss": 0.2479, + "grad_norm": 5.484694480895996, + "learning_rate": 0.0 + }, + { + "step": 294, + "epoch": 1.993220338983051, + "cpu_mem": 2.288082944, + "gpu_mem": 1.65055488, + "train_runtime": 4555.895, + "train_samples_per_second": 4.138, + "train_steps_per_second": 0.065, + "total_flos": 0.0, + "train_loss": 0.5994277026783041 + } +] \ No newline at end of file diff --git a/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r32-a2/adapter_config.json b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r32-a2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b45b0ec05dc0b2725649e69090539b84b3e445ea --- /dev/null +++ b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r32-a2/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha": 64, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1", + "bias": "none", + "enabled_mlp": true, + "enabled_qkv": [ + "q", + "k", + "v" + ], + "fan_in_fan_out": false, + "inference_mode": false, + "layers_pattern": null, + "layers_to_transform": null, + "mixture": false, + "modules_to_preserve_errors": null, + "modules_to_quantize": null, + "modules_to_save": null, + "onnx_export": false, + "optimization_level": 3, + "orthogonal_init": false, + "peft_type": "MARS", + "quant_n_bits": 8, + "r": 32, + "revision": null, + "seed": 42, + "shared_r": 32, + "target_modules": [ + "down_proj", + "v_proj", + "k_proj", + "up_proj", + "gate_proj", + "o_proj", + "q_proj" + ], + "task_type": null, + "use_bnb": false +} \ No newline at end of file diff --git a/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r32-a2/eval_results.json b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r32-a2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a7fef87a6621a6390b3084ec7fb570631432890c --- /dev/null +++ b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r32-a2/eval_results.json @@ -0,0 +1,4 @@ +{ + "task": "logiqa", + "results": 0.44556909729394756 +} \ No newline at end of file diff --git a/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r32-a2/training_configuration.json b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r32-a2/training_configuration.json new file mode 100644 index 0000000000000000000000000000000000000000..bfe0b67db0a46af55ea27358636de5a7fe8c3c5e --- /dev/null +++ b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r32-a2/training_configuration.json @@ -0,0 +1,38 @@ +{ + "model_id": "TinyLlama/TinyLlama_v1.1", + "dataset": { + "name": "LOGIQA", + "dataset_id": "data/logiqa_train", + "preprocess_id": "logiqa_train_deepeval" + }, + "peft_config": { + "method": "mars", + "rank": 32, + "alpha": 64, + "dropout": 0.0, + "bias": "none", + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "trainable_parameter_count": 21018624 + }, + "training_config": { + "max_dataset_length": null, + "batch_size": 64, + "per_device_batch_size": 32, + "gradient_accumulation_steps": 2, + "learning_rate": 0.0003, + "num_epochs": 3, + "warmup_ratio": 0.1 + }, + "model_name": "TinyLlama_v1.1-mars-logiqa-r32-a2", + "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q8/TinyLlama_v1.1-mars-logiqa-r32-a2", + "seed": 42, + "timestamp": "2025-09-02T12:52:00.563090" +} \ No newline at end of file diff --git a/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r32-a2/training_logs.json b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r32-a2/training_logs.json new file mode 100644 index 0000000000000000000000000000000000000000..30d6e308072408dafb6a29a87cc77bb1faa9cd2e --- /dev/null +++ b/TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r32-a2/training_logs.json @@ -0,0 +1,5305 @@ +[ + { + "step": 1, + "epoch": 0.005089058524173028, + "cpu_mem": 3.111108608, + "gpu_mem": 1.652903424, + "loss": 3.8704, + "grad_norm": 77.43319702148438, + "learning_rate": 5.084745762711864e-06 + }, + { + "step": 2, + "epoch": 0.010178117048346057, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821071872, + "loss": 3.9889, + "grad_norm": 77.22821807861328, + "learning_rate": 1.0169491525423728e-05 + }, + { + "step": 3, + "epoch": 0.015267175572519083, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821148672, + "loss": 3.4625, + "grad_norm": 66.85151672363281, + "learning_rate": 1.5254237288135592e-05 + }, + { + "step": 4, + "epoch": 0.020356234096692113, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821047296, + "loss": 2.7331, + "grad_norm": 48.447811126708984, + "learning_rate": 2.0338983050847455e-05 + }, + { + "step": 5, + "epoch": 0.02544529262086514, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821062656, + "loss": 2.2856, + "grad_norm": 27.1071720123291, + "learning_rate": 2.542372881355932e-05 + }, + { + "step": 6, + "epoch": 0.030534351145038167, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821054976, + "loss": 1.938, + "grad_norm": 15.736248970031738, + "learning_rate": 3.0508474576271185e-05 + }, + { + "step": 7, + "epoch": 0.035623409669211195, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821125632, + "loss": 1.5896, + "grad_norm": 9.598876953125, + "learning_rate": 3.559322033898305e-05 + }, + { + "step": 8, + "epoch": 0.04071246819338423, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821110272, + "loss": 1.521, + "grad_norm": 5.87281608581543, + "learning_rate": 4.067796610169491e-05 + }, + { + "step": 9, + "epoch": 0.04580152671755725, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821108736, + "loss": 1.5015, + "grad_norm": 6.580787181854248, + "learning_rate": 4.576271186440678e-05 + }, + { + "step": 10, + "epoch": 0.05089058524173028, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821119488, + "loss": 1.401, + "grad_norm": 3.961298942565918, + "learning_rate": 5.084745762711864e-05 + }, + { + "step": 11, + "epoch": 0.05597964376590331, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82102272, + "loss": 1.4032, + "grad_norm": 3.8365039825439453, + "learning_rate": 5.59322033898305e-05 + }, + { + "step": 12, + "epoch": 0.061068702290076333, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821073408, + "loss": 1.4189, + "grad_norm": 4.514285564422607, + "learning_rate": 6.101694915254237e-05 + }, + { + "step": 13, + "epoch": 0.06615776081424936, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821165568, + "loss": 1.3857, + "grad_norm": 2.3672590255737305, + "learning_rate": 6.610169491525423e-05 + }, + { + "step": 14, + "epoch": 0.07124681933842239, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821078016, + "loss": 1.471, + "grad_norm": 5.247903823852539, + "learning_rate": 7.11864406779661e-05 + }, + { + "step": 15, + "epoch": 0.07633587786259542, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821216256, + "loss": 1.4185, + "grad_norm": 2.809873580932617, + "learning_rate": 7.627118644067796e-05 + }, + { + "step": 16, + "epoch": 0.08142493638676845, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82107648, + "loss": 1.4816, + "grad_norm": 5.729938983917236, + "learning_rate": 8.135593220338982e-05 + }, + { + "step": 17, + "epoch": 0.08651399491094147, + "cpu_mem": 3.113271296, + "gpu_mem": 1.8211072, + "loss": 1.3951, + "grad_norm": 3.3262863159179688, + "learning_rate": 8.64406779661017e-05 + }, + { + "step": 18, + "epoch": 0.0916030534351145, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821070336, + "loss": 1.4086, + "grad_norm": 4.384488582611084, + "learning_rate": 9.152542372881355e-05 + }, + { + "step": 19, + "epoch": 0.09669211195928754, + "cpu_mem": 3.113271296, + "gpu_mem": 1.820978176, + "loss": 1.3778, + "grad_norm": 1.2025991678237915, + "learning_rate": 9.661016949152541e-05 + }, + { + "step": 20, + "epoch": 0.10178117048346055, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821016576, + "loss": 1.3709, + "grad_norm": 1.226719856262207, + "learning_rate": 0.00010169491525423727 + }, + { + "step": 21, + "epoch": 0.10687022900763359, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821150208, + "loss": 1.4556, + "grad_norm": 5.13141393661499, + "learning_rate": 0.00010677966101694915 + }, + { + "step": 22, + "epoch": 0.11195928753180662, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821048832, + "loss": 1.433, + "grad_norm": 3.8367297649383545, + "learning_rate": 0.000111864406779661 + }, + { + "step": 23, + "epoch": 0.11704834605597965, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82107648, + "loss": 1.4417, + "grad_norm": 4.432380199432373, + "learning_rate": 0.00011694915254237288 + }, + { + "step": 24, + "epoch": 0.12213740458015267, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821070336, + "loss": 1.3887, + "grad_norm": 2.2985289096832275, + "learning_rate": 0.00012203389830508474 + }, + { + "step": 25, + "epoch": 0.1272264631043257, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82107648, + "loss": 1.4507, + "grad_norm": 3.923163890838623, + "learning_rate": 0.00012711864406779658 + }, + { + "step": 26, + "epoch": 0.13231552162849872, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821128704, + "loss": 1.4328, + "grad_norm": 4.150632858276367, + "learning_rate": 0.00013220338983050846 + }, + { + "step": 27, + "epoch": 0.13740458015267176, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821070336, + "loss": 1.4214, + "grad_norm": 3.012176036834717, + "learning_rate": 0.00013728813559322033 + }, + { + "step": 28, + "epoch": 0.14249363867684478, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821016576, + "loss": 1.4158, + "grad_norm": 1.8457833528518677, + "learning_rate": 0.0001423728813559322 + }, + { + "step": 29, + "epoch": 0.1475826972010178, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821108736, + "loss": 1.3806, + "grad_norm": 1.184678077697754, + "learning_rate": 0.00014745762711864405 + }, + { + "step": 30, + "epoch": 0.15267175572519084, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821104128, + "loss": 1.324, + "grad_norm": 2.047057628631592, + "learning_rate": 0.00015254237288135592 + }, + { + "step": 31, + "epoch": 0.15776081424936386, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821082624, + "loss": 1.369, + "grad_norm": 2.089134931564331, + "learning_rate": 0.0001576271186440678 + }, + { + "step": 32, + "epoch": 0.1628498727735369, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821087232, + "loss": 1.6046, + "grad_norm": 6.350461006164551, + "learning_rate": 0.00016271186440677964 + }, + { + "step": 33, + "epoch": 0.16793893129770993, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82112256, + "loss": 1.3662, + "grad_norm": 2.530993938446045, + "learning_rate": 0.0001677966101694915 + }, + { + "step": 34, + "epoch": 0.17302798982188294, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821047296, + "loss": 1.4543, + "grad_norm": 3.0746216773986816, + "learning_rate": 0.0001728813559322034 + }, + { + "step": 35, + "epoch": 0.178117048346056, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821094912, + "loss": 1.373, + "grad_norm": 1.8788795471191406, + "learning_rate": 0.00017796610169491523 + }, + { + "step": 36, + "epoch": 0.183206106870229, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821110272, + "loss": 1.5255, + "grad_norm": 4.8368401527404785, + "learning_rate": 0.0001830508474576271 + }, + { + "step": 37, + "epoch": 0.18829516539440203, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82117632, + "loss": 1.5946, + "grad_norm": 5.974050998687744, + "learning_rate": 0.00018813559322033895 + }, + { + "step": 38, + "epoch": 0.19338422391857507, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821059584, + "loss": 1.3837, + "grad_norm": 0.9142059683799744, + "learning_rate": 0.00019322033898305083 + }, + { + "step": 39, + "epoch": 0.1984732824427481, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82117632, + "loss": 1.4891, + "grad_norm": 2.3137128353118896, + "learning_rate": 0.0001983050847457627 + }, + { + "step": 40, + "epoch": 0.2035623409669211, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821097984, + "loss": 1.3519, + "grad_norm": 0.6163047552108765, + "learning_rate": 0.00020338983050847455 + }, + { + "step": 41, + "epoch": 0.20865139949109415, + "cpu_mem": 3.113271296, + "gpu_mem": 1.820996608, + "loss": 1.4103, + "grad_norm": 1.914015531539917, + "learning_rate": 0.00020847457627118642 + }, + { + "step": 42, + "epoch": 0.21374045801526717, + "cpu_mem": 3.113271296, + "gpu_mem": 1.8210688, + "loss": 1.4675, + "grad_norm": 2.6388909816741943, + "learning_rate": 0.0002135593220338983 + }, + { + "step": 43, + "epoch": 0.21882951653944022, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821031936, + "loss": 1.41, + "grad_norm": 1.9415768384933472, + "learning_rate": 0.00021864406779661014 + }, + { + "step": 44, + "epoch": 0.22391857506361323, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821067264, + "loss": 1.4645, + "grad_norm": 2.140970230102539, + "learning_rate": 0.000223728813559322 + }, + { + "step": 45, + "epoch": 0.22900763358778625, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821121024, + "loss": 1.4556, + "grad_norm": 1.9815139770507812, + "learning_rate": 0.00022881355932203386 + }, + { + "step": 46, + "epoch": 0.2340966921119593, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82116864, + "loss": 1.421, + "grad_norm": 1.6856343746185303, + "learning_rate": 0.00023389830508474576 + }, + { + "step": 47, + "epoch": 0.23918575063613232, + "cpu_mem": 3.113271296, + "gpu_mem": 1.820996608, + "loss": 1.4289, + "grad_norm": 2.08207106590271, + "learning_rate": 0.0002389830508474576 + }, + { + "step": 48, + "epoch": 0.24427480916030533, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821050368, + "loss": 1.403, + "grad_norm": 1.014788269996643, + "learning_rate": 0.00024406779661016948 + }, + { + "step": 49, + "epoch": 0.24936386768447838, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821039616, + "loss": 1.4549, + "grad_norm": 2.0267350673675537, + "learning_rate": 0.00024915254237288135 + }, + { + "step": 50, + "epoch": 0.2544529262086514, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821048832, + "loss": 1.3902, + "grad_norm": 0.978413462638855, + "learning_rate": 0.00025423728813559317 + }, + { + "step": 51, + "epoch": 0.2595419847328244, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821144064, + "loss": 1.3769, + "grad_norm": 1.0182136297225952, + "learning_rate": 0.0002593220338983051 + }, + { + "step": 52, + "epoch": 0.26463104325699743, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821085696, + "loss": 1.4931, + "grad_norm": 2.9022347927093506, + "learning_rate": 0.0002644067796610169 + }, + { + "step": 53, + "epoch": 0.2697201017811705, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821164032, + "loss": 1.4766, + "grad_norm": 2.8224120140075684, + "learning_rate": 0.0002694915254237288 + }, + { + "step": 54, + "epoch": 0.2748091603053435, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821064192, + "loss": 1.4009, + "grad_norm": 1.2312403917312622, + "learning_rate": 0.00027457627118644066 + }, + { + "step": 55, + "epoch": 0.27989821882951654, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82106112, + "loss": 1.4579, + "grad_norm": 2.528264284133911, + "learning_rate": 0.0002796610169491525 + }, + { + "step": 56, + "epoch": 0.28498727735368956, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821105664, + "loss": 1.4377, + "grad_norm": 1.8990558385849, + "learning_rate": 0.0002847457627118644 + }, + { + "step": 57, + "epoch": 0.2900763358778626, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821097984, + "loss": 1.4124, + "grad_norm": 1.682401418685913, + "learning_rate": 0.00028983050847457623 + }, + { + "step": 58, + "epoch": 0.2951653944020356, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821121024, + "loss": 1.3908, + "grad_norm": 1.022024154663086, + "learning_rate": 0.0002949152542372881 + }, + { + "step": 59, + "epoch": 0.30025445292620867, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821088768, + "loss": 1.4857, + "grad_norm": 3.3230535984039307, + "learning_rate": 0.0003 + }, + { + "step": 60, + "epoch": 0.3053435114503817, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821079552, + "loss": 1.402, + "grad_norm": 1.06948983669281, + "learning_rate": 0.00029999735486167307 + }, + { + "step": 61, + "epoch": 0.3104325699745547, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821116416, + "loss": 1.4069, + "grad_norm": 1.742200255393982, + "learning_rate": 0.00029998941953998247 + }, + { + "step": 62, + "epoch": 0.3155216284987277, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821156352, + "loss": 1.382, + "grad_norm": 1.1098747253417969, + "learning_rate": 0.0002999761943147951 + }, + { + "step": 63, + "epoch": 0.32061068702290074, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821085696, + "loss": 1.3934, + "grad_norm": 1.1350864171981812, + "learning_rate": 0.000299957679652545 + }, + { + "step": 64, + "epoch": 0.3256997455470738, + "cpu_mem": 3.113271296, + "gpu_mem": 1.820987392, + "loss": 1.4064, + "grad_norm": 1.8642208576202393, + "learning_rate": 0.0002999338762062168 + }, + { + "step": 65, + "epoch": 0.33078880407124683, + "cpu_mem": 3.113271296, + "gpu_mem": 1.8210688, + "loss": 1.4376, + "grad_norm": 2.2374446392059326, + "learning_rate": 0.00029990478481532246 + }, + { + "step": 66, + "epoch": 0.33587786259541985, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821173248, + "loss": 1.3994, + "grad_norm": 0.6631535291671753, + "learning_rate": 0.00029987040650587214 + }, + { + "step": 67, + "epoch": 0.34096692111959287, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821050368, + "loss": 1.4136, + "grad_norm": 1.2549136877059937, + "learning_rate": 0.0002998307424903376 + }, + { + "step": 68, + "epoch": 0.3460559796437659, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821102592, + "loss": 1.4228, + "grad_norm": 1.1801550388336182, + "learning_rate": 0.00029978579416760955 + }, + { + "step": 69, + "epoch": 0.3511450381679389, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821094912, + "loss": 1.3921, + "grad_norm": 1.004292607307434, + "learning_rate": 0.00029973556312294853 + }, + { + "step": 70, + "epoch": 0.356234096692112, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821010432, + "loss": 1.4271, + "grad_norm": 1.4426991939544678, + "learning_rate": 0.0002996800511279286 + }, + { + "step": 71, + "epoch": 0.361323155216285, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821033472, + "loss": 1.4191, + "grad_norm": 1.7635105848312378, + "learning_rate": 0.0002996192601403751 + }, + { + "step": 72, + "epoch": 0.366412213740458, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821082624, + "loss": 1.3706, + "grad_norm": 0.528340756893158, + "learning_rate": 0.00029955319230429584 + }, + { + "step": 73, + "epoch": 0.37150127226463103, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821036544, + "loss": 1.4233, + "grad_norm": 1.2433674335479736, + "learning_rate": 0.00029948184994980486 + }, + { + "step": 74, + "epoch": 0.37659033078880405, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821090304, + "loss": 1.4586, + "grad_norm": 1.7939504384994507, + "learning_rate": 0.0002994052355930409 + }, + { + "step": 75, + "epoch": 0.3816793893129771, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821133312, + "loss": 1.4527, + "grad_norm": 2.049440383911133, + "learning_rate": 0.0002993233519360781 + }, + { + "step": 76, + "epoch": 0.38676844783715014, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821081088, + "loss": 1.4022, + "grad_norm": 1.1904855966567993, + "learning_rate": 0.0002992362018668312 + }, + { + "step": 77, + "epoch": 0.39185750636132316, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821019648, + "loss": 1.388, + "grad_norm": 0.6846556663513184, + "learning_rate": 0.00029914378845895343 + }, + { + "step": 78, + "epoch": 0.3969465648854962, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821121024, + "loss": 1.3602, + "grad_norm": 0.9406521916389465, + "learning_rate": 0.000299046114971728 + }, + { + "step": 79, + "epoch": 0.4020356234096692, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82113792, + "loss": 1.3127, + "grad_norm": 1.5447375774383545, + "learning_rate": 0.0002989431848499534 + }, + { + "step": 80, + "epoch": 0.4071246819338422, + "cpu_mem": 3.113271296, + "gpu_mem": 1.820978176, + "loss": 1.5185, + "grad_norm": 2.7212510108947754, + "learning_rate": 0.0002988350017238218 + }, + { + "step": 81, + "epoch": 0.4122137404580153, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82108416, + "loss": 1.4423, + "grad_norm": 2.1437230110168457, + "learning_rate": 0.0002987215694087909 + }, + { + "step": 82, + "epoch": 0.4173027989821883, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821056512, + "loss": 1.4406, + "grad_norm": 2.396592855453491, + "learning_rate": 0.0002986028919054496 + }, + { + "step": 83, + "epoch": 0.4223918575063613, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821070336, + "loss": 1.5115, + "grad_norm": 3.6540367603302, + "learning_rate": 0.00029847897339937675 + }, + { + "step": 84, + "epoch": 0.42748091603053434, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82107648, + "loss": 1.4399, + "grad_norm": 2.2770965099334717, + "learning_rate": 0.0002983498182609935 + }, + { + "step": 85, + "epoch": 0.43256997455470736, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821105664, + "loss": 1.3924, + "grad_norm": 0.4663897454738617, + "learning_rate": 0.0002982154310454093 + }, + { + "step": 86, + "epoch": 0.43765903307888043, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821047296, + "loss": 1.3515, + "grad_norm": 0.4849090278148651, + "learning_rate": 0.00029807581649226114 + }, + { + "step": 87, + "epoch": 0.44274809160305345, + "cpu_mem": 3.113271296, + "gpu_mem": 1.8210688, + "loss": 1.4423, + "grad_norm": 1.36202871799469, + "learning_rate": 0.00029793097952554646 + }, + { + "step": 88, + "epoch": 0.44783715012722647, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821094912, + "loss": 1.3724, + "grad_norm": 1.0862525701522827, + "learning_rate": 0.0002977809252534494 + }, + { + "step": 89, + "epoch": 0.4529262086513995, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821042688, + "loss": 1.429, + "grad_norm": 1.4298802614212036, + "learning_rate": 0.00029762565896816073 + }, + { + "step": 90, + "epoch": 0.4580152671755725, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821074944, + "loss": 1.4007, + "grad_norm": 1.319437861442566, + "learning_rate": 0.000297465186145691 + }, + { + "step": 91, + "epoch": 0.4631043256997455, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821067264, + "loss": 1.373, + "grad_norm": 1.2720054388046265, + "learning_rate": 0.0002972995124456779 + }, + { + "step": 92, + "epoch": 0.4681933842239186, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82100736, + "loss": 1.4655, + "grad_norm": 2.014648675918579, + "learning_rate": 0.0002971286437111861 + }, + { + "step": 93, + "epoch": 0.4732824427480916, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821208576, + "loss": 1.4411, + "grad_norm": 1.483183741569519, + "learning_rate": 0.0002969525859685014 + }, + { + "step": 94, + "epoch": 0.47837150127226463, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82102272, + "loss": 1.4625, + "grad_norm": 1.8429388999938965, + "learning_rate": 0.0002967713454269183 + }, + { + "step": 95, + "epoch": 0.48346055979643765, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821190144, + "loss": 1.4079, + "grad_norm": 1.169232964515686, + "learning_rate": 0.0002965849284785207 + }, + { + "step": 96, + "epoch": 0.48854961832061067, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821064192, + "loss": 1.4258, + "grad_norm": 1.7047373056411743, + "learning_rate": 0.000296393341697957 + }, + { + "step": 97, + "epoch": 0.49363867684478374, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821059584, + "loss": 1.4598, + "grad_norm": 1.9307148456573486, + "learning_rate": 0.00029619659184220755 + }, + { + "step": 98, + "epoch": 0.49872773536895676, + "cpu_mem": 3.113271296, + "gpu_mem": 1.8211072, + "loss": 1.3959, + "grad_norm": 1.4363160133361816, + "learning_rate": 0.00029599468585034684 + }, + { + "step": 99, + "epoch": 0.5038167938931297, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821088768, + "loss": 1.4811, + "grad_norm": 2.9748597145080566, + "learning_rate": 0.0002957876308432986 + }, + { + "step": 100, + "epoch": 0.5089058524173028, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82105344, + "loss": 1.3672, + "grad_norm": 0.6291810870170593, + "learning_rate": 0.0002955754341235846 + }, + { + "step": 101, + "epoch": 0.5139949109414759, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821036544, + "loss": 1.3941, + "grad_norm": 0.572160005569458, + "learning_rate": 0.00029535810317506714 + }, + { + "step": 102, + "epoch": 0.5190839694656488, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821087232, + "loss": 1.4059, + "grad_norm": 1.0223088264465332, + "learning_rate": 0.00029513564566268524 + }, + { + "step": 103, + "epoch": 0.5241730279898219, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821047296, + "loss": 1.4209, + "grad_norm": 1.5404703617095947, + "learning_rate": 0.0002949080694321841 + }, + { + "step": 104, + "epoch": 0.5292620865139949, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821136384, + "loss": 1.4371, + "grad_norm": 1.5860410928726196, + "learning_rate": 0.0002946753825098386 + }, + { + "step": 105, + "epoch": 0.5343511450381679, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821005824, + "loss": 1.4498, + "grad_norm": 2.137938976287842, + "learning_rate": 0.0002944375931021699 + }, + { + "step": 106, + "epoch": 0.539440203562341, + "cpu_mem": 3.113271296, + "gpu_mem": 1.8210688, + "loss": 1.4196, + "grad_norm": 1.2961212396621704, + "learning_rate": 0.0002941947095956564 + }, + { + "step": 107, + "epoch": 0.544529262086514, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821064192, + "loss": 1.3932, + "grad_norm": 0.7637197375297546, + "learning_rate": 0.0002939467405564377 + }, + { + "step": 108, + "epoch": 0.549618320610687, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82105344, + "loss": 1.3669, + "grad_norm": 0.6210586428642273, + "learning_rate": 0.00029369369473001265 + }, + { + "step": 109, + "epoch": 0.55470737913486, + "cpu_mem": 3.113271296, + "gpu_mem": 1.8212224, + "loss": 1.3997, + "grad_norm": 0.8344686031341553, + "learning_rate": 0.0002934355810409307 + }, + { + "step": 110, + "epoch": 0.5597964376590331, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821019648, + "loss": 1.3571, + "grad_norm": 0.8124700784683228, + "learning_rate": 0.0002931724085924774 + }, + { + "step": 111, + "epoch": 0.5648854961832062, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821021184, + "loss": 1.3719, + "grad_norm": 0.5583367347717285, + "learning_rate": 0.00029290418666635314 + }, + { + "step": 112, + "epoch": 0.5699745547073791, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821121024, + "loss": 1.4333, + "grad_norm": 1.3898857831954956, + "learning_rate": 0.0002926309247223459 + }, + { + "step": 113, + "epoch": 0.5750636132315522, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821219328, + "loss": 1.4643, + "grad_norm": 1.4439101219177246, + "learning_rate": 0.0002923526323979975 + }, + { + "step": 114, + "epoch": 0.5801526717557252, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821048832, + "loss": 1.3991, + "grad_norm": 0.6095612049102783, + "learning_rate": 0.00029206931950826387 + }, + { + "step": 115, + "epoch": 0.5852417302798982, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821058048, + "loss": 1.3906, + "grad_norm": 0.44761934876441956, + "learning_rate": 0.00029178099604516876 + }, + { + "step": 116, + "epoch": 0.5903307888040712, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821119488, + "loss": 1.4041, + "grad_norm": 0.8002026677131653, + "learning_rate": 0.0002914876721774515 + }, + { + "step": 117, + "epoch": 0.5954198473282443, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821011968, + "loss": 1.3863, + "grad_norm": 0.35989442467689514, + "learning_rate": 0.00029118935825020806 + }, + { + "step": 118, + "epoch": 0.6005089058524173, + "cpu_mem": 3.113271296, + "gpu_mem": 1.8211072, + "loss": 1.3788, + "grad_norm": 0.3892383277416229, + "learning_rate": 0.00029088606478452656 + }, + { + "step": 119, + "epoch": 0.6055979643765903, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821240832, + "loss": 1.3924, + "grad_norm": 0.8341579437255859, + "learning_rate": 0.0002905778024771158 + }, + { + "step": 120, + "epoch": 0.6106870229007634, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821144064, + "loss": 1.3973, + "grad_norm": 1.0399988889694214, + "learning_rate": 0.00029026458219992855 + }, + { + "step": 121, + "epoch": 0.6157760814249363, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821190144, + "loss": 1.3802, + "grad_norm": 0.4304821789264679, + "learning_rate": 0.00028994641499977745 + }, + { + "step": 122, + "epoch": 0.6208651399491094, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821134848, + "loss": 1.4005, + "grad_norm": 0.6028931140899658, + "learning_rate": 0.00028962331209794604 + }, + { + "step": 123, + "epoch": 0.6259541984732825, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821173248, + "loss": 1.4059, + "grad_norm": 1.2111930847167969, + "learning_rate": 0.00028929528488979244 + }, + { + "step": 124, + "epoch": 0.6310432569974554, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821096448, + "loss": 1.3603, + "grad_norm": 0.746380090713501, + "learning_rate": 0.0002889623449443479 + }, + { + "step": 125, + "epoch": 0.6361323155216285, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821131776, + "loss": 1.3978, + "grad_norm": 0.8815646767616272, + "learning_rate": 0.0002886245040039086 + }, + { + "step": 126, + "epoch": 0.6412213740458015, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82103808, + "loss": 1.3641, + "grad_norm": 0.6282069683074951, + "learning_rate": 0.0002882817739836215 + }, + { + "step": 127, + "epoch": 0.6463104325699746, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821062656, + "loss": 1.4178, + "grad_norm": 0.9397181868553162, + "learning_rate": 0.000287934166971064 + }, + { + "step": 128, + "epoch": 0.6513994910941476, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821035008, + "loss": 1.3697, + "grad_norm": 0.46435537934303284, + "learning_rate": 0.0002875816952258179 + }, + { + "step": 129, + "epoch": 0.6564885496183206, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821041152, + "loss": 1.3973, + "grad_norm": 0.9885510206222534, + "learning_rate": 0.00028722437117903693 + }, + { + "step": 130, + "epoch": 0.6615776081424937, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821031936, + "loss": 1.4482, + "grad_norm": 1.469680905342102, + "learning_rate": 0.000286862207433008 + }, + { + "step": 131, + "epoch": 0.6666666666666666, + "cpu_mem": 3.113271296, + "gpu_mem": 1.820965888, + "loss": 1.4225, + "grad_norm": 1.0637551546096802, + "learning_rate": 0.00028649521676070726 + }, + { + "step": 132, + "epoch": 0.6717557251908397, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821128704, + "loss": 1.401, + "grad_norm": 0.8559896349906921, + "learning_rate": 0.0002861234121053493 + }, + { + "step": 133, + "epoch": 0.6768447837150128, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821041152, + "loss": 1.4099, + "grad_norm": 1.2120181322097778, + "learning_rate": 0.0002857468065799307 + }, + { + "step": 134, + "epoch": 0.6819338422391857, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821044224, + "loss": 1.3982, + "grad_norm": 0.7364377379417419, + "learning_rate": 0.0002853654134667676 + }, + { + "step": 135, + "epoch": 0.6870229007633588, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82109952, + "loss": 1.4227, + "grad_norm": 1.128157377243042, + "learning_rate": 0.0002849792462170271 + }, + { + "step": 136, + "epoch": 0.6921119592875318, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821002752, + "loss": 1.4673, + "grad_norm": 1.1331865787506104, + "learning_rate": 0.0002845883184502533 + }, + { + "step": 137, + "epoch": 0.6972010178117048, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821170176, + "loss": 1.3908, + "grad_norm": 0.6120572090148926, + "learning_rate": 0.00028419264395388626 + }, + { + "step": 138, + "epoch": 0.7022900763358778, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821036544, + "loss": 1.3453, + "grad_norm": 0.6200059652328491, + "learning_rate": 0.0002837922366827765 + }, + { + "step": 139, + "epoch": 0.7073791348600509, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821025792, + "loss": 1.4072, + "grad_norm": 1.3088102340698242, + "learning_rate": 0.00028338711075869216 + }, + { + "step": 140, + "epoch": 0.712468193384224, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821078016, + "loss": 1.3812, + "grad_norm": 1.0158618688583374, + "learning_rate": 0.00028297728046982137 + }, + { + "step": 141, + "epoch": 0.7175572519083969, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821002752, + "loss": 1.4295, + "grad_norm": 1.341398000717163, + "learning_rate": 0.00028256276027026816 + }, + { + "step": 142, + "epoch": 0.72264631043257, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821064192, + "loss": 1.4563, + "grad_norm": 1.433461308479309, + "learning_rate": 0.0002821435647795429 + }, + { + "step": 143, + "epoch": 0.727735368956743, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821062656, + "loss": 1.3784, + "grad_norm": 0.5539003014564514, + "learning_rate": 0.00028171970878204623 + }, + { + "step": 144, + "epoch": 0.732824427480916, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82100736, + "loss": 1.3814, + "grad_norm": 0.40661826729774475, + "learning_rate": 0.0002812912072265481 + }, + { + "step": 145, + "epoch": 0.7379134860050891, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821004288, + "loss": 1.4083, + "grad_norm": 1.0502127408981323, + "learning_rate": 0.00028085807522566043 + }, + { + "step": 146, + "epoch": 0.7430025445292621, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821105664, + "loss": 1.3976, + "grad_norm": 0.8885952234268188, + "learning_rate": 0.00028042032805530387 + }, + { + "step": 147, + "epoch": 0.7480916030534351, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821110272, + "loss": 1.3728, + "grad_norm": 0.6923975348472595, + "learning_rate": 0.00027997798115416935 + }, + { + "step": 148, + "epoch": 0.7531806615776081, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821208576, + "loss": 1.4091, + "grad_norm": 0.8596552014350891, + "learning_rate": 0.0002795310501231734 + }, + { + "step": 149, + "epoch": 0.7582697201017812, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821051904, + "loss": 1.4018, + "grad_norm": 0.7871503233909607, + "learning_rate": 0.0002790795507249081 + }, + { + "step": 150, + "epoch": 0.7633587786259542, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821044224, + "loss": 1.3892, + "grad_norm": 0.5919831991195679, + "learning_rate": 0.00027862349888308494 + }, + { + "step": 151, + "epoch": 0.7684478371501272, + "cpu_mem": 3.113271296, + "gpu_mem": 1.820987392, + "loss": 1.3764, + "grad_norm": 0.32677996158599854, + "learning_rate": 0.0002781629106819733 + }, + { + "step": 152, + "epoch": 0.7735368956743003, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821019648, + "loss": 1.3836, + "grad_norm": 0.45640766620635986, + "learning_rate": 0.00027769780236583315 + }, + { + "step": 153, + "epoch": 0.7786259541984732, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821047296, + "loss": 1.3741, + "grad_norm": 0.6364334225654602, + "learning_rate": 0.0002772281903383424 + }, + { + "step": 154, + "epoch": 0.7837150127226463, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821097984, + "loss": 1.3921, + "grad_norm": 0.8242989778518677, + "learning_rate": 0.00027675409116201797 + }, + { + "step": 155, + "epoch": 0.7888040712468194, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821010432, + "loss": 1.4035, + "grad_norm": 0.3738221824169159, + "learning_rate": 0.00027627552155763186 + }, + { + "step": 156, + "epoch": 0.7938931297709924, + "cpu_mem": 3.113271296, + "gpu_mem": 1.8210304, + "loss": 1.3897, + "grad_norm": 0.8507632613182068, + "learning_rate": 0.00027579249840362145 + }, + { + "step": 157, + "epoch": 0.7989821882951654, + "cpu_mem": 3.113271296, + "gpu_mem": 1.8211072, + "loss": 1.3927, + "grad_norm": 0.8078579902648926, + "learning_rate": 0.0002753050387354942 + }, + { + "step": 158, + "epoch": 0.8040712468193384, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821008896, + "loss": 1.3944, + "grad_norm": 0.5337038040161133, + "learning_rate": 0.0002748131597452268 + }, + { + "step": 159, + "epoch": 0.8091603053435115, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821110272, + "loss": 1.3975, + "grad_norm": 0.6491421461105347, + "learning_rate": 0.00027431687878065874 + }, + { + "step": 160, + "epoch": 0.8142493638676844, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821067264, + "loss": 1.3991, + "grad_norm": 1.1651588678359985, + "learning_rate": 0.00027381621334488085 + }, + { + "step": 161, + "epoch": 0.8193384223918575, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821065728, + "loss": 1.3625, + "grad_norm": 0.4121988117694855, + "learning_rate": 0.00027331118109561744 + }, + { + "step": 162, + "epoch": 0.8244274809160306, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821048832, + "loss": 1.3816, + "grad_norm": 0.43150025606155396, + "learning_rate": 0.000272801799844604 + }, + { + "step": 163, + "epoch": 0.8295165394402035, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821165568, + "loss": 1.4383, + "grad_norm": 1.0060560703277588, + "learning_rate": 0.00027228808755695884 + }, + { + "step": 164, + "epoch": 0.8346055979643766, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82106112, + "loss": 1.396, + "grad_norm": 0.87540602684021, + "learning_rate": 0.00027177006235054943 + }, + { + "step": 165, + "epoch": 0.8396946564885496, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821140992, + "loss": 1.3968, + "grad_norm": 0.46050024032592773, + "learning_rate": 0.0002712477424953534 + }, + { + "step": 166, + "epoch": 0.8447837150127226, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821050368, + "loss": 1.3754, + "grad_norm": 0.7831093072891235, + "learning_rate": 0.00027072114641281435 + }, + { + "step": 167, + "epoch": 0.8498727735368957, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821011968, + "loss": 1.369, + "grad_norm": 0.6410794258117676, + "learning_rate": 0.0002701902926751921 + }, + { + "step": 168, + "epoch": 0.8549618320610687, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82101504, + "loss": 1.3508, + "grad_norm": 0.5212058424949646, + "learning_rate": 0.00026965520000490743 + }, + { + "step": 169, + "epoch": 0.8600508905852418, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821082624, + "loss": 1.364, + "grad_norm": 0.7267853021621704, + "learning_rate": 0.0002691158872738822 + }, + { + "step": 170, + "epoch": 0.8651399491094147, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821090304, + "loss": 1.4076, + "grad_norm": 0.7872965335845947, + "learning_rate": 0.00026857237350287334 + }, + { + "step": 171, + "epoch": 0.8702290076335878, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821064192, + "loss": 1.4045, + "grad_norm": 0.6829628944396973, + "learning_rate": 0.0002680246778608023 + }, + { + "step": 172, + "epoch": 0.8753180661577609, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821044224, + "loss": 1.3754, + "grad_norm": 0.28063952922821045, + "learning_rate": 0.0002674728196640788 + }, + { + "step": 173, + "epoch": 0.8804071246819338, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821081088, + "loss": 1.4039, + "grad_norm": 0.7310879230499268, + "learning_rate": 0.00026691681837591984 + }, + { + "step": 174, + "epoch": 0.8854961832061069, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821024256, + "loss": 1.38, + "grad_norm": 0.2757090926170349, + "learning_rate": 0.00026635669360566296 + }, + { + "step": 175, + "epoch": 0.8905852417302799, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821094912, + "loss": 1.39, + "grad_norm": 1.1245434284210205, + "learning_rate": 0.00026579246510807477 + }, + { + "step": 176, + "epoch": 0.8956743002544529, + "cpu_mem": 3.113271296, + "gpu_mem": 1.820992, + "loss": 1.3698, + "grad_norm": 0.4498777687549591, + "learning_rate": 0.00026522415278265425 + }, + { + "step": 177, + "epoch": 0.9007633587786259, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821001216, + "loss": 1.4273, + "grad_norm": 1.0490611791610718, + "learning_rate": 0.0002646517766729309 + }, + { + "step": 178, + "epoch": 0.905852417302799, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821078016, + "loss": 1.4165, + "grad_norm": 0.9493329524993896, + "learning_rate": 0.0002640753569657579 + }, + { + "step": 179, + "epoch": 0.910941475826972, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821085696, + "loss": 1.3709, + "grad_norm": 0.3585142493247986, + "learning_rate": 0.0002634949139906 + }, + { + "step": 180, + "epoch": 0.916030534351145, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82109184, + "loss": 1.4153, + "grad_norm": 0.903349757194519, + "learning_rate": 0.00026291046821881673 + }, + { + "step": 181, + "epoch": 0.9211195928753181, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821010432, + "loss": 1.4102, + "grad_norm": 0.6674940586090088, + "learning_rate": 0.0002623220402629402 + }, + { + "step": 182, + "epoch": 0.926208651399491, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821121024, + "loss": 1.3958, + "grad_norm": 0.5497495532035828, + "learning_rate": 0.0002617296508759483 + }, + { + "step": 183, + "epoch": 0.9312977099236641, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821108736, + "loss": 1.4003, + "grad_norm": 0.7935939431190491, + "learning_rate": 0.00026113332095053257 + }, + { + "step": 184, + "epoch": 0.9363867684478372, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82109184, + "loss": 1.4213, + "grad_norm": 1.1277867555618286, + "learning_rate": 0.0002605330715183616 + }, + { + "step": 185, + "epoch": 0.9414758269720102, + "cpu_mem": 3.113271296, + "gpu_mem": 1.8210304, + "loss": 1.4191, + "grad_norm": 1.1471519470214844, + "learning_rate": 0.0002599289237493392 + }, + { + "step": 186, + "epoch": 0.9465648854961832, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821062656, + "loss": 1.3697, + "grad_norm": 0.40237319469451904, + "learning_rate": 0.0002593208989508575 + }, + { + "step": 187, + "epoch": 0.9516539440203562, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821159424, + "loss": 1.3653, + "grad_norm": 0.27740904688835144, + "learning_rate": 0.00025870901856704583 + }, + { + "step": 188, + "epoch": 0.9567430025445293, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821131776, + "loss": 1.3604, + "grad_norm": 0.29505857825279236, + "learning_rate": 0.00025809330417801425 + }, + { + "step": 189, + "epoch": 0.9618320610687023, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821036544, + "loss": 1.5091, + "grad_norm": 1.6612234115600586, + "learning_rate": 0.00025747377749909254 + }, + { + "step": 190, + "epoch": 0.9669211195928753, + "cpu_mem": 3.113271296, + "gpu_mem": 1.8210688, + "loss": 1.4032, + "grad_norm": 0.9677612781524658, + "learning_rate": 0.00025685046038006413 + }, + { + "step": 191, + "epoch": 0.9720101781170484, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821058048, + "loss": 1.4246, + "grad_norm": 0.7989298701286316, + "learning_rate": 0.0002562233748043958 + }, + { + "step": 192, + "epoch": 0.9770992366412213, + "cpu_mem": 3.113271296, + "gpu_mem": 1.820995072, + "loss": 1.422, + "grad_norm": 1.4173755645751953, + "learning_rate": 0.00025559254288846196 + }, + { + "step": 193, + "epoch": 0.9821882951653944, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821116416, + "loss": 1.4061, + "grad_norm": 0.5086129903793335, + "learning_rate": 0.0002549579868807651 + }, + { + "step": 194, + "epoch": 0.9872773536895675, + "cpu_mem": 3.113271296, + "gpu_mem": 1.82103808, + "loss": 1.3856, + "grad_norm": 1.0409482717514038, + "learning_rate": 0.0002543197291611507 + }, + { + "step": 195, + "epoch": 0.9923664122137404, + "cpu_mem": 3.113271296, + "gpu_mem": 1.821150208, + "loss": 1.3663, + "grad_norm": 0.4114460349082947, + "learning_rate": 0.0002536777922400183 + }, + { + "step": 196, + "epoch": 0.9974554707379135, + "cpu_mem": 3.113271296, + "gpu_mem": 1.8211456, + "loss": 1.3955, + "grad_norm": 1.0342953205108643, + "learning_rate": 0.0002530321987575271 + }, + { + "step": 197, + "epoch": 1.0025445292620865, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905154048, + "loss": 2.0512, + "grad_norm": 0.9975292682647705, + "learning_rate": 0.0002523829714827981 + }, + { + "step": 198, + "epoch": 1.0076335877862594, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905220096, + "loss": 1.3788, + "grad_norm": 0.5035502314567566, + "learning_rate": 0.00025173013331311053 + }, + { + "step": 199, + "epoch": 1.0127226463104326, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905217024, + "loss": 1.3792, + "grad_norm": 1.0316883325576782, + "learning_rate": 0.0002510737072730946 + }, + { + "step": 200, + "epoch": 1.0178117048346056, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905101824, + "loss": 1.3696, + "grad_norm": 0.9440978169441223, + "learning_rate": 0.0002504137165139193 + }, + { + "step": 201, + "epoch": 1.0229007633587786, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905152512, + "loss": 1.3945, + "grad_norm": 1.1476325988769531, + "learning_rate": 0.0002497501843124761 + }, + { + "step": 202, + "epoch": 1.0279898218829517, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905146368, + "loss": 1.4563, + "grad_norm": 1.7681964635849, + "learning_rate": 0.00024908313407055765 + }, + { + "step": 203, + "epoch": 1.0330788804071247, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905193984, + "loss": 1.3872, + "grad_norm": 0.8654902577400208, + "learning_rate": 0.00024841258931403284 + }, + { + "step": 204, + "epoch": 1.0381679389312977, + "cpu_mem": 3.113271296, + "gpu_mem": 1.9051264, + "loss": 1.3303, + "grad_norm": 0.7527448534965515, + "learning_rate": 0.00024773857369201675 + }, + { + "step": 205, + "epoch": 1.0432569974554706, + "cpu_mem": 3.113271296, + "gpu_mem": 1.90517248, + "loss": 1.3708, + "grad_norm": 0.6352161765098572, + "learning_rate": 0.00024706111097603676 + }, + { + "step": 206, + "epoch": 1.0483460559796438, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905186304, + "loss": 1.3298, + "grad_norm": 0.8687700033187866, + "learning_rate": 0.00024638022505919425 + }, + { + "step": 207, + "epoch": 1.0534351145038168, + "cpu_mem": 3.113271296, + "gpu_mem": 1.90514176, + "loss": 1.3585, + "grad_norm": 1.6945754289627075, + "learning_rate": 0.00024569593995532157 + }, + { + "step": 208, + "epoch": 1.0585241730279897, + "cpu_mem": 3.113271296, + "gpu_mem": 1.90513408, + "loss": 1.4337, + "grad_norm": 1.9076613187789917, + "learning_rate": 0.00024500827979813546 + }, + { + "step": 209, + "epoch": 1.063613231552163, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905255424, + "loss": 1.4284, + "grad_norm": 1.7829077243804932, + "learning_rate": 0.0002443172688403859 + }, + { + "step": 210, + "epoch": 1.0687022900763359, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905152512, + "loss": 1.4085, + "grad_norm": 2.1941967010498047, + "learning_rate": 0.00024362293145300027 + }, + { + "step": 211, + "epoch": 1.0737913486005088, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905154048, + "loss": 1.361, + "grad_norm": 1.5711240768432617, + "learning_rate": 0.00024292529212422445 + }, + { + "step": 212, + "epoch": 1.078880407124682, + "cpu_mem": 3.113271296, + "gpu_mem": 1.90515712, + "loss": 1.3718, + "grad_norm": 0.6616776585578918, + "learning_rate": 0.00024222437545875887 + }, + { + "step": 213, + "epoch": 1.083969465648855, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905083392, + "loss": 1.4145, + "grad_norm": 1.1617729663848877, + "learning_rate": 0.0002415202061768906 + }, + { + "step": 214, + "epoch": 1.089058524173028, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905078784, + "loss": 1.3481, + "grad_norm": 0.9168514609336853, + "learning_rate": 0.0002408128091136217 + }, + { + "step": 215, + "epoch": 1.094147582697201, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905129472, + "loss": 1.37, + "grad_norm": 1.6636909246444702, + "learning_rate": 0.00024010220921779336 + }, + { + "step": 216, + "epoch": 1.099236641221374, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905258496, + "loss": 1.3568, + "grad_norm": 0.6562288403511047, + "learning_rate": 0.00023938843155120581 + }, + { + "step": 217, + "epoch": 1.104325699745547, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905120256, + "loss": 1.3331, + "grad_norm": 0.8974683284759521, + "learning_rate": 0.00023867150128773453 + }, + { + "step": 218, + "epoch": 1.10941475826972, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905146368, + "loss": 1.3182, + "grad_norm": 0.574216902256012, + "learning_rate": 0.0002379514437124425 + }, + { + "step": 219, + "epoch": 1.1145038167938932, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905158656, + "loss": 1.3705, + "grad_norm": 0.811642587184906, + "learning_rate": 0.00023722828422068814 + }, + { + "step": 220, + "epoch": 1.1195928753180662, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905131008, + "loss": 1.4478, + "grad_norm": 1.658861756324768, + "learning_rate": 0.00023650204831723008 + }, + { + "step": 221, + "epoch": 1.1246819338422391, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905104896, + "loss": 1.4011, + "grad_norm": 0.6337707042694092, + "learning_rate": 0.00023577276161532718 + }, + { + "step": 222, + "epoch": 1.1297709923664123, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905117184, + "loss": 1.359, + "grad_norm": 0.7838967442512512, + "learning_rate": 0.0002350404498358356 + }, + { + "step": 223, + "epoch": 1.1348600508905853, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905144832, + "loss": 1.3019, + "grad_norm": 0.6580224633216858, + "learning_rate": 0.00023430513880630133 + }, + { + "step": 224, + "epoch": 1.1399491094147582, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905163264, + "loss": 1.3611, + "grad_norm": 0.5345458984375, + "learning_rate": 0.00023356685446004966 + }, + { + "step": 225, + "epoch": 1.1450381679389312, + "cpu_mem": 3.113271296, + "gpu_mem": 1.905213952, + "loss": 1.3918, + "grad_norm": 0.898566484451294, + "learning_rate": 0.00023282562283527005 + }, + { + "step": 226, + "epoch": 1.1501272264631044, + "cpu_mem": 3.113271296, + "gpu_mem": 1.9051648, + "loss": 1.4015, + "grad_norm": 0.7438874840736389, + "learning_rate": 0.00023208147007409827 + }, + { + "step": 227, + "epoch": 1.1552162849872774, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905115648, + "loss": 1.3923, + "grad_norm": 0.7566032409667969, + "learning_rate": 0.00023133442242169425 + }, + { + "step": 228, + "epoch": 1.1603053435114503, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905160192, + "loss": 1.402, + "grad_norm": 0.525081217288971, + "learning_rate": 0.00023058450622531632 + }, + { + "step": 229, + "epoch": 1.1653944020356235, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905058816, + "loss": 1.3747, + "grad_norm": 0.5139512419700623, + "learning_rate": 0.00022983174793339206 + }, + { + "step": 230, + "epoch": 1.1704834605597965, + "cpu_mem": 3.113467904, + "gpu_mem": 1.9051264, + "loss": 1.3996, + "grad_norm": 0.40653568506240845, + "learning_rate": 0.0002290761740945857 + }, + { + "step": 231, + "epoch": 1.1755725190839694, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905117184, + "loss": 1.3646, + "grad_norm": 0.6309854388237, + "learning_rate": 0.00022831781135686135 + }, + { + "step": 232, + "epoch": 1.1806615776081424, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905121792, + "loss": 1.3564, + "grad_norm": 0.840477705001831, + "learning_rate": 0.00022755668646654375 + }, + { + "step": 233, + "epoch": 1.1857506361323156, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905236992, + "loss": 1.4048, + "grad_norm": 0.9431836009025574, + "learning_rate": 0.00022679282626737442 + }, + { + "step": 234, + "epoch": 1.1908396946564885, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905197056, + "loss": 1.4298, + "grad_norm": 1.051881194114685, + "learning_rate": 0.00022602625769956519 + }, + { + "step": 235, + "epoch": 1.1959287531806615, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905155584, + "loss": 1.395, + "grad_norm": 0.8881512880325317, + "learning_rate": 0.00022525700779884802 + }, + { + "step": 236, + "epoch": 1.2010178117048347, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905154048, + "loss": 1.4105, + "grad_norm": 1.0743992328643799, + "learning_rate": 0.00022448510369552164 + }, + { + "step": 237, + "epoch": 1.2061068702290076, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905178624, + "loss": 1.383, + "grad_norm": 0.5493573546409607, + "learning_rate": 0.0002237105726134943 + }, + { + "step": 238, + "epoch": 1.2111959287531806, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905198592, + "loss": 1.3848, + "grad_norm": 0.5919926762580872, + "learning_rate": 0.00022293344186932406 + }, + { + "step": 239, + "epoch": 1.2162849872773536, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905115648, + "loss": 1.431, + "grad_norm": 1.232595443725586, + "learning_rate": 0.00022215373887125514 + }, + { + "step": 240, + "epoch": 1.2213740458015268, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905250816, + "loss": 1.3981, + "grad_norm": 0.7269375324249268, + "learning_rate": 0.00022137149111825128 + }, + { + "step": 241, + "epoch": 1.2264631043256997, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905204736, + "loss": 1.3797, + "grad_norm": 0.6194531321525574, + "learning_rate": 0.00022058672619902606 + }, + { + "step": 242, + "epoch": 1.2315521628498727, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90522624, + "loss": 1.3875, + "grad_norm": 0.44889453053474426, + "learning_rate": 0.00021979947179106966 + }, + { + "step": 243, + "epoch": 1.2366412213740459, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905112576, + "loss": 1.38, + "grad_norm": 1.638612151145935, + "learning_rate": 0.0002190097556596728 + }, + { + "step": 244, + "epoch": 1.2417302798982188, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905100288, + "loss": 1.4224, + "grad_norm": 1.138084888458252, + "learning_rate": 0.0002182176056569476 + }, + { + "step": 245, + "epoch": 1.2468193384223918, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905104896, + "loss": 1.4175, + "grad_norm": 1.0911611318588257, + "learning_rate": 0.00021742304972084518 + }, + { + "step": 246, + "epoch": 1.2519083969465647, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90514176, + "loss": 1.391, + "grad_norm": 0.671161949634552, + "learning_rate": 0.00021662611587417035 + }, + { + "step": 247, + "epoch": 1.256997455470738, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90510336, + "loss": 1.3581, + "grad_norm": 0.8125865459442139, + "learning_rate": 0.00021582683222359317 + }, + { + "step": 248, + "epoch": 1.262086513994911, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90514944, + "loss": 1.378, + "grad_norm": 0.5536556839942932, + "learning_rate": 0.00021502522695865796 + }, + { + "step": 249, + "epoch": 1.267175572519084, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905158656, + "loss": 1.3855, + "grad_norm": 0.5323188900947571, + "learning_rate": 0.00021422132835078884 + }, + { + "step": 250, + "epoch": 1.272264631043257, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905197056, + "loss": 1.387, + "grad_norm": 0.8608607649803162, + "learning_rate": 0.0002134151647522927 + }, + { + "step": 251, + "epoch": 1.27735368956743, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905115648, + "loss": 1.3721, + "grad_norm": 0.6481439471244812, + "learning_rate": 0.00021260676459535933 + }, + { + "step": 252, + "epoch": 1.282442748091603, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905124864, + "loss": 1.4, + "grad_norm": 1.2097550630569458, + "learning_rate": 0.00021179615639105857 + }, + { + "step": 253, + "epoch": 1.2875318066157762, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905190912, + "loss": 1.3719, + "grad_norm": 0.574444591999054, + "learning_rate": 0.00021098336872833482 + }, + { + "step": 254, + "epoch": 1.2926208651399491, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905117184, + "loss": 1.3664, + "grad_norm": 0.7013645172119141, + "learning_rate": 0.0002101684302729987 + }, + { + "step": 255, + "epoch": 1.297709923664122, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90511104, + "loss": 1.3773, + "grad_norm": 0.7573210000991821, + "learning_rate": 0.00020935136976671617 + }, + { + "step": 256, + "epoch": 1.3027989821882953, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905177088, + "loss": 1.399, + "grad_norm": 0.9376775622367859, + "learning_rate": 0.00020853221602599458 + }, + { + "step": 257, + "epoch": 1.3078880407124682, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905135616, + "loss": 1.3547, + "grad_norm": 0.5309875011444092, + "learning_rate": 0.00020771099794116672 + }, + { + "step": 258, + "epoch": 1.3129770992366412, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90524928, + "loss": 1.3476, + "grad_norm": 0.7361463904380798, + "learning_rate": 0.0002068877444753717 + }, + { + "step": 259, + "epoch": 1.3180661577608141, + "cpu_mem": 3.113467904, + "gpu_mem": 1.9051264, + "loss": 1.3614, + "grad_norm": 0.6686943769454956, + "learning_rate": 0.0002060624846635335 + }, + { + "step": 260, + "epoch": 1.3231552162849873, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905152512, + "loss": 1.3732, + "grad_norm": 0.647790253162384, + "learning_rate": 0.00020523524761133677 + }, + { + "step": 261, + "epoch": 1.3282442748091603, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905158656, + "loss": 1.3457, + "grad_norm": 0.799545407295227, + "learning_rate": 0.00020440606249420073 + }, + { + "step": 262, + "epoch": 1.3333333333333333, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90522624, + "loss": 1.3941, + "grad_norm": 0.7074455618858337, + "learning_rate": 0.00020357495855624974 + }, + { + "step": 263, + "epoch": 1.3384223918575064, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905081856, + "loss": 1.3664, + "grad_norm": 0.973965048789978, + "learning_rate": 0.0002027419651092822 + }, + { + "step": 264, + "epoch": 1.3435114503816794, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905215488, + "loss": 1.3765, + "grad_norm": 0.7563479542732239, + "learning_rate": 0.00020190711153173676 + }, + { + "step": 265, + "epoch": 1.3486005089058524, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905244672, + "loss": 1.3375, + "grad_norm": 0.9132266044616699, + "learning_rate": 0.00020107042726765588 + }, + { + "step": 266, + "epoch": 1.3536895674300253, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905114112, + "loss": 1.3684, + "grad_norm": 1.130285382270813, + "learning_rate": 0.0002002319418256479 + }, + { + "step": 267, + "epoch": 1.3587786259541985, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905115648, + "loss": 1.4424, + "grad_norm": 1.6794543266296387, + "learning_rate": 0.00019939168477784583 + }, + { + "step": 268, + "epoch": 1.3638676844783715, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905117184, + "loss": 1.3524, + "grad_norm": 1.016810655593872, + "learning_rate": 0.00019854968575886458 + }, + { + "step": 269, + "epoch": 1.3689567430025447, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905190912, + "loss": 1.4163, + "grad_norm": 1.2469711303710938, + "learning_rate": 0.00019770597446475588 + }, + { + "step": 270, + "epoch": 1.3740458015267176, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90517248, + "loss": 1.3087, + "grad_norm": 0.6948901414871216, + "learning_rate": 0.0001968605806519608 + }, + { + "step": 271, + "epoch": 1.3791348600508906, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905115648, + "loss": 1.3694, + "grad_norm": 0.6360764503479004, + "learning_rate": 0.00019601353413626032 + }, + { + "step": 272, + "epoch": 1.3842239185750635, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905154048, + "loss": 1.3759, + "grad_norm": 1.1712974309921265, + "learning_rate": 0.00019516486479172386 + }, + { + "step": 273, + "epoch": 1.3893129770992365, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905221632, + "loss": 1.374, + "grad_norm": 0.7962912321090698, + "learning_rate": 0.0001943146025496555 + }, + { + "step": 274, + "epoch": 1.3944020356234097, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905174016, + "loss": 1.3418, + "grad_norm": 0.6504131555557251, + "learning_rate": 0.00019346277739753855 + }, + { + "step": 275, + "epoch": 1.3994910941475827, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905301504, + "loss": 1.3407, + "grad_norm": 0.7462274432182312, + "learning_rate": 0.00019260941937797776 + }, + { + "step": 276, + "epoch": 1.4045801526717558, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905186304, + "loss": 1.3522, + "grad_norm": 0.5024028420448303, + "learning_rate": 0.00019175455858763988 + }, + { + "step": 277, + "epoch": 1.4096692111959288, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905135616, + "loss": 1.3584, + "grad_norm": 0.7558271884918213, + "learning_rate": 0.0001908982251761921 + }, + { + "step": 278, + "epoch": 1.4147582697201018, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905123328, + "loss": 1.3116, + "grad_norm": 0.7395772933959961, + "learning_rate": 0.00019004044934523871 + }, + { + "step": 279, + "epoch": 1.4198473282442747, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905115648, + "loss": 1.3652, + "grad_norm": 0.6781938672065735, + "learning_rate": 0.00018918126134725616 + }, + { + "step": 280, + "epoch": 1.424936386768448, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905238528, + "loss": 1.3234, + "grad_norm": 0.9510707855224609, + "learning_rate": 0.00018832069148452582 + }, + { + "step": 281, + "epoch": 1.4300254452926209, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905178624, + "loss": 1.3989, + "grad_norm": 1.4387047290802002, + "learning_rate": 0.00018745877010806534 + }, + { + "step": 282, + "epoch": 1.4351145038167938, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905124864, + "loss": 1.3734, + "grad_norm": 1.0874615907669067, + "learning_rate": 0.00018659552761655828 + }, + { + "step": 283, + "epoch": 1.440203562340967, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90514176, + "loss": 1.3472, + "grad_norm": 0.7096400856971741, + "learning_rate": 0.00018573099445528204 + }, + { + "step": 284, + "epoch": 1.44529262086514, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905281536, + "loss": 1.3919, + "grad_norm": 0.912343442440033, + "learning_rate": 0.00018486520111503387 + }, + { + "step": 285, + "epoch": 1.450381679389313, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905152512, + "loss": 1.4232, + "grad_norm": 1.491202712059021, + "learning_rate": 0.0001839981781310558 + }, + { + "step": 286, + "epoch": 1.455470737913486, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905147904, + "loss": 1.4216, + "grad_norm": 1.4537569284439087, + "learning_rate": 0.00018312995608195747 + }, + { + "step": 287, + "epoch": 1.460559796437659, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905051136, + "loss": 1.415, + "grad_norm": 1.4356942176818848, + "learning_rate": 0.00018226056558863778 + }, + { + "step": 288, + "epoch": 1.465648854961832, + "cpu_mem": 3.113467904, + "gpu_mem": 1.9051264, + "loss": 1.3901, + "grad_norm": 1.1576313972473145, + "learning_rate": 0.00018139003731320496 + }, + { + "step": 289, + "epoch": 1.470737913486005, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905146368, + "loss": 1.364, + "grad_norm": 0.9789712429046631, + "learning_rate": 0.00018051840195789506 + }, + { + "step": 290, + "epoch": 1.4758269720101782, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905112576, + "loss": 1.3638, + "grad_norm": 0.8423197865486145, + "learning_rate": 0.00017964569026398926 + }, + { + "step": 291, + "epoch": 1.4809160305343512, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905074176, + "loss": 1.444, + "grad_norm": 1.6864820718765259, + "learning_rate": 0.00017877193301072945 + }, + { + "step": 292, + "epoch": 1.4860050890585241, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905200128, + "loss": 1.4175, + "grad_norm": 1.6534279584884644, + "learning_rate": 0.0001778971610142331 + }, + { + "step": 293, + "epoch": 1.491094147582697, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905077248, + "loss": 1.4312, + "grad_norm": 1.6089859008789062, + "learning_rate": 0.00017702140512640594 + }, + { + "step": 294, + "epoch": 1.4961832061068703, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905120256, + "loss": 1.4058, + "grad_norm": 1.1727707386016846, + "learning_rate": 0.00017614469623385414 + }, + { + "step": 295, + "epoch": 1.5012722646310432, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90510336, + "loss": 1.3402, + "grad_norm": 1.3052458763122559, + "learning_rate": 0.00017526706525679498 + }, + { + "step": 296, + "epoch": 1.5063613231552164, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905144832, + "loss": 1.3429, + "grad_norm": 0.8560346961021423, + "learning_rate": 0.00017438854314796623 + }, + { + "step": 297, + "epoch": 1.5114503816793894, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905084928, + "loss": 1.3907, + "grad_norm": 0.9063064455986023, + "learning_rate": 0.00017350916089153455 + }, + { + "step": 298, + "epoch": 1.5165394402035624, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905107968, + "loss": 1.3815, + "grad_norm": 1.1800432205200195, + "learning_rate": 0.00017262894950200277 + }, + { + "step": 299, + "epoch": 1.5216284987277353, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905097216, + "loss": 1.3354, + "grad_norm": 0.7010225653648376, + "learning_rate": 0.000171747940023116 + }, + { + "step": 300, + "epoch": 1.5267175572519083, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905109504, + "loss": 1.3611, + "grad_norm": 0.6866477727890015, + "learning_rate": 0.0001708661635267667 + }, + { + "step": 301, + "epoch": 1.5318066157760815, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905112576, + "loss": 1.3609, + "grad_norm": 1.737938404083252, + "learning_rate": 0.00016998365111189906 + }, + { + "step": 302, + "epoch": 1.5368956743002544, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905094144, + "loss": 1.348, + "grad_norm": 0.6783648133277893, + "learning_rate": 0.00016910043390341183 + }, + { + "step": 303, + "epoch": 1.5419847328244276, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905100288, + "loss": 1.3302, + "grad_norm": 0.9422507286071777, + "learning_rate": 0.0001682165430510609 + }, + { + "step": 304, + "epoch": 1.5470737913486006, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905048064, + "loss": 1.3818, + "grad_norm": 1.8866678476333618, + "learning_rate": 0.00016733200972836055 + }, + { + "step": 305, + "epoch": 1.5521628498727735, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905120256, + "loss": 1.3217, + "grad_norm": 1.0897494554519653, + "learning_rate": 0.00016644686513148397 + }, + { + "step": 306, + "epoch": 1.5572519083969465, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905161728, + "loss": 1.3378, + "grad_norm": 1.3383039236068726, + "learning_rate": 0.00016556114047816317 + }, + { + "step": 307, + "epoch": 1.5623409669211195, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905109504, + "loss": 1.3253, + "grad_norm": 1.0309984683990479, + "learning_rate": 0.00016467486700658785 + }, + { + "step": 308, + "epoch": 1.5674300254452926, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905120256, + "loss": 1.3399, + "grad_norm": 1.3751200437545776, + "learning_rate": 0.0001637880759743037 + }, + { + "step": 309, + "epoch": 1.5725190839694656, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90510336, + "loss": 1.3883, + "grad_norm": 1.8217005729675293, + "learning_rate": 0.00016290079865711004 + }, + { + "step": 310, + "epoch": 1.5776081424936388, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905158656, + "loss": 1.3429, + "grad_norm": 1.2679810523986816, + "learning_rate": 0.00016201306634795675 + }, + { + "step": 311, + "epoch": 1.5826972010178118, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905124864, + "loss": 1.3513, + "grad_norm": 1.1056039333343506, + "learning_rate": 0.00016112491035584047 + }, + { + "step": 312, + "epoch": 1.5877862595419847, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90513408, + "loss": 1.3315, + "grad_norm": 1.150239109992981, + "learning_rate": 0.00016023636200470065 + }, + { + "step": 313, + "epoch": 1.5928753180661577, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905135616, + "loss": 1.331, + "grad_norm": 1.344879150390625, + "learning_rate": 0.00015934745263231464 + }, + { + "step": 314, + "epoch": 1.5979643765903306, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905362944, + "loss": 1.3818, + "grad_norm": 1.4897019863128662, + "learning_rate": 0.00015845821358919236 + }, + { + "step": 315, + "epoch": 1.6030534351145038, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905155584, + "loss": 1.3259, + "grad_norm": 1.104763388633728, + "learning_rate": 0.00015756867623747088 + }, + { + "step": 316, + "epoch": 1.608142493638677, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905127936, + "loss": 1.3535, + "grad_norm": 1.1140691041946411, + "learning_rate": 0.00015667887194980806 + }, + { + "step": 317, + "epoch": 1.61323155216285, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905092608, + "loss": 1.3251, + "grad_norm": 1.0610512495040894, + "learning_rate": 0.00015578883210827626 + }, + { + "step": 318, + "epoch": 1.618320610687023, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905183232, + "loss": 1.3099, + "grad_norm": 1.4653223752975464, + "learning_rate": 0.0001548985881032554 + }, + { + "step": 319, + "epoch": 1.623409669211196, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905107968, + "loss": 1.2891, + "grad_norm": 1.3148369789123535, + "learning_rate": 0.00015400817133232606 + }, + { + "step": 320, + "epoch": 1.6284987277353689, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905155584, + "loss": 1.3176, + "grad_norm": 1.148085355758667, + "learning_rate": 0.00015311761319916184 + }, + { + "step": 321, + "epoch": 1.6335877862595418, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905097216, + "loss": 1.3566, + "grad_norm": 1.430254578590393, + "learning_rate": 0.00015222694511242215 + }, + { + "step": 322, + "epoch": 1.638676844783715, + "cpu_mem": 3.113467904, + "gpu_mem": 1.9053184, + "loss": 1.2988, + "grad_norm": 1.3327760696411133, + "learning_rate": 0.00015133619848464424 + }, + { + "step": 323, + "epoch": 1.6437659033078882, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905200128, + "loss": 1.3427, + "grad_norm": 1.5058979988098145, + "learning_rate": 0.0001504454047311353 + }, + { + "step": 324, + "epoch": 1.6488549618320612, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905092608, + "loss": 1.3643, + "grad_norm": 2.2405126094818115, + "learning_rate": 0.00014955459526886468 + }, + { + "step": 325, + "epoch": 1.6539440203562341, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905132544, + "loss": 1.3761, + "grad_norm": 1.7857478857040405, + "learning_rate": 0.00014866380151535574 + }, + { + "step": 326, + "epoch": 1.659033078880407, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90518016, + "loss": 1.252, + "grad_norm": 1.512559413909912, + "learning_rate": 0.0001477730548875778 + }, + { + "step": 327, + "epoch": 1.66412213740458, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905146368, + "loss": 1.3311, + "grad_norm": 1.5184987783432007, + "learning_rate": 0.0001468823868008382 + }, + { + "step": 328, + "epoch": 1.6692111959287532, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905201664, + "loss": 1.2544, + "grad_norm": 1.3399018049240112, + "learning_rate": 0.000145991828667674 + }, + { + "step": 329, + "epoch": 1.6743002544529262, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905094144, + "loss": 1.4214, + "grad_norm": 1.641420841217041, + "learning_rate": 0.0001451014118967446 + }, + { + "step": 330, + "epoch": 1.6793893129770994, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905178624, + "loss": 1.2914, + "grad_norm": 1.7924326658248901, + "learning_rate": 0.00014421116789172374 + }, + { + "step": 331, + "epoch": 1.6844783715012723, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905167872, + "loss": 1.2282, + "grad_norm": 1.2720330953598022, + "learning_rate": 0.00014332112805019194 + }, + { + "step": 332, + "epoch": 1.6895674300254453, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905123328, + "loss": 1.2894, + "grad_norm": 1.4926481246948242, + "learning_rate": 0.00014243132376252912 + }, + { + "step": 333, + "epoch": 1.6946564885496183, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905212416, + "loss": 1.4835, + "grad_norm": 2.056016683578491, + "learning_rate": 0.00014154178641080767 + }, + { + "step": 334, + "epoch": 1.6997455470737912, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905140224, + "loss": 1.3122, + "grad_norm": 1.4418115615844727, + "learning_rate": 0.0001406525473676854 + }, + { + "step": 335, + "epoch": 1.7048346055979644, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90514944, + "loss": 1.2959, + "grad_norm": 1.2824722528457642, + "learning_rate": 0.00013976363799529936 + }, + { + "step": 336, + "epoch": 1.7099236641221374, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905201664, + "loss": 1.3376, + "grad_norm": 1.2171121835708618, + "learning_rate": 0.00013887508964415956 + }, + { + "step": 337, + "epoch": 1.7150127226463106, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905201664, + "loss": 1.247, + "grad_norm": 1.7492483854293823, + "learning_rate": 0.00013798693365204325 + }, + { + "step": 338, + "epoch": 1.7201017811704835, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905238528, + "loss": 1.3447, + "grad_norm": 1.5868966579437256, + "learning_rate": 0.00013709920134288993 + }, + { + "step": 339, + "epoch": 1.7251908396946565, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905155584, + "loss": 1.2842, + "grad_norm": 1.0777534246444702, + "learning_rate": 0.00013621192402569628 + }, + { + "step": 340, + "epoch": 1.7302798982188294, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905224704, + "loss": 1.3316, + "grad_norm": 1.2451850175857544, + "learning_rate": 0.00013532513299341215 + }, + { + "step": 341, + "epoch": 1.7353689567430024, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905166336, + "loss": 1.2912, + "grad_norm": 1.242231845855713, + "learning_rate": 0.00013443885952183683 + }, + { + "step": 342, + "epoch": 1.7404580152671756, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90517248, + "loss": 1.3336, + "grad_norm": 1.7687222957611084, + "learning_rate": 0.00013355313486851603 + }, + { + "step": 343, + "epoch": 1.7455470737913485, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905083392, + "loss": 1.3219, + "grad_norm": 1.2731298208236694, + "learning_rate": 0.00013266799027163942 + }, + { + "step": 344, + "epoch": 1.7506361323155217, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905201664, + "loss": 1.2563, + "grad_norm": 1.2799016237258911, + "learning_rate": 0.00013178345694893906 + }, + { + "step": 345, + "epoch": 1.7557251908396947, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905154048, + "loss": 1.2672, + "grad_norm": 1.2973897457122803, + "learning_rate": 0.0001308995660965881 + }, + { + "step": 346, + "epoch": 1.7608142493638677, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905100288, + "loss": 1.3017, + "grad_norm": 1.6960549354553223, + "learning_rate": 0.00013001634888810094 + }, + { + "step": 347, + "epoch": 1.7659033078880406, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90514944, + "loss": 1.3001, + "grad_norm": 1.7435672283172607, + "learning_rate": 0.0001291338364732333 + }, + { + "step": 348, + "epoch": 1.7709923664122136, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905275392, + "loss": 1.3799, + "grad_norm": 1.811391830444336, + "learning_rate": 0.00012825205997688403 + }, + { + "step": 349, + "epoch": 1.7760814249363868, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905074176, + "loss": 1.3102, + "grad_norm": 1.7709952592849731, + "learning_rate": 0.00012737105049799723 + }, + { + "step": 350, + "epoch": 1.78117048346056, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905138688, + "loss": 1.3116, + "grad_norm": 1.71134614944458, + "learning_rate": 0.00012649083910846543 + }, + { + "step": 351, + "epoch": 1.786259541984733, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905154048, + "loss": 1.2823, + "grad_norm": 1.5174767971038818, + "learning_rate": 0.00012561145685203374 + }, + { + "step": 352, + "epoch": 1.7913486005089059, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905144832, + "loss": 1.27, + "grad_norm": 1.6558209657669067, + "learning_rate": 0.00012473293474320505 + }, + { + "step": 353, + "epoch": 1.7964376590330788, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90514176, + "loss": 1.2733, + "grad_norm": 2.0085315704345703, + "learning_rate": 0.00012385530376614586 + }, + { + "step": 354, + "epoch": 1.8015267175572518, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905117184, + "loss": 1.2629, + "grad_norm": 2.0381364822387695, + "learning_rate": 0.00012297859487359408 + }, + { + "step": 355, + "epoch": 1.806615776081425, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905100288, + "loss": 1.2346, + "grad_norm": 1.492771029472351, + "learning_rate": 0.0001221028389857669 + }, + { + "step": 356, + "epoch": 1.811704834605598, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905101824, + "loss": 1.2123, + "grad_norm": 1.5164073705673218, + "learning_rate": 0.00012122806698927051 + }, + { + "step": 357, + "epoch": 1.8167938931297711, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905154048, + "loss": 1.2768, + "grad_norm": 2.091362714767456, + "learning_rate": 0.00012035430973601075 + }, + { + "step": 358, + "epoch": 1.821882951653944, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905220096, + "loss": 1.2864, + "grad_norm": 2.366520404815674, + "learning_rate": 0.00011948159804210495 + }, + { + "step": 359, + "epoch": 1.826972010178117, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905174016, + "loss": 1.147, + "grad_norm": 1.6528884172439575, + "learning_rate": 0.00011860996268679504 + }, + { + "step": 360, + "epoch": 1.83206106870229, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905224704, + "loss": 1.3236, + "grad_norm": 2.12330961227417, + "learning_rate": 0.00011773943441136221 + }, + { + "step": 361, + "epoch": 1.837150127226463, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90519552, + "loss": 1.241, + "grad_norm": 1.9378141164779663, + "learning_rate": 0.00011687004391804251 + }, + { + "step": 362, + "epoch": 1.8422391857506362, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905129472, + "loss": 1.4082, + "grad_norm": 2.5934369564056396, + "learning_rate": 0.00011600182186894417 + }, + { + "step": 363, + "epoch": 1.8473282442748091, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905086464, + "loss": 1.2106, + "grad_norm": 1.8395181894302368, + "learning_rate": 0.00011513479888496609 + }, + { + "step": 364, + "epoch": 1.8524173027989823, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905114112, + "loss": 1.2044, + "grad_norm": 2.014338493347168, + "learning_rate": 0.00011426900554471795 + }, + { + "step": 365, + "epoch": 1.8575063613231553, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905097216, + "loss": 1.2812, + "grad_norm": 1.8447554111480713, + "learning_rate": 0.0001134044723834417 + }, + { + "step": 366, + "epoch": 1.8625954198473282, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905140224, + "loss": 1.1987, + "grad_norm": 1.82839035987854, + "learning_rate": 0.00011254122989193465 + }, + { + "step": 367, + "epoch": 1.8676844783715012, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905193984, + "loss": 1.2467, + "grad_norm": 1.8893308639526367, + "learning_rate": 0.00011167930851547418 + }, + { + "step": 368, + "epoch": 1.8727735368956742, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905091072, + "loss": 1.2245, + "grad_norm": 1.7109169960021973, + "learning_rate": 0.0001108187386527438 + }, + { + "step": 369, + "epoch": 1.8778625954198473, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905150976, + "loss": 1.1939, + "grad_norm": 1.7093673944473267, + "learning_rate": 0.00010995955065476126 + }, + { + "step": 370, + "epoch": 1.8829516539440203, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905074176, + "loss": 1.2501, + "grad_norm": 1.9341355562210083, + "learning_rate": 0.00010910177482380795 + }, + { + "step": 371, + "epoch": 1.8880407124681935, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90523392, + "loss": 1.1762, + "grad_norm": 1.599368929862976, + "learning_rate": 0.00010824544141236015 + }, + { + "step": 372, + "epoch": 1.8931297709923665, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905088, + "loss": 1.252, + "grad_norm": 1.9920207262039185, + "learning_rate": 0.00010739058062202224 + }, + { + "step": 373, + "epoch": 1.8982188295165394, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90511104, + "loss": 1.2334, + "grad_norm": 2.071505308151245, + "learning_rate": 0.00010653722260246145 + }, + { + "step": 374, + "epoch": 1.9033078880407124, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90513408, + "loss": 1.3293, + "grad_norm": 2.605424165725708, + "learning_rate": 0.00010568539745034447 + }, + { + "step": 375, + "epoch": 1.9083969465648853, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905358336, + "loss": 1.326, + "grad_norm": 2.2104361057281494, + "learning_rate": 0.00010483513520827614 + }, + { + "step": 376, + "epoch": 1.9134860050890585, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905115648, + "loss": 1.2648, + "grad_norm": 1.7226589918136597, + "learning_rate": 0.00010398646586373969 + }, + { + "step": 377, + "epoch": 1.9185750636132317, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905238528, + "loss": 1.245, + "grad_norm": 1.9684314727783203, + "learning_rate": 0.00010313941934803922 + }, + { + "step": 378, + "epoch": 1.9236641221374047, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905154048, + "loss": 1.3108, + "grad_norm": 2.1255056858062744, + "learning_rate": 0.00010229402553524413 + }, + { + "step": 379, + "epoch": 1.9287531806615776, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905243136, + "loss": 1.3866, + "grad_norm": 2.676853895187378, + "learning_rate": 0.00010145031424113542 + }, + { + "step": 380, + "epoch": 1.9338422391857506, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90514176, + "loss": 1.1982, + "grad_norm": 1.822307825088501, + "learning_rate": 0.00010060831522215416 + }, + { + "step": 381, + "epoch": 1.9389312977099236, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90514944, + "loss": 1.3458, + "grad_norm": 1.8360158205032349, + "learning_rate": 9.976805817435207e-05 + }, + { + "step": 382, + "epoch": 1.9440203562340967, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90507264, + "loss": 1.295, + "grad_norm": 1.500222086906433, + "learning_rate": 9.89295727323441e-05 + }, + { + "step": 383, + "epoch": 1.9491094147582697, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905152512, + "loss": 1.2613, + "grad_norm": 1.6363192796707153, + "learning_rate": 9.809288846826327e-05 + }, + { + "step": 384, + "epoch": 1.954198473282443, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905123328, + "loss": 1.3119, + "grad_norm": 1.9601236581802368, + "learning_rate": 9.725803489071779e-05 + }, + { + "step": 385, + "epoch": 1.9592875318066159, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90511104, + "loss": 1.3026, + "grad_norm": 1.7063336372375488, + "learning_rate": 9.642504144375026e-05 + }, + { + "step": 386, + "epoch": 1.9643765903307888, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905253888, + "loss": 1.1781, + "grad_norm": 1.368180751800537, + "learning_rate": 9.559393750579926e-05 + }, + { + "step": 387, + "epoch": 1.9694656488549618, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905137152, + "loss": 1.2942, + "grad_norm": 1.3331520557403564, + "learning_rate": 9.476475238866318e-05 + }, + { + "step": 388, + "epoch": 1.9745547073791347, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905147904, + "loss": 1.2764, + "grad_norm": 1.381353497505188, + "learning_rate": 9.393751533646649e-05 + }, + { + "step": 389, + "epoch": 1.979643765903308, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90530304, + "loss": 1.331, + "grad_norm": 1.504258155822754, + "learning_rate": 9.31122555246283e-05 + }, + { + "step": 390, + "epoch": 1.984732824427481, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905112576, + "loss": 1.2213, + "grad_norm": 1.6263151168823242, + "learning_rate": 9.228900205883324e-05 + }, + { + "step": 391, + "epoch": 1.989821882951654, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90513408, + "loss": 1.1804, + "grad_norm": 1.552638292312622, + "learning_rate": 9.146778397400543e-05 + }, + { + "step": 392, + "epoch": 1.994910941475827, + "cpu_mem": 3.113467904, + "gpu_mem": 1.905170944, + "loss": 1.2642, + "grad_norm": 1.3623449802398682, + "learning_rate": 9.064863023328384e-05 + }, + { + "step": 393, + "epoch": 2.0, + "cpu_mem": 3.113467904, + "gpu_mem": 1.90475776, + "loss": 1.7831, + "grad_norm": 2.773557186126709, + "learning_rate": 8.983156972700125e-05 + }, + { + "step": 394, + "epoch": 2.005089058524173, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821018112, + "loss": 1.0019, + "grad_norm": 1.8360249996185303, + "learning_rate": 8.901663127166513e-05 + }, + { + "step": 395, + "epoch": 2.010178117048346, + "cpu_mem": 3.113467904, + "gpu_mem": 1.820996608, + "loss": 1.1047, + "grad_norm": 2.0697855949401855, + "learning_rate": 8.820384360894143e-05 + }, + { + "step": 396, + "epoch": 2.015267175572519, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821011968, + "loss": 1.1274, + "grad_norm": 2.029513359069824, + "learning_rate": 8.739323540464063e-05 + }, + { + "step": 397, + "epoch": 2.0203562340966923, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821159424, + "loss": 0.9022, + "grad_norm": 1.9198529720306396, + "learning_rate": 8.658483524770728e-05 + }, + { + "step": 398, + "epoch": 2.0254452926208653, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821105664, + "loss": 1.0918, + "grad_norm": 3.0439510345458984, + "learning_rate": 8.577867164921113e-05 + }, + { + "step": 399, + "epoch": 2.030534351145038, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82112256, + "loss": 1.0646, + "grad_norm": 2.7595407962799072, + "learning_rate": 8.497477304134203e-05 + }, + { + "step": 400, + "epoch": 2.035623409669211, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821074944, + "loss": 1.0026, + "grad_norm": 3.4280829429626465, + "learning_rate": 8.41731677764068e-05 + }, + { + "step": 401, + "epoch": 2.040712468193384, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82109952, + "loss": 0.94, + "grad_norm": 3.7877755165100098, + "learning_rate": 8.337388412582972e-05 + }, + { + "step": 402, + "epoch": 2.045801526717557, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821147136, + "loss": 1.158, + "grad_norm": 4.703278064727783, + "learning_rate": 8.257695027915481e-05 + }, + { + "step": 403, + "epoch": 2.05089058524173, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821056512, + "loss": 0.9696, + "grad_norm": 3.19827938079834, + "learning_rate": 8.178239434305235e-05 + }, + { + "step": 404, + "epoch": 2.0559796437659035, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821028864, + "loss": 0.9487, + "grad_norm": 3.562917470932007, + "learning_rate": 8.099024434032717e-05 + }, + { + "step": 405, + "epoch": 2.0610687022900764, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821117952, + "loss": 1.0539, + "grad_norm": 4.523188591003418, + "learning_rate": 8.02005282089303e-05 + }, + { + "step": 406, + "epoch": 2.0661577608142494, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821008896, + "loss": 0.8491, + "grad_norm": 2.4671928882598877, + "learning_rate": 7.941327380097388e-05 + }, + { + "step": 407, + "epoch": 2.0712468193384224, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82106112, + "loss": 0.9503, + "grad_norm": 3.4050047397613525, + "learning_rate": 7.862850888174869e-05 + }, + { + "step": 408, + "epoch": 2.0763358778625953, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82101504, + "loss": 0.9635, + "grad_norm": 3.670715093612671, + "learning_rate": 7.784626112874487e-05 + }, + { + "step": 409, + "epoch": 2.0814249363867683, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821217792, + "loss": 0.9511, + "grad_norm": 2.748667001724243, + "learning_rate": 7.706655813067594e-05 + }, + { + "step": 410, + "epoch": 2.0865139949109412, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821216256, + "loss": 1.018, + "grad_norm": 3.3010270595550537, + "learning_rate": 7.628942738650573e-05 + }, + { + "step": 411, + "epoch": 2.0916030534351147, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821139456, + "loss": 0.904, + "grad_norm": 2.789752960205078, + "learning_rate": 7.551489630447835e-05 + }, + { + "step": 412, + "epoch": 2.0966921119592876, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821096448, + "loss": 1.1889, + "grad_norm": 3.731233596801758, + "learning_rate": 7.474299220115195e-05 + }, + { + "step": 413, + "epoch": 2.1017811704834606, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821073408, + "loss": 0.9122, + "grad_norm": 2.9216806888580322, + "learning_rate": 7.397374230043484e-05 + }, + { + "step": 414, + "epoch": 2.1068702290076335, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821027328, + "loss": 0.8975, + "grad_norm": 2.250838279724121, + "learning_rate": 7.320717373262557e-05 + }, + { + "step": 415, + "epoch": 2.1119592875318065, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821144064, + "loss": 1.0718, + "grad_norm": 2.8770334720611572, + "learning_rate": 7.244331353345625e-05 + }, + { + "step": 416, + "epoch": 2.1170483460559795, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821031936, + "loss": 1.0717, + "grad_norm": 2.722787857055664, + "learning_rate": 7.16821886431386e-05 + }, + { + "step": 417, + "epoch": 2.122137404580153, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821021184, + "loss": 1.014, + "grad_norm": 2.8963892459869385, + "learning_rate": 7.092382590541432e-05 + }, + { + "step": 418, + "epoch": 2.127226463104326, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821025792, + "loss": 1.0238, + "grad_norm": 2.330237627029419, + "learning_rate": 7.016825206660788e-05 + }, + { + "step": 419, + "epoch": 2.132315521628499, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821140992, + "loss": 1.0449, + "grad_norm": 3.2127678394317627, + "learning_rate": 6.941549377468367e-05 + }, + { + "step": 420, + "epoch": 2.1374045801526718, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82111488, + "loss": 0.8869, + "grad_norm": 2.6941819190979004, + "learning_rate": 6.866557757830575e-05 + }, + { + "step": 421, + "epoch": 2.1424936386768447, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821016576, + "loss": 0.848, + "grad_norm": 2.5436348915100098, + "learning_rate": 6.791852992590169e-05 + }, + { + "step": 422, + "epoch": 2.1475826972010177, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821051904, + "loss": 0.9505, + "grad_norm": 2.504446506500244, + "learning_rate": 6.717437716472997e-05 + }, + { + "step": 423, + "epoch": 2.1526717557251906, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821018112, + "loss": 0.9931, + "grad_norm": 2.937140703201294, + "learning_rate": 6.643314553995034e-05 + }, + { + "step": 424, + "epoch": 2.157760814249364, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821151744, + "loss": 0.9331, + "grad_norm": 2.807204008102417, + "learning_rate": 6.569486119369863e-05 + }, + { + "step": 425, + "epoch": 2.162849872773537, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821125632, + "loss": 0.908, + "grad_norm": 3.1015007495880127, + "learning_rate": 6.495955016416441e-05 + }, + { + "step": 426, + "epoch": 2.16793893129771, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821170176, + "loss": 0.7622, + "grad_norm": 2.822793483734131, + "learning_rate": 6.422723838467286e-05 + }, + { + "step": 427, + "epoch": 2.173027989821883, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82106112, + "loss": 1.0248, + "grad_norm": 3.2857625484466553, + "learning_rate": 6.349795168276994e-05 + }, + { + "step": 428, + "epoch": 2.178117048346056, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821078016, + "loss": 0.7292, + "grad_norm": 3.2497096061706543, + "learning_rate": 6.277171577931187e-05 + }, + { + "step": 429, + "epoch": 2.183206106870229, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82115328, + "loss": 0.9894, + "grad_norm": 3.270601749420166, + "learning_rate": 6.204855628755751e-05 + }, + { + "step": 430, + "epoch": 2.188295165394402, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821001216, + "loss": 1.0788, + "grad_norm": 3.960789918899536, + "learning_rate": 6.13284987122654e-05 + }, + { + "step": 431, + "epoch": 2.1933842239185752, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821119488, + "loss": 0.8422, + "grad_norm": 4.07096529006958, + "learning_rate": 6.061156844879417e-05 + }, + { + "step": 432, + "epoch": 2.198473282442748, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821108736, + "loss": 0.7631, + "grad_norm": 3.1884262561798096, + "learning_rate": 5.9897790782206636e-05 + }, + { + "step": 433, + "epoch": 2.203562340966921, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821074944, + "loss": 0.9472, + "grad_norm": 4.213481426239014, + "learning_rate": 5.9187190886378306e-05 + }, + { + "step": 434, + "epoch": 2.208651399491094, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82105344, + "loss": 1.1781, + "grad_norm": 5.448444366455078, + "learning_rate": 5.8479793823109406e-05 + }, + { + "step": 435, + "epoch": 2.213740458015267, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821048832, + "loss": 0.8777, + "grad_norm": 3.9265336990356445, + "learning_rate": 5.777562454124113e-05 + }, + { + "step": 436, + "epoch": 2.21882951653944, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821059584, + "loss": 0.9067, + "grad_norm": 4.149843692779541, + "learning_rate": 5.7074707875775496e-05 + }, + { + "step": 437, + "epoch": 2.223918575063613, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821113344, + "loss": 0.9506, + "grad_norm": 4.431631565093994, + "learning_rate": 5.637706854699974e-05 + }, + { + "step": 438, + "epoch": 2.2290076335877864, + "cpu_mem": 3.113467904, + "gpu_mem": 1.8210304, + "loss": 0.892, + "grad_norm": 3.369805097579956, + "learning_rate": 5.568273115961414e-05 + }, + { + "step": 439, + "epoch": 2.2340966921119594, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821058048, + "loss": 0.8835, + "grad_norm": 3.7866477966308594, + "learning_rate": 5.499172020186447e-05 + }, + { + "step": 440, + "epoch": 2.2391857506361323, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821056512, + "loss": 0.7847, + "grad_norm": 3.2563962936401367, + "learning_rate": 5.430406004467842e-05 + }, + { + "step": 441, + "epoch": 2.2442748091603053, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82109952, + "loss": 0.8076, + "grad_norm": 4.040958881378174, + "learning_rate": 5.361977494080572e-05 + }, + { + "step": 442, + "epoch": 2.2493638676844783, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821097984, + "loss": 1.0575, + "grad_norm": 3.9884541034698486, + "learning_rate": 5.293888902396319e-05 + }, + { + "step": 443, + "epoch": 2.2544529262086512, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821148672, + "loss": 0.9085, + "grad_norm": 4.26292085647583, + "learning_rate": 5.2261426307983204e-05 + }, + { + "step": 444, + "epoch": 2.2595419847328246, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821154816, + "loss": 0.8856, + "grad_norm": 3.75675892829895, + "learning_rate": 5.158741068596714e-05 + }, + { + "step": 445, + "epoch": 2.2646310432569976, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821093376, + "loss": 0.955, + "grad_norm": 3.913606882095337, + "learning_rate": 5.0916865929442326e-05 + }, + { + "step": 446, + "epoch": 2.2697201017811706, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821082624, + "loss": 0.9614, + "grad_norm": 3.6866273880004883, + "learning_rate": 5.024981568752386e-05 + }, + { + "step": 447, + "epoch": 2.2748091603053435, + "cpu_mem": 3.113467904, + "gpu_mem": 1.8211072, + "loss": 0.9116, + "grad_norm": 3.5516369342803955, + "learning_rate": 4.958628348608065e-05 + }, + { + "step": 448, + "epoch": 2.2798982188295165, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821124096, + "loss": 0.8063, + "grad_norm": 3.7882702350616455, + "learning_rate": 4.892629272690536e-05 + }, + { + "step": 449, + "epoch": 2.2849872773536894, + "cpu_mem": 3.113467904, + "gpu_mem": 1.820995072, + "loss": 0.9265, + "grad_norm": 3.706578016281128, + "learning_rate": 4.826986668688944e-05 + }, + { + "step": 450, + "epoch": 2.2900763358778624, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821170176, + "loss": 0.8024, + "grad_norm": 3.7211546897888184, + "learning_rate": 4.761702851720191e-05 + }, + { + "step": 451, + "epoch": 2.2951653944020354, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821127168, + "loss": 0.7846, + "grad_norm": 3.3788211345672607, + "learning_rate": 4.6967801242472916e-05 + }, + { + "step": 452, + "epoch": 2.300254452926209, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821036544, + "loss": 0.7157, + "grad_norm": 3.0570080280303955, + "learning_rate": 4.632220775998172e-05 + }, + { + "step": 453, + "epoch": 2.3053435114503817, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82098432, + "loss": 0.8621, + "grad_norm": 3.600937604904175, + "learning_rate": 4.568027083884929e-05 + }, + { + "step": 454, + "epoch": 2.3104325699745547, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82099968, + "loss": 0.9429, + "grad_norm": 3.9426963329315186, + "learning_rate": 4.504201311923488e-05 + }, + { + "step": 455, + "epoch": 2.3155216284987277, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821101056, + "loss": 0.9405, + "grad_norm": 4.274807453155518, + "learning_rate": 4.440745711153804e-05 + }, + { + "step": 456, + "epoch": 2.3206106870229006, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821110272, + "loss": 0.8662, + "grad_norm": 4.326922416687012, + "learning_rate": 4.377662519560423e-05 + }, + { + "step": 457, + "epoch": 2.325699745547074, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821139456, + "loss": 1.0695, + "grad_norm": 4.525497913360596, + "learning_rate": 4.3149539619935836e-05 + }, + { + "step": 458, + "epoch": 2.330788804071247, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821200896, + "loss": 0.961, + "grad_norm": 4.260465621948242, + "learning_rate": 4.252622250090746e-05 + }, + { + "step": 459, + "epoch": 2.33587786259542, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821054976, + "loss": 0.9071, + "grad_norm": 4.009454250335693, + "learning_rate": 4.190669582198571e-05 + }, + { + "step": 460, + "epoch": 2.340966921119593, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821142528, + "loss": 0.9244, + "grad_norm": 4.703610897064209, + "learning_rate": 4.1290981432954185e-05 + }, + { + "step": 461, + "epoch": 2.346055979643766, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82104576, + "loss": 1.0102, + "grad_norm": 4.305217742919922, + "learning_rate": 4.067910104914249e-05 + }, + { + "step": 462, + "epoch": 2.351145038167939, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821116416, + "loss": 0.9591, + "grad_norm": 4.934427738189697, + "learning_rate": 4.007107625066079e-05 + }, + { + "step": 463, + "epoch": 2.356234096692112, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821151744, + "loss": 0.7848, + "grad_norm": 3.6162827014923096, + "learning_rate": 3.946692848163836e-05 + }, + { + "step": 464, + "epoch": 2.3613231552162848, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821090304, + "loss": 0.7923, + "grad_norm": 3.4046671390533447, + "learning_rate": 3.886667904946739e-05 + }, + { + "step": 465, + "epoch": 2.366412213740458, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821047296, + "loss": 0.9039, + "grad_norm": 3.8736793994903564, + "learning_rate": 3.8270349124051694e-05 + }, + { + "step": 466, + "epoch": 2.371501272264631, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821048832, + "loss": 0.8813, + "grad_norm": 3.2090065479278564, + "learning_rate": 3.767795973705975e-05 + }, + { + "step": 467, + "epoch": 2.376590330788804, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821085696, + "loss": 0.6856, + "grad_norm": 3.355933666229248, + "learning_rate": 3.708953178118324e-05 + }, + { + "step": 468, + "epoch": 2.381679389312977, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821058048, + "loss": 1.0183, + "grad_norm": 3.8839523792266846, + "learning_rate": 3.6505086009399944e-05 + }, + { + "step": 469, + "epoch": 2.38676844783715, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82111488, + "loss": 0.7343, + "grad_norm": 3.3767969608306885, + "learning_rate": 3.5924643034242136e-05 + }, + { + "step": 470, + "epoch": 2.391857506361323, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821144064, + "loss": 0.8597, + "grad_norm": 4.108831405639648, + "learning_rate": 3.5348223327069105e-05 + }, + { + "step": 471, + "epoch": 2.3969465648854964, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821044224, + "loss": 0.8768, + "grad_norm": 3.6018404960632324, + "learning_rate": 3.4775847217345756e-05 + }, + { + "step": 472, + "epoch": 2.4020356234096694, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821067264, + "loss": 0.7739, + "grad_norm": 3.7794270515441895, + "learning_rate": 3.420753489192524e-05 + }, + { + "step": 473, + "epoch": 2.4071246819338423, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821042688, + "loss": 0.8448, + "grad_norm": 3.6236464977264404, + "learning_rate": 3.364330639433701e-05 + }, + { + "step": 474, + "epoch": 2.4122137404580153, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821134848, + "loss": 0.9312, + "grad_norm": 3.266313314437866, + "learning_rate": 3.308318162408013e-05 + }, + { + "step": 475, + "epoch": 2.4173027989821882, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821004288, + "loss": 0.8591, + "grad_norm": 3.4973514080047607, + "learning_rate": 3.2527180335921186e-05 + }, + { + "step": 476, + "epoch": 2.422391857506361, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821140992, + "loss": 0.7782, + "grad_norm": 4.105288028717041, + "learning_rate": 3.197532213919774e-05 + }, + { + "step": 477, + "epoch": 2.427480916030534, + "cpu_mem": 3.113467904, + "gpu_mem": 1.820998144, + "loss": 0.7574, + "grad_norm": 3.1518139839172363, + "learning_rate": 3.1427626497126654e-05 + }, + { + "step": 478, + "epoch": 2.432569974554707, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821021184, + "loss": 0.9103, + "grad_norm": 4.622748374938965, + "learning_rate": 3.088411272611781e-05 + }, + { + "step": 479, + "epoch": 2.4376590330788805, + "cpu_mem": 3.113467904, + "gpu_mem": 1.8210688, + "loss": 1.053, + "grad_norm": 5.19940710067749, + "learning_rate": 3.0344799995092533e-05 + }, + { + "step": 480, + "epoch": 2.4427480916030535, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821111808, + "loss": 0.9879, + "grad_norm": 4.480295658111572, + "learning_rate": 2.9809707324807912e-05 + }, + { + "step": 481, + "epoch": 2.4478371501272265, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821039616, + "loss": 1.007, + "grad_norm": 5.339071750640869, + "learning_rate": 2.9278853587185658e-05 + }, + { + "step": 482, + "epoch": 2.4529262086513994, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82113792, + "loss": 0.9014, + "grad_norm": 4.878562927246094, + "learning_rate": 2.8752257504646616e-05 + }, + { + "step": 483, + "epoch": 2.4580152671755724, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821033472, + "loss": 1.0006, + "grad_norm": 4.631145477294922, + "learning_rate": 2.8229937649450613e-05 + }, + { + "step": 484, + "epoch": 2.4631043256997454, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821188608, + "loss": 0.7264, + "grad_norm": 4.030819416046143, + "learning_rate": 2.7711912443041123e-05 + }, + { + "step": 485, + "epoch": 2.4681933842239188, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821021184, + "loss": 0.8546, + "grad_norm": 3.557971954345703, + "learning_rate": 2.719820015539596e-05 + }, + { + "step": 486, + "epoch": 2.4732824427480917, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821064192, + "loss": 0.8782, + "grad_norm": 4.107112884521484, + "learning_rate": 2.6688818904382513e-05 + }, + { + "step": 487, + "epoch": 2.4783715012722647, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821085696, + "loss": 0.9161, + "grad_norm": 4.056766986846924, + "learning_rate": 2.6183786655119144e-05 + }, + { + "step": 488, + "epoch": 2.4834605597964376, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821008896, + "loss": 0.9338, + "grad_norm": 4.186944961547852, + "learning_rate": 2.5683121219341217e-05 + }, + { + "step": 489, + "epoch": 2.4885496183206106, + "cpu_mem": 3.113467904, + "gpu_mem": 1.8211456, + "loss": 0.749, + "grad_norm": 4.4343791007995605, + "learning_rate": 2.518684025477319e-05 + }, + { + "step": 490, + "epoch": 2.4936386768447836, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821033472, + "loss": 0.7911, + "grad_norm": 3.834761619567871, + "learning_rate": 2.469496126450578e-05 + }, + { + "step": 491, + "epoch": 2.4987277353689565, + "cpu_mem": 3.113467904, + "gpu_mem": 1.8210304, + "loss": 0.9685, + "grad_norm": 4.905758380889893, + "learning_rate": 2.4207501596378508e-05 + }, + { + "step": 492, + "epoch": 2.5038167938931295, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821004288, + "loss": 1.0171, + "grad_norm": 4.31558084487915, + "learning_rate": 2.3724478442368133e-05 + }, + { + "step": 493, + "epoch": 2.508905852417303, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821039616, + "loss": 0.7762, + "grad_norm": 4.475506782531738, + "learning_rate": 2.324590883798204e-05 + }, + { + "step": 494, + "epoch": 2.513994910941476, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821054976, + "loss": 0.6939, + "grad_norm": 3.590113878250122, + "learning_rate": 2.2771809661657614e-05 + }, + { + "step": 495, + "epoch": 2.519083969465649, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821048832, + "loss": 0.7723, + "grad_norm": 3.6632065773010254, + "learning_rate": 2.2302197634166835e-05 + }, + { + "step": 496, + "epoch": 2.524173027989822, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82106112, + "loss": 0.8986, + "grad_norm": 4.348067283630371, + "learning_rate": 2.1837089318026714e-05 + }, + { + "step": 497, + "epoch": 2.5292620865139948, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821093376, + "loss": 0.8345, + "grad_norm": 4.31683874130249, + "learning_rate": 2.1376501116915047e-05 + }, + { + "step": 498, + "epoch": 2.534351145038168, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821094912, + "loss": 0.8759, + "grad_norm": 4.218949794769287, + "learning_rate": 2.0920449275091837e-05 + }, + { + "step": 499, + "epoch": 2.539440203562341, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821067264, + "loss": 0.8139, + "grad_norm": 3.6294329166412354, + "learning_rate": 2.0468949876826573e-05 + }, + { + "step": 500, + "epoch": 2.544529262086514, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821116416, + "loss": 0.9677, + "grad_norm": 4.29981803894043, + "learning_rate": 2.002201884583065e-05 + }, + { + "step": 501, + "epoch": 2.549618320610687, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82104576, + "loss": 0.8024, + "grad_norm": 3.977585554122925, + "learning_rate": 1.957967194469615e-05 + }, + { + "step": 502, + "epoch": 2.55470737913486, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821104128, + "loss": 0.7003, + "grad_norm": 3.6818857192993164, + "learning_rate": 1.9141924774339566e-05 + }, + { + "step": 503, + "epoch": 2.559796437659033, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821094912, + "loss": 0.7005, + "grad_norm": 3.289773941040039, + "learning_rate": 1.8708792773451874e-05 + }, + { + "step": 504, + "epoch": 2.564885496183206, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821028864, + "loss": 0.8487, + "grad_norm": 4.38092565536499, + "learning_rate": 1.828029121795375e-05 + }, + { + "step": 505, + "epoch": 2.569974554707379, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821050368, + "loss": 0.7997, + "grad_norm": 3.9405319690704346, + "learning_rate": 1.7856435220457092e-05 + }, + { + "step": 506, + "epoch": 2.5750636132315523, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821105664, + "loss": 0.9284, + "grad_norm": 4.499026775360107, + "learning_rate": 1.7437239729731806e-05 + }, + { + "step": 507, + "epoch": 2.5801526717557253, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82102272, + "loss": 0.8707, + "grad_norm": 4.25786828994751, + "learning_rate": 1.7022719530178624e-05 + }, + { + "step": 508, + "epoch": 2.5852417302798982, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821087232, + "loss": 0.9989, + "grad_norm": 4.702800750732422, + "learning_rate": 1.6612889241307836e-05 + }, + { + "step": 509, + "epoch": 2.590330788804071, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82104576, + "loss": 0.8028, + "grad_norm": 4.400262832641602, + "learning_rate": 1.620776331722347e-05 + }, + { + "step": 510, + "epoch": 2.595419847328244, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821078016, + "loss": 0.8253, + "grad_norm": 3.974031448364258, + "learning_rate": 1.580735604611368e-05 + }, + { + "step": 511, + "epoch": 2.6005089058524176, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821051904, + "loss": 0.8805, + "grad_norm": 3.995480537414551, + "learning_rate": 1.5411681549746678e-05 + }, + { + "step": 512, + "epoch": 2.6055979643765905, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82104576, + "loss": 0.9104, + "grad_norm": 4.524916172027588, + "learning_rate": 1.502075378297285e-05 + }, + { + "step": 513, + "epoch": 2.6106870229007635, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821056512, + "loss": 0.9692, + "grad_norm": 5.014407634735107, + "learning_rate": 1.4634586533232428e-05 + }, + { + "step": 514, + "epoch": 2.6157760814249365, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82096128, + "loss": 0.8798, + "grad_norm": 4.219020366668701, + "learning_rate": 1.4253193420069292e-05 + }, + { + "step": 515, + "epoch": 2.6208651399491094, + "cpu_mem": 3.113467904, + "gpu_mem": 1.8210304, + "loss": 0.6301, + "grad_norm": 3.3962159156799316, + "learning_rate": 1.3876587894650686e-05 + }, + { + "step": 516, + "epoch": 2.6259541984732824, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821078016, + "loss": 0.8772, + "grad_norm": 4.472508907318115, + "learning_rate": 1.350478323929271e-05 + }, + { + "step": 517, + "epoch": 2.6310432569974553, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82102272, + "loss": 0.9567, + "grad_norm": 5.244596004486084, + "learning_rate": 1.3137792566992001e-05 + }, + { + "step": 518, + "epoch": 2.6361323155216283, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821081088, + "loss": 1.0084, + "grad_norm": 5.080158710479736, + "learning_rate": 1.2775628820963091e-05 + }, + { + "step": 519, + "epoch": 2.6412213740458013, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821087232, + "loss": 0.892, + "grad_norm": 4.244832515716553, + "learning_rate": 1.2418304774182075e-05 + }, + { + "step": 520, + "epoch": 2.6463104325699747, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821223936, + "loss": 0.6988, + "grad_norm": 3.654989242553711, + "learning_rate": 1.2065833028935968e-05 + }, + { + "step": 521, + "epoch": 2.6513994910941476, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821033472, + "loss": 0.782, + "grad_norm": 4.028866291046143, + "learning_rate": 1.1718226016378507e-05 + }, + { + "step": 522, + "epoch": 2.6564885496183206, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821097984, + "loss": 0.8227, + "grad_norm": 3.871262550354004, + "learning_rate": 1.137549599609136e-05 + }, + { + "step": 523, + "epoch": 2.6615776081424936, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821013504, + "loss": 0.8025, + "grad_norm": 3.701946496963501, + "learning_rate": 1.103765505565205e-05 + }, + { + "step": 524, + "epoch": 2.6666666666666665, + "cpu_mem": 3.113467904, + "gpu_mem": 1.8210304, + "loss": 0.9476, + "grad_norm": 4.908421516418457, + "learning_rate": 1.0704715110207579e-05 + }, + { + "step": 525, + "epoch": 2.67175572519084, + "cpu_mem": 3.113467904, + "gpu_mem": 1.8210304, + "loss": 0.8021, + "grad_norm": 3.9580726623535156, + "learning_rate": 1.0376687902053981e-05 + }, + { + "step": 526, + "epoch": 2.676844783715013, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821156352, + "loss": 0.9794, + "grad_norm": 4.487414360046387, + "learning_rate": 1.0053585000222524e-05 + }, + { + "step": 527, + "epoch": 2.681933842239186, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821036544, + "loss": 0.6439, + "grad_norm": 3.482146739959717, + "learning_rate": 9.735417800071433e-06 + }, + { + "step": 528, + "epoch": 2.687022900763359, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821194752, + "loss": 0.8498, + "grad_norm": 4.241966247558594, + "learning_rate": 9.42219752288414e-06 + }, + { + "step": 529, + "epoch": 2.6921119592875318, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821019648, + "loss": 0.8719, + "grad_norm": 4.349477767944336, + "learning_rate": 9.113935215473428e-06 + }, + { + "step": 530, + "epoch": 2.6972010178117047, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82103808, + "loss": 1.0438, + "grad_norm": 4.631407737731934, + "learning_rate": 8.810641749791902e-06 + }, + { + "step": 531, + "epoch": 2.7022900763358777, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82121472, + "loss": 0.7903, + "grad_norm": 3.8820600509643555, + "learning_rate": 8.512327822548481e-06 + }, + { + "step": 532, + "epoch": 2.7073791348600507, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821088768, + "loss": 0.8204, + "grad_norm": 4.3631815910339355, + "learning_rate": 8.219003954831199e-06 + }, + { + "step": 533, + "epoch": 2.712468193384224, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821085696, + "loss": 0.8806, + "grad_norm": 4.463318824768066, + "learning_rate": 7.930680491736135e-06 + }, + { + "step": 534, + "epoch": 2.717557251908397, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821094912, + "loss": 0.8214, + "grad_norm": 4.392256736755371, + "learning_rate": 7.647367602002491e-06 + }, + { + "step": 535, + "epoch": 2.72264631043257, + "cpu_mem": 3.113467904, + "gpu_mem": 1.820970496, + "loss": 0.831, + "grad_norm": 3.8954215049743652, + "learning_rate": 7.369075277654091e-06 + }, + { + "step": 536, + "epoch": 2.727735368956743, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82125312, + "loss": 0.8939, + "grad_norm": 4.657924652099609, + "learning_rate": 7.095813333646832e-06 + }, + { + "step": 537, + "epoch": 2.732824427480916, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821016576, + "loss": 0.9631, + "grad_norm": 5.165740966796875, + "learning_rate": 6.827591407522548e-06 + }, + { + "step": 538, + "epoch": 2.7379134860050893, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821111808, + "loss": 0.9548, + "grad_norm": 4.016014099121094, + "learning_rate": 6.564418959069273e-06 + }, + { + "step": 539, + "epoch": 2.7430025445292623, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821078016, + "loss": 0.8763, + "grad_norm": 4.610321998596191, + "learning_rate": 6.3063052699873326e-06 + }, + { + "step": 540, + "epoch": 2.7480916030534353, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82104576, + "loss": 0.8742, + "grad_norm": 4.520374774932861, + "learning_rate": 6.053259443562286e-06 + }, + { + "step": 541, + "epoch": 2.753180661577608, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821064192, + "loss": 0.9277, + "grad_norm": 4.043761730194092, + "learning_rate": 5.8052904043435985e-06 + }, + { + "step": 542, + "epoch": 2.758269720101781, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82111488, + "loss": 0.7987, + "grad_norm": 4.227725505828857, + "learning_rate": 5.56240689783013e-06 + }, + { + "step": 543, + "epoch": 2.763358778625954, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821010432, + "loss": 0.9619, + "grad_norm": 4.8975982666015625, + "learning_rate": 5.324617490161409e-06 + }, + { + "step": 544, + "epoch": 2.768447837150127, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821039616, + "loss": 0.7307, + "grad_norm": 4.403115749359131, + "learning_rate": 5.091930567815866e-06 + }, + { + "step": 545, + "epoch": 2.7735368956743, + "cpu_mem": 3.113467904, + "gpu_mem": 1.820982784, + "loss": 0.9319, + "grad_norm": 5.2207255363464355, + "learning_rate": 4.86435433731473e-06 + }, + { + "step": 546, + "epoch": 2.778625954198473, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821058048, + "loss": 0.6408, + "grad_norm": 3.4910476207733154, + "learning_rate": 4.641896824932861e-06 + }, + { + "step": 547, + "epoch": 2.7837150127226464, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821050368, + "loss": 0.7218, + "grad_norm": 3.3683738708496094, + "learning_rate": 4.424565876415415e-06 + }, + { + "step": 548, + "epoch": 2.7888040712468194, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821081088, + "loss": 0.7472, + "grad_norm": 3.7214415073394775, + "learning_rate": 4.212369156701373e-06 + }, + { + "step": 549, + "epoch": 2.7938931297709924, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821010432, + "loss": 1.0674, + "grad_norm": 4.790102481842041, + "learning_rate": 4.005314149653133e-06 + }, + { + "step": 550, + "epoch": 2.7989821882951653, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821036544, + "loss": 0.7237, + "grad_norm": 5.058776378631592, + "learning_rate": 3.8034081577924147e-06 + }, + { + "step": 551, + "epoch": 2.8040712468193383, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821131776, + "loss": 0.7898, + "grad_norm": 3.6861226558685303, + "learning_rate": 3.6066583020429864e-06 + }, + { + "step": 552, + "epoch": 2.8091603053435117, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821027328, + "loss": 0.7088, + "grad_norm": 3.6794393062591553, + "learning_rate": 3.415071521479246e-06 + }, + { + "step": 553, + "epoch": 2.8142493638676847, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821036544, + "loss": 0.859, + "grad_norm": 4.3652753829956055, + "learning_rate": 3.2286545730817183e-06 + }, + { + "step": 554, + "epoch": 2.8193384223918576, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821064192, + "loss": 1.0371, + "grad_norm": 4.943362712860107, + "learning_rate": 3.0474140314985628e-06 + }, + { + "step": 555, + "epoch": 2.8244274809160306, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821078016, + "loss": 0.8779, + "grad_norm": 4.743992328643799, + "learning_rate": 2.8713562888138754e-06 + }, + { + "step": 556, + "epoch": 2.8295165394402035, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821071872, + "loss": 0.9108, + "grad_norm": 3.955052375793457, + "learning_rate": 2.7004875543220506e-06 + }, + { + "step": 557, + "epoch": 2.8346055979643765, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821035008, + "loss": 0.7444, + "grad_norm": 3.734229803085327, + "learning_rate": 2.5348138543089425e-06 + }, + { + "step": 558, + "epoch": 2.8396946564885495, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821093376, + "loss": 0.9916, + "grad_norm": 5.415971279144287, + "learning_rate": 2.374341031839283e-06 + }, + { + "step": 559, + "epoch": 2.8447837150127224, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821002752, + "loss": 0.9103, + "grad_norm": 4.247875690460205, + "learning_rate": 2.2190747465505644e-06 + }, + { + "step": 560, + "epoch": 2.849872773536896, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821087232, + "loss": 0.9209, + "grad_norm": 4.942544460296631, + "learning_rate": 2.0690204744534976e-06 + }, + { + "step": 561, + "epoch": 2.854961832061069, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821125632, + "loss": 0.726, + "grad_norm": 4.058422088623047, + "learning_rate": 1.924183507738819e-06 + }, + { + "step": 562, + "epoch": 2.8600508905852418, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821170176, + "loss": 0.9525, + "grad_norm": 4.068112373352051, + "learning_rate": 1.7845689545906704e-06 + }, + { + "step": 563, + "epoch": 2.8651399491094147, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821131776, + "loss": 0.8714, + "grad_norm": 4.035792350769043, + "learning_rate": 1.6501817390064786e-06 + }, + { + "step": 564, + "epoch": 2.8702290076335877, + "cpu_mem": 3.113467904, + "gpu_mem": 1.8210688, + "loss": 0.9153, + "grad_norm": 4.86890983581543, + "learning_rate": 1.521026600623243e-06 + }, + { + "step": 565, + "epoch": 2.875318066157761, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82112256, + "loss": 0.8778, + "grad_norm": 4.439229965209961, + "learning_rate": 1.3971080945503866e-06 + }, + { + "step": 566, + "epoch": 2.880407124681934, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82106112, + "loss": 0.7461, + "grad_norm": 4.225937843322754, + "learning_rate": 1.2784305912090842e-06 + }, + { + "step": 567, + "epoch": 2.885496183206107, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821047296, + "loss": 0.7245, + "grad_norm": 3.7079238891601562, + "learning_rate": 1.1649982761782195e-06 + }, + { + "step": 568, + "epoch": 2.89058524173028, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821259264, + "loss": 0.5193, + "grad_norm": 3.163409471511841, + "learning_rate": 1.0568151500465693e-06 + }, + { + "step": 569, + "epoch": 2.895674300254453, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821124096, + "loss": 0.9202, + "grad_norm": 4.291584014892578, + "learning_rate": 9.538850282719833e-07 + }, + { + "step": 570, + "epoch": 2.900763358778626, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821097984, + "loss": 0.7801, + "grad_norm": 3.8574540615081787, + "learning_rate": 8.56211541046542e-07 + }, + { + "step": 571, + "epoch": 2.905852417302799, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821104128, + "loss": 0.8513, + "grad_norm": 4.3767547607421875, + "learning_rate": 7.637981331687582e-07 + }, + { + "step": 572, + "epoch": 2.910941475826972, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82113792, + "loss": 0.9779, + "grad_norm": 5.144960880279541, + "learning_rate": 6.766480639218752e-07 + }, + { + "step": 573, + "epoch": 2.916030534351145, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82104576, + "loss": 0.9818, + "grad_norm": 4.492354869842529, + "learning_rate": 5.947644069591084e-07 + }, + { + "step": 574, + "epoch": 2.921119592875318, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82113024, + "loss": 0.7505, + "grad_norm": 4.2612080574035645, + "learning_rate": 5.181500501950986e-07 + }, + { + "step": 575, + "epoch": 2.926208651399491, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821018112, + "loss": 0.7699, + "grad_norm": 3.7540624141693115, + "learning_rate": 4.468076957041433e-07 + }, + { + "step": 576, + "epoch": 2.931297709923664, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821081088, + "loss": 0.8675, + "grad_norm": 3.9362919330596924, + "learning_rate": 3.807398596248401e-07 + }, + { + "step": 577, + "epoch": 2.936386768447837, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82104576, + "loss": 0.8348, + "grad_norm": 3.985013723373413, + "learning_rate": 3.199488720714072e-07 + }, + { + "step": 578, + "epoch": 2.94147582697201, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82104576, + "loss": 0.7712, + "grad_norm": 4.082873821258545, + "learning_rate": 2.64436877051466e-07 + }, + { + "step": 579, + "epoch": 2.9465648854961835, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82113024, + "loss": 0.9147, + "grad_norm": 3.601395606994629, + "learning_rate": 2.1420583239040167e-07 + }, + { + "step": 580, + "epoch": 2.9516539440203564, + "cpu_mem": 3.113467904, + "gpu_mem": 1.82107648, + "loss": 0.8939, + "grad_norm": 3.9137625694274902, + "learning_rate": 1.6925750966238494e-07 + }, + { + "step": 581, + "epoch": 2.9567430025445294, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821096448, + "loss": 0.932, + "grad_norm": 4.429101943969727, + "learning_rate": 1.295934941278387e-07 + }, + { + "step": 582, + "epoch": 2.9618320610687023, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821024256, + "loss": 0.8618, + "grad_norm": 4.022157669067383, + "learning_rate": 9.52151846775162e-08 + }, + { + "step": 583, + "epoch": 2.9669211195928753, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821111808, + "loss": 0.8188, + "grad_norm": 4.057188510894775, + "learning_rate": 6.612379378320709e-08 + }, + { + "step": 584, + "epoch": 2.9720101781170483, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821079552, + "loss": 0.8046, + "grad_norm": 4.377601146697998, + "learning_rate": 4.232034745495494e-08 + }, + { + "step": 585, + "epoch": 2.9770992366412212, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821058048, + "loss": 0.7033, + "grad_norm": 3.728914737701416, + "learning_rate": 2.3805685204869583e-08 + }, + { + "step": 586, + "epoch": 2.982188295165394, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821001216, + "loss": 0.8078, + "grad_norm": 4.040831565856934, + "learning_rate": 1.0580460017517444e-08 + }, + { + "step": 587, + "epoch": 2.9872773536895676, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821067264, + "loss": 0.9107, + "grad_norm": 4.278015613555908, + "learning_rate": 2.645138326906604e-09 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821108736, + "loss": 0.9742, + "grad_norm": 4.842123031616211, + "learning_rate": 0.0 + }, + { + "step": 588, + "epoch": 2.9923664122137406, + "cpu_mem": 3.113467904, + "gpu_mem": 1.821108736, + "train_runtime": 8645.871, + "train_samples_per_second": 4.361, + "train_steps_per_second": 0.068, + "total_flos": 0.0, + "train_loss": 1.232399859193231 + } +] \ No newline at end of file